You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@beam.apache.org by dh...@apache.org on 2017/04/19 19:14:35 UTC
[01/50] [abbrv] beam git commit: Changed snappy version to 1.1.4-M3
Repository: beam
Updated Branches:
refs/heads/DSL_SQL ca8760373 -> aa07a1d41
Changed snappy version to 1.1.4-M3
Project: http://git-wip-us.apache.org/repos/asf/beam/repo
Commit: http://git-wip-us.apache.org/repos/asf/beam/commit/09e0f776
Tree: http://git-wip-us.apache.org/repos/asf/beam/tree/09e0f776
Diff: http://git-wip-us.apache.org/repos/asf/beam/diff/09e0f776
Branch: refs/heads/DSL_SQL
Commit: 09e0f77657d2673b8f5a78022c8f90ded51799ff
Parents: d988150
Author: Vassil Kolarov <va...@vas.io>
Authored: Wed Mar 29 15:02:36 2017 +0100
Committer: Dan Halperin <dh...@google.com>
Committed: Tue Apr 18 17:57:56 2017 -0700
----------------------------------------------------------------------
sdks/java/core/pom.xml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/beam/blob/09e0f776/sdks/java/core/pom.xml
----------------------------------------------------------------------
diff --git a/sdks/java/core/pom.xml b/sdks/java/core/pom.xml
index 4ba8e3b..2b12481 100644
--- a/sdks/java/core/pom.xml
+++ b/sdks/java/core/pom.xml
@@ -241,7 +241,7 @@
<dependency>
<groupId>org.xerial.snappy</groupId>
<artifactId>snappy-java</artifactId>
- <version>1.1.2.1</version>
+ <version>1.1.4-M3</version>
</dependency>
<dependency>
[06/50] [abbrv] beam git commit: Extracts interface from
PushbackSideInputDoFnRunner
Posted by dh...@apache.org.
Extracts interface from PushbackSideInputDoFnRunner
Project: http://git-wip-us.apache.org/repos/asf/beam/repo
Commit: http://git-wip-us.apache.org/repos/asf/beam/commit/7e1a2675
Tree: http://git-wip-us.apache.org/repos/asf/beam/tree/7e1a2675
Diff: http://git-wip-us.apache.org/repos/asf/beam/diff/7e1a2675
Branch: refs/heads/DSL_SQL
Commit: 7e1a2675699ef14291e8c112010be66fff4b8581
Parents: 1cc16b0
Author: Eugene Kirpichov <ki...@google.com>
Authored: Mon Apr 17 14:41:53 2017 -0700
Committer: Eugene Kirpichov <ki...@google.com>
Committed: Tue Apr 18 18:02:06 2017 -0700
----------------------------------------------------------------------
.../operators/ApexParDoOperator.java | 3 +-
.../core/PushbackSideInputDoFnRunner.java | 106 +------
.../core/SimplePushbackSideInputDoFnRunner.java | 115 ++++++++
.../core/PushbackSideInputDoFnRunnerTest.java | 282 -------------------
.../SimplePushbackSideInputDoFnRunnerTest.java | 282 +++++++++++++++++++
.../beam/runners/direct/ParDoEvaluator.java | 3 +-
.../wrappers/streaming/DoFnOperator.java | 12 +-
.../streaming/SplittableDoFnOperator.java | 2 +-
.../wrappers/streaming/WindowDoFnOperator.java | 2 +-
9 files changed, 424 insertions(+), 383 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/beam/blob/7e1a2675/runners/apex/src/main/java/org/apache/beam/runners/apex/translation/operators/ApexParDoOperator.java
----------------------------------------------------------------------
diff --git a/runners/apex/src/main/java/org/apache/beam/runners/apex/translation/operators/ApexParDoOperator.java b/runners/apex/src/main/java/org/apache/beam/runners/apex/translation/operators/ApexParDoOperator.java
index bad5be2..52d1d43 100644
--- a/runners/apex/src/main/java/org/apache/beam/runners/apex/translation/operators/ApexParDoOperator.java
+++ b/runners/apex/src/main/java/org/apache/beam/runners/apex/translation/operators/ApexParDoOperator.java
@@ -48,6 +48,7 @@ import org.apache.beam.runners.core.InMemoryTimerInternals;
import org.apache.beam.runners.core.KeyedWorkItem;
import org.apache.beam.runners.core.PushbackSideInputDoFnRunner;
import org.apache.beam.runners.core.SideInputHandler;
+import org.apache.beam.runners.core.SimplePushbackSideInputDoFnRunner;
import org.apache.beam.runners.core.StateInternals;
import org.apache.beam.runners.core.StateNamespace;
import org.apache.beam.runners.core.StatefulDoFnRunner;
@@ -368,7 +369,7 @@ public class ApexParDoOperator<InputT, OutputT> extends BaseOperator implements
}
pushbackDoFnRunner =
- PushbackSideInputDoFnRunner.create(doFnRunner, sideInputs, sideInputHandler);
+ SimplePushbackSideInputDoFnRunner.create(doFnRunner, sideInputs, sideInputHandler);
}
http://git-wip-us.apache.org/repos/asf/beam/blob/7e1a2675/runners/core-java/src/main/java/org/apache/beam/runners/core/PushbackSideInputDoFnRunner.java
----------------------------------------------------------------------
diff --git a/runners/core-java/src/main/java/org/apache/beam/runners/core/PushbackSideInputDoFnRunner.java b/runners/core-java/src/main/java/org/apache/beam/runners/core/PushbackSideInputDoFnRunner.java
index 4ad20b5..bab1dc7 100644
--- a/runners/core-java/src/main/java/org/apache/beam/runners/core/PushbackSideInputDoFnRunner.java
+++ b/runners/core-java/src/main/java/org/apache/beam/runners/core/PushbackSideInputDoFnRunner.java
@@ -17,113 +17,35 @@
*/
package org.apache.beam.runners.core;
-import com.google.common.collect.ImmutableList;
-import com.google.common.collect.Iterables;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.HashSet;
-import java.util.Set;
+import org.apache.beam.sdk.transforms.DoFn;
import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
-import org.apache.beam.sdk.util.ReadyCheckingSideInputReader;
import org.apache.beam.sdk.util.TimeDomain;
import org.apache.beam.sdk.util.WindowedValue;
-import org.apache.beam.sdk.values.PCollectionView;
import org.joda.time.Instant;
/**
- * A {@link DoFnRunner} that can refuse to process elements that are not ready, instead returning
- * them via the {@link #processElementInReadyWindows(WindowedValue)}.
+ * Interface for runners of {@link DoFn}'s that support pushback when reading side inputs,
+ * i.e. return elements that could not be processed because they require reading a side input
+ * window that is not ready.
*/
-public class PushbackSideInputDoFnRunner<InputT, OutputT> implements DoFnRunner<InputT, OutputT> {
- private final DoFnRunner<InputT, OutputT> underlying;
- private final Collection<PCollectionView<?>> views;
- private final ReadyCheckingSideInputReader sideInputReader;
-
- private Set<BoundedWindow> notReadyWindows;
-
- public static <InputT, OutputT> PushbackSideInputDoFnRunner<InputT, OutputT> create(
- DoFnRunner<InputT, OutputT> underlying,
- Collection<PCollectionView<?>> views,
- ReadyCheckingSideInputReader sideInputReader) {
- return new PushbackSideInputDoFnRunner<>(underlying, views, sideInputReader);
- }
-
- private PushbackSideInputDoFnRunner(
- DoFnRunner<InputT, OutputT> underlying,
- Collection<PCollectionView<?>> views,
- ReadyCheckingSideInputReader sideInputReader) {
- this.underlying = underlying;
- this.views = views;
- this.sideInputReader = sideInputReader;
- }
-
- @Override
- public void startBundle() {
- notReadyWindows = new HashSet<>();
- underlying.startBundle();
- }
+public interface PushbackSideInputDoFnRunner<InputT, OutputT> {
+ /** Calls the underlying {@link DoFn.StartBundle} method. */
+ void startBundle();
/**
- * Call the underlying {@link DoFnRunner#processElement(WindowedValue)} for the provided element
+ * Call the underlying {@link DoFn.ProcessElement} method for the provided element
* for each window the element is in that is ready.
*
* @param elem the element to process in all ready windows
* @return each element that could not be processed because it requires a side input window
* that is not ready.
*/
- public Iterable<WindowedValue<InputT>> processElementInReadyWindows(WindowedValue<InputT> elem) {
- if (views.isEmpty()) {
- // When there are no side inputs, we can preserve the compressed representation.
- processElement(elem);
- return Collections.emptyList();
- }
- ImmutableList.Builder<WindowedValue<InputT>> pushedBack = ImmutableList.builder();
- for (WindowedValue<InputT> windowElem : elem.explodeWindows()) {
- BoundedWindow mainInputWindow = Iterables.getOnlyElement(windowElem.getWindows());
- if (isReady(mainInputWindow)) {
- // When there are any side inputs, we have to process the element in each window
- // individually, to disambiguate access to per-window side inputs.
- processElement(windowElem);
- } else {
- notReadyWindows.add(mainInputWindow);
- pushedBack.add(windowElem);
- }
- }
- return pushedBack.build();
- }
-
- private boolean isReady(BoundedWindow mainInputWindow) {
- if (notReadyWindows.contains(mainInputWindow)) {
- return false;
- }
- for (PCollectionView<?> view : views) {
- BoundedWindow sideInputWindow =
- view.getWindowMappingFn().getSideInputWindow(mainInputWindow);
- if (!sideInputReader.isReady(view, sideInputWindow)) {
- return false;
- }
- }
- return true;
- }
+ Iterable<WindowedValue<InputT>> processElementInReadyWindows(WindowedValue<InputT> elem);
- @Override
- public void processElement(WindowedValue<InputT> elem) {
- underlying.processElement(elem);
- }
+ /** Calls the underlying {@link DoFn.OnTimer} method. */
+ void onTimer(String timerId, BoundedWindow window, Instant timestamp,
+ TimeDomain timeDomain);
- @Override
- public void onTimer(String timerId, BoundedWindow window, Instant timestamp,
- TimeDomain timeDomain) {
- underlying.onTimer(timerId, window, timestamp, timeDomain);
- }
-
- /**
- * Call the underlying {@link DoFnRunner#finishBundle()}.
- */
- @Override
- public void finishBundle() {
- notReadyWindows = null;
- underlying.finishBundle();
- }
+ /** Calls the underlying {@link DoFn.FinishBundle} method. */
+ void finishBundle();
}
-
http://git-wip-us.apache.org/repos/asf/beam/blob/7e1a2675/runners/core-java/src/main/java/org/apache/beam/runners/core/SimplePushbackSideInputDoFnRunner.java
----------------------------------------------------------------------
diff --git a/runners/core-java/src/main/java/org/apache/beam/runners/core/SimplePushbackSideInputDoFnRunner.java b/runners/core-java/src/main/java/org/apache/beam/runners/core/SimplePushbackSideInputDoFnRunner.java
new file mode 100644
index 0000000..50d301b
--- /dev/null
+++ b/runners/core-java/src/main/java/org/apache/beam/runners/core/SimplePushbackSideInputDoFnRunner.java
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.core;
+
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.Iterables;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
+import org.apache.beam.sdk.util.ReadyCheckingSideInputReader;
+import org.apache.beam.sdk.util.TimeDomain;
+import org.apache.beam.sdk.util.WindowedValue;
+import org.apache.beam.sdk.values.PCollectionView;
+import org.joda.time.Instant;
+
+/**
+ * A {@link DoFnRunner} that can refuse to process elements that are not ready, instead returning
+ * them via the {@link #processElementInReadyWindows(WindowedValue)}.
+ */
+public class SimplePushbackSideInputDoFnRunner<InputT, OutputT>
+ implements PushbackSideInputDoFnRunner<InputT, OutputT> {
+ private final DoFnRunner<InputT, OutputT> underlying;
+ private final Collection<PCollectionView<?>> views;
+ private final ReadyCheckingSideInputReader sideInputReader;
+
+ private Set<BoundedWindow> notReadyWindows;
+
+ public static <InputT, OutputT> SimplePushbackSideInputDoFnRunner<InputT, OutputT> create(
+ DoFnRunner<InputT, OutputT> underlying,
+ Collection<PCollectionView<?>> views,
+ ReadyCheckingSideInputReader sideInputReader) {
+ return new SimplePushbackSideInputDoFnRunner<>(underlying, views, sideInputReader);
+ }
+
+ private SimplePushbackSideInputDoFnRunner(
+ DoFnRunner<InputT, OutputT> underlying,
+ Collection<PCollectionView<?>> views,
+ ReadyCheckingSideInputReader sideInputReader) {
+ this.underlying = underlying;
+ this.views = views;
+ this.sideInputReader = sideInputReader;
+ }
+
+ @Override
+ public void startBundle() {
+ notReadyWindows = new HashSet<>();
+ underlying.startBundle();
+ }
+
+ @Override
+ public Iterable<WindowedValue<InputT>> processElementInReadyWindows(WindowedValue<InputT> elem) {
+ if (views.isEmpty()) {
+ // When there are no side inputs, we can preserve the compressed representation.
+ underlying.processElement(elem);
+ return Collections.emptyList();
+ }
+ ImmutableList.Builder<WindowedValue<InputT>> pushedBack = ImmutableList.builder();
+ for (WindowedValue<InputT> windowElem : elem.explodeWindows()) {
+ BoundedWindow mainInputWindow = Iterables.getOnlyElement(windowElem.getWindows());
+ if (isReady(mainInputWindow)) {
+ // When there are any side inputs, we have to process the element in each window
+ // individually, to disambiguate access to per-window side inputs.
+ underlying.processElement(windowElem);
+ } else {
+ notReadyWindows.add(mainInputWindow);
+ pushedBack.add(windowElem);
+ }
+ }
+ return pushedBack.build();
+ }
+
+ private boolean isReady(BoundedWindow mainInputWindow) {
+ if (notReadyWindows.contains(mainInputWindow)) {
+ return false;
+ }
+ for (PCollectionView<?> view : views) {
+ BoundedWindow sideInputWindow =
+ view.getWindowMappingFn().getSideInputWindow(mainInputWindow);
+ if (!sideInputReader.isReady(view, sideInputWindow)) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ @Override
+ public void onTimer(String timerId, BoundedWindow window, Instant timestamp,
+ TimeDomain timeDomain) {
+ underlying.onTimer(timerId, window, timestamp, timeDomain);
+ }
+
+ @Override
+ public void finishBundle() {
+ notReadyWindows = null;
+ underlying.finishBundle();
+ }
+}
+
http://git-wip-us.apache.org/repos/asf/beam/blob/7e1a2675/runners/core-java/src/test/java/org/apache/beam/runners/core/PushbackSideInputDoFnRunnerTest.java
----------------------------------------------------------------------
diff --git a/runners/core-java/src/test/java/org/apache/beam/runners/core/PushbackSideInputDoFnRunnerTest.java b/runners/core-java/src/test/java/org/apache/beam/runners/core/PushbackSideInputDoFnRunnerTest.java
deleted file mode 100644
index cb057b8..0000000
--- a/runners/core-java/src/test/java/org/apache/beam/runners/core/PushbackSideInputDoFnRunnerTest.java
+++ /dev/null
@@ -1,282 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.core;
-
-import static org.hamcrest.Matchers.contains;
-import static org.hamcrest.Matchers.containsInAnyOrder;
-import static org.hamcrest.Matchers.emptyIterable;
-import static org.hamcrest.Matchers.equalTo;
-import static org.hamcrest.Matchers.is;
-import static org.junit.Assert.assertThat;
-import static org.mockito.Mockito.when;
-
-import com.google.common.collect.ImmutableList;
-import java.util.ArrayList;
-import java.util.List;
-import org.apache.beam.runners.core.TimerInternals.TimerData;
-import org.apache.beam.sdk.testing.TestPipeline;
-import org.apache.beam.sdk.transforms.Create;
-import org.apache.beam.sdk.transforms.Sum;
-import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
-import org.apache.beam.sdk.transforms.windowing.GlobalWindow;
-import org.apache.beam.sdk.transforms.windowing.IntervalWindow;
-import org.apache.beam.sdk.transforms.windowing.PaneInfo;
-import org.apache.beam.sdk.transforms.windowing.Window;
-import org.apache.beam.sdk.util.IdentitySideInputWindowFn;
-import org.apache.beam.sdk.util.ReadyCheckingSideInputReader;
-import org.apache.beam.sdk.util.TimeDomain;
-import org.apache.beam.sdk.util.WindowedValue;
-import org.apache.beam.sdk.values.PCollection;
-import org.apache.beam.sdk.values.PCollectionView;
-import org.hamcrest.Matchers;
-import org.joda.time.Instant;
-import org.junit.Before;
-import org.junit.Rule;
-import org.junit.Test;
-import org.junit.runner.RunWith;
-import org.junit.runners.JUnit4;
-import org.mockito.Mock;
-import org.mockito.Mockito;
-import org.mockito.MockitoAnnotations;
-
-/**
- * Tests for {@link PushbackSideInputDoFnRunner}.
- */
-@RunWith(JUnit4.class)
-public class PushbackSideInputDoFnRunnerTest {
- @Mock private ReadyCheckingSideInputReader reader;
- private TestDoFnRunner<Integer, Integer> underlying;
- private PCollectionView<Integer> singletonView;
-
- @Rule
- public TestPipeline p = TestPipeline.create().enableAbandonedNodeEnforcement(false);
-
- @Before
- public void setup() {
- MockitoAnnotations.initMocks(this);
- PCollection<Integer> created = p.apply(Create.of(1, 2, 3));
- singletonView =
- created
- .apply(Window.into(new IdentitySideInputWindowFn()))
- .apply(Sum.integersGlobally().asSingletonView());
-
- underlying = new TestDoFnRunner<>();
- }
-
- private PushbackSideInputDoFnRunner<Integer, Integer> createRunner(
- ImmutableList<PCollectionView<?>> views) {
- PushbackSideInputDoFnRunner<Integer, Integer> runner =
- PushbackSideInputDoFnRunner.create(underlying, views, reader);
- runner.startBundle();
- return runner;
- }
-
- @Test
- public void startFinishBundleDelegates() {
- PushbackSideInputDoFnRunner runner =
- createRunner(ImmutableList.<PCollectionView<?>>of(singletonView));
-
- assertThat(underlying.started, is(true));
- assertThat(underlying.finished, is(false));
- runner.finishBundle();
- assertThat(underlying.finished, is(true));
- }
-
- @Test
- public void processElementSideInputNotReady() {
- when(reader.isReady(Mockito.eq(singletonView), Mockito.any(BoundedWindow.class)))
- .thenReturn(false);
-
- PushbackSideInputDoFnRunner<Integer, Integer> runner =
- createRunner(ImmutableList.<PCollectionView<?>>of(singletonView));
-
- WindowedValue<Integer> oneWindow =
- WindowedValue.of(
- 2,
- new Instant(-2),
- new IntervalWindow(new Instant(-500L), new Instant(0L)),
- PaneInfo.ON_TIME_AND_ONLY_FIRING);
- Iterable<WindowedValue<Integer>> oneWindowPushback =
- runner.processElementInReadyWindows(oneWindow);
- assertThat(oneWindowPushback, containsInAnyOrder(oneWindow));
- assertThat(underlying.inputElems, Matchers.<WindowedValue<Integer>>emptyIterable());
- }
-
- @Test
- public void processElementSideInputNotReadyMultipleWindows() {
- when(reader.isReady(Mockito.eq(singletonView), Mockito.any(BoundedWindow.class)))
- .thenReturn(false);
-
- PushbackSideInputDoFnRunner<Integer, Integer> runner =
- createRunner(ImmutableList.<PCollectionView<?>>of(singletonView));
-
- WindowedValue<Integer> multiWindow =
- WindowedValue.of(
- 2,
- new Instant(-2),
- ImmutableList.of(
- new IntervalWindow(new Instant(-500L), new Instant(0L)),
- new IntervalWindow(BoundedWindow.TIMESTAMP_MIN_VALUE, new Instant(250L)),
- GlobalWindow.INSTANCE),
- PaneInfo.ON_TIME_AND_ONLY_FIRING);
- Iterable<WindowedValue<Integer>> multiWindowPushback =
- runner.processElementInReadyWindows(multiWindow);
- assertThat(multiWindowPushback, equalTo(multiWindow.explodeWindows()));
- assertThat(underlying.inputElems, Matchers.<WindowedValue<Integer>>emptyIterable());
- }
-
- @Test
- public void processElementSideInputNotReadySomeWindows() {
- when(reader.isReady(Mockito.eq(singletonView), Mockito.eq(GlobalWindow.INSTANCE)))
- .thenReturn(false);
- when(
- reader.isReady(
- Mockito.eq(singletonView),
- org.mockito.AdditionalMatchers.not(Mockito.eq(GlobalWindow.INSTANCE))))
- .thenReturn(true);
-
- PushbackSideInputDoFnRunner<Integer, Integer> runner =
- createRunner(ImmutableList.<PCollectionView<?>>of(singletonView));
-
- IntervalWindow littleWindow = new IntervalWindow(new Instant(-500L), new Instant(0L));
- IntervalWindow bigWindow =
- new IntervalWindow(BoundedWindow.TIMESTAMP_MIN_VALUE, new Instant(250L));
- WindowedValue<Integer> multiWindow =
- WindowedValue.of(
- 2,
- new Instant(-2),
- ImmutableList.of(littleWindow, bigWindow, GlobalWindow.INSTANCE),
- PaneInfo.NO_FIRING);
- Iterable<WindowedValue<Integer>> multiWindowPushback =
- runner.processElementInReadyWindows(multiWindow);
- assertThat(
- multiWindowPushback,
- containsInAnyOrder(WindowedValue.timestampedValueInGlobalWindow(2, new Instant(-2L))));
- assertThat(
- underlying.inputElems,
- containsInAnyOrder(
- WindowedValue.of(
- 2, new Instant(-2), ImmutableList.of(littleWindow), PaneInfo.NO_FIRING),
- WindowedValue.of(2, new Instant(-2), ImmutableList.of(bigWindow), PaneInfo.NO_FIRING)));
- }
-
- @Test
- public void processElementSideInputReadyAllWindows() {
- when(reader.isReady(Mockito.eq(singletonView), Mockito.any(BoundedWindow.class)))
- .thenReturn(true);
-
- ImmutableList<PCollectionView<?>> views = ImmutableList.<PCollectionView<?>>of(singletonView);
- PushbackSideInputDoFnRunner<Integer, Integer> runner = createRunner(views);
-
- WindowedValue<Integer> multiWindow =
- WindowedValue.of(
- 2,
- new Instant(-2),
- ImmutableList.of(
- new IntervalWindow(new Instant(-500L), new Instant(0L)),
- new IntervalWindow(BoundedWindow.TIMESTAMP_MIN_VALUE, new Instant(250L)),
- GlobalWindow.INSTANCE),
- PaneInfo.ON_TIME_AND_ONLY_FIRING);
- Iterable<WindowedValue<Integer>> multiWindowPushback =
- runner.processElementInReadyWindows(multiWindow);
- assertThat(multiWindowPushback, emptyIterable());
- assertThat(
- underlying.inputElems,
- containsInAnyOrder(ImmutableList.copyOf(multiWindow.explodeWindows()).toArray()));
- }
-
- @Test
- public void processElementNoSideInputs() {
- PushbackSideInputDoFnRunner<Integer, Integer> runner =
- createRunner(ImmutableList.<PCollectionView<?>>of());
-
- WindowedValue<Integer> multiWindow =
- WindowedValue.of(
- 2,
- new Instant(-2),
- ImmutableList.of(
- new IntervalWindow(new Instant(-500L), new Instant(0L)),
- new IntervalWindow(BoundedWindow.TIMESTAMP_MIN_VALUE, new Instant(250L)),
- GlobalWindow.INSTANCE),
- PaneInfo.ON_TIME_AND_ONLY_FIRING);
- Iterable<WindowedValue<Integer>> multiWindowPushback =
- runner.processElementInReadyWindows(multiWindow);
- assertThat(multiWindowPushback, emptyIterable());
- // Should preserve the compressed representation when there's no side inputs.
- assertThat(underlying.inputElems, containsInAnyOrder(multiWindow));
- }
-
- /** Tests that a call to onTimer gets delegated. */
- @Test
- public void testOnTimerCalled() {
- PushbackSideInputDoFnRunner<Integer, Integer> runner =
- createRunner(ImmutableList.<PCollectionView<?>>of());
-
- String timerId = "fooTimer";
- IntervalWindow window = new IntervalWindow(new Instant(4), new Instant(16));
- Instant timestamp = new Instant(72);
-
- // Mocking is not easily compatible with annotation analysis, so we manually record
- // the method call.
- runner.onTimer(timerId, window, new Instant(timestamp), TimeDomain.EVENT_TIME);
-
- assertThat(
- underlying.firedTimers,
- contains(
- TimerData.of(
- timerId,
- StateNamespaces.window(IntervalWindow.getCoder(), window),
- timestamp,
- TimeDomain.EVENT_TIME)));
- }
-
- private static class TestDoFnRunner<InputT, OutputT> implements DoFnRunner<InputT, OutputT> {
- List<WindowedValue<InputT>> inputElems;
- List<TimerData> firedTimers;
- private boolean started = false;
- private boolean finished = false;
-
- @Override
- public void startBundle() {
- started = true;
- inputElems = new ArrayList<>();
- firedTimers = new ArrayList<>();
- }
-
- @Override
- public void processElement(WindowedValue<InputT> elem) {
- inputElems.add(elem);
- }
-
- @Override
- public void onTimer(String timerId, BoundedWindow window, Instant timestamp,
- TimeDomain timeDomain) {
- firedTimers.add(
- TimerData.of(
- timerId,
- StateNamespaces.window(IntervalWindow.getCoder(), (IntervalWindow) window),
- timestamp,
- timeDomain));
- }
-
- @Override
- public void finishBundle() {
- finished = true;
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/7e1a2675/runners/core-java/src/test/java/org/apache/beam/runners/core/SimplePushbackSideInputDoFnRunnerTest.java
----------------------------------------------------------------------
diff --git a/runners/core-java/src/test/java/org/apache/beam/runners/core/SimplePushbackSideInputDoFnRunnerTest.java b/runners/core-java/src/test/java/org/apache/beam/runners/core/SimplePushbackSideInputDoFnRunnerTest.java
new file mode 100644
index 0000000..ba3f926
--- /dev/null
+++ b/runners/core-java/src/test/java/org/apache/beam/runners/core/SimplePushbackSideInputDoFnRunnerTest.java
@@ -0,0 +1,282 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.core;
+
+import static org.hamcrest.Matchers.contains;
+import static org.hamcrest.Matchers.containsInAnyOrder;
+import static org.hamcrest.Matchers.emptyIterable;
+import static org.hamcrest.Matchers.equalTo;
+import static org.hamcrest.Matchers.is;
+import static org.junit.Assert.assertThat;
+import static org.mockito.Mockito.when;
+
+import com.google.common.collect.ImmutableList;
+import java.util.ArrayList;
+import java.util.List;
+import org.apache.beam.runners.core.TimerInternals.TimerData;
+import org.apache.beam.sdk.testing.TestPipeline;
+import org.apache.beam.sdk.transforms.Create;
+import org.apache.beam.sdk.transforms.Sum;
+import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
+import org.apache.beam.sdk.transforms.windowing.GlobalWindow;
+import org.apache.beam.sdk.transforms.windowing.IntervalWindow;
+import org.apache.beam.sdk.transforms.windowing.PaneInfo;
+import org.apache.beam.sdk.transforms.windowing.Window;
+import org.apache.beam.sdk.util.IdentitySideInputWindowFn;
+import org.apache.beam.sdk.util.ReadyCheckingSideInputReader;
+import org.apache.beam.sdk.util.TimeDomain;
+import org.apache.beam.sdk.util.WindowedValue;
+import org.apache.beam.sdk.values.PCollection;
+import org.apache.beam.sdk.values.PCollectionView;
+import org.hamcrest.Matchers;
+import org.joda.time.Instant;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+import org.mockito.Mock;
+import org.mockito.Mockito;
+import org.mockito.MockitoAnnotations;
+
+/**
+ * Tests for {@link SimplePushbackSideInputDoFnRunner}.
+ */
+@RunWith(JUnit4.class)
+public class SimplePushbackSideInputDoFnRunnerTest {
+ @Mock private ReadyCheckingSideInputReader reader;
+ private TestDoFnRunner<Integer, Integer> underlying;
+ private PCollectionView<Integer> singletonView;
+
+ @Rule
+ public TestPipeline p = TestPipeline.create().enableAbandonedNodeEnforcement(false);
+
+ @Before
+ public void setup() {
+ MockitoAnnotations.initMocks(this);
+ PCollection<Integer> created = p.apply(Create.of(1, 2, 3));
+ singletonView =
+ created
+ .apply(Window.into(new IdentitySideInputWindowFn()))
+ .apply(Sum.integersGlobally().asSingletonView());
+
+ underlying = new TestDoFnRunner<>();
+ }
+
+ private SimplePushbackSideInputDoFnRunner<Integer, Integer> createRunner(
+ ImmutableList<PCollectionView<?>> views) {
+ SimplePushbackSideInputDoFnRunner<Integer, Integer> runner =
+ SimplePushbackSideInputDoFnRunner.create(underlying, views, reader);
+ runner.startBundle();
+ return runner;
+ }
+
+ @Test
+ public void startFinishBundleDelegates() {
+ PushbackSideInputDoFnRunner runner =
+ createRunner(ImmutableList.<PCollectionView<?>>of(singletonView));
+
+ assertThat(underlying.started, is(true));
+ assertThat(underlying.finished, is(false));
+ runner.finishBundle();
+ assertThat(underlying.finished, is(true));
+ }
+
+ @Test
+ public void processElementSideInputNotReady() {
+ when(reader.isReady(Mockito.eq(singletonView), Mockito.any(BoundedWindow.class)))
+ .thenReturn(false);
+
+ SimplePushbackSideInputDoFnRunner<Integer, Integer> runner =
+ createRunner(ImmutableList.<PCollectionView<?>>of(singletonView));
+
+ WindowedValue<Integer> oneWindow =
+ WindowedValue.of(
+ 2,
+ new Instant(-2),
+ new IntervalWindow(new Instant(-500L), new Instant(0L)),
+ PaneInfo.ON_TIME_AND_ONLY_FIRING);
+ Iterable<WindowedValue<Integer>> oneWindowPushback =
+ runner.processElementInReadyWindows(oneWindow);
+ assertThat(oneWindowPushback, containsInAnyOrder(oneWindow));
+ assertThat(underlying.inputElems, Matchers.<WindowedValue<Integer>>emptyIterable());
+ }
+
+ @Test
+ public void processElementSideInputNotReadyMultipleWindows() {
+ when(reader.isReady(Mockito.eq(singletonView), Mockito.any(BoundedWindow.class)))
+ .thenReturn(false);
+
+ SimplePushbackSideInputDoFnRunner<Integer, Integer> runner =
+ createRunner(ImmutableList.<PCollectionView<?>>of(singletonView));
+
+ WindowedValue<Integer> multiWindow =
+ WindowedValue.of(
+ 2,
+ new Instant(-2),
+ ImmutableList.of(
+ new IntervalWindow(new Instant(-500L), new Instant(0L)),
+ new IntervalWindow(BoundedWindow.TIMESTAMP_MIN_VALUE, new Instant(250L)),
+ GlobalWindow.INSTANCE),
+ PaneInfo.ON_TIME_AND_ONLY_FIRING);
+ Iterable<WindowedValue<Integer>> multiWindowPushback =
+ runner.processElementInReadyWindows(multiWindow);
+ assertThat(multiWindowPushback, equalTo(multiWindow.explodeWindows()));
+ assertThat(underlying.inputElems, Matchers.<WindowedValue<Integer>>emptyIterable());
+ }
+
+ @Test
+ public void processElementSideInputNotReadySomeWindows() {
+ when(reader.isReady(Mockito.eq(singletonView), Mockito.eq(GlobalWindow.INSTANCE)))
+ .thenReturn(false);
+ when(
+ reader.isReady(
+ Mockito.eq(singletonView),
+ org.mockito.AdditionalMatchers.not(Mockito.eq(GlobalWindow.INSTANCE))))
+ .thenReturn(true);
+
+ SimplePushbackSideInputDoFnRunner<Integer, Integer> runner =
+ createRunner(ImmutableList.<PCollectionView<?>>of(singletonView));
+
+ IntervalWindow littleWindow = new IntervalWindow(new Instant(-500L), new Instant(0L));
+ IntervalWindow bigWindow =
+ new IntervalWindow(BoundedWindow.TIMESTAMP_MIN_VALUE, new Instant(250L));
+ WindowedValue<Integer> multiWindow =
+ WindowedValue.of(
+ 2,
+ new Instant(-2),
+ ImmutableList.of(littleWindow, bigWindow, GlobalWindow.INSTANCE),
+ PaneInfo.NO_FIRING);
+ Iterable<WindowedValue<Integer>> multiWindowPushback =
+ runner.processElementInReadyWindows(multiWindow);
+ assertThat(
+ multiWindowPushback,
+ containsInAnyOrder(WindowedValue.timestampedValueInGlobalWindow(2, new Instant(-2L))));
+ assertThat(
+ underlying.inputElems,
+ containsInAnyOrder(
+ WindowedValue.of(
+ 2, new Instant(-2), ImmutableList.of(littleWindow), PaneInfo.NO_FIRING),
+ WindowedValue.of(2, new Instant(-2), ImmutableList.of(bigWindow), PaneInfo.NO_FIRING)));
+ }
+
+ @Test
+ public void processElementSideInputReadyAllWindows() {
+ when(reader.isReady(Mockito.eq(singletonView), Mockito.any(BoundedWindow.class)))
+ .thenReturn(true);
+
+ ImmutableList<PCollectionView<?>> views = ImmutableList.<PCollectionView<?>>of(singletonView);
+ SimplePushbackSideInputDoFnRunner<Integer, Integer> runner = createRunner(views);
+
+ WindowedValue<Integer> multiWindow =
+ WindowedValue.of(
+ 2,
+ new Instant(-2),
+ ImmutableList.of(
+ new IntervalWindow(new Instant(-500L), new Instant(0L)),
+ new IntervalWindow(BoundedWindow.TIMESTAMP_MIN_VALUE, new Instant(250L)),
+ GlobalWindow.INSTANCE),
+ PaneInfo.ON_TIME_AND_ONLY_FIRING);
+ Iterable<WindowedValue<Integer>> multiWindowPushback =
+ runner.processElementInReadyWindows(multiWindow);
+ assertThat(multiWindowPushback, emptyIterable());
+ assertThat(
+ underlying.inputElems,
+ containsInAnyOrder(ImmutableList.copyOf(multiWindow.explodeWindows()).toArray()));
+ }
+
+ @Test
+ public void processElementNoSideInputs() {
+ SimplePushbackSideInputDoFnRunner<Integer, Integer> runner =
+ createRunner(ImmutableList.<PCollectionView<?>>of());
+
+ WindowedValue<Integer> multiWindow =
+ WindowedValue.of(
+ 2,
+ new Instant(-2),
+ ImmutableList.of(
+ new IntervalWindow(new Instant(-500L), new Instant(0L)),
+ new IntervalWindow(BoundedWindow.TIMESTAMP_MIN_VALUE, new Instant(250L)),
+ GlobalWindow.INSTANCE),
+ PaneInfo.ON_TIME_AND_ONLY_FIRING);
+ Iterable<WindowedValue<Integer>> multiWindowPushback =
+ runner.processElementInReadyWindows(multiWindow);
+ assertThat(multiWindowPushback, emptyIterable());
+ // Should preserve the compressed representation when there's no side inputs.
+ assertThat(underlying.inputElems, containsInAnyOrder(multiWindow));
+ }
+
+ /** Tests that a call to onTimer gets delegated. */
+ @Test
+ public void testOnTimerCalled() {
+ PushbackSideInputDoFnRunner<Integer, Integer> runner =
+ createRunner(ImmutableList.<PCollectionView<?>>of());
+
+ String timerId = "fooTimer";
+ IntervalWindow window = new IntervalWindow(new Instant(4), new Instant(16));
+ Instant timestamp = new Instant(72);
+
+ // Mocking is not easily compatible with annotation analysis, so we manually record
+ // the method call.
+ runner.onTimer(timerId, window, new Instant(timestamp), TimeDomain.EVENT_TIME);
+
+ assertThat(
+ underlying.firedTimers,
+ contains(
+ TimerData.of(
+ timerId,
+ StateNamespaces.window(IntervalWindow.getCoder(), window),
+ timestamp,
+ TimeDomain.EVENT_TIME)));
+ }
+
+ private static class TestDoFnRunner<InputT, OutputT> implements DoFnRunner<InputT, OutputT> {
+ List<WindowedValue<InputT>> inputElems;
+ List<TimerData> firedTimers;
+ private boolean started = false;
+ private boolean finished = false;
+
+ @Override
+ public void startBundle() {
+ started = true;
+ inputElems = new ArrayList<>();
+ firedTimers = new ArrayList<>();
+ }
+
+ @Override
+ public void processElement(WindowedValue<InputT> elem) {
+ inputElems.add(elem);
+ }
+
+ @Override
+ public void onTimer(String timerId, BoundedWindow window, Instant timestamp,
+ TimeDomain timeDomain) {
+ firedTimers.add(
+ TimerData.of(
+ timerId,
+ StateNamespaces.window(IntervalWindow.getCoder(), (IntervalWindow) window),
+ timestamp,
+ timeDomain));
+ }
+
+ @Override
+ public void finishBundle() {
+ finished = true;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/7e1a2675/runners/direct-java/src/main/java/org/apache/beam/runners/direct/ParDoEvaluator.java
----------------------------------------------------------------------
diff --git a/runners/direct-java/src/main/java/org/apache/beam/runners/direct/ParDoEvaluator.java b/runners/direct-java/src/main/java/org/apache/beam/runners/direct/ParDoEvaluator.java
index 131716f..bab7b2c 100644
--- a/runners/direct-java/src/main/java/org/apache/beam/runners/direct/ParDoEvaluator.java
+++ b/runners/direct-java/src/main/java/org/apache/beam/runners/direct/ParDoEvaluator.java
@@ -26,6 +26,7 @@ import org.apache.beam.runners.core.DoFnRunner;
import org.apache.beam.runners.core.DoFnRunners;
import org.apache.beam.runners.core.DoFnRunners.OutputManager;
import org.apache.beam.runners.core.PushbackSideInputDoFnRunner;
+import org.apache.beam.runners.core.SimplePushbackSideInputDoFnRunner;
import org.apache.beam.runners.core.TimerInternals.TimerData;
import org.apache.beam.runners.direct.DirectExecutionContext.DirectStepContext;
import org.apache.beam.runners.direct.DirectRunner.UncommittedBundle;
@@ -85,7 +86,7 @@ class ParDoEvaluator<InputT> implements TransformEvaluator<InputT> {
aggregatorChanges,
windowingStrategy);
PushbackSideInputDoFnRunner<InputT, OutputT> runner =
- PushbackSideInputDoFnRunner.create(underlying, sideInputs, sideInputReader);
+ SimplePushbackSideInputDoFnRunner.create(underlying, sideInputs, sideInputReader);
try {
runner.startBundle();
http://git-wip-us.apache.org/repos/asf/beam/blob/7e1a2675/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/DoFnOperator.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/DoFnOperator.java b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/DoFnOperator.java
index 5496f71..8a09286 100644
--- a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/DoFnOperator.java
+++ b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/DoFnOperator.java
@@ -37,6 +37,7 @@ import org.apache.beam.runners.core.ExecutionContext;
import org.apache.beam.runners.core.GroupAlsoByWindowViaWindowSetNewDoFn;
import org.apache.beam.runners.core.PushbackSideInputDoFnRunner;
import org.apache.beam.runners.core.SideInputHandler;
+import org.apache.beam.runners.core.SimplePushbackSideInputDoFnRunner;
import org.apache.beam.runners.core.StateInternals;
import org.apache.beam.runners.core.StateNamespace;
import org.apache.beam.runners.core.StateNamespaces;
@@ -119,6 +120,7 @@ public class DoFnOperator<InputT, FnOutputT, OutputT>
protected final OutputManagerFactory<OutputT> outputManagerFactory;
+ protected transient DoFnRunner<InputT, FnOutputT> doFnRunner;
protected transient PushbackSideInputDoFnRunner<InputT, FnOutputT> pushbackDoFnRunner;
protected transient SideInputHandler sideInputHandler;
@@ -269,7 +271,7 @@ public class DoFnOperator<InputT, FnOutputT, OutputT>
ExecutionContext.StepContext stepContext = createStepContext();
- DoFnRunner<InputT, FnOutputT> doFnRunner = DoFnRunners.simpleRunner(
+ doFnRunner = DoFnRunners.simpleRunner(
serializedOptions.getPipelineOptions(),
doFn,
sideInputReader,
@@ -320,7 +322,7 @@ public class DoFnOperator<InputT, FnOutputT, OutputT>
}
pushbackDoFnRunner =
- PushbackSideInputDoFnRunner.create(doFnRunner, sideInputs, sideInputHandler);
+ SimplePushbackSideInputDoFnRunner.create(doFnRunner, sideInputs, sideInputHandler);
}
@Override
@@ -362,9 +364,9 @@ public class DoFnOperator<InputT, FnOutputT, OutputT>
@Override
public final void processElement(
StreamRecord<WindowedValue<InputT>> streamRecord) throws Exception {
- pushbackDoFnRunner.startBundle();
- pushbackDoFnRunner.processElement(streamRecord.getValue());
- pushbackDoFnRunner.finishBundle();
+ doFnRunner.startBundle();
+ doFnRunner.processElement(streamRecord.getValue());
+ doFnRunner.finishBundle();
}
private void setPushedBackWatermark(long watermark) {
http://git-wip-us.apache.org/repos/asf/beam/blob/7e1a2675/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/SplittableDoFnOperator.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/SplittableDoFnOperator.java b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/SplittableDoFnOperator.java
index 1a636c9..40f70e4 100644
--- a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/SplittableDoFnOperator.java
+++ b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/SplittableDoFnOperator.java
@@ -142,7 +142,7 @@ public class SplittableDoFnOperator<
@Override
public void fireTimer(InternalTimer<?, TimerInternals.TimerData> timer) {
- pushbackDoFnRunner.processElement(WindowedValue.valueInGlobalWindow(
+ doFnRunner.processElement(WindowedValue.valueInGlobalWindow(
KeyedWorkItems.<String, ElementAndRestriction<InputT, RestrictionT>>timersWorkItem(
(String) stateInternals.getKey(),
Collections.singletonList(timer.getNamespace()))));
http://git-wip-us.apache.org/repos/asf/beam/blob/7e1a2675/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/WindowDoFnOperator.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/WindowDoFnOperator.java b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/WindowDoFnOperator.java
index 7b899f4..9b2136c 100644
--- a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/WindowDoFnOperator.java
+++ b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/WindowDoFnOperator.java
@@ -108,7 +108,7 @@ public class WindowDoFnOperator<K, InputT, OutputT>
@Override
public void fireTimer(InternalTimer<?, TimerData> timer) {
- pushbackDoFnRunner.processElement(WindowedValue.valueInGlobalWindow(
+ doFnRunner.processElement(WindowedValue.valueInGlobalWindow(
KeyedWorkItems.<K, InputT>timersWorkItem(
(K) stateInternals.getKey(),
Collections.singletonList(timer.getNamespace()))));
[32/50] [abbrv] beam git commit: [BEAM-1994] Remove Flink examples
package
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/test/java/org/apache/beam/runners/flink/streaming/TestCountingSource.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/test/java/org/apache/beam/runners/flink/streaming/TestCountingSource.java b/runners/flink/runner/src/test/java/org/apache/beam/runners/flink/streaming/TestCountingSource.java
deleted file mode 100644
index 3a08088..0000000
--- a/runners/flink/runner/src/test/java/org/apache/beam/runners/flink/streaming/TestCountingSource.java
+++ /dev/null
@@ -1,254 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink.streaming;
-
-import static org.apache.beam.sdk.util.CoderUtils.encodeToByteArray;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.concurrent.ThreadLocalRandom;
-import javax.annotation.Nullable;
-import org.apache.beam.sdk.coders.Coder;
-import org.apache.beam.sdk.coders.DelegateCoder;
-import org.apache.beam.sdk.coders.KvCoder;
-import org.apache.beam.sdk.coders.VarIntCoder;
-import org.apache.beam.sdk.io.UnboundedSource;
-import org.apache.beam.sdk.options.PipelineOptions;
-import org.apache.beam.sdk.values.KV;
-import org.joda.time.Instant;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * An unbounded source for testing the unbounded sources framework code.
- *
- * <p>Each split of this sources produces records of the form KV(split_id, i),
- * where i counts up from 0. Each record has a timestamp of i, and the watermark
- * accurately tracks these timestamps. The reader will occasionally return false
- * from {@code advance}, in order to simulate a source where not all the data is
- * available immediately.
- */
-public class TestCountingSource
- extends UnboundedSource<KV<Integer, Integer>, TestCountingSource.CounterMark> {
- private static final Logger LOG = LoggerFactory.getLogger(TestCountingSource.class);
-
- private static List<Integer> finalizeTracker;
- private final int numMessagesPerShard;
- private final int shardNumber;
- private final boolean dedup;
- private final boolean throwOnFirstSnapshot;
- private final boolean allowSplitting;
-
- /**
- * We only allow an exception to be thrown from getCheckpointMark
- * at most once. This must be static since the entire TestCountingSource
- * instance may re-serialized when the pipeline recovers and retries.
- */
- private static boolean thrown = false;
-
- public static void setFinalizeTracker(List<Integer> finalizeTracker) {
- TestCountingSource.finalizeTracker = finalizeTracker;
- }
-
- public TestCountingSource(int numMessagesPerShard) {
- this(numMessagesPerShard, 0, false, false, true);
- }
-
- public TestCountingSource withDedup() {
- return new TestCountingSource(
- numMessagesPerShard, shardNumber, true, throwOnFirstSnapshot, true);
- }
-
- private TestCountingSource withShardNumber(int shardNumber) {
- return new TestCountingSource(
- numMessagesPerShard, shardNumber, dedup, throwOnFirstSnapshot, true);
- }
-
- public TestCountingSource withThrowOnFirstSnapshot(boolean throwOnFirstSnapshot) {
- return new TestCountingSource(
- numMessagesPerShard, shardNumber, dedup, throwOnFirstSnapshot, true);
- }
-
- public TestCountingSource withoutSplitting() {
- return new TestCountingSource(
- numMessagesPerShard, shardNumber, dedup, throwOnFirstSnapshot, false);
- }
-
- private TestCountingSource(int numMessagesPerShard, int shardNumber, boolean dedup,
- boolean throwOnFirstSnapshot, boolean allowSplitting) {
- this.numMessagesPerShard = numMessagesPerShard;
- this.shardNumber = shardNumber;
- this.dedup = dedup;
- this.throwOnFirstSnapshot = throwOnFirstSnapshot;
- this.allowSplitting = allowSplitting;
- }
-
- public int getShardNumber() {
- return shardNumber;
- }
-
- @Override
- public List<TestCountingSource> split(
- int desiredNumSplits, PipelineOptions options) {
- List<TestCountingSource> splits = new ArrayList<>();
- int numSplits = allowSplitting ? desiredNumSplits : 1;
- for (int i = 0; i < numSplits; i++) {
- splits.add(withShardNumber(i));
- }
- return splits;
- }
-
- class CounterMark implements UnboundedSource.CheckpointMark {
- int current;
-
- public CounterMark(int current) {
- this.current = current;
- }
-
- @Override
- public void finalizeCheckpoint() {
- if (finalizeTracker != null) {
- finalizeTracker.add(current);
- }
- }
- }
-
- @Override
- public Coder<CounterMark> getCheckpointMarkCoder() {
- return DelegateCoder.of(
- VarIntCoder.of(),
- new DelegateCoder.CodingFunction<CounterMark, Integer>() {
- @Override
- public Integer apply(CounterMark input) {
- return input.current;
- }
- },
- new DelegateCoder.CodingFunction<Integer, CounterMark>() {
- @Override
- public CounterMark apply(Integer input) {
- return new CounterMark(input);
- }
- });
- }
-
- @Override
- public boolean requiresDeduping() {
- return dedup;
- }
-
- /**
- * Public only so that the checkpoint can be conveyed from {@link #getCheckpointMark()} to
- * {@link TestCountingSource#createReader(PipelineOptions, CounterMark)} without cast.
- */
- public class CountingSourceReader extends UnboundedReader<KV<Integer, Integer>> {
- private int current;
-
- public CountingSourceReader(int startingPoint) {
- this.current = startingPoint;
- }
-
- @Override
- public boolean start() {
- return advance();
- }
-
- @Override
- public boolean advance() {
- if (current >= numMessagesPerShard - 1) {
- return false;
- }
- // If testing dedup, occasionally insert a duplicate value;
- if (current >= 0 && dedup && ThreadLocalRandom.current().nextInt(5) == 0) {
- return true;
- }
- current++;
- return true;
- }
-
- @Override
- public KV<Integer, Integer> getCurrent() {
- return KV.of(shardNumber, current);
- }
-
- @Override
- public Instant getCurrentTimestamp() {
- return new Instant(current);
- }
-
- @Override
- public byte[] getCurrentRecordId() {
- try {
- return encodeToByteArray(KvCoder.of(VarIntCoder.of(), VarIntCoder.of()), getCurrent());
- } catch (IOException e) {
- throw new RuntimeException(e);
- }
- }
-
- @Override
- public void close() {}
-
- @Override
- public TestCountingSource getCurrentSource() {
- return TestCountingSource.this;
- }
-
- @Override
- public Instant getWatermark() {
- // The watermark is a promise about future elements, and the timestamps of elements are
- // strictly increasing for this source.
- return new Instant(current + 1);
- }
-
- @Override
- public CounterMark getCheckpointMark() {
- if (throwOnFirstSnapshot && !thrown) {
- thrown = true;
- LOG.error("Throwing exception while checkpointing counter");
- throw new RuntimeException("failed during checkpoint");
- }
- // The checkpoint can assume all records read, including the current, have
- // been commited.
- return new CounterMark(current);
- }
-
- @Override
- public long getSplitBacklogBytes() {
- return 7L;
- }
- }
-
- @Override
- public CountingSourceReader createReader(
- PipelineOptions options, @Nullable CounterMark checkpointMark) {
- if (checkpointMark == null) {
- LOG.debug("creating reader");
- } else {
- LOG.debug("restoring reader from checkpoint with current = {}", checkpointMark.current);
- }
- return new CountingSourceReader(checkpointMark != null ? checkpointMark.current : -1);
- }
-
- @Override
- public void validate() {}
-
- @Override
- public Coder<KV<Integer, Integer>> getDefaultOutputCoder() {
- return KvCoder.of(VarIntCoder.of(), VarIntCoder.of());
- }
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/test/java/org/apache/beam/runners/flink/streaming/TopWikipediaSessionsITCase.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/test/java/org/apache/beam/runners/flink/streaming/TopWikipediaSessionsITCase.java b/runners/flink/runner/src/test/java/org/apache/beam/runners/flink/streaming/TopWikipediaSessionsITCase.java
deleted file mode 100644
index 9e6bba8..0000000
--- a/runners/flink/runner/src/test/java/org/apache/beam/runners/flink/streaming/TopWikipediaSessionsITCase.java
+++ /dev/null
@@ -1,133 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink.streaming;
-
-import com.google.api.services.bigquery.model.TableRow;
-import com.google.common.base.Joiner;
-import java.io.Serializable;
-import java.util.Arrays;
-import org.apache.beam.runners.flink.FlinkTestPipeline;
-import org.apache.beam.sdk.Pipeline;
-import org.apache.beam.sdk.io.TextIO;
-import org.apache.beam.sdk.transforms.Count;
-import org.apache.beam.sdk.transforms.Create;
-import org.apache.beam.sdk.transforms.DoFn;
-import org.apache.beam.sdk.transforms.ParDo;
-import org.apache.beam.sdk.transforms.windowing.Sessions;
-import org.apache.beam.sdk.transforms.windowing.Window;
-import org.apache.beam.sdk.values.KV;
-import org.apache.beam.sdk.values.PCollection;
-import org.apache.flink.streaming.util.StreamingProgramTestBase;
-import org.joda.time.Duration;
-import org.joda.time.Instant;
-
-
-/**
- * Session window test.
- */
-public class TopWikipediaSessionsITCase extends StreamingProgramTestBase implements Serializable {
- protected String resultPath;
-
- public TopWikipediaSessionsITCase(){
- }
-
- static final String[] EXPECTED_RESULT = new String[] {
- "user: user1 value:3",
- "user: user1 value:1",
- "user: user2 value:4",
- "user: user2 value:6",
- "user: user3 value:7",
- "user: user3 value:2"
- };
-
- @Override
- protected void preSubmit() throws Exception {
- resultPath = getTempDirPath("result");
- }
-
- @Override
- protected void postSubmit() throws Exception {
- compareResultsByLinesInMemory(Joiner.on('\n').join(EXPECTED_RESULT), resultPath);
- }
-
- @Override
- protected void testProgram() throws Exception {
-
- Pipeline p = FlinkTestPipeline.createForStreaming();
-
- Long now = (System.currentTimeMillis() + 10000) / 1000;
-
- PCollection<KV<String, Long>> output =
- p.apply(Create.of(Arrays.asList(new TableRow().set("timestamp", now).set
- ("contributor_username", "user1"), new TableRow().set("timestamp", now + 10).set
- ("contributor_username", "user3"), new TableRow().set("timestamp", now).set
- ("contributor_username", "user2"), new TableRow().set("timestamp", now).set
- ("contributor_username", "user1"), new TableRow().set("timestamp", now + 2).set
- ("contributor_username", "user1"), new TableRow().set("timestamp", now).set
- ("contributor_username", "user2"), new TableRow().set("timestamp", now + 1).set
- ("contributor_username", "user2"), new TableRow().set("timestamp", now + 5).set
- ("contributor_username", "user2"), new TableRow().set("timestamp", now + 7).set
- ("contributor_username", "user2"), new TableRow().set("timestamp", now + 8).set
- ("contributor_username", "user2"), new TableRow().set("timestamp", now + 200).set
- ("contributor_username", "user2"), new TableRow().set("timestamp", now + 230).set
- ("contributor_username", "user1"), new TableRow().set("timestamp", now + 230).set
- ("contributor_username", "user2"), new TableRow().set("timestamp", now + 240).set
- ("contributor_username", "user2"), new TableRow().set("timestamp", now + 245).set
- ("contributor_username", "user3"), new TableRow().set("timestamp", now + 235).set
- ("contributor_username", "user3"), new TableRow().set("timestamp", now + 236).set
- ("contributor_username", "user3"), new TableRow().set("timestamp", now + 237).set
- ("contributor_username", "user3"), new TableRow().set("timestamp", now + 238).set
- ("contributor_username", "user3"), new TableRow().set("timestamp", now + 239).set
- ("contributor_username", "user3"), new TableRow().set("timestamp", now + 240).set
- ("contributor_username", "user3"), new TableRow().set("timestamp", now + 241).set
- ("contributor_username", "user2"), new TableRow().set("timestamp", now)
- .set("contributor_username", "user3"))))
-
-
-
- .apply(ParDo.of(new DoFn<TableRow, String>() {
- @ProcessElement
- public void processElement(ProcessContext c) throws Exception {
- TableRow row = c.element();
- long timestamp = (Integer) row.get("timestamp");
- String userName = (String) row.get("contributor_username");
- if (userName != null) {
- // Sets the timestamp field to be used in windowing.
- c.outputWithTimestamp(userName, new Instant(timestamp * 1000L));
- }
- }
- }))
-
- .apply(Window.<String>into(Sessions.withGapDuration(Duration.standardMinutes(1))))
-
- .apply(Count.<String>perElement());
-
- PCollection<String> format = output.apply(ParDo.of(new DoFn<KV<String, Long>, String>() {
- @ProcessElement
- public void processElement(ProcessContext c) throws Exception {
- KV<String, Long> el = c.element();
- String out = "user: " + el.getKey() + " value:" + el.getValue();
- c.output(out);
- }
- }));
-
- format.apply(TextIO.Write.to(resultPath));
-
- p.run();
- }
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/test/java/org/apache/beam/runners/flink/streaming/UnboundedSourceWrapperTest.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/test/java/org/apache/beam/runners/flink/streaming/UnboundedSourceWrapperTest.java b/runners/flink/runner/src/test/java/org/apache/beam/runners/flink/streaming/UnboundedSourceWrapperTest.java
deleted file mode 100644
index 90f95d6..0000000
--- a/runners/flink/runner/src/test/java/org/apache/beam/runners/flink/streaming/UnboundedSourceWrapperTest.java
+++ /dev/null
@@ -1,464 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink.streaming;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-import static org.junit.Assert.fail;
-import static org.mockito.Mockito.mock;
-import static org.mockito.Mockito.when;
-
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Set;
-import org.apache.beam.runners.flink.translation.wrappers.streaming.io.UnboundedSourceWrapper;
-import org.apache.beam.sdk.coders.Coder;
-import org.apache.beam.sdk.io.UnboundedSource;
-import org.apache.beam.sdk.options.PipelineOptions;
-import org.apache.beam.sdk.options.PipelineOptionsFactory;
-import org.apache.beam.sdk.util.WindowedValue;
-import org.apache.beam.sdk.values.KV;
-import org.apache.flink.api.common.ExecutionConfig;
-import org.apache.flink.api.common.accumulators.Accumulator;
-import org.apache.flink.api.common.state.ListState;
-import org.apache.flink.api.common.state.ListStateDescriptor;
-import org.apache.flink.api.common.state.OperatorStateStore;
-import org.apache.flink.configuration.Configuration;
-import org.apache.flink.runtime.execution.Environment;
-import org.apache.flink.runtime.operators.testutils.DummyEnvironment;
-import org.apache.flink.runtime.state.StateInitializationContext;
-import org.apache.flink.runtime.state.StateSnapshotContextSynchronousImpl;
-import org.apache.flink.streaming.api.TimeCharacteristic;
-import org.apache.flink.streaming.api.graph.StreamConfig;
-import org.apache.flink.streaming.api.operators.Output;
-import org.apache.flink.streaming.api.operators.StreamSource;
-import org.apache.flink.streaming.api.watermark.Watermark;
-import org.apache.flink.streaming.runtime.streamrecord.LatencyMarker;
-import org.apache.flink.streaming.runtime.streamrecord.StreamRecord;
-import org.apache.flink.streaming.runtime.tasks.StreamTask;
-import org.apache.flink.streaming.runtime.tasks.TestProcessingTimeService;
-import org.apache.flink.util.InstantiationUtil;
-import org.junit.Test;
-import org.junit.experimental.runners.Enclosed;
-import org.junit.runner.RunWith;
-import org.junit.runners.Parameterized;
-import org.mockito.Matchers;
-
-/**
- * Tests for {@link UnboundedSourceWrapper}.
- */
-@RunWith(Enclosed.class)
-public class UnboundedSourceWrapperTest {
-
- /**
- * Parameterized tests.
- */
- @RunWith(Parameterized.class)
- public static class UnboundedSourceWrapperTestWithParams {
- private final int numTasks;
- private final int numSplits;
-
- public UnboundedSourceWrapperTestWithParams(int numTasks, int numSplits) {
- this.numTasks = numTasks;
- this.numSplits = numSplits;
- }
-
- @Parameterized.Parameters
- public static Collection<Object[]> data() {
- /*
- * Parameters for initializing the tests:
- * {numTasks, numSplits}
- * The test currently assumes powers of two for some assertions.
- */
- return Arrays.asList(new Object[][]{
- {1, 1}, {1, 2}, {1, 4},
- {2, 1}, {2, 2}, {2, 4},
- {4, 1}, {4, 2}, {4, 4}
- });
- }
-
- /**
- * Creates a {@link UnboundedSourceWrapper} that has one or multiple readers per source.
- * If numSplits > numTasks the source has one source will manage multiple readers.
- */
- @Test
- public void testReaders() throws Exception {
- final int numElements = 20;
- final Object checkpointLock = new Object();
- PipelineOptions options = PipelineOptionsFactory.create();
-
- // this source will emit exactly NUM_ELEMENTS across all parallel readers,
- // afterwards it will stall. We check whether we also receive NUM_ELEMENTS
- // elements later.
- TestCountingSource source = new TestCountingSource(numElements);
- UnboundedSourceWrapper<KV<Integer, Integer>, TestCountingSource.CounterMark> flinkWrapper =
- new UnboundedSourceWrapper<>(options, source, numSplits);
-
- assertEquals(numSplits, flinkWrapper.getSplitSources().size());
-
- StreamSource<WindowedValue<
- KV<Integer, Integer>>,
- UnboundedSourceWrapper<
- KV<Integer, Integer>,
- TestCountingSource.CounterMark>> sourceOperator = new StreamSource<>(flinkWrapper);
-
- setupSourceOperator(sourceOperator, numTasks);
-
- try {
- sourceOperator.open();
- sourceOperator.run(checkpointLock,
- new Output<StreamRecord<WindowedValue<KV<Integer, Integer>>>>() {
- private int count = 0;
-
- @Override
- public void emitWatermark(Watermark watermark) {
- }
-
- @Override
- public void emitLatencyMarker(LatencyMarker latencyMarker) {
- }
-
- @Override
- public void collect(
- StreamRecord<WindowedValue<KV<Integer, Integer>>> windowedValueStreamRecord) {
-
- count++;
- if (count >= numElements) {
- throw new SuccessException();
- }
- }
-
- @Override
- public void close() {
-
- }
- });
- } catch (SuccessException e) {
-
- assertEquals(Math.max(1, numSplits / numTasks), flinkWrapper.getLocalSplitSources().size());
-
- // success
- return;
- }
- fail("Read terminated without producing expected number of outputs");
- }
-
- /**
- * Verify that snapshot/restore work as expected. We bring up a source and cancel
- * after seeing a certain number of elements. Then we snapshot that source,
- * bring up a completely new source that we restore from the snapshot and verify
- * that we see all expected elements in the end.
- */
- @Test
- public void testRestore() throws Exception {
- final int numElements = 20;
- final Object checkpointLock = new Object();
- PipelineOptions options = PipelineOptionsFactory.create();
-
- // this source will emit exactly NUM_ELEMENTS across all parallel readers,
- // afterwards it will stall. We check whether we also receive NUM_ELEMENTS
- // elements later.
- TestCountingSource source = new TestCountingSource(numElements);
- UnboundedSourceWrapper<KV<Integer, Integer>, TestCountingSource.CounterMark> flinkWrapper =
- new UnboundedSourceWrapper<>(options, source, numSplits);
-
- assertEquals(numSplits, flinkWrapper.getSplitSources().size());
-
- StreamSource<
- WindowedValue<KV<Integer, Integer>>,
- UnboundedSourceWrapper<
- KV<Integer, Integer>,
- TestCountingSource.CounterMark>> sourceOperator = new StreamSource<>(flinkWrapper);
-
-
- OperatorStateStore backend = mock(OperatorStateStore.class);
-
- TestingListState<KV<UnboundedSource, TestCountingSource.CounterMark>>
- listState = new TestingListState<>();
-
- when(backend.getOperatorState(Matchers.any(ListStateDescriptor.class)))
- .thenReturn(listState);
-
- StateInitializationContext initializationContext = mock(StateInitializationContext.class);
-
- when(initializationContext.getOperatorStateStore()).thenReturn(backend);
- when(initializationContext.isRestored()).thenReturn(false, true);
-
- flinkWrapper.initializeState(initializationContext);
-
- setupSourceOperator(sourceOperator, numTasks);
-
- final Set<KV<Integer, Integer>> emittedElements = new HashSet<>();
-
- boolean readFirstBatchOfElements = false;
-
- try {
- sourceOperator.open();
- sourceOperator.run(checkpointLock,
- new Output<StreamRecord<WindowedValue<KV<Integer, Integer>>>>() {
- private int count = 0;
-
- @Override
- public void emitWatermark(Watermark watermark) {
- }
-
- @Override
- public void emitLatencyMarker(LatencyMarker latencyMarker) {
- }
-
- @Override
- public void collect(
- StreamRecord<WindowedValue<KV<Integer, Integer>>> windowedValueStreamRecord) {
-
- emittedElements.add(windowedValueStreamRecord.getValue().getValue());
- count++;
- if (count >= numElements / 2) {
- throw new SuccessException();
- }
- }
-
- @Override
- public void close() {
-
- }
- });
- } catch (SuccessException e) {
- // success
- readFirstBatchOfElements = true;
- }
-
- assertTrue("Did not successfully read first batch of elements.", readFirstBatchOfElements);
-
- // draw a snapshot
- flinkWrapper.snapshotState(new StateSnapshotContextSynchronousImpl(0, 0));
-
- // test snapshot offsets
- assertEquals(flinkWrapper.getLocalSplitSources().size(),
- listState.getList().size());
- int totalEmit = 0;
- for (KV<UnboundedSource, TestCountingSource.CounterMark> kv : listState.get()) {
- totalEmit += kv.getValue().current + 1;
- }
- assertEquals(numElements / 2, totalEmit);
-
- // test that finalizeCheckpoint on CheckpointMark is called
- final ArrayList<Integer> finalizeList = new ArrayList<>();
- TestCountingSource.setFinalizeTracker(finalizeList);
- flinkWrapper.notifyCheckpointComplete(0);
- assertEquals(flinkWrapper.getLocalSplitSources().size(), finalizeList.size());
-
- // create a completely new source but restore from the snapshot
- TestCountingSource restoredSource = new TestCountingSource(numElements);
- UnboundedSourceWrapper<
- KV<Integer, Integer>, TestCountingSource.CounterMark> restoredFlinkWrapper =
- new UnboundedSourceWrapper<>(options, restoredSource, numSplits);
-
- assertEquals(numSplits, restoredFlinkWrapper.getSplitSources().size());
-
- StreamSource<
- WindowedValue<KV<Integer, Integer>>,
- UnboundedSourceWrapper<
- KV<Integer, Integer>,
- TestCountingSource.CounterMark>> restoredSourceOperator =
- new StreamSource<>(restoredFlinkWrapper);
-
- setupSourceOperator(restoredSourceOperator, numTasks);
-
- // restore snapshot
- restoredFlinkWrapper.initializeState(initializationContext);
-
- boolean readSecondBatchOfElements = false;
-
- // run again and verify that we see the other elements
- try {
- restoredSourceOperator.open();
- restoredSourceOperator.run(checkpointLock,
- new Output<StreamRecord<WindowedValue<KV<Integer, Integer>>>>() {
- private int count = 0;
-
- @Override
- public void emitWatermark(Watermark watermark) {
- }
-
- @Override
- public void emitLatencyMarker(LatencyMarker latencyMarker) {
- }
-
- @Override
- public void collect(
- StreamRecord<WindowedValue<KV<Integer, Integer>>> windowedValueStreamRecord) {
- emittedElements.add(windowedValueStreamRecord.getValue().getValue());
- count++;
- if (count >= numElements / 2) {
- throw new SuccessException();
- }
- }
-
- @Override
- public void close() {
-
- }
- });
- } catch (SuccessException e) {
- // success
- readSecondBatchOfElements = true;
- }
-
- assertEquals(Math.max(1, numSplits / numTasks), flinkWrapper.getLocalSplitSources().size());
-
- assertTrue("Did not successfully read second batch of elements.", readSecondBatchOfElements);
-
- // verify that we saw all NUM_ELEMENTS elements
- assertTrue(emittedElements.size() == numElements);
- }
-
- @Test
- public void testNullCheckpoint() throws Exception {
- final int numElements = 20;
- PipelineOptions options = PipelineOptionsFactory.create();
-
- TestCountingSource source = new TestCountingSource(numElements) {
- @Override
- public Coder<CounterMark> getCheckpointMarkCoder() {
- return null;
- }
- };
- UnboundedSourceWrapper<KV<Integer, Integer>, TestCountingSource.CounterMark> flinkWrapper =
- new UnboundedSourceWrapper<>(options, source, numSplits);
-
- OperatorStateStore backend = mock(OperatorStateStore.class);
-
- TestingListState<KV<UnboundedSource, TestCountingSource.CounterMark>>
- listState = new TestingListState<>();
-
- when(backend.getOperatorState(Matchers.any(ListStateDescriptor.class)))
- .thenReturn(listState);
-
- StateInitializationContext initializationContext = mock(StateInitializationContext.class);
-
- when(initializationContext.getOperatorStateStore()).thenReturn(backend);
- when(initializationContext.isRestored()).thenReturn(false, true);
-
- flinkWrapper.initializeState(initializationContext);
-
- StreamSource sourceOperator = new StreamSource<>(flinkWrapper);
- setupSourceOperator(sourceOperator, numTasks);
- sourceOperator.open();
-
- flinkWrapper.snapshotState(new StateSnapshotContextSynchronousImpl(0, 0));
-
- assertEquals(0, listState.getList().size());
-
- UnboundedSourceWrapper<
- KV<Integer, Integer>, TestCountingSource.CounterMark> restoredFlinkWrapper =
- new UnboundedSourceWrapper<>(options, new TestCountingSource(numElements),
- numSplits);
-
- StreamSource restoredSourceOperator = new StreamSource<>(flinkWrapper);
- setupSourceOperator(restoredSourceOperator, numTasks);
- sourceOperator.open();
-
- restoredFlinkWrapper.initializeState(initializationContext);
-
- assertEquals(Math.max(1, numSplits / numTasks), flinkWrapper.getLocalSplitSources().size());
-
- }
-
- @SuppressWarnings("unchecked")
- private static <T> void setupSourceOperator(StreamSource<T, ?> operator, int numSubTasks) {
- ExecutionConfig executionConfig = new ExecutionConfig();
- StreamConfig cfg = new StreamConfig(new Configuration());
-
- cfg.setTimeCharacteristic(TimeCharacteristic.EventTime);
-
- Environment env = new DummyEnvironment("MockTwoInputTask", numSubTasks, 0);
-
- StreamTask<?, ?> mockTask = mock(StreamTask.class);
- when(mockTask.getName()).thenReturn("Mock Task");
- when(mockTask.getCheckpointLock()).thenReturn(new Object());
- when(mockTask.getConfiguration()).thenReturn(cfg);
- when(mockTask.getEnvironment()).thenReturn(env);
- when(mockTask.getExecutionConfig()).thenReturn(executionConfig);
- when(mockTask.getAccumulatorMap())
- .thenReturn(Collections.<String, Accumulator<?, ?>>emptyMap());
- TestProcessingTimeService testProcessingTimeService = new TestProcessingTimeService();
- when(mockTask.getProcessingTimeService()).thenReturn(testProcessingTimeService);
-
- operator.setup(mockTask, cfg, (Output<StreamRecord<T>>) mock(Output.class));
- }
-
- /**
- * A special {@link RuntimeException} that we throw to signal that the test was successful.
- */
- private static class SuccessException extends RuntimeException {
- }
- }
-
- /**
- * Not parameterized tests.
- */
- public static class BasicTest {
-
- /**
- * Check serialization a {@link UnboundedSourceWrapper}.
- */
- @Test
- public void testSerialization() throws Exception {
- final int parallelism = 1;
- final int numElements = 20;
- PipelineOptions options = PipelineOptionsFactory.create();
-
- TestCountingSource source = new TestCountingSource(numElements);
- UnboundedSourceWrapper<KV<Integer, Integer>, TestCountingSource.CounterMark> flinkWrapper =
- new UnboundedSourceWrapper<>(options, source, parallelism);
-
- InstantiationUtil.serializeObject(flinkWrapper);
- }
-
- }
-
- private static final class TestingListState<T> implements ListState<T> {
-
- private final List<T> list = new ArrayList<>();
-
- @Override
- public void clear() {
- list.clear();
- }
-
- @Override
- public Iterable<T> get() throws Exception {
- return list;
- }
-
- @Override
- public void add(T value) throws Exception {
- list.add(value);
- }
-
- public List<T> getList() {
- return list;
- }
-
- }
-
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/test/java/org/apache/beam/runners/flink/streaming/package-info.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/test/java/org/apache/beam/runners/flink/streaming/package-info.java b/runners/flink/runner/src/test/java/org/apache/beam/runners/flink/streaming/package-info.java
deleted file mode 100644
index 08a1e03..0000000
--- a/runners/flink/runner/src/test/java/org/apache/beam/runners/flink/streaming/package-info.java
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * Internal implementation of the Beam runner for Apache Flink.
- */
-package org.apache.beam.runners.flink.streaming;
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/test/resources/log4j-test.properties
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/test/resources/log4j-test.properties b/runners/flink/runner/src/test/resources/log4j-test.properties
deleted file mode 100644
index 4c74d85..0000000
--- a/runners/flink/runner/src/test/resources/log4j-test.properties
+++ /dev/null
@@ -1,27 +0,0 @@
-################################################################################
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-################################################################################
-
-# Set root logger level to OFF to not flood build logs
-# set manually to INFO for debugging purposes
-log4j.rootLogger=OFF, testlogger
-
-# A1 is set to be a ConsoleAppender.
-log4j.appender.testlogger=org.apache.log4j.ConsoleAppender
-log4j.appender.testlogger.target = System.err
-log4j.appender.testlogger.layout=org.apache.log4j.PatternLayout
-log4j.appender.testlogger.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/main/java/org/apache/beam/runners/flink/DefaultParallelismFactory.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/DefaultParallelismFactory.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/DefaultParallelismFactory.java
new file mode 100644
index 0000000..b745f0b
--- /dev/null
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/DefaultParallelismFactory.java
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink;
+
+import org.apache.beam.sdk.options.DefaultValueFactory;
+import org.apache.beam.sdk.options.PipelineOptions;
+import org.apache.flink.configuration.ConfigConstants;
+import org.apache.flink.configuration.GlobalConfiguration;
+
+/**
+ * {@link DefaultValueFactory} for getting a default value for the parallelism option
+ * on {@link FlinkPipelineOptions}.
+ *
+ * <p>This will return either the default value from {@link GlobalConfiguration} or {@code 1}.
+ * A valid {@link GlobalConfiguration} is only available if the program is executed by the Flink
+ * run scripts.
+ */
+public class DefaultParallelismFactory implements DefaultValueFactory<Integer> {
+ @Override
+ public Integer create(PipelineOptions options) {
+ return GlobalConfiguration.loadConfiguration()
+ .getInteger(ConfigConstants.DEFAULT_PARALLELISM_KEY, 1);
+ }
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkBatchPipelineTranslator.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkBatchPipelineTranslator.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkBatchPipelineTranslator.java
new file mode 100644
index 0000000..854b674
--- /dev/null
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkBatchPipelineTranslator.java
@@ -0,0 +1,139 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink;
+
+import org.apache.beam.sdk.Pipeline;
+import org.apache.beam.sdk.options.PipelineOptions;
+import org.apache.beam.sdk.runners.TransformHierarchy;
+import org.apache.beam.sdk.transforms.PTransform;
+import org.apache.flink.api.java.DataSet;
+import org.apache.flink.api.java.ExecutionEnvironment;
+import org.apache.flink.api.java.io.DiscardingOutputFormat;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * {@link Pipeline.PipelineVisitor} for executing a {@link Pipeline} as a
+ * Flink batch job.
+ */
+class FlinkBatchPipelineTranslator extends FlinkPipelineTranslator {
+
+ private static final Logger LOG = LoggerFactory.getLogger(FlinkBatchPipelineTranslator.class);
+
+ /**
+ * The necessary context in the case of a batch job.
+ */
+ private final FlinkBatchTranslationContext batchContext;
+
+ private int depth = 0;
+
+ public FlinkBatchPipelineTranslator(ExecutionEnvironment env, PipelineOptions options) {
+ this.batchContext = new FlinkBatchTranslationContext(env, options);
+ }
+
+ @Override
+ @SuppressWarnings("rawtypes, unchecked")
+ public void translate(Pipeline pipeline) {
+ super.translate(pipeline);
+
+ // terminate dangling DataSets
+ for (DataSet<?> dataSet: batchContext.getDanglingDataSets().values()) {
+ dataSet.output(new DiscardingOutputFormat());
+ }
+ }
+
+ // --------------------------------------------------------------------------------------------
+ // Pipeline Visitor Methods
+ // --------------------------------------------------------------------------------------------
+
+ @Override
+ public CompositeBehavior enterCompositeTransform(TransformHierarchy.Node node) {
+ LOG.info("{} enterCompositeTransform- {}", genSpaces(this.depth), node.getFullName());
+ this.depth++;
+
+ BatchTransformTranslator<?> translator = getTranslator(node);
+
+ if (translator != null) {
+ applyBatchTransform(node.getTransform(), node, translator);
+ LOG.info("{} translated- {}", genSpaces(this.depth), node.getFullName());
+ return CompositeBehavior.DO_NOT_ENTER_TRANSFORM;
+ } else {
+ return CompositeBehavior.ENTER_TRANSFORM;
+ }
+ }
+
+ @Override
+ public void leaveCompositeTransform(TransformHierarchy.Node node) {
+ this.depth--;
+ LOG.info("{} leaveCompositeTransform- {}", genSpaces(this.depth), node.getFullName());
+ }
+
+ @Override
+ public void visitPrimitiveTransform(TransformHierarchy.Node node) {
+ LOG.info("{} visitPrimitiveTransform- {}", genSpaces(this.depth), node.getFullName());
+
+ // get the transformation corresponding to the node we are
+ // currently visiting and translate it into its Flink alternative.
+ PTransform<?, ?> transform = node.getTransform();
+ BatchTransformTranslator<?> translator =
+ FlinkBatchTransformTranslators.getTranslator(transform);
+ if (translator == null) {
+ LOG.info(node.getTransform().getClass().toString());
+ throw new UnsupportedOperationException("The transform " + transform
+ + " is currently not supported.");
+ }
+ applyBatchTransform(transform, node, translator);
+ }
+
+ private <T extends PTransform<?, ?>> void applyBatchTransform(
+ PTransform<?, ?> transform,
+ TransformHierarchy.Node node,
+ BatchTransformTranslator<?> translator) {
+
+ @SuppressWarnings("unchecked")
+ T typedTransform = (T) transform;
+
+ @SuppressWarnings("unchecked")
+ BatchTransformTranslator<T> typedTranslator = (BatchTransformTranslator<T>) translator;
+
+ // create the applied PTransform on the batchContext
+ batchContext.setCurrentTransform(node.toAppliedPTransform());
+ typedTranslator.translateNode(typedTransform, batchContext);
+ }
+
+ /**
+ * A translator of a {@link PTransform}.
+ */
+ public interface BatchTransformTranslator<TransformT extends PTransform> {
+ void translateNode(TransformT transform, FlinkBatchTranslationContext context);
+ }
+
+ /**
+ * Returns a translator for the given node, if it is possible, otherwise null.
+ */
+ private static BatchTransformTranslator<?> getTranslator(TransformHierarchy.Node node) {
+ PTransform<?, ?> transform = node.getTransform();
+
+ // Root of the graph is null
+ if (transform == null) {
+ return null;
+ }
+
+ return FlinkBatchTransformTranslators.getTranslator(transform);
+ }
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkBatchTransformTranslators.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkBatchTransformTranslators.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkBatchTransformTranslators.java
new file mode 100644
index 0000000..ff9521c
--- /dev/null
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkBatchTransformTranslators.java
@@ -0,0 +1,723 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink;
+
+import static com.google.common.base.Preconditions.checkArgument;
+import static com.google.common.base.Preconditions.checkState;
+
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+import org.apache.beam.runners.flink.translation.functions.FlinkAssignWindows;
+import org.apache.beam.runners.flink.translation.functions.FlinkDoFnFunction;
+import org.apache.beam.runners.flink.translation.functions.FlinkMergingNonShuffleReduceFunction;
+import org.apache.beam.runners.flink.translation.functions.FlinkMergingPartialReduceFunction;
+import org.apache.beam.runners.flink.translation.functions.FlinkMergingReduceFunction;
+import org.apache.beam.runners.flink.translation.functions.FlinkMultiOutputPruningFunction;
+import org.apache.beam.runners.flink.translation.functions.FlinkPartialReduceFunction;
+import org.apache.beam.runners.flink.translation.functions.FlinkReduceFunction;
+import org.apache.beam.runners.flink.translation.functions.FlinkStatefulDoFnFunction;
+import org.apache.beam.runners.flink.translation.types.CoderTypeInformation;
+import org.apache.beam.runners.flink.translation.types.KvKeySelector;
+import org.apache.beam.runners.flink.translation.wrappers.SourceInputFormat;
+import org.apache.beam.sdk.coders.CannotProvideCoderException;
+import org.apache.beam.sdk.coders.Coder;
+import org.apache.beam.sdk.coders.CoderRegistry;
+import org.apache.beam.sdk.coders.KvCoder;
+import org.apache.beam.sdk.coders.ListCoder;
+import org.apache.beam.sdk.coders.VoidCoder;
+import org.apache.beam.sdk.io.BoundedSource;
+import org.apache.beam.sdk.io.Read;
+import org.apache.beam.sdk.transforms.Combine;
+import org.apache.beam.sdk.transforms.CombineFnBase;
+import org.apache.beam.sdk.transforms.DoFn;
+import org.apache.beam.sdk.transforms.Flatten;
+import org.apache.beam.sdk.transforms.GroupByKey;
+import org.apache.beam.sdk.transforms.PTransform;
+import org.apache.beam.sdk.transforms.ParDo;
+import org.apache.beam.sdk.transforms.View;
+import org.apache.beam.sdk.transforms.join.RawUnionValue;
+import org.apache.beam.sdk.transforms.join.UnionCoder;
+import org.apache.beam.sdk.transforms.reflect.DoFnSignature;
+import org.apache.beam.sdk.transforms.reflect.DoFnSignatures;
+import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
+import org.apache.beam.sdk.transforms.windowing.GlobalWindow;
+import org.apache.beam.sdk.transforms.windowing.IntervalWindow;
+import org.apache.beam.sdk.transforms.windowing.Window;
+import org.apache.beam.sdk.transforms.windowing.WindowFn;
+import org.apache.beam.sdk.util.Reshuffle;
+import org.apache.beam.sdk.util.WindowedValue;
+import org.apache.beam.sdk.util.WindowingStrategy;
+import org.apache.beam.sdk.values.KV;
+import org.apache.beam.sdk.values.PCollection;
+import org.apache.beam.sdk.values.PCollectionView;
+import org.apache.beam.sdk.values.PValue;
+import org.apache.beam.sdk.values.TupleTag;
+import org.apache.flink.api.common.functions.FilterFunction;
+import org.apache.flink.api.common.functions.FlatMapFunction;
+import org.apache.flink.api.common.typeinfo.TypeInformation;
+import org.apache.flink.api.java.DataSet;
+import org.apache.flink.api.java.operators.DataSource;
+import org.apache.flink.api.java.operators.FlatMapOperator;
+import org.apache.flink.api.java.operators.GroupCombineOperator;
+import org.apache.flink.api.java.operators.GroupReduceOperator;
+import org.apache.flink.api.java.operators.Grouping;
+import org.apache.flink.api.java.operators.MapPartitionOperator;
+import org.apache.flink.api.java.operators.SingleInputUdfOperator;
+import org.apache.flink.util.Collector;
+
+/**
+ * Translators for transforming {@link PTransform PTransforms} to
+ * Flink {@link DataSet DataSets}.
+ */
+class FlinkBatchTransformTranslators {
+
+ // --------------------------------------------------------------------------------------------
+ // Transform Translator Registry
+ // --------------------------------------------------------------------------------------------
+
+ @SuppressWarnings("rawtypes")
+ private static final Map<
+ Class<? extends PTransform>,
+ FlinkBatchPipelineTranslator.BatchTransformTranslator> TRANSLATORS = new HashMap<>();
+
+ static {
+ TRANSLATORS.put(View.CreatePCollectionView.class, new CreatePCollectionViewTranslatorBatch());
+
+ TRANSLATORS.put(Combine.PerKey.class, new CombinePerKeyTranslatorBatch());
+ TRANSLATORS.put(GroupByKey.class, new GroupByKeyTranslatorBatch());
+ TRANSLATORS.put(Reshuffle.class, new ReshuffleTranslatorBatch());
+
+ TRANSLATORS.put(Flatten.PCollections.class, new FlattenPCollectionTranslatorBatch());
+
+ TRANSLATORS.put(Window.Assign.class, new WindowAssignTranslatorBatch());
+
+ TRANSLATORS.put(ParDo.MultiOutput.class, new ParDoTranslatorBatch());
+
+ TRANSLATORS.put(Read.Bounded.class, new ReadSourceTranslatorBatch());
+ }
+
+
+ static FlinkBatchPipelineTranslator.BatchTransformTranslator<?> getTranslator(
+ PTransform<?, ?> transform) {
+ return TRANSLATORS.get(transform.getClass());
+ }
+
+ private static class ReadSourceTranslatorBatch<T>
+ implements FlinkBatchPipelineTranslator.BatchTransformTranslator<Read.Bounded<T>> {
+
+ @Override
+ public void translateNode(Read.Bounded<T> transform, FlinkBatchTranslationContext context) {
+ String name = transform.getName();
+ BoundedSource<T> source = transform.getSource();
+ PCollection<T> output = context.getOutput(transform);
+
+ TypeInformation<WindowedValue<T>> typeInformation = context.getTypeInfo(output);
+
+ DataSource<WindowedValue<T>> dataSource = new DataSource<>(
+ context.getExecutionEnvironment(),
+ new SourceInputFormat<>(source, context.getPipelineOptions()),
+ typeInformation,
+ name);
+
+ context.setOutputDataSet(output, dataSource);
+ }
+ }
+
+ private static class WindowAssignTranslatorBatch<T>
+ implements FlinkBatchPipelineTranslator.BatchTransformTranslator<Window.Assign<T>> {
+
+ @Override
+ public void translateNode(Window.Assign<T> transform, FlinkBatchTranslationContext context) {
+ PValue input = context.getInput(transform);
+
+ TypeInformation<WindowedValue<T>> resultTypeInfo =
+ context.getTypeInfo(context.getOutput(transform));
+
+ DataSet<WindowedValue<T>> inputDataSet = context.getInputDataSet(input);
+
+ @SuppressWarnings("unchecked")
+ final WindowingStrategy<T, ? extends BoundedWindow> windowingStrategy =
+ (WindowingStrategy<T, ? extends BoundedWindow>)
+ context.getOutput(transform).getWindowingStrategy();
+
+ WindowFn<T, ? extends BoundedWindow> windowFn = windowingStrategy.getWindowFn();
+
+ FlinkAssignWindows<T, ? extends BoundedWindow> assignWindowsFunction =
+ new FlinkAssignWindows<>(windowFn);
+
+ DataSet<WindowedValue<T>> resultDataSet = inputDataSet
+ .flatMap(assignWindowsFunction)
+ .name(context.getOutput(transform).getName())
+ .returns(resultTypeInfo);
+
+ context.setOutputDataSet(context.getOutput(transform), resultDataSet);
+ }
+ }
+
+ private static class GroupByKeyTranslatorBatch<K, InputT>
+ implements FlinkBatchPipelineTranslator.BatchTransformTranslator<GroupByKey<K, InputT>> {
+
+ @Override
+ public void translateNode(
+ GroupByKey<K, InputT> transform,
+ FlinkBatchTranslationContext context) {
+
+ // for now, this is copied from the Combine.PerKey translater. Once we have the new runner API
+ // we can replace GroupByKey by a Combine.PerKey with the Concatenate CombineFn
+
+ DataSet<WindowedValue<KV<K, InputT>>> inputDataSet =
+ context.getInputDataSet(context.getInput(transform));
+
+ Combine.KeyedCombineFn<K, InputT, List<InputT>, List<InputT>> combineFn =
+ new Concatenate<InputT>().asKeyedFn();
+
+ KvCoder<K, InputT> inputCoder =
+ (KvCoder<K, InputT>) context.getInput(transform).getCoder();
+
+ Coder<List<InputT>> accumulatorCoder;
+
+ try {
+ accumulatorCoder =
+ combineFn.getAccumulatorCoder(
+ context.getInput(transform).getPipeline().getCoderRegistry(),
+ inputCoder.getKeyCoder(),
+ inputCoder.getValueCoder());
+ } catch (CannotProvideCoderException e) {
+ throw new RuntimeException(e);
+ }
+
+ WindowingStrategy<?, ?> windowingStrategy =
+ context.getInput(transform).getWindowingStrategy();
+
+ TypeInformation<WindowedValue<KV<K, List<InputT>>>> partialReduceTypeInfo =
+ new CoderTypeInformation<>(
+ WindowedValue.getFullCoder(
+ KvCoder.of(inputCoder.getKeyCoder(), accumulatorCoder),
+ windowingStrategy.getWindowFn().windowCoder()));
+
+
+ Grouping<WindowedValue<KV<K, InputT>>> inputGrouping =
+ inputDataSet.groupBy(new KvKeySelector<InputT, K>(inputCoder.getKeyCoder()));
+
+ FlinkPartialReduceFunction<K, InputT, List<InputT>, ?> partialReduceFunction;
+ FlinkReduceFunction<K, List<InputT>, List<InputT>, ?> reduceFunction;
+
+ if (windowingStrategy.getWindowFn().isNonMerging()) {
+ @SuppressWarnings("unchecked")
+ WindowingStrategy<?, BoundedWindow> boundedStrategy =
+ (WindowingStrategy<?, BoundedWindow>) windowingStrategy;
+
+ partialReduceFunction = new FlinkPartialReduceFunction<>(
+ combineFn,
+ boundedStrategy,
+ Collections.<PCollectionView<?>, WindowingStrategy<?, ?>>emptyMap(),
+ context.getPipelineOptions());
+
+ reduceFunction = new FlinkReduceFunction<>(
+ combineFn,
+ boundedStrategy,
+ Collections.<PCollectionView<?>, WindowingStrategy<?, ?>>emptyMap(),
+ context.getPipelineOptions());
+
+ } else {
+ if (!windowingStrategy.getWindowFn().windowCoder().equals(IntervalWindow.getCoder())) {
+ throw new UnsupportedOperationException(
+ "Merging WindowFn with windows other than IntervalWindow are not supported.");
+ }
+
+ @SuppressWarnings("unchecked")
+ WindowingStrategy<?, IntervalWindow> intervalStrategy =
+ (WindowingStrategy<?, IntervalWindow>) windowingStrategy;
+
+ partialReduceFunction = new FlinkMergingPartialReduceFunction<>(
+ combineFn,
+ intervalStrategy,
+ Collections.<PCollectionView<?>, WindowingStrategy<?, ?>>emptyMap(),
+ context.getPipelineOptions());
+
+ reduceFunction = new FlinkMergingReduceFunction<>(
+ combineFn,
+ intervalStrategy,
+ Collections.<PCollectionView<?>, WindowingStrategy<?, ?>>emptyMap(),
+ context.getPipelineOptions());
+ }
+
+ // Partially GroupReduce the values into the intermediate format AccumT (combine)
+ GroupCombineOperator<
+ WindowedValue<KV<K, InputT>>,
+ WindowedValue<KV<K, List<InputT>>>> groupCombine =
+ new GroupCombineOperator<>(
+ inputGrouping,
+ partialReduceTypeInfo,
+ partialReduceFunction,
+ "GroupCombine: " + transform.getName());
+
+ Grouping<WindowedValue<KV<K, List<InputT>>>> intermediateGrouping =
+ groupCombine.groupBy(new KvKeySelector<List<InputT>, K>(inputCoder.getKeyCoder()));
+
+ // Fully reduce the values and create output format VO
+ GroupReduceOperator<
+ WindowedValue<KV<K, List<InputT>>>, WindowedValue<KV<K, List<InputT>>>> outputDataSet =
+ new GroupReduceOperator<>(
+ intermediateGrouping, partialReduceTypeInfo, reduceFunction, transform.getName());
+
+ context.setOutputDataSet(context.getOutput(transform), outputDataSet);
+
+ }
+
+ }
+
+ private static class ReshuffleTranslatorBatch<K, InputT>
+ implements FlinkBatchPipelineTranslator.BatchTransformTranslator<Reshuffle<K, InputT>> {
+
+ @Override
+ public void translateNode(
+ Reshuffle<K, InputT> transform,
+ FlinkBatchTranslationContext context) {
+
+ DataSet<WindowedValue<KV<K, InputT>>> inputDataSet =
+ context.getInputDataSet(context.getInput(transform));
+
+ context.setOutputDataSet(context.getOutput(transform), inputDataSet.rebalance());
+
+ }
+
+ }
+
+ /**
+ * Combiner that combines {@code T}s into a single {@code List<T>} containing all inputs.
+ *
+ * <p>For internal use to translate {@link GroupByKey}. For a large {@link PCollection} this
+ * is expected to crash!
+ *
+ * <p>This is copied from the dataflow runner code.
+ *
+ * @param <T> the type of elements to concatenate.
+ */
+ private static class Concatenate<T> extends Combine.CombineFn<T, List<T>, List<T>> {
+ @Override
+ public List<T> createAccumulator() {
+ return new ArrayList<>();
+ }
+
+ @Override
+ public List<T> addInput(List<T> accumulator, T input) {
+ accumulator.add(input);
+ return accumulator;
+ }
+
+ @Override
+ public List<T> mergeAccumulators(Iterable<List<T>> accumulators) {
+ List<T> result = createAccumulator();
+ for (List<T> accumulator : accumulators) {
+ result.addAll(accumulator);
+ }
+ return result;
+ }
+
+ @Override
+ public List<T> extractOutput(List<T> accumulator) {
+ return accumulator;
+ }
+
+ @Override
+ public Coder<List<T>> getAccumulatorCoder(CoderRegistry registry, Coder<T> inputCoder) {
+ return ListCoder.of(inputCoder);
+ }
+
+ @Override
+ public Coder<List<T>> getDefaultOutputCoder(CoderRegistry registry, Coder<T> inputCoder) {
+ return ListCoder.of(inputCoder);
+ }
+ }
+
+
+ private static class CombinePerKeyTranslatorBatch<K, InputT, AccumT, OutputT>
+ implements FlinkBatchPipelineTranslator.BatchTransformTranslator<
+ Combine.PerKey<K, InputT, OutputT>> {
+
+ @Override
+ @SuppressWarnings("unchecked")
+ public void translateNode(
+ Combine.PerKey<K, InputT, OutputT> transform,
+ FlinkBatchTranslationContext context) {
+ DataSet<WindowedValue<KV<K, InputT>>> inputDataSet =
+ context.getInputDataSet(context.getInput(transform));
+
+ CombineFnBase.PerKeyCombineFn<K, InputT, AccumT, OutputT> combineFn =
+ (CombineFnBase.PerKeyCombineFn<K, InputT, AccumT, OutputT>) transform.getFn();
+
+ KvCoder<K, InputT> inputCoder =
+ (KvCoder<K, InputT>) context.getInput(transform).getCoder();
+
+ Coder<AccumT> accumulatorCoder;
+
+ try {
+ accumulatorCoder =
+ combineFn.getAccumulatorCoder(
+ context.getInput(transform).getPipeline().getCoderRegistry(),
+ inputCoder.getKeyCoder(),
+ inputCoder.getValueCoder());
+ } catch (CannotProvideCoderException e) {
+ throw new RuntimeException(e);
+ }
+
+ WindowingStrategy<?, ?> windowingStrategy =
+ context.getInput(transform).getWindowingStrategy();
+
+ TypeInformation<WindowedValue<KV<K, AccumT>>> partialReduceTypeInfo =
+ context.getTypeInfo(
+ KvCoder.of(inputCoder.getKeyCoder(), accumulatorCoder),
+ windowingStrategy);
+
+ Grouping<WindowedValue<KV<K, InputT>>> inputGrouping =
+ inputDataSet.groupBy(new KvKeySelector<InputT, K>(inputCoder.getKeyCoder()));
+
+ // construct a map from side input to WindowingStrategy so that
+ // the DoFn runner can map main-input windows to side input windows
+ Map<PCollectionView<?>, WindowingStrategy<?, ?>> sideInputStrategies = new HashMap<>();
+ for (PCollectionView<?> sideInput: transform.getSideInputs()) {
+ sideInputStrategies.put(sideInput, sideInput.getWindowingStrategyInternal());
+ }
+
+ if (windowingStrategy.getWindowFn().isNonMerging()) {
+ WindowingStrategy<?, BoundedWindow> boundedStrategy =
+ (WindowingStrategy<?, BoundedWindow>) windowingStrategy;
+
+ FlinkPartialReduceFunction<K, InputT, AccumT, ?> partialReduceFunction =
+ new FlinkPartialReduceFunction<>(
+ combineFn,
+ boundedStrategy,
+ sideInputStrategies,
+ context.getPipelineOptions());
+
+ FlinkReduceFunction<K, AccumT, OutputT, ?> reduceFunction =
+ new FlinkReduceFunction<>(
+ combineFn,
+ boundedStrategy,
+ sideInputStrategies,
+ context.getPipelineOptions());
+
+ // Partially GroupReduce the values into the intermediate format AccumT (combine)
+ GroupCombineOperator<
+ WindowedValue<KV<K, InputT>>,
+ WindowedValue<KV<K, AccumT>>> groupCombine =
+ new GroupCombineOperator<>(
+ inputGrouping,
+ partialReduceTypeInfo,
+ partialReduceFunction,
+ "GroupCombine: " + transform.getName());
+
+ transformSideInputs(transform.getSideInputs(), groupCombine, context);
+
+ TypeInformation<WindowedValue<KV<K, OutputT>>> reduceTypeInfo =
+ context.getTypeInfo(context.getOutput(transform));
+
+ Grouping<WindowedValue<KV<K, AccumT>>> intermediateGrouping =
+ groupCombine.groupBy(new KvKeySelector<AccumT, K>(inputCoder.getKeyCoder()));
+
+ // Fully reduce the values and create output format OutputT
+ GroupReduceOperator<
+ WindowedValue<KV<K, AccumT>>, WindowedValue<KV<K, OutputT>>> outputDataSet =
+ new GroupReduceOperator<>(
+ intermediateGrouping, reduceTypeInfo, reduceFunction, transform.getName());
+
+ transformSideInputs(transform.getSideInputs(), outputDataSet, context);
+
+ context.setOutputDataSet(context.getOutput(transform), outputDataSet);
+
+ } else {
+ if (!windowingStrategy.getWindowFn().windowCoder().equals(IntervalWindow.getCoder())) {
+ throw new UnsupportedOperationException(
+ "Merging WindowFn with windows other than IntervalWindow are not supported.");
+ }
+
+ // for merging windows we can't to a pre-shuffle combine step since
+ // elements would not be in their correct windows for side-input access
+
+ WindowingStrategy<?, IntervalWindow> intervalStrategy =
+ (WindowingStrategy<?, IntervalWindow>) windowingStrategy;
+
+ FlinkMergingNonShuffleReduceFunction<K, InputT, AccumT, OutputT, ?> reduceFunction =
+ new FlinkMergingNonShuffleReduceFunction<>(
+ combineFn,
+ intervalStrategy,
+ sideInputStrategies,
+ context.getPipelineOptions());
+
+ TypeInformation<WindowedValue<KV<K, OutputT>>> reduceTypeInfo =
+ context.getTypeInfo(context.getOutput(transform));
+
+ Grouping<WindowedValue<KV<K, InputT>>> grouping =
+ inputDataSet.groupBy(new KvKeySelector<InputT, K>(inputCoder.getKeyCoder()));
+
+ // Fully reduce the values and create output format OutputT
+ GroupReduceOperator<
+ WindowedValue<KV<K, InputT>>, WindowedValue<KV<K, OutputT>>> outputDataSet =
+ new GroupReduceOperator<>(
+ grouping, reduceTypeInfo, reduceFunction, transform.getName());
+
+ transformSideInputs(transform.getSideInputs(), outputDataSet, context);
+
+ context.setOutputDataSet(context.getOutput(transform), outputDataSet);
+ }
+
+
+ }
+ }
+
+ private static void rejectSplittable(DoFn<?, ?> doFn) {
+ DoFnSignature signature = DoFnSignatures.getSignature(doFn.getClass());
+ if (signature.processElement().isSplittable()) {
+ throw new UnsupportedOperationException(
+ String.format(
+ "%s does not currently support splittable DoFn: %s",
+ FlinkRunner.class.getSimpleName(), doFn));
+ }
+ }
+
+ private static class ParDoTranslatorBatch<InputT, OutputT>
+ implements FlinkBatchPipelineTranslator.BatchTransformTranslator<
+ ParDo.MultiOutput<InputT, OutputT>> {
+
+ @Override
+ @SuppressWarnings("unchecked")
+ public void translateNode(
+ ParDo.MultiOutput<InputT, OutputT> transform,
+ FlinkBatchTranslationContext context) {
+ DoFn<InputT, OutputT> doFn = transform.getFn();
+ rejectSplittable(doFn);
+ DataSet<WindowedValue<InputT>> inputDataSet =
+ context.getInputDataSet(context.getInput(transform));
+
+ Map<TupleTag<?>, PValue> outputs = context.getOutputs(transform);
+
+ Map<TupleTag<?>, Integer> outputMap = Maps.newHashMap();
+ // put the main output at index 0, FlinkMultiOutputDoFnFunction expects this
+ outputMap.put(transform.getMainOutputTag(), 0);
+ int count = 1;
+ for (TupleTag<?> tag : outputs.keySet()) {
+ if (!outputMap.containsKey(tag)) {
+ outputMap.put(tag, count++);
+ }
+ }
+
+ // assume that the windowing strategy is the same for all outputs
+ WindowingStrategy<?, ?> windowingStrategy = null;
+
+ // collect all output Coders and create a UnionCoder for our tagged outputs
+ List<Coder<?>> outputCoders = Lists.newArrayList();
+ for (PValue taggedValue : outputs.values()) {
+ checkState(
+ taggedValue instanceof PCollection,
+ "Within ParDo, got a non-PCollection output %s of type %s",
+ taggedValue,
+ taggedValue.getClass().getSimpleName());
+ PCollection<?> coll = (PCollection<?>) taggedValue;
+ outputCoders.add(coll.getCoder());
+ windowingStrategy = coll.getWindowingStrategy();
+ }
+
+ if (windowingStrategy == null) {
+ throw new IllegalStateException("No outputs defined.");
+ }
+
+ UnionCoder unionCoder = UnionCoder.of(outputCoders);
+
+ TypeInformation<WindowedValue<RawUnionValue>> typeInformation =
+ new CoderTypeInformation<>(
+ WindowedValue.getFullCoder(
+ unionCoder,
+ windowingStrategy.getWindowFn().windowCoder()));
+
+ List<PCollectionView<?>> sideInputs = transform.getSideInputs();
+
+ // construct a map from side input to WindowingStrategy so that
+ // the DoFn runner can map main-input windows to side input windows
+ Map<PCollectionView<?>, WindowingStrategy<?, ?>> sideInputStrategies = new HashMap<>();
+ for (PCollectionView<?> sideInput: sideInputs) {
+ sideInputStrategies.put(sideInput, sideInput.getWindowingStrategyInternal());
+ }
+
+ SingleInputUdfOperator<WindowedValue<InputT>, WindowedValue<RawUnionValue>, ?> outputDataSet;
+ DoFnSignature signature = DoFnSignatures.getSignature(transform.getFn().getClass());
+ if (signature.stateDeclarations().size() > 0
+ || signature.timerDeclarations().size() > 0) {
+
+ // Based on the fact that the signature is stateful, DoFnSignatures ensures
+ // that it is also keyed
+ KvCoder<?, InputT> inputCoder =
+ (KvCoder<?, InputT>) context.getInput(transform).getCoder();
+
+ FlinkStatefulDoFnFunction<?, ?, OutputT> doFnWrapper = new FlinkStatefulDoFnFunction<>(
+ (DoFn) doFn, windowingStrategy, sideInputStrategies, context.getPipelineOptions(),
+ outputMap, transform.getMainOutputTag()
+ );
+
+ Grouping<WindowedValue<InputT>> grouping =
+ inputDataSet.groupBy(new KvKeySelector(inputCoder.getKeyCoder()));
+
+ outputDataSet =
+ new GroupReduceOperator(grouping, typeInformation, doFnWrapper, transform.getName());
+
+ } else {
+ FlinkDoFnFunction<InputT, RawUnionValue> doFnWrapper =
+ new FlinkDoFnFunction(
+ doFn,
+ windowingStrategy,
+ sideInputStrategies,
+ context.getPipelineOptions(),
+ outputMap,
+ transform.getMainOutputTag());
+
+ outputDataSet = new MapPartitionOperator<>(
+ inputDataSet, typeInformation,
+ doFnWrapper, transform.getName());
+
+ }
+
+ transformSideInputs(sideInputs, outputDataSet, context);
+
+ for (Entry<TupleTag<?>, PValue> output : outputs.entrySet()) {
+ pruneOutput(
+ outputDataSet,
+ context,
+ outputMap.get(output.getKey()),
+ (PCollection) output.getValue());
+ }
+
+ }
+
+ private <T> void pruneOutput(
+ DataSet<WindowedValue<RawUnionValue>> taggedDataSet,
+ FlinkBatchTranslationContext context,
+ int integerTag,
+ PCollection<T> collection) {
+ TypeInformation<WindowedValue<T>> outputType = context.getTypeInfo(collection);
+
+ FlinkMultiOutputPruningFunction<T> pruningFunction =
+ new FlinkMultiOutputPruningFunction<>(integerTag);
+
+ FlatMapOperator<WindowedValue<RawUnionValue>, WindowedValue<T>> pruningOperator =
+ new FlatMapOperator<>(
+ taggedDataSet,
+ outputType,
+ pruningFunction,
+ collection.getName());
+
+ context.setOutputDataSet(collection, pruningOperator);
+ }
+ }
+
+ private static class FlattenPCollectionTranslatorBatch<T>
+ implements FlinkBatchPipelineTranslator.BatchTransformTranslator<
+ Flatten.PCollections<T>> {
+
+ @Override
+ @SuppressWarnings("unchecked")
+ public void translateNode(
+ Flatten.PCollections<T> transform,
+ FlinkBatchTranslationContext context) {
+
+ Map<TupleTag<?>, PValue> allInputs = context.getInputs(transform);
+ DataSet<WindowedValue<T>> result = null;
+
+ if (allInputs.isEmpty()) {
+
+ // create an empty dummy source to satisfy downstream operations
+ // we cannot create an empty source in Flink, therefore we have to
+ // add the flatMap that simply never forwards the single element
+ DataSource<String> dummySource =
+ context.getExecutionEnvironment().fromElements("dummy");
+ result = dummySource.flatMap(new FlatMapFunction<String, WindowedValue<T>>() {
+ @Override
+ public void flatMap(String s, Collector<WindowedValue<T>> collector) throws Exception {
+ // never return anything
+ }
+ }).returns(
+ new CoderTypeInformation<>(
+ WindowedValue.getFullCoder(
+ (Coder<T>) VoidCoder.of(),
+ GlobalWindow.Coder.INSTANCE)));
+ } else {
+ for (PValue taggedPc : allInputs.values()) {
+ checkArgument(
+ taggedPc instanceof PCollection,
+ "Got non-PCollection input to flatten: %s of type %s",
+ taggedPc,
+ taggedPc.getClass().getSimpleName());
+ PCollection<T> collection = (PCollection<T>) taggedPc;
+ DataSet<WindowedValue<T>> current = context.getInputDataSet(collection);
+ if (result == null) {
+ result = current;
+ } else {
+ result = result.union(current);
+ }
+ }
+ }
+
+ // insert a dummy filter, there seems to be a bug in Flink
+ // that produces duplicate elements after the union in some cases
+ // if we don't
+ result = result.filter(new FilterFunction<WindowedValue<T>>() {
+ @Override
+ public boolean filter(WindowedValue<T> tWindowedValue) throws Exception {
+ return true;
+ }
+ }).name("UnionFixFilter");
+ context.setOutputDataSet(context.getOutput(transform), result);
+ }
+ }
+
+ private static class CreatePCollectionViewTranslatorBatch<ElemT, ViewT>
+ implements FlinkBatchPipelineTranslator.BatchTransformTranslator<
+ View.CreatePCollectionView<ElemT, ViewT>> {
+
+ @Override
+ public void translateNode(
+ View.CreatePCollectionView<ElemT, ViewT> transform,
+ FlinkBatchTranslationContext context) {
+ DataSet<WindowedValue<ElemT>> inputDataSet =
+ context.getInputDataSet(context.getInput(transform));
+
+ PCollectionView<ViewT> input = transform.getView();
+
+ context.setSideInputDataSet(input, inputDataSet);
+ }
+ }
+
+ private static void transformSideInputs(
+ List<PCollectionView<?>> sideInputs,
+ SingleInputUdfOperator<?, ?, ?> outputDataSet,
+ FlinkBatchTranslationContext context) {
+ // get corresponding Flink broadcast DataSets
+ for (PCollectionView<?> input : sideInputs) {
+ DataSet<?> broadcastSet = context.getSideInputDataSet(input);
+ outputDataSet.withBroadcastSet(broadcastSet, input.getTagInternal().getId());
+ }
+ }
+
+ private FlinkBatchTransformTranslators() {}
+
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkBatchTranslationContext.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkBatchTranslationContext.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkBatchTranslationContext.java
new file mode 100644
index 0000000..98dd0fb
--- /dev/null
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkBatchTranslationContext.java
@@ -0,0 +1,153 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink;
+
+import com.google.common.collect.Iterables;
+import java.util.HashMap;
+import java.util.Map;
+import org.apache.beam.runners.flink.translation.types.CoderTypeInformation;
+import org.apache.beam.sdk.coders.Coder;
+import org.apache.beam.sdk.options.PipelineOptions;
+import org.apache.beam.sdk.transforms.AppliedPTransform;
+import org.apache.beam.sdk.transforms.PTransform;
+import org.apache.beam.sdk.util.WindowedValue;
+import org.apache.beam.sdk.util.WindowingStrategy;
+import org.apache.beam.sdk.values.PCollection;
+import org.apache.beam.sdk.values.PCollectionView;
+import org.apache.beam.sdk.values.PValue;
+import org.apache.beam.sdk.values.TupleTag;
+import org.apache.flink.api.common.typeinfo.TypeInformation;
+import org.apache.flink.api.java.DataSet;
+import org.apache.flink.api.java.ExecutionEnvironment;
+
+/**
+ * Helper for {@link FlinkBatchPipelineTranslator} and translators in
+ * {@link FlinkBatchTransformTranslators}.
+ */
+class FlinkBatchTranslationContext {
+
+ private final Map<PValue, DataSet<?>> dataSets;
+ private final Map<PCollectionView<?>, DataSet<?>> broadcastDataSets;
+
+ /**
+ * For keeping track about which DataSets don't have a successor. We
+ * need to terminate these with a discarding sink because the Beam
+ * model allows dangling operations.
+ */
+ private final Map<PValue, DataSet<?>> danglingDataSets;
+
+ private final ExecutionEnvironment env;
+ private final PipelineOptions options;
+
+ private AppliedPTransform<?, ?, ?> currentTransform;
+
+ // ------------------------------------------------------------------------
+
+ public FlinkBatchTranslationContext(ExecutionEnvironment env, PipelineOptions options) {
+ this.env = env;
+ this.options = options;
+ this.dataSets = new HashMap<>();
+ this.broadcastDataSets = new HashMap<>();
+
+ this.danglingDataSets = new HashMap<>();
+ }
+
+ // ------------------------------------------------------------------------
+
+ public Map<PValue, DataSet<?>> getDanglingDataSets() {
+ return danglingDataSets;
+ }
+
+ public ExecutionEnvironment getExecutionEnvironment() {
+ return env;
+ }
+
+ public PipelineOptions getPipelineOptions() {
+ return options;
+ }
+
+ @SuppressWarnings("unchecked")
+ public <T> DataSet<WindowedValue<T>> getInputDataSet(PValue value) {
+ // assume that the DataSet is used as an input if retrieved here
+ danglingDataSets.remove(value);
+ return (DataSet<WindowedValue<T>>) dataSets.get(value);
+ }
+
+ public <T> void setOutputDataSet(PValue value, DataSet<WindowedValue<T>> set) {
+ if (!dataSets.containsKey(value)) {
+ dataSets.put(value, set);
+ danglingDataSets.put(value, set);
+ }
+ }
+
+ /**
+ * Sets the AppliedPTransform which carries input/output.
+ * @param currentTransform
+ */
+ public void setCurrentTransform(AppliedPTransform<?, ?, ?> currentTransform) {
+ this.currentTransform = currentTransform;
+ }
+
+ @SuppressWarnings("unchecked")
+ public <T> DataSet<T> getSideInputDataSet(PCollectionView<?> value) {
+ return (DataSet<T>) broadcastDataSets.get(value);
+ }
+
+ public <ViewT, ElemT> void setSideInputDataSet(
+ PCollectionView<ViewT> value,
+ DataSet<WindowedValue<ElemT>> set) {
+ if (!broadcastDataSets.containsKey(value)) {
+ broadcastDataSets.put(value, set);
+ }
+ }
+
+ @SuppressWarnings("unchecked")
+ public <T> TypeInformation<WindowedValue<T>> getTypeInfo(PCollection<T> collection) {
+ return getTypeInfo(collection.getCoder(), collection.getWindowingStrategy());
+ }
+
+ @SuppressWarnings("unchecked")
+ public <T> TypeInformation<WindowedValue<T>> getTypeInfo(
+ Coder<T> coder,
+ WindowingStrategy<?, ?> windowingStrategy) {
+ WindowedValue.FullWindowedValueCoder<T> windowedValueCoder =
+ WindowedValue.getFullCoder(
+ coder,
+ windowingStrategy.getWindowFn().windowCoder());
+
+ return new CoderTypeInformation<>(windowedValueCoder);
+ }
+
+ Map<TupleTag<?>, PValue> getInputs(PTransform<?, ?> transform) {
+ return currentTransform.getInputs();
+ }
+
+ @SuppressWarnings("unchecked")
+ <T extends PValue> T getInput(PTransform<T, ?> transform) {
+ return (T) Iterables.getOnlyElement(currentTransform.getInputs().values());
+ }
+
+ Map<TupleTag<?>, PValue> getOutputs(PTransform<?, ?> transform) {
+ return currentTransform.getOutputs();
+ }
+
+ @SuppressWarnings("unchecked")
+ <T extends PValue> T getOutput(PTransform<?, T> transform) {
+ return (T) Iterables.getOnlyElement(currentTransform.getOutputs().values());
+ }
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkDetachedRunnerResult.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkDetachedRunnerResult.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkDetachedRunnerResult.java
new file mode 100644
index 0000000..bf4395f
--- /dev/null
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkDetachedRunnerResult.java
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink;
+
+import java.io.IOException;
+
+import org.apache.beam.sdk.AggregatorRetrievalException;
+import org.apache.beam.sdk.AggregatorValues;
+import org.apache.beam.sdk.PipelineResult;
+import org.apache.beam.sdk.metrics.MetricResults;
+import org.apache.beam.sdk.transforms.Aggregator;
+import org.joda.time.Duration;
+
+
+/**
+ * Result of a detached execution of a {@link org.apache.beam.sdk.Pipeline} with Flink.
+ * In detached execution, results and job execution are currently unavailable.
+ */
+public class FlinkDetachedRunnerResult implements PipelineResult {
+
+ FlinkDetachedRunnerResult() {}
+
+ @Override
+ public State getState() {
+ return State.UNKNOWN;
+ }
+
+ @Override
+ public <T> AggregatorValues<T> getAggregatorValues(final Aggregator<?, T> aggregator)
+ throws AggregatorRetrievalException {
+ throw new AggregatorRetrievalException(
+ "Accumulators can't be retrieved for detached Job executions.",
+ new UnsupportedOperationException());
+ }
+
+ @Override
+ public MetricResults metrics() {
+ throw new UnsupportedOperationException("The FlinkRunner does not currently support metrics.");
+ }
+
+ @Override
+ public State cancel() throws IOException {
+ throw new UnsupportedOperationException("Cancelling is not yet supported.");
+ }
+
+ @Override
+ public State waitUntilFinish() {
+ return State.UNKNOWN;
+ }
+
+ @Override
+ public State waitUntilFinish(Duration duration) {
+ return State.UNKNOWN;
+ }
+
+ @Override
+ public String toString() {
+ return "FlinkDetachedRunnerResult{}";
+ }
+}
[09/50] [abbrv] beam git commit: This closes #2556
Posted by dh...@apache.org.
This closes #2556
Project: http://git-wip-us.apache.org/repos/asf/beam/repo
Commit: http://git-wip-us.apache.org/repos/asf/beam/commit/e0df7d85
Tree: http://git-wip-us.apache.org/repos/asf/beam/tree/e0df7d85
Diff: http://git-wip-us.apache.org/repos/asf/beam/diff/e0df7d85
Branch: refs/heads/DSL_SQL
Commit: e0df7d85e80eac71f875663512bc293a0529460f
Parents: a9bcc8b 6ac3ac5
Author: Eugene Kirpichov <ki...@google.com>
Authored: Tue Apr 18 18:02:25 2017 -0700
Committer: Eugene Kirpichov <ki...@google.com>
Committed: Tue Apr 18 18:02:25 2017 -0700
----------------------------------------------------------------------
.../operators/ApexParDoOperator.java | 3 +-
.../apache/beam/runners/core/DoFnRunners.java | 32 +++
.../beam/runners/core/ProcessFnRunner.java | 127 +++++++++
.../core/PushbackSideInputDoFnRunner.java | 106 +------
.../core/SimplePushbackSideInputDoFnRunner.java | 115 ++++++++
.../beam/runners/core/SplittableParDo.java | 110 +++++---
.../core/PushbackSideInputDoFnRunnerTest.java | 282 -------------------
.../SimplePushbackSideInputDoFnRunnerTest.java | 282 +++++++++++++++++++
.../beam/runners/core/SplittableParDoTest.java | 90 +++---
...ecycleManagerRemovingTransformEvaluator.java | 6 +-
.../beam/runners/direct/ParDoEvaluator.java | 127 ++++++---
.../runners/direct/ParDoEvaluatorFactory.java | 13 +-
...littableProcessElementsEvaluatorFactory.java | 106 +++++--
.../direct/StatefulParDoEvaluatorFactory.java | 4 +-
.../direct/TransformEvaluatorRegistry.java | 4 +-
...leManagerRemovingTransformEvaluatorTest.java | 8 +-
.../beam/runners/direct/ParDoEvaluatorTest.java | 7 +-
runners/flink/runner/pom.xml | 3 +-
.../wrappers/streaming/DoFnOperator.java | 12 +-
.../streaming/SplittableDoFnOperator.java | 2 +-
.../wrappers/streaming/WindowDoFnOperator.java | 2 +-
...esSplittableParDoWithWindowedSideInputs.java | 26 ++
.../beam/sdk/transforms/SplittableDoFnTest.java | 104 +++++--
23 files changed, 993 insertions(+), 578 deletions(-)
----------------------------------------------------------------------
[15/50] [abbrv] beam git commit: Separate streaming writes into two
pluggable components - CreateTables,
and StreamingWriteTables. Also address many code review comments. Also merge
with master.
Posted by dh...@apache.org.
Separate streaming writes into two pluggable components - CreateTables, and StreamingWriteTables.
Also address many code review comments.
Also merge with master.
Project: http://git-wip-us.apache.org/repos/asf/beam/repo
Commit: http://git-wip-us.apache.org/repos/asf/beam/commit/7d13061c
Tree: http://git-wip-us.apache.org/repos/asf/beam/tree/7d13061c
Diff: http://git-wip-us.apache.org/repos/asf/beam/diff/7d13061c
Branch: refs/heads/DSL_SQL
Commit: 7d13061cc36466c502bbc1f61d391743dd3739af
Parents: b486137
Author: Reuven Lax <re...@google.com>
Authored: Sun Apr 2 21:39:50 2017 -0700
Committer: Eugene Kirpichov <ki...@google.com>
Committed: Tue Apr 18 21:12:50 2017 -0700
----------------------------------------------------------------------
.../beam/sdk/io/gcp/bigquery/BatchLoads.java | 176 ++++++++++---------
.../sdk/io/gcp/bigquery/BigQueryHelpers.java | 13 ++
.../beam/sdk/io/gcp/bigquery/BigQueryIO.java | 21 ++-
.../io/gcp/bigquery/BigQueryTableSource.java | 4 +-
.../beam/sdk/io/gcp/bigquery/CreateTables.java | 95 ++++++----
.../io/gcp/bigquery/GenerateShardedTable.java | 3 +-
.../beam/sdk/io/gcp/bigquery/PrepareWrite.java | 80 +++++----
.../beam/sdk/io/gcp/bigquery/ShardedKey.java | 1 +
.../sdk/io/gcp/bigquery/StreamingInserts.java | 44 +----
.../io/gcp/bigquery/StreamingWriteTables.java | 86 +++++++++
.../sdk/io/gcp/bigquery/TableDestination.java | 1 +
.../io/gcp/bigquery/TableDestinationCoder.java | 62 +++----
.../sdk/io/gcp/bigquery/TableRowWriter.java | 14 +-
.../sdk/io/gcp/bigquery/TagWithUniqueIds.java | 14 +-
.../io/gcp/bigquery/WriteBundlesToFiles.java | 25 +--
.../sdk/io/gcp/bigquery/WritePartition.java | 127 ++++++++-----
.../beam/sdk/io/gcp/bigquery/WriteRename.java | 5 +-
.../beam/sdk/io/gcp/bigquery/WriteTables.java | 17 +-
.../sdk/io/gcp/bigquery/BigQueryIOTest.java | 66 ++++---
19 files changed, 516 insertions(+), 338 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/beam/blob/7d13061c/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BatchLoads.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BatchLoads.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BatchLoads.java
index 06fdfce..236b234 100644
--- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BatchLoads.java
+++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BatchLoads.java
@@ -18,7 +18,6 @@
package org.apache.beam.sdk.io.gcp.bigquery;
-import com.google.api.services.bigquery.model.TableReference;
import com.google.api.services.bigquery.model.TableRow;
import com.google.api.services.bigquery.model.TableSchema;
import java.io.IOException;
@@ -35,7 +34,6 @@ import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition;
import org.apache.beam.sdk.options.BigQueryOptions;
import org.apache.beam.sdk.options.ValueProvider;
import org.apache.beam.sdk.transforms.Create;
-import org.apache.beam.sdk.transforms.GroupByKey;
import org.apache.beam.sdk.transforms.MapElements;
import org.apache.beam.sdk.transforms.PTransform;
import org.apache.beam.sdk.transforms.ParDo;
@@ -47,6 +45,7 @@ import org.apache.beam.sdk.transforms.windowing.GlobalWindows;
import org.apache.beam.sdk.transforms.windowing.Window;
import org.apache.beam.sdk.util.IOChannelFactory;
import org.apache.beam.sdk.util.IOChannelUtils;
+import org.apache.beam.sdk.util.Reshuffle;
import org.apache.beam.sdk.values.KV;
import org.apache.beam.sdk.values.PCollection;
import org.apache.beam.sdk.values.PCollectionTuple;
@@ -54,17 +53,13 @@ import org.apache.beam.sdk.values.PCollectionView;
import org.apache.beam.sdk.values.TupleTag;
import org.apache.beam.sdk.values.TupleTagList;
-
-/**
- * PTransform that uses BigQuery batch-load jobs to write a PCollection to BigQuery.
- */
+/** PTransform that uses BigQuery batch-load jobs to write a PCollection to BigQuery. */
class BatchLoads extends PTransform<PCollection<KV<TableDestination, TableRow>>, WriteResult> {
BigQueryIO.Write<?> write;
- private static class ConstantSchemaFunction implements
- SerializableFunction<TableDestination, TableSchema> {
- private final @Nullable
- ValueProvider<String> jsonSchema;
+ private static class ConstantSchemaFunction
+ implements SerializableFunction<TableDestination, TableSchema> {
+ private final @Nullable ValueProvider<String> jsonSchema;
ConstantSchemaFunction(ValueProvider<String> jsonSchema) {
this.jsonSchema = jsonSchema;
@@ -86,7 +81,6 @@ class BatchLoads extends PTransform<PCollection<KV<TableDestination, TableRow>>,
public WriteResult expand(PCollection<KV<TableDestination, TableRow>> input) {
Pipeline p = input.getPipeline();
BigQueryOptions options = p.getOptions().as(BigQueryOptions.class);
- ValueProvider<TableReference> table = write.getTableWithDefaultProject(options);
final String stepUuid = BigQueryHelpers.randomUUIDString();
@@ -94,40 +88,41 @@ class BatchLoads extends PTransform<PCollection<KV<TableDestination, TableRow>>,
String tempFilePrefix;
try {
IOChannelFactory factory = IOChannelUtils.getFactory(tempLocation);
- tempFilePrefix = factory.resolve(
- factory.resolve(tempLocation, "BigQueryWriteTemp"),
- stepUuid);
+ tempFilePrefix =
+ factory.resolve(factory.resolve(tempLocation, "BigQueryWriteTemp"), stepUuid);
} catch (IOException e) {
throw new RuntimeException(
- String.format("Failed to resolve BigQuery temp location in %s", tempLocation),
- e);
+ String.format("Failed to resolve BigQuery temp location in %s", tempLocation), e);
}
// Create a singleton job ID token at execution time. This will be used as the base for all
// load jobs issued from this instance of the transfomr.
PCollection<String> singleton = p.apply("Create", Create.of(tempFilePrefix));
- PCollectionView<String> jobIdTokenView = p
- .apply("TriggerIdCreation", Create.of("ignored"))
- .apply("CreateJobId", MapElements.via(
- new SimpleFunction<String, String>() {
- @Override
- public String apply(String input) {
- return stepUuid;
- }
- }))
- .apply(View.<String>asSingleton());
+ PCollectionView<String> jobIdTokenView =
+ p.apply("TriggerIdCreation", Create.of("ignored"))
+ .apply(
+ "CreateJobId",
+ MapElements.via(
+ new SimpleFunction<String, String>() {
+ @Override
+ public String apply(String input) {
+ return stepUuid;
+ }
+ }))
+ .apply(View.<String>asSingleton());
PCollection<KV<TableDestination, TableRow>> inputInGlobalWindow =
- input.apply("rewindowIntoGlobal",
+ input.apply(
+ "rewindowIntoGlobal",
Window.<KV<TableDestination, TableRow>>into(new GlobalWindows())
.triggering(DefaultTrigger.of())
.discardingFiredPanes());
// PCollection of filename, file byte size, and table destination.
- PCollection<WriteBundlesToFiles.Result> results = inputInGlobalWindow
- .apply("WriteBundlesToFiles",
- ParDo.of(new WriteBundlesToFiles(tempFilePrefix)))
- .setCoder(WriteBundlesToFiles.ResultCoder.of());
+ PCollection<WriteBundlesToFiles.Result> results =
+ inputInGlobalWindow
+ .apply("WriteBundlesToFiles", ParDo.of(new WriteBundlesToFiles(tempFilePrefix)))
+ .setCoder(WriteBundlesToFiles.ResultCoder.of());
TupleTag<KV<ShardedKey<TableDestination>, List<String>>> multiPartitionsTag =
new TupleTag<KV<ShardedKey<TableDestination>, List<String>>>("multiPartitionsTag") {};
@@ -136,20 +131,23 @@ class BatchLoads extends PTransform<PCollection<KV<TableDestination, TableRow>>,
// Turn the list of files and record counts in a PCollectionView that can be used as a
// side input.
- PCollectionView<Iterable<WriteBundlesToFiles.Result>> resultsView = results
- .apply("ResultsView", View.<WriteBundlesToFiles.Result>asIterable());
+ PCollectionView<Iterable<WriteBundlesToFiles.Result>> resultsView =
+ results.apply("ResultsView", View.<WriteBundlesToFiles.Result>asIterable());
// This transform will look at the set of files written for each table, and if any table has
// too many files or bytes, will partition that table's files into multiple partitions for
// loading.
- PCollectionTuple partitions = singleton.apply("WritePartition",
- ParDo.of(new WritePartition(
- write.getJsonTableRef(),
- write.getTableDescription(),
- resultsView,
- multiPartitionsTag,
- singlePartitionTag))
- .withSideInputs(resultsView)
- .withOutputTags(multiPartitionsTag, TupleTagList.of(singlePartitionTag)));
+ PCollectionTuple partitions =
+ singleton.apply(
+ "WritePartition",
+ ParDo.of(
+ new WritePartition(
+ write.getJsonTableRef(),
+ write.getTableDescription(),
+ resultsView,
+ multiPartitionsTag,
+ singlePartitionTag))
+ .withSideInputs(resultsView)
+ .withOutputTags(multiPartitionsTag, TupleTagList.of(singlePartitionTag)));
// Since BigQueryIO.java does not yet have support for per-table schemas, inject a constant
// schema function here. If no schema is specified, this function will return null.
@@ -158,55 +156,69 @@ class BatchLoads extends PTransform<PCollection<KV<TableDestination, TableRow>>,
new ConstantSchemaFunction(write.getJsonSchema());
Coder<KV<ShardedKey<TableDestination>, List<String>>> partitionsCoder =
- KvCoder.of(ShardedKeyCoder.of(TableDestinationCoder.of()),
- ListCoder.of(StringUtf8Coder.of()));
+ KvCoder.of(
+ ShardedKeyCoder.of(TableDestinationCoder.of()), ListCoder.of(StringUtf8Coder.of()));
// If WriteBundlesToFiles produced more than MAX_NUM_FILES files or MAX_SIZE_BYTES bytes, then
// the import needs to be split into multiple partitions, and those partitions will be
// specified in multiPartitionsTag.
- PCollection<KV<TableDestination, String>> tempTables = partitions.get(multiPartitionsTag)
- .setCoder(partitionsCoder)
- // What's this GroupByKey for? Is this so we have a deterministic temp tables? If so, maybe
- // Reshuffle is better here.
- .apply("MultiPartitionsGroupByKey",
- GroupByKey.<ShardedKey<TableDestination>, List<String>>create())
- .apply("MultiPartitionsWriteTables", ParDo.of(new WriteTables(
- false,
- write.getBigQueryServices(),
- jobIdTokenView,
- tempFilePrefix,
- WriteDisposition.WRITE_EMPTY,
- CreateDisposition.CREATE_IF_NEEDED,
- schemaFunction))
- .withSideInputs(jobIdTokenView));
+ PCollection<KV<TableDestination, String>> tempTables =
+ partitions
+ .get(multiPartitionsTag)
+ .setCoder(partitionsCoder)
+ // Reshuffle will distribute this among multiple workers, and also guard against
+ // reexecution of the WritePartitions step once WriteTables has begun.
+ .apply(
+ "MultiPartitionsReshuffle",
+ Reshuffle.<ShardedKey<TableDestination>, List<String>>of())
+ .apply(
+ "MultiPartitionsWriteTables",
+ ParDo.of(
+ new WriteTables(
+ false,
+ write.getBigQueryServices(),
+ jobIdTokenView,
+ tempFilePrefix,
+ WriteDisposition.WRITE_EMPTY,
+ CreateDisposition.CREATE_IF_NEEDED,
+ schemaFunction))
+ .withSideInputs(jobIdTokenView));
// This view maps each final table destination to the set of temporary partitioned tables
// the PCollection was loaded into.
- PCollectionView<Map<TableDestination, Iterable<String>>> tempTablesView = tempTables
- .apply("TempTablesView", View.<TableDestination, String>asMultimap());
-
- singleton.apply("WriteRename", ParDo
- .of(new WriteRename(
- write.getBigQueryServices(),
- jobIdTokenView,
- write.getWriteDisposition(),
- write.getCreateDisposition(),
- tempTablesView))
- .withSideInputs(tempTablesView, jobIdTokenView));
+ PCollectionView<Map<TableDestination, Iterable<String>>> tempTablesView =
+ tempTables.apply("TempTablesView", View.<TableDestination, String>asMultimap());
+
+ singleton.apply(
+ "WriteRename",
+ ParDo.of(
+ new WriteRename(
+ write.getBigQueryServices(),
+ jobIdTokenView,
+ write.getWriteDisposition(),
+ write.getCreateDisposition(),
+ tempTablesView))
+ .withSideInputs(tempTablesView, jobIdTokenView));
// Write single partition to final table
- partitions.get(singlePartitionTag)
+ partitions
+ .get(singlePartitionTag)
.setCoder(partitionsCoder)
- .apply("SinglePartitionGroupByKey",
- GroupByKey.<ShardedKey<TableDestination>, List<String>>create())
- .apply("SinglePartitionWriteTables", ParDo.of(new WriteTables(
- true,
- write.getBigQueryServices(),
- jobIdTokenView,
- tempFilePrefix,
- write.getWriteDisposition(),
- write.getCreateDisposition(),
- schemaFunction))
- .withSideInputs(jobIdTokenView));
+ // Reshuffle will distribute this among multiple workers, and also guard against
+ // reexecution of the WritePartitions step once WriteTables has begun.
+ .apply(
+ "SinglePartitionsReshuffle", Reshuffle.<ShardedKey<TableDestination>, List<String>>of())
+ .apply(
+ "SinglePartitionWriteTables",
+ ParDo.of(
+ new WriteTables(
+ true,
+ write.getBigQueryServices(),
+ jobIdTokenView,
+ tempFilePrefix,
+ write.getWriteDisposition(),
+ write.getCreateDisposition(),
+ schemaFunction))
+ .withSideInputs(jobIdTokenView));
return WriteResult.in(input.getPipeline());
}
http://git-wip-us.apache.org/repos/asf/beam/blob/7d13061c/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryHelpers.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryHelpers.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryHelpers.java
index 846103d..e04361c 100644
--- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryHelpers.java
+++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryHelpers.java
@@ -26,6 +26,7 @@ import com.google.api.services.bigquery.model.TableReference;
import com.google.api.services.bigquery.model.TableSchema;
import com.google.cloud.hadoop.util.ApiErrorExtractor;
import com.google.common.annotations.VisibleForTesting;
+import com.google.common.hash.Hashing;
import java.io.IOException;
import java.util.ArrayList;
@@ -234,6 +235,18 @@ public class BigQueryHelpers {
}
}
+ // Create a unique job id for a table load.
+ static String createJobId(String prefix, TableDestination tableDestination, int partition) {
+ // Job ID must be different for each partition of each table.
+ String destinationHash =
+ Hashing.murmur3_128().hashUnencodedChars(tableDestination.toString()).toString();
+ if (partition >= 0) {
+ return String.format("%s_%s_%05d", prefix, destinationHash, partition);
+ } else {
+ return String.format("%s_%s", prefix, destinationHash);
+ }
+ }
+
@VisibleForTesting
static class JsonSchemaToTableSchema
implements SerializableFunction<String, TableSchema> {
http://git-wip-us.apache.org/repos/asf/beam/blob/7d13061c/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIO.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIO.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIO.java
index 54a25c7..3f5947e 100644
--- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIO.java
+++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIO.java
@@ -61,7 +61,6 @@ import org.apache.beam.sdk.options.ValueProvider.NestedValueProvider;
import org.apache.beam.sdk.options.ValueProvider.StaticValueProvider;
import org.apache.beam.sdk.runners.PipelineRunner;
import org.apache.beam.sdk.transforms.PTransform;
-import org.apache.beam.sdk.transforms.ParDo;
import org.apache.beam.sdk.transforms.SerializableFunction;
import org.apache.beam.sdk.transforms.display.DisplayData;
import org.apache.beam.sdk.util.IOChannelFactory;
@@ -445,7 +444,8 @@ public class BigQueryIO {
// Note that a table or query check can fail if the table or dataset are created by
// earlier stages of the pipeline or if a query depends on earlier stages of a pipeline.
// For these cases the withoutValidation method can be used to disable the check.
- if (getValidate() && table != null) {
+ if (getValidate() && table != null && table.isAccessible() && table.get().getProjectId()
+ != null) {
checkState(table.isAccessible(), "Cannot call validate if table is dynamically set.");
// Check for source table presence for early failure notification.
DatasetService datasetService = getBigQueryServices().getDatasetService(bqOptions);
@@ -650,6 +650,7 @@ public class BigQueryIO {
public static <T> Write<T> write() {
return new AutoValue_BigQueryIO_Write.Builder<T>()
.setValidate(true)
+ .setTableDescription("")
.setBigQueryServices(new BigQueryServicesImpl())
.setCreateDisposition(Write.CreateDisposition.CREATE_IF_NEEDED)
.setWriteDisposition(Write.WriteDisposition.WRITE_EMPTY)
@@ -690,7 +691,8 @@ public class BigQueryIO {
@Nullable abstract ValueProvider<String> getJsonSchema();
abstract CreateDisposition getCreateDisposition();
abstract WriteDisposition getWriteDisposition();
- @Nullable abstract String getTableDescription();
+ /** Table description. Default is empty. */
+ abstract String getTableDescription();
/** An option to indicate if table validation is desired. Default is true. */
abstract boolean getValidate();
abstract BigQueryServices getBigQueryServices();
@@ -805,9 +807,6 @@ public class BigQueryIO {
public Write<T> to(ValueProvider<String> tableSpec) {
ensureToNotCalledYet();
String tableDescription = getTableDescription();
- if (tableDescription == null) {
- tableDescription = "";
- }
return toBuilder()
.setJsonTableRef(
NestedValueProvider.of(
@@ -911,7 +910,7 @@ public class BigQueryIO {
public void validate(PCollection<T> input) {
BigQueryOptions options = input.getPipeline().getOptions().as(BigQueryOptions.class);
- // Exactly one of the table and table reference can be configured.
+ // We must have a destination to write to!
checkState(getTableFunction() != null,
"must set the table reference of a BigQueryIO.Write transform");
@@ -972,8 +971,8 @@ public class BigQueryIO {
@Override
public WriteResult expand(PCollection<T> input) {
PCollection<KV<TableDestination, TableRow>> rowsWithDestination =
- input.apply("PrepareWrite", ParDo.of(
- new PrepareWrite<T>(getTableFunction(), getFormatFunction())))
+ input.apply("PrepareWrite", new PrepareWrite<T>(
+ getTableFunction(), getFormatFunction()))
.setCoder(KvCoder.of(TableDestinationCoder.of(), TableRowJsonCoder.of()));
@@ -1013,8 +1012,8 @@ public class BigQueryIO {
.withLabel("Table WriteDisposition"))
.addIfNotDefault(DisplayData.item("validation", getValidate())
.withLabel("Validation Enabled"), true)
- .addIfNotNull(DisplayData.item("tableDescription", getTableDescription())
- .withLabel("Table Description"));
+ .addIfNotDefault(DisplayData.item("tableDescription", getTableDescription())
+ .withLabel("Table Description"), "");
}
/** Returns the table schema. */
http://git-wip-us.apache.org/repos/asf/beam/blob/7d13061c/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryTableSource.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryTableSource.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryTableSource.java
index 22aba64..a28da92 100644
--- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryTableSource.java
+++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryTableSource.java
@@ -109,8 +109,8 @@ class BigQueryTableSource extends BigQuerySourceBase {
@Override
public synchronized long getEstimatedSizeBytes(PipelineOptions options) throws Exception {
if (tableSizeBytes.get() == null) {
- TableReference table = BigQueryIO.JSON_FACTORY.fromString(jsonTable.get(),
- TableReference.class);
+ TableReference table = setDefaultProjectIfAbsent(options.as(BigQueryOptions.class),
+ BigQueryIO.JSON_FACTORY.fromString(jsonTable.get(), TableReference.class));
Long numBytes = bqServices.getDatasetService(options.as(BigQueryOptions.class))
.getTable(table).getNumBytes();
http://git-wip-us.apache.org/repos/asf/beam/blob/7d13061c/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/CreateTables.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/CreateTables.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/CreateTables.java
index e216553..a78f32d 100644
--- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/CreateTables.java
+++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/CreateTables.java
@@ -1,68 +1,94 @@
/*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements. See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership. The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License. You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
package org.apache.beam.sdk.io.gcp.bigquery;
import com.google.api.services.bigquery.model.Table;
import com.google.api.services.bigquery.model.TableReference;
import com.google.api.services.bigquery.model.TableRow;
import com.google.api.services.bigquery.model.TableSchema;
+import com.google.common.annotations.VisibleForTesting;
import java.io.IOException;
import java.util.Collections;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
+
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryServices.DatasetService;
import org.apache.beam.sdk.options.BigQueryOptions;
import org.apache.beam.sdk.transforms.DoFn;
+import org.apache.beam.sdk.transforms.PTransform;
+import org.apache.beam.sdk.transforms.ParDo;
import org.apache.beam.sdk.transforms.SerializableFunction;
import org.apache.beam.sdk.values.KV;
-
+import org.apache.beam.sdk.values.PCollection;
/**
- * Creates any tables needed before performing streaming writes to the tables. This is a
- * side-effect {l@ink DoFn}, and returns the original collection unchanged.
+ * Creates any tables needed before performing streaming writes to the tables. This is a side-effect
+ * {@link DoFn}, and returns the original collection unchanged.
*/
-public class CreateTables extends DoFn<KV<TableDestination, TableRow>,
- KV<TableDestination, TableRow>> {
+public class CreateTables
+ extends PTransform<
+ PCollection<KV<TableDestination, TableRow>>, PCollection<KV<TableDestination, TableRow>>> {
private final CreateDisposition createDisposition;
private final BigQueryServices bqServices;
private final SerializableFunction<TableDestination, TableSchema> schemaFunction;
-
- /** The list of tables created so far, so we don't try the creation
- each time.
- * TODO: We should put a bound on memory usage of this. Use guava cache instead.
+ /**
+ * The list of tables created so far, so we don't try the creation each time.
+ *
+ * <p>TODO: We should put a bound on memory usage of this. Use guava cache instead.
*/
private static Set<String> createdTables =
Collections.newSetFromMap(new ConcurrentHashMap<String, Boolean>());
- public CreateTables(CreateDisposition createDisposition, BigQueryServices bqServices,
- SerializableFunction<TableDestination, TableSchema> schemaFunction) {
+ public CreateTables(
+ CreateDisposition createDisposition,
+ SerializableFunction<TableDestination, TableSchema> schemaFunction) {
+ this(createDisposition, new BigQueryServicesImpl(), schemaFunction);
+ }
+
+ private CreateTables(
+ CreateDisposition createDisposition,
+ BigQueryServices bqServices,
+ SerializableFunction<TableDestination, TableSchema> schemaFunction) {
this.createDisposition = createDisposition;
this.bqServices = bqServices;
this.schemaFunction = schemaFunction;
}
- @ProcessElement
- public void processElement(ProcessContext context) throws InterruptedException, IOException {
- BigQueryOptions options = context.getPipelineOptions().as(BigQueryOptions.class);
- possibleCreateTable(options, context.element().getKey());
- context.output(context.element());
+ CreateTables withTestServices(BigQueryServices bqServices) {
+ return new CreateTables(createDisposition, bqServices, schemaFunction);
+ }
+
+ @Override
+ public PCollection<KV<TableDestination, TableRow>> expand(
+ PCollection<KV<TableDestination, TableRow>> input) {
+ return input.apply(
+ ParDo.of(
+ new DoFn<KV<TableDestination, TableRow>, KV<TableDestination, TableRow>>() {
+ @ProcessElement
+ public void processElement(ProcessContext context)
+ throws InterruptedException, IOException {
+ BigQueryOptions options = context.getPipelineOptions().as(BigQueryOptions.class);
+ possibleCreateTable(options, context.element().getKey());
+ context.output(context.element());
+ }
+ }));
}
private void possibleCreateTable(BigQueryOptions options, TableDestination tableDestination)
@@ -70,8 +96,7 @@ public class CreateTables extends DoFn<KV<TableDestination, TableRow>,
String tableSpec = tableDestination.getTableSpec();
TableReference tableReference = tableDestination.getTableReference();
String tableDescription = tableDestination.getTableDescription();
- if (createDisposition != createDisposition.CREATE_NEVER
- && !createdTables.contains(tableSpec)) {
+ if (createDisposition != createDisposition.CREATE_NEVER && !createdTables.contains(tableSpec)) {
synchronized (createdTables) {
// Another thread may have succeeded in creating the table in the meanwhile, so
// check again. This check isn't needed for correctness, but we add it to prevent
@@ -92,6 +117,8 @@ public class CreateTables extends DoFn<KV<TableDestination, TableRow>,
}
}
+ /** This method is used by the testing fake to clear static state. */
+ @VisibleForTesting
static void clearCreatedTables() {
synchronized (createdTables) {
createdTables.clear();
http://git-wip-us.apache.org/repos/asf/beam/blob/7d13061c/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/GenerateShardedTable.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/GenerateShardedTable.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/GenerateShardedTable.java
index da3a70a..90d41a0 100644
--- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/GenerateShardedTable.java
+++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/GenerateShardedTable.java
@@ -39,8 +39,7 @@ class GenerateShardedTable extends DoFn<KV<TableDestination, TableRow>,
@ProcessElement
public void processElement(ProcessContext context, BoundedWindow window) throws IOException {
ThreadLocalRandom randomGenerator = ThreadLocalRandom.current();
- // We output on keys 0-50 to ensure that there's enough batching for
- // BigQuery.
+ // We output on keys 0-numShards.
String tableSpec = context.element().getKey().getTableSpec();
context.output(KV.of(ShardedKey.of(tableSpec, randomGenerator.nextInt(0, numShards)),
context.element().getValue()));
http://git-wip-us.apache.org/repos/asf/beam/blob/7d13061c/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/PrepareWrite.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/PrepareWrite.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/PrepareWrite.java
index 7712417..a8bdb43 100644
--- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/PrepareWrite.java
+++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/PrepareWrite.java
@@ -1,20 +1,20 @@
/*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements. See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership. The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License. You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
package org.apache.beam.sdk.io.gcp.bigquery;
import com.google.api.services.bigquery.model.TableReference;
@@ -23,6 +23,8 @@ import com.google.common.base.Strings;
import java.io.IOException;
import org.apache.beam.sdk.options.BigQueryOptions;
import org.apache.beam.sdk.transforms.DoFn;
+import org.apache.beam.sdk.transforms.PTransform;
+import org.apache.beam.sdk.transforms.ParDo;
import org.apache.beam.sdk.transforms.SerializableFunction;
import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
import org.apache.beam.sdk.values.KV;
@@ -30,37 +32,49 @@ import org.apache.beam.sdk.values.PCollection;
import org.apache.beam.sdk.values.ValueInSingleWindow;
/**
- * Prepare an input {@link PCollection} for writing to BigQuery. Use the table-reference
- * function to determine which tables each element is written to, and format the element into a
- * {@link TableRow} using the user-supplied format function.
+ * Prepare an input {@link PCollection} for writing to BigQuery. Use the table function to determine
+ * which tables each element is written to, and format the element into a {@link TableRow} using the
+ * user-supplied format function.
*/
-public class PrepareWrite<T> extends DoFn<T, KV<TableDestination, TableRow>> {
+public class PrepareWrite<T>
+ extends PTransform<PCollection<T>, PCollection<KV<TableDestination, TableRow>>> {
private SerializableFunction<ValueInSingleWindow<T>, TableDestination> tableFunction;
private SerializableFunction<T, TableRow> formatFunction;
- public PrepareWrite(SerializableFunction<ValueInSingleWindow<T>, TableDestination> tableFunction,
- SerializableFunction<T, TableRow> formatFunction) {
+ public PrepareWrite(
+ SerializableFunction<ValueInSingleWindow<T>, TableDestination> tableFunction,
+ SerializableFunction<T, TableRow> formatFunction) {
this.tableFunction = tableFunction;
this.formatFunction = formatFunction;
}
- @ProcessElement
- public void processElement(ProcessContext context, BoundedWindow window) throws IOException {
- TableDestination tableDestination = tableSpecFromWindowedValue(
- context.getPipelineOptions().as(BigQueryOptions.class),
- ValueInSingleWindow.of(context.element(), context.timestamp(), window, context.pane()));
- TableRow tableRow = formatFunction.apply(context.element());
- context.output(KV.of(tableDestination, tableRow));
+ @Override
+ public PCollection<KV<TableDestination, TableRow>> expand(PCollection<T> input) {
+ return input.apply(
+ ParDo.of(
+ new DoFn<T, KV<TableDestination, TableRow>>() {
+ @ProcessElement
+ public void processElement(ProcessContext context, BoundedWindow window)
+ throws IOException {
+ TableDestination tableDestination =
+ tableSpecFromWindowedValue(
+ context.getPipelineOptions().as(BigQueryOptions.class),
+ ValueInSingleWindow.of(
+ context.element(), context.timestamp(), window, context.pane()));
+ TableRow tableRow = formatFunction.apply(context.element());
+ context.output(KV.of(tableDestination, tableRow));
+ }
+ }));
}
- private TableDestination tableSpecFromWindowedValue(BigQueryOptions options,
- ValueInSingleWindow<T> value) {
+ private TableDestination tableSpecFromWindowedValue(
+ BigQueryOptions options, ValueInSingleWindow<T> value) {
TableDestination tableDestination = tableFunction.apply(value);
TableReference tableReference = tableDestination.getTableReference();
if (Strings.isNullOrEmpty(tableReference.getProjectId())) {
tableReference.setProjectId(options.getProject());
- tableDestination = new TableDestination(tableReference,
- tableDestination.getTableDescription());
+ tableDestination =
+ new TableDestination(tableReference, tableDestination.getTableDescription());
}
return tableDestination;
}
http://git-wip-us.apache.org/repos/asf/beam/blob/7d13061c/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/ShardedKey.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/ShardedKey.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/ShardedKey.java
index 09b4fbf..c2b739f 100644
--- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/ShardedKey.java
+++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/ShardedKey.java
@@ -25,6 +25,7 @@ import java.util.Objects;
* A key and a shard number.
*/
class ShardedKey<K> implements Serializable {
+ private static final long serialVersionUID = 1L;
private final K key;
private final int shardNumber;
http://git-wip-us.apache.org/repos/asf/beam/blob/7d13061c/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StreamingInserts.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StreamingInserts.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StreamingInserts.java
index ced1d66..efd9c31 100644
--- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StreamingInserts.java
+++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StreamingInserts.java
@@ -22,15 +22,10 @@ import com.google.api.services.bigquery.model.TableRow;
import com.google.api.services.bigquery.model.TableSchema;
import javax.annotation.Nullable;
import org.apache.beam.sdk.coders.Coder;
-import org.apache.beam.sdk.coders.KvCoder;
-import org.apache.beam.sdk.coders.StringUtf8Coder;
-import org.apache.beam.sdk.coders.TableRowJsonCoder;
import org.apache.beam.sdk.coders.VoidCoder;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write;
import org.apache.beam.sdk.transforms.PTransform;
-import org.apache.beam.sdk.transforms.ParDo;
import org.apache.beam.sdk.transforms.SerializableFunction;
-import org.apache.beam.sdk.util.Reshuffle;
import org.apache.beam.sdk.values.KV;
import org.apache.beam.sdk.values.PCollection;
@@ -38,8 +33,8 @@ import org.apache.beam.sdk.values.PCollection;
* PTransform that performs streaming BigQuery write. To increase consistency,
* it leverages BigQuery best effort de-dup mechanism.
*/
-class StreamingInserts extends PTransform<PCollection<KV<TableDestination, TableRow>>,
- WriteResult> {
+public class StreamingInserts extends
+ PTransform<PCollection<KV<TableDestination, TableRow>>, WriteResult> {
private final Write<?> write;
private static class ConstantSchemaFunction implements
@@ -74,36 +69,11 @@ class StreamingInserts extends PTransform<PCollection<KV<TableDestination, Table
SerializableFunction<TableDestination, TableSchema> schemaFunction =
new ConstantSchemaFunction(write.getSchema());
- // A naive implementation would be to simply stream data directly to BigQuery.
- // However, this could occasionally lead to duplicated data, e.g., when
- // a VM that runs this code is restarted and the code is re-run.
+ PCollection<KV<TableDestination, TableRow>> writes = input
+ .apply("CreateTables", new CreateTables(write.getCreateDisposition(), schemaFunction)
+ .withTestServices(write.getBigQueryServices()));
- // The above risk is mitigated in this implementation by relying on
- // BigQuery built-in best effort de-dup mechanism.
-
- // To use this mechanism, each input TableRow is tagged with a generated
- // unique id, which is then passed to BigQuery and used to ignore duplicates.
- PCollection<KV<ShardedKey<String>, TableRowInfo>> tagged = input
- .apply("CreateTables", ParDo.of(new CreateTables(write.getCreateDisposition(),
- write.getBigQueryServices(), schemaFunction)))
- // We create 50 keys per BigQuery table to generate output on. This is few enough that we
- // get good batching into BigQuery's insert calls, and enough that we can max out the
- // streaming insert quota.
- .apply("ShardTableWrites", ParDo.of(new GenerateShardedTable(50)))
- .setCoder(KvCoder.of(ShardedKeyCoder.of(StringUtf8Coder.of()), TableRowJsonCoder.of()))
- .apply("TagWithUniqueIds", ParDo.of(new TagWithUniqueIds()));
-
- // To prevent having the same TableRow processed more than once with regenerated
- // different unique ids, this implementation relies on "checkpointing", which is
- // achieved as a side effect of having StreamingWriteFn immediately follow a GBK,
- // performed by Reshuffle.
- tagged
- .setCoder(KvCoder.of(ShardedKeyCoder.of(StringUtf8Coder.of()), TableRowInfoCoder.of()))
- .apply(Reshuffle.<ShardedKey<String>, TableRowInfo>of())
- .apply("StreamingWrite",
- ParDo.of(
- new StreamingWriteFn(write.getBigQueryServices())));
-
- return WriteResult.in(input.getPipeline());
+ return writes.apply(new StreamingWriteTables()
+ .withTestServices(write.getBigQueryServices()));
}
}
http://git-wip-us.apache.org/repos/asf/beam/blob/7d13061c/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StreamingWriteTables.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StreamingWriteTables.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StreamingWriteTables.java
new file mode 100644
index 0000000..4ddc1df
--- /dev/null
+++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StreamingWriteTables.java
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.sdk.io.gcp.bigquery;
+
+import com.google.api.services.bigquery.model.TableRow;
+import org.apache.beam.sdk.coders.KvCoder;
+import org.apache.beam.sdk.coders.StringUtf8Coder;
+import org.apache.beam.sdk.coders.TableRowJsonCoder;
+import org.apache.beam.sdk.transforms.PTransform;
+import org.apache.beam.sdk.transforms.ParDo;
+import org.apache.beam.sdk.util.Reshuffle;
+import org.apache.beam.sdk.values.KV;
+import org.apache.beam.sdk.values.PCollection;
+
+/**
+ * This transform takes in key-value pairs of {@link TableRow} entries and the
+ * {@link TableDestination} it should be written to. The BigQuery streaming-write service is used
+ * to stream these writes to the appropriate table.
+ *
+ * <p>This transform assumes that all destination tables already exist by the time it sees a write
+ * for that table.
+ */
+public class StreamingWriteTables extends PTransform<
+ PCollection<KV<TableDestination, TableRow>>, WriteResult> {
+ private BigQueryServices bigQueryServices;
+
+ public StreamingWriteTables() {
+ this(new BigQueryServicesImpl());
+ }
+
+ private StreamingWriteTables(BigQueryServices bigQueryServices) {
+ this.bigQueryServices = bigQueryServices;
+ }
+
+ StreamingWriteTables withTestServices(BigQueryServices bigQueryServices) {
+ return new StreamingWriteTables(bigQueryServices);
+ }
+
+ @Override
+ public WriteResult expand(PCollection<KV<TableDestination, TableRow>> input) {
+ // A naive implementation would be to simply stream data directly to BigQuery.
+ // However, this could occasionally lead to duplicated data, e.g., when
+ // a VM that runs this code is restarted and the code is re-run.
+
+ // The above risk is mitigated in this implementation by relying on
+ // BigQuery built-in best effort de-dup mechanism.
+
+ // To use this mechanism, each input TableRow is tagged with a generated
+ // unique id, which is then passed to BigQuery and used to ignore duplicates
+ // We create 50 keys per BigQuery table to generate output on. This is few enough that we
+ // get good batching into BigQuery's insert calls, and enough that we can max out the
+ // streaming insert quota.
+ PCollection<KV<ShardedKey<String>, TableRowInfo>> tagged =
+ input.apply("ShardTableWrites", ParDo.of
+ (new GenerateShardedTable(50)))
+ .setCoder(KvCoder.of(ShardedKeyCoder.of(StringUtf8Coder.of()), TableRowJsonCoder.of()))
+ .apply("TagWithUniqueIds", ParDo.of(new TagWithUniqueIds()));
+
+ // To prevent having the same TableRow processed more than once with regenerated
+ // different unique ids, this implementation relies on "checkpointing", which is
+ // achieved as a side effect of having StreamingWriteFn immediately follow a GBK,
+ // performed by Reshuffle.
+ tagged
+ .setCoder(KvCoder.of(ShardedKeyCoder.of(StringUtf8Coder.of()), TableRowInfoCoder.of()))
+ .apply(Reshuffle.<ShardedKey<String>, TableRowInfo>of())
+ .apply("StreamingWrite",
+ ParDo.of(
+ new StreamingWriteFn(bigQueryServices)));
+ return WriteResult.in(input.getPipeline());
+ }
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/7d13061c/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TableDestination.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TableDestination.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TableDestination.java
index 36e1401..962e2cd 100644
--- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TableDestination.java
+++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TableDestination.java
@@ -27,6 +27,7 @@ import java.util.Objects;
* Encapsulates a BigQuery table destination.
*/
public class TableDestination implements Serializable {
+ private static final long serialVersionUID = 1L;
private final String tableSpec;
private final String tableDescription;
http://git-wip-us.apache.org/repos/asf/beam/blob/7d13061c/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TableDestinationCoder.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TableDestinationCoder.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TableDestinationCoder.java
index fa24700..262a00d 100644
--- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TableDestinationCoder.java
+++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TableDestinationCoder.java
@@ -1,20 +1,20 @@
/*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements. See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership. The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License. You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
package org.apache.beam.sdk.io.gcp.bigquery;
@@ -26,20 +26,18 @@ import org.apache.beam.sdk.coders.AtomicCoder;
import org.apache.beam.sdk.coders.CoderException;
import org.apache.beam.sdk.coders.StringUtf8Coder;
-/**
- * A coder for {@link TableDestination} objects.
- */
+/** A coder for {@link TableDestination} objects. */
public class TableDestinationCoder extends AtomicCoder<TableDestination> {
private static final TableDestinationCoder INSTANCE = new TableDestinationCoder();
-
+ private static final StringUtf8Coder stringCoder = StringUtf8Coder.of();
@JsonCreator
public static TableDestinationCoder of() {
- return INSTANCE;
- }
+ return INSTANCE;
+ }
@Override
- public void encode(TableDestination value, OutputStream outStream, Context context)
+ public void encode(TableDestination value, OutputStream outStream, Context context)
throws IOException {
if (value == null) {
throw new CoderException("cannot encode a null value");
@@ -50,15 +48,13 @@ public class TableDestinationCoder extends AtomicCoder<TableDestination> {
@Override
public TableDestination decode(InputStream inStream, Context context) throws IOException {
- return new TableDestination(
- stringCoder.decode(inStream, context.nested()),
- stringCoder.decode(inStream, context.nested()));
- }
-
- @Override
- public void verifyDeterministic() throws NonDeterministicException {
- return;
- }
+ return new TableDestination(
+ stringCoder.decode(inStream, context.nested()),
+ stringCoder.decode(inStream, context.nested()));
+ }
- StringUtf8Coder stringCoder = StringUtf8Coder.of();
+ @Override
+ public void verifyDeterministic() throws NonDeterministicException {
+ return;
+ }
}
http://git-wip-us.apache.org/repos/asf/beam/blob/7d13061c/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TableRowWriter.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TableRowWriter.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TableRowWriter.java
index ee8f466..91ef404 100644
--- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TableRowWriter.java
+++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TableRowWriter.java
@@ -32,9 +32,7 @@ import org.apache.beam.sdk.util.MimeTypes;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-/**
- * Writes {@TableRow} objects out to a file. Used when doing batch load jobs into BigQuery.
- */
+/** Writes {@TableRow} objects out to a file. Used when doing batch load jobs into BigQuery. */
class TableRowWriter {
private static final Logger LOG = LoggerFactory.getLogger(BigQueryIO.class);
@@ -47,16 +45,18 @@ class TableRowWriter {
protected String mimeType = MimeTypes.TEXT;
private CountingOutputStream out;
- public class Result {
- String filename;
- long byteSize;
+ public static final class Result {
+ final String filename;
+ final long byteSize;
+
public Result(String filename, long byteSize) {
this.filename = filename;
this.byteSize = byteSize;
}
}
+
TableRowWriter(String basename) {
- this.tempFilePrefix = basename;
+ this.tempFilePrefix = basename;
}
public final void open(String uId) throws Exception {
http://git-wip-us.apache.org/repos/asf/beam/blob/7d13061c/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TagWithUniqueIds.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TagWithUniqueIds.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TagWithUniqueIds.java
index 7379784..284691e 100644
--- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TagWithUniqueIds.java
+++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TagWithUniqueIds.java
@@ -28,15 +28,14 @@ import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
import org.apache.beam.sdk.values.KV;
/**
- * Fn that tags each table row with a unique id and destination table.
- * To avoid calling UUID.randomUUID() for each element, which can be costly,
- * a randomUUID is generated only once per bucket of data. The actual unique
- * id is created by concatenating this randomUUID with a sequential number.
+ * Fn that tags each table row with a unique id and destination table. To avoid calling
+ * UUID.randomUUID() for each element, which can be costly, a randomUUID is generated only once per
+ * bucket of data. The actual unique id is created by concatenating this randomUUID with a
+ * sequential number.
*/
@VisibleForTesting
class TagWithUniqueIds
extends DoFn<KV<ShardedKey<String>, TableRow>, KV<ShardedKey<String>, TableRowInfo>> {
-
private transient String randomUUID;
private transient long sequenceNo = 0L;
@@ -51,8 +50,9 @@ class TagWithUniqueIds
String uniqueId = randomUUID + sequenceNo++;
// We output on keys 0-50 to ensure that there's enough batching for
// BigQuery.
- context.output(KV.of(context.element().getKey(),
- new TableRowInfo(context.element().getValue(), uniqueId)));
+ context.output(
+ KV.of(
+ context.element().getKey(), new TableRowInfo(context.element().getValue(), uniqueId)));
}
@Override
http://git-wip-us.apache.org/repos/asf/beam/blob/7d13061c/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/WriteBundlesToFiles.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/WriteBundlesToFiles.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/WriteBundlesToFiles.java
index 869e68a..a25cc90 100644
--- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/WriteBundlesToFiles.java
+++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/WriteBundlesToFiles.java
@@ -51,10 +51,11 @@ class WriteBundlesToFiles extends DoFn<KV<TableDestination, TableRow>, WriteBund
* The result of the {@link WriteBundlesToFiles} transform. Corresponds to a single output file,
* and encapsulates the table it is destined to as well as the file byte size.
*/
- public static class Result implements Serializable {
- public String filename;
- public Long fileByteSize;
- public TableDestination tableDestination;
+ public static final class Result implements Serializable {
+ private static final long serialVersionUID = 1L;
+ public final String filename;
+ public final Long fileByteSize;
+ public final TableDestination tableDestination;
public Result(String filename, Long fileByteSize, TableDestination tableDestination) {
this.filename = filename;
@@ -68,6 +69,9 @@ class WriteBundlesToFiles extends DoFn<KV<TableDestination, TableRow>, WriteBund
*/
public static class ResultCoder extends AtomicCoder<Result> {
private static final ResultCoder INSTANCE = new ResultCoder();
+ private static final StringUtf8Coder stringCoder = StringUtf8Coder.of();
+ private static final VarLongCoder longCoder = VarLongCoder.of();
+ private static final TableDestinationCoder tableDestinationCoder = TableDestinationCoder.of();
public static ResultCoder of() {
return INSTANCE;
@@ -87,18 +91,15 @@ class WriteBundlesToFiles extends DoFn<KV<TableDestination, TableRow>, WriteBund
@Override
public Result decode(InputStream inStream, Context context)
throws IOException {
- return new Result(stringCoder.decode(inStream, context.nested()),
- longCoder.decode(inStream, context.nested()),
- tableDestinationCoder.decode(inStream, context.nested()));
+ String filename = stringCoder.decode(inStream, context.nested());
+ long fileByteSize = longCoder.decode(inStream, context.nested());
+ TableDestination tableDestination = tableDestinationCoder.decode(inStream, context.nested());
+ return new Result(filename, fileByteSize, tableDestination);
}
@Override
public void verifyDeterministic() throws NonDeterministicException {
}
-
- StringUtf8Coder stringCoder = StringUtf8Coder.of();
- VarLongCoder longCoder = VarLongCoder.of();
- TableDestinationCoder tableDestinationCoder = TableDestinationCoder.of();
}
WriteBundlesToFiles(String tempFilePrefix) {
@@ -107,6 +108,8 @@ class WriteBundlesToFiles extends DoFn<KV<TableDestination, TableRow>, WriteBund
@StartBundle
public void startBundle(Context c) {
+ // This must be done each bundle, as by default the {@link DoFn} might be reused between
+ // bundles.
this.writers = Maps.newHashMap();
}
http://git-wip-us.apache.org/repos/asf/beam/blob/7d13061c/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/WritePartition.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/WritePartition.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/WritePartition.java
index 9c48b82..9414909 100644
--- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/WritePartition.java
+++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/WritePartition.java
@@ -44,7 +44,65 @@ class WritePartition extends DoFn<String, KV<ShardedKey<TableDestination>, List<
private TupleTag<KV<ShardedKey<TableDestination>, List<String>>> multiPartitionsTag;
private TupleTag<KV<ShardedKey<TableDestination>, List<String>>> singlePartitionTag;
- public WritePartition(
+ private static class PartitionData {
+ private int numFiles = 0;
+ private long byteSize = 0;
+ private List<String> filenames = Lists.newArrayList();
+
+ int getNumFiles() {
+ return numFiles;
+ }
+
+ void addFiles(int numFiles) {
+ this.numFiles += numFiles;
+ }
+
+ long getByteSize() {
+ return byteSize;
+ }
+
+ void addBytes(long numBytes) {
+ this.byteSize += numBytes;
+ }
+
+ List<String> getFilenames() {
+ return filenames;
+ }
+
+ void addFilename(String filename) {
+ filenames.add(filename);
+ }
+
+ // Check to see whether we can add to this partition without exceeding the maximum partition
+ // size.
+ boolean canAccept(int numFiles, long numBytes) {
+ return this.numFiles + numFiles <= Write.MAX_NUM_FILES
+ && this.byteSize + numBytes <= Write.MAX_SIZE_BYTES;
+ }
+ }
+
+ private static class DestinationData {
+ private List<PartitionData> partitions = Lists.newArrayList();
+
+ DestinationData() {
+ // Always start out with a single empty partition.
+ partitions.add(new PartitionData());
+ }
+
+ List<PartitionData> getPartitions() {
+ return partitions;
+ }
+
+ PartitionData getLatestPartition() {
+ return partitions.get(partitions.size() - 1);
+ }
+
+ void addPartition(PartitionData partition) {
+ partitions.add(partition);
+ }
+ }
+
+ WritePartition(
ValueProvider<String> singletonOutputJsonTableRef,
String singletonOutputTableDescription,
PCollectionView<Iterable<WriteBundlesToFiles.Result>> resultsView,
@@ -76,54 +134,41 @@ class WritePartition extends DoFn<String, KV<ShardedKey<TableDestination>, List<
}
- long partitionId = 0;
- Map<TableDestination, Integer> currNumFilesMap = Maps.newHashMap();
- Map<TableDestination, Long> currSizeBytesMap = Maps.newHashMap();
- Map<TableDestination, List<List<String>>> currResultsMap = Maps.newHashMap();
- for (int i = 0; i < results.size(); ++i) {
- WriteBundlesToFiles.Result fileResult = results.get(i);
+ Map<TableDestination, DestinationData> currentResults = Maps.newHashMap();
+ for (WriteBundlesToFiles.Result fileResult : results) {
TableDestination tableDestination = fileResult.tableDestination;
- List<List<String>> partitions = currResultsMap.get(tableDestination);
- if (partitions == null) {
- partitions = Lists.newArrayList();
- partitions.add(Lists.<String>newArrayList());
- currResultsMap.put(tableDestination, partitions);
+ DestinationData destinationData = currentResults.get(tableDestination);
+ if (destinationData == null) {
+ destinationData = new DestinationData();
+ currentResults.put(tableDestination, destinationData);
}
- int currNumFiles = getOrDefault(currNumFilesMap, tableDestination, 0);
- long currSizeBytes = getOrDefault(currSizeBytesMap, tableDestination, 0L);
- if (currNumFiles + 1 > Write.MAX_NUM_FILES
- || currSizeBytes + fileResult.fileByteSize > Write.MAX_SIZE_BYTES) {
- // Add a new partition for this table.
- partitions.add(Lists.<String>newArrayList());
- // c.sideOutput(multiPartitionsTag, KV.of(++partitionId, currResults));
- currNumFiles = 0;
- currSizeBytes = 0;
- currNumFilesMap.remove(tableDestination);
- currSizeBytesMap.remove(tableDestination);
+
+ PartitionData latestPartition = destinationData.getLatestPartition();
+ if (!latestPartition.canAccept(1, fileResult.fileByteSize)) {
+ // Too much data, roll over to a new partition.
+ latestPartition = new PartitionData();
+ destinationData.addPartition(latestPartition);
}
- currNumFilesMap.put(tableDestination, currNumFiles + 1);
- currSizeBytesMap.put(tableDestination, currSizeBytes + fileResult.fileByteSize);
- // Always add to the most recent partition for this table.
- partitions.get(partitions.size() - 1).add(fileResult.filename);
+ latestPartition.addFilename(fileResult.filename);
+ latestPartition.addFiles(1);
+ latestPartition.addBytes(fileResult.fileByteSize);
}
- for (Map.Entry<TableDestination, List<List<String>>> entry : currResultsMap.entrySet()) {
+ // Now that we've figured out which tables and partitions to write out, emit this information
+ // to the next stage.
+ for (Map.Entry<TableDestination, DestinationData> entry : currentResults.entrySet()) {
TableDestination tableDestination = entry.getKey();
- List<List<String>> partitions = entry.getValue();
+ DestinationData destinationData = entry.getValue();
+ // In the fast-path case where we only output one table, the transform loads it directly
+ // to the final table. In this case, we output on a special TupleTag so the enclosing
+ // transform knows to skip the rename step.
TupleTag<KV<ShardedKey<TableDestination>, List<String>>> outputTag =
- (partitions.size() == 1) ? singlePartitionTag : multiPartitionsTag;
- for (int i = 0; i < partitions.size(); ++i) {
- c.output(outputTag, KV.of(ShardedKey.of(tableDestination, i + 1), partitions.get(i)));
+ (destinationData.getPartitions().size() == 1) ? singlePartitionTag : multiPartitionsTag;
+ for (int i = 0; i < destinationData.getPartitions().size(); ++i) {
+ PartitionData partitionData = destinationData.getPartitions().get(i);
+ c.output(outputTag, KV.of(ShardedKey.of(tableDestination, i + 1),
+ partitionData.getFilenames()));
}
}
}
-
- private <T> T getOrDefault(Map<TableDestination, T> map, TableDestination tableDestination,
- T defaultValue) {
- if (map.containsKey(tableDestination)) {
- return map.get(tableDestination);
- } else {
- return defaultValue;
- }
- }
}
http://git-wip-us.apache.org/repos/asf/beam/blob/7d13061c/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/WriteRename.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/WriteRename.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/WriteRename.java
index 752e7d3..9b1c989 100644
--- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/WriteRename.java
+++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/WriteRename.java
@@ -89,8 +89,9 @@ class WriteRename extends DoFn<String, Void> {
}
// Make sure each destination table gets a unique job id.
- String jobIdPrefix = String.format(
- c.sideInput(jobIdToken) + "0x%08x", finalTableDestination.hashCode());
+ String jobIdPrefix = BigQueryHelpers.createJobId(
+ c.sideInput(jobIdToken), finalTableDestination, -1);
+
copy(
bqServices.getJobService(c.getPipelineOptions().as(BigQueryOptions.class)),
bqServices.getDatasetService(c.getPipelineOptions().as(BigQueryOptions.class)),
http://git-wip-us.apache.org/repos/asf/beam/blob/7d13061c/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/WriteTables.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/WriteTables.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/WriteTables.java
index f7fe87b..4a6cd2b 100644
--- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/WriteTables.java
+++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/WriteTables.java
@@ -57,11 +57,15 @@ import org.slf4j.LoggerFactory;
/**
* Writes partitions to BigQuery tables.
*
- * <p>The input is a list of files corresponding to a partition of a table. These files are
+ * <p>The input is a list of files corresponding to each partition of a table. These files are
* load into a temporary table (or into the final table if there is only one partition). The output
- * is a {@link KV} mapping the final table to the temporary tables for each partition of that table.
+ * is a {@link KV} mapping each final table to a list of the temporary tables containing its data.
+ *
+ * <p>In the case where all the data in the files fit into a single load job, this transform loads
+ * the data directly into the final table, skipping temporary tables. In this case, the output
+ * {@link KV} maps the final table to itself.
*/
-class WriteTables extends DoFn<KV<ShardedKey<TableDestination>, Iterable<List<String>>>,
+class WriteTables extends DoFn<KV<ShardedKey<TableDestination>, List<String>>,
KV<TableDestination, String>> {
private static final Logger LOG = LoggerFactory.getLogger(WriteTables.class);
@@ -94,10 +98,9 @@ class WriteTables extends DoFn<KV<ShardedKey<TableDestination>, Iterable<List<St
public void processElement(ProcessContext c) throws Exception {
TableDestination tableDestination = c.element().getKey().getKey();
Integer partition = c.element().getKey().getShardNumber();
- List<String> partitionFiles = Lists.newArrayList(c.element().getValue()).get(0);
- // Job ID must be different for each partition of each table.
- String jobIdPrefix = String.format(
- c.sideInput(jobIdToken) + "_0x%08x_%05d", tableDestination.hashCode(), partition);
+ List<String> partitionFiles = Lists.newArrayList(c.element().getValue());
+ String jobIdPrefix = BigQueryHelpers.createJobId(
+ c.sideInput(jobIdToken), tableDestination, partition);
TableReference ref = tableDestination.getTableReference();
if (!singlePartition) {
http://git-wip-us.apache.org/repos/asf/beam/blob/7d13061c/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOTest.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOTest.java
index f10be13..d0004e4 100644
--- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOTest.java
+++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOTest.java
@@ -71,6 +71,7 @@ import org.apache.beam.sdk.coders.AtomicCoder;
import org.apache.beam.sdk.coders.Coder;
import org.apache.beam.sdk.coders.Coder.Context;
import org.apache.beam.sdk.coders.CoderException;
+import org.apache.beam.sdk.coders.IterableCoder;
import org.apache.beam.sdk.coders.KvCoder;
import org.apache.beam.sdk.coders.StringUtf8Coder;
import org.apache.beam.sdk.coders.TableRowJsonCoder;
@@ -122,7 +123,6 @@ import org.apache.beam.sdk.util.WindowingStrategy;
import org.apache.beam.sdk.values.KV;
import org.apache.beam.sdk.values.PCollection;
import org.apache.beam.sdk.values.PCollectionView;
-import org.apache.beam.sdk.values.PDone;
import org.apache.beam.sdk.values.TupleTag;
import org.apache.beam.sdk.values.TypeDescriptor;
import org.apache.beam.sdk.values.ValueInSingleWindow;
@@ -607,13 +607,11 @@ public class BigQueryIOTest implements Serializable {
}
@Test
- @Category(NeedsRunner.class)
public void testStreamingWriteWithDynamicTables() throws Exception {
testWriteWithDynamicTables(true);
}
@Test
- @Category(NeedsRunner.class)
public void testBatchWriteWithDynamicTables() throws Exception {
testWriteWithDynamicTables(false);
}
@@ -842,7 +840,7 @@ public class BigQueryIOTest implements Serializable {
BigQueryIO.writeTableRows().to("foo.com:project:somedataset.sometable");
checkWriteObject(
write, "foo.com:project", "somedataset", "sometable",
- null, CreateDisposition.CREATE_IF_NEEDED, WriteDisposition.WRITE_EMPTY, null);
+ null, CreateDisposition.CREATE_IF_NEEDED, WriteDisposition.WRITE_EMPTY, "");
}
@Test
@@ -894,7 +892,7 @@ public class BigQueryIOTest implements Serializable {
null,
CreateDisposition.CREATE_IF_NEEDED,
WriteDisposition.WRITE_EMPTY,
- null,
+ "",
false);
}
@@ -905,7 +903,7 @@ public class BigQueryIOTest implements Serializable {
checkWriteObject(
write, null, "somedataset", "sometable",
null, CreateDisposition.CREATE_IF_NEEDED, WriteDisposition.WRITE_EMPTY,
- null);
+ "");
}
@Test
@@ -917,7 +915,7 @@ public class BigQueryIOTest implements Serializable {
BigQueryIO.Write<TableRow> write = BigQueryIO.writeTableRows().to(table);
checkWriteObject(
write, "foo.com:project", "somedataset", "sometable",
- null, CreateDisposition.CREATE_IF_NEEDED, WriteDisposition.WRITE_EMPTY, null);
+ null, CreateDisposition.CREATE_IF_NEEDED, WriteDisposition.WRITE_EMPTY, "");
}
@Test
@@ -927,7 +925,7 @@ public class BigQueryIOTest implements Serializable {
BigQueryIO.<TableRow>write().to("foo.com:project:somedataset.sometable").withSchema(schema);
checkWriteObject(
write, "foo.com:project", "somedataset", "sometable",
- schema, CreateDisposition.CREATE_IF_NEEDED, WriteDisposition.WRITE_EMPTY, null);
+ schema, CreateDisposition.CREATE_IF_NEEDED, WriteDisposition.WRITE_EMPTY, "");
}
@Test
@@ -937,7 +935,7 @@ public class BigQueryIOTest implements Serializable {
.withCreateDisposition(CreateDisposition.CREATE_NEVER);
checkWriteObject(
write, "foo.com:project", "somedataset", "sometable",
- null, CreateDisposition.CREATE_NEVER, WriteDisposition.WRITE_EMPTY, null);
+ null, CreateDisposition.CREATE_NEVER, WriteDisposition.WRITE_EMPTY, "");
}
@Test
@@ -947,7 +945,7 @@ public class BigQueryIOTest implements Serializable {
.withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED);
checkWriteObject(
write, "foo.com:project", "somedataset", "sometable",
- null, CreateDisposition.CREATE_IF_NEEDED, WriteDisposition.WRITE_EMPTY, null);
+ null, CreateDisposition.CREATE_IF_NEEDED, WriteDisposition.WRITE_EMPTY, "");
}
@Test
@@ -957,7 +955,7 @@ public class BigQueryIOTest implements Serializable {
.withWriteDisposition(WriteDisposition.WRITE_TRUNCATE);
checkWriteObject(
write, "foo.com:project", "somedataset", "sometable",
- null, CreateDisposition.CREATE_IF_NEEDED, WriteDisposition.WRITE_TRUNCATE, null);
+ null, CreateDisposition.CREATE_IF_NEEDED, WriteDisposition.WRITE_TRUNCATE, "");
}
@Test
@@ -967,7 +965,7 @@ public class BigQueryIOTest implements Serializable {
.withWriteDisposition(WriteDisposition.WRITE_APPEND);
checkWriteObject(
write, "foo.com:project", "somedataset", "sometable",
- null, CreateDisposition.CREATE_IF_NEEDED, WriteDisposition.WRITE_APPEND, null);
+ null, CreateDisposition.CREATE_IF_NEEDED, WriteDisposition.WRITE_APPEND, "");
}
@Test
@@ -977,7 +975,7 @@ public class BigQueryIOTest implements Serializable {
.withWriteDisposition(WriteDisposition.WRITE_EMPTY);
checkWriteObject(
write, "foo.com:project", "somedataset", "sometable",
- null, CreateDisposition.CREATE_IF_NEEDED, WriteDisposition.WRITE_EMPTY, null);
+ null, CreateDisposition.CREATE_IF_NEEDED, WriteDisposition.WRITE_EMPTY, "");
}
@Test
@@ -1359,7 +1357,6 @@ public class BigQueryIOTest implements Serializable {
SourceTestUtils.assertSplitAtFractionBehavior(
bqSource, 2, 0.3, ExpectedSplitOutcome.MUST_BE_CONSISTENT_IF_SUCCEEDS, options);
-
List<? extends BoundedSource<TableRow>> sources = bqSource.split(100, options);
assertEquals(2, sources.size());
BoundedSource<TableRow> actual = sources.get(0);
@@ -1626,9 +1623,11 @@ public class BigQueryIOTest implements Serializable {
TupleTag<KV<ShardedKey<TableDestination>, List<String>>> singlePartitionTag =
new TupleTag<KV<ShardedKey<TableDestination>, List<String>>>("singlePartitionTag") {};
+ PCollection<WriteBundlesToFiles.Result> filesPCollection =
+ p.apply(Create.of(files).withType(new TypeDescriptor<WriteBundlesToFiles.Result>() {}));
PCollectionView<Iterable<WriteBundlesToFiles.Result>> resultsView =
PCollectionViews.iterableView(
- p,
+ filesPCollection,
WindowingStrategy.globalDefault(),
WriteBundlesToFiles.ResultCoder.of());
@@ -1699,14 +1698,12 @@ public class BigQueryIOTest implements Serializable {
Path baseDir = Files.createTempDirectory(tempFolder, "testWriteTables");
- List<KV<ShardedKey<TableDestination>, Iterable<List<String>>>> partitions =
- Lists.newArrayList();
+ List<KV<ShardedKey<TableDestination>, List<String>>> partitions = Lists.newArrayList();
for (int i = 0; i < numTables; ++i) {
String tableName = String.format("project-id:dataset-id.table%05d", i);
TableDestination tableDestination = new TableDestination(tableName, tableName);
for (int j = 0; j < numPartitions; ++j) {
- String tempTableId = String.format(
- jobIdToken + "_0x%08x_%05d", tableDestination.hashCode(), j);
+ String tempTableId = BigQueryHelpers.createJobId(jobIdToken, tableDestination, j);
List<String> filesPerPartition = Lists.newArrayList();
for (int k = 0; k < numFilesPerPartition; ++k) {
String filename = Paths.get(baseDir.toString(),
@@ -1721,7 +1718,7 @@ public class BigQueryIOTest implements Serializable {
filesPerPartition.add(filename);
}
partitions.add(KV.of(ShardedKey.of(tableDestination, j),
- (Iterable<List<String>>) Collections.singleton(filesPerPartition)));
+ filesPerPartition));
List<String> expectedTables = expectedTempTables.get(tableDestination);
if (expectedTables == null) {
@@ -1735,11 +1732,6 @@ public class BigQueryIOTest implements Serializable {
}
}
- PCollection<String> expectedTempTablesPCollection = p.apply(Create.of(expectedTempTables));
- PCollectionView<Iterable<String>> tempTablesView = PCollectionViews.iterableView(
- expectedTempTablesPCollection,
- WindowingStrategy.globalDefault(),
- StringUtf8Coder.of());
PCollection<String> jobIdTokenCollection = p.apply("CreateJobId", Create.of("jobId"));
PCollectionView<String> jobIdTokenView =
jobIdTokenCollection.apply(View.<String>asSingleton());
@@ -1753,10 +1745,10 @@ public class BigQueryIOTest implements Serializable {
CreateDisposition.CREATE_IF_NEEDED,
null);
- DoFnTester<KV<ShardedKey<TableDestination>, Iterable<List<String>>>,
+ DoFnTester<KV<ShardedKey<TableDestination>, List<String>>,
KV<TableDestination, String>> tester = DoFnTester.of(writeTables);
tester.setSideInput(jobIdTokenView, GlobalWindow.INSTANCE, jobIdToken);
- for (KV<ShardedKey<TableDestination>, Iterable<List<String>>> partition : partitions) {
+ for (KV<ShardedKey<TableDestination>, List<String>> partition : partitions) {
tester.processElement(partition);
}
@@ -1848,11 +1840,27 @@ public class BigQueryIOTest implements Serializable {
}
}
+ PCollection<KV<TableDestination, String>> tempTablesPCollection =
+ p.apply(Create.of(tempTables)
+ .withCoder(KvCoder.of(TableDestinationCoder.of(),
+ IterableCoder.of(StringUtf8Coder.of()))))
+ .apply(ParDo.of(new DoFn<KV<TableDestination, Iterable<String>>,
+ KV<TableDestination, String>>() {
+ @ProcessElement
+ public void processElement(ProcessContext c) {
+ TableDestination tableDestination = c.element().getKey();
+ for (String tempTable : c.element().getValue()) {
+ c.output(KV.of(tableDestination, tempTable));
+ }
+ }
+ }));
+
PCollectionView<Map<TableDestination, Iterable<String>>> tempTablesView =
PCollectionViews.multimapView(
- p,
+ tempTablesPCollection,
WindowingStrategy.globalDefault(),
- KvCoder.of(TableDestinationCoder.of(), StringUtf8Coder.of()));
+ KvCoder.of(TableDestinationCoder.of(),
+ StringUtf8Coder.of()));
PCollection<String> jobIdTokenCollection = p.apply("CreateJobId", Create.of("jobId"));
PCollectionView<String> jobIdTokenView =
[10/50] [abbrv] beam git commit: Add PrepareWrite transform.
Posted by dh...@apache.org.
Add PrepareWrite transform.
Project: http://git-wip-us.apache.org/repos/asf/beam/repo
Commit: http://git-wip-us.apache.org/repos/asf/beam/commit/67a5f827
Tree: http://git-wip-us.apache.org/repos/asf/beam/tree/67a5f827
Diff: http://git-wip-us.apache.org/repos/asf/beam/diff/67a5f827
Branch: refs/heads/DSL_SQL
Commit: 67a5f82706e52fe025b63aa2e9652368f22c8344
Parents: c939a43
Author: Reuven Lax <re...@google.com>
Authored: Tue Mar 28 12:53:27 2017 -0700
Committer: Eugene Kirpichov <ki...@google.com>
Committed: Tue Apr 18 21:12:49 2017 -0700
----------------------------------------------------------------------
.../beam/sdk/io/gcp/bigquery/PrepareWrite.java | 58 ++++++++++++++++++++
.../sdk/io/gcp/bigquery/TableDestination.java | 7 +++
.../gcp/bigquery/TagWithUniqueIdsAndTable.java | 15 ++---
3 files changed, 69 insertions(+), 11 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/beam/blob/67a5f827/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/PrepareWrite.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/PrepareWrite.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/PrepareWrite.java
new file mode 100644
index 0000000..0c08e18
--- /dev/null
+++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/PrepareWrite.java
@@ -0,0 +1,58 @@
+package org.apache.beam.sdk.io.gcp.bigquery;
+
+import com.google.api.services.bigquery.model.TableReference;
+import com.google.api.services.bigquery.model.TableRow;
+import com.google.common.base.Strings;
+import java.io.IOException;
+import org.apache.beam.sdk.options.BigQueryOptions;
+import org.apache.beam.sdk.transforms.DoFn;
+import org.apache.beam.sdk.transforms.PTransform;
+import org.apache.beam.sdk.transforms.ParDo;
+import org.apache.beam.sdk.transforms.SerializableFunction;
+import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
+import org.apache.beam.sdk.values.KV;
+import org.apache.beam.sdk.values.PCollection;
+import org.apache.beam.sdk.values.ValueInSingleWindow;
+
+/**
+ * Prepare an input {@link PCollection<T>} for writing to BigQuery. Use the table-reference
+ * function to determine which tables each element is written to, and format the element into a
+ * {@link TableRow} using the user-supplied format function.
+ */
+public class PrepareWrite<T> extends PTransform<PCollection<T>, PCollection<KV<String, TableRow>>> {
+ private static final String NAME = "PrepareWrite";
+ private SerializableFunction<ValueInSingleWindow<T>, TableReference> tableRefFunction;
+ private SerializableFunction<T, TableRow> formatFunction;
+
+ public PrepareWrite(SerializableFunction<ValueInSingleWindow<T>, TableReference> tableRefFunction,
+ SerializableFunction<T, TableRow> formatFunction) {
+ super(NAME);
+ this.tableRefFunction = tableRefFunction;
+ this.formatFunction = formatFunction;
+ }
+
+ @Override
+ public PCollection<KV<String, TableRow>> expand(PCollection<T> input) {
+ PCollection<KV<String, TableRow>> elementsByTable =
+ input.apply(ParDo.of(new DoFn<T, KV<String, TableRow>>() {
+ @ProcessElement
+ public void processElement(ProcessContext context, BoundedWindow window) throws IOException {
+ String tableSpec = tableSpecFromWindowedValue(
+ context.getPipelineOptions().as(BigQueryOptions.class),
+ ValueInSingleWindow.of(context.element(), context.timestamp(), window, context.pane()));
+ TableRow tableRow = formatFunction.apply(context.element());
+ context.output(KV.of(tableSpec, tableRow));
+ }
+ }));
+ return elementsByTable;
+ }
+
+ private String tableSpecFromWindowedValue(BigQueryOptions options,
+ ValueInSingleWindow<T> value) {
+ TableReference table = tableRefFunction.apply(value);
+ if (Strings.isNullOrEmpty(table.getProjectId())) {
+ table.setProjectId(options.getProject());
+ }
+ return BigQueryHelpers.toTableSpec(table);
+ }
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/67a5f827/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TableDestination.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TableDestination.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TableDestination.java
new file mode 100644
index 0000000..3cbbf3b
--- /dev/null
+++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TableDestination.java
@@ -0,0 +1,7 @@
+package org.apache.beam.sdk.io.gcp.bigquery;
+
+/**
+ * Created by relax on 3/28/17.
+ */
+public class TableDestination {
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/67a5f827/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TagWithUniqueIdsAndTable.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TagWithUniqueIdsAndTable.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TagWithUniqueIdsAndTable.java
index 8d7d1e6..4e50f7c 100644
--- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TagWithUniqueIdsAndTable.java
+++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TagWithUniqueIdsAndTable.java
@@ -73,9 +73,9 @@ class TagWithUniqueIdsAndTable<T>
public void processElement(ProcessContext context, BoundedWindow window) throws IOException {
String uniqueId = randomUUID + sequenceNo++;
ThreadLocalRandom randomGenerator = ThreadLocalRandom.current();
- String tableSpec = tableSpecFromWindowedValue(
- context.getPipelineOptions().as(BigQueryOptions.class),
- ValueInSingleWindow.of(context.element(), context.timestamp(), window, context.pane()));
+ String tableSpec = tableSpecFromWindowedValue(
+ context.getPipelineOptions().as(BigQueryOptions.class),
+ ValueInSingleWindow.of(context.element(), context.timestamp(), window, context.pane()));
// We output on keys 0-50 to ensure that there's enough batching for
// BigQuery.
context.output(KV.of(ShardedKey.of(tableSpec, randomGenerator.nextInt(0, 50)),
@@ -97,12 +97,5 @@ class TagWithUniqueIdsAndTable<T>
}
- private String tableSpecFromWindowedValue(BigQueryOptions options,
- ValueInSingleWindow<T> value) {
- TableReference table = write.getTableRefFunction().apply(value);
- if (Strings.isNullOrEmpty(table.getProjectId())) {
- table.setProjectId(options.getProject());
- }
- return BigQueryHelpers.toTableSpec(table);
- }
+
}
[34/50] [abbrv] beam git commit: [BEAM-1994] Remove Flink examples
package
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/state/FlinkKeyGroupStateInternals.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/state/FlinkKeyGroupStateInternals.java b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/state/FlinkKeyGroupStateInternals.java
deleted file mode 100644
index 24b340e..0000000
--- a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/state/FlinkKeyGroupStateInternals.java
+++ /dev/null
@@ -1,487 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink.translation.wrappers.streaming.state;
-
-import static org.apache.flink.util.Preconditions.checkArgument;
-
-import java.io.DataInputStream;
-import java.io.DataOutputStream;
-import java.nio.ByteBuffer;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import org.apache.beam.runners.core.StateInternals;
-import org.apache.beam.runners.core.StateNamespace;
-import org.apache.beam.runners.core.StateTag;
-import org.apache.beam.sdk.coders.Coder;
-import org.apache.beam.sdk.coders.Coder.Context;
-import org.apache.beam.sdk.coders.CoderException;
-import org.apache.beam.sdk.coders.ListCoder;
-import org.apache.beam.sdk.coders.StringUtf8Coder;
-import org.apache.beam.sdk.transforms.Combine;
-import org.apache.beam.sdk.transforms.CombineWithContext;
-import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
-import org.apache.beam.sdk.transforms.windowing.OutputTimeFn;
-import org.apache.beam.sdk.util.CoderUtils;
-import org.apache.beam.sdk.util.state.BagState;
-import org.apache.beam.sdk.util.state.CombiningState;
-import org.apache.beam.sdk.util.state.MapState;
-import org.apache.beam.sdk.util.state.ReadableState;
-import org.apache.beam.sdk.util.state.SetState;
-import org.apache.beam.sdk.util.state.State;
-import org.apache.beam.sdk.util.state.StateContext;
-import org.apache.beam.sdk.util.state.StateContexts;
-import org.apache.beam.sdk.util.state.ValueState;
-import org.apache.beam.sdk.util.state.WatermarkHoldState;
-import org.apache.flink.api.java.tuple.Tuple2;
-import org.apache.flink.runtime.state.KeyGroupsList;
-import org.apache.flink.runtime.state.KeyedStateBackend;
-import org.apache.flink.streaming.api.operators.HeapInternalTimerService;
-import org.apache.flink.util.InstantiationUtil;
-import org.apache.flink.util.Preconditions;
-
-/**
- * {@link StateInternals} that uses {@link KeyGroupCheckpointedOperator}
- * to checkpoint state.
- *
- * <p>Note:
- * Ignore index of key.
- * Just implement BagState.
- *
- * <p>Reference from {@link HeapInternalTimerService} to the local key-group range.
- */
-public class FlinkKeyGroupStateInternals<K> implements StateInternals<K> {
-
- private final Coder<K> keyCoder;
- private final KeyGroupsList localKeyGroupRange;
- private KeyedStateBackend keyedStateBackend;
- private final int localKeyGroupRangeStartIdx;
-
- // stateName -> namespace -> (valueCoder, value)
- private final Map<String, Tuple2<Coder<?>, Map<String, ?>>>[] stateTables;
-
- public FlinkKeyGroupStateInternals(
- Coder<K> keyCoder,
- KeyedStateBackend keyedStateBackend) {
- this.keyCoder = keyCoder;
- this.keyedStateBackend = keyedStateBackend;
- this.localKeyGroupRange = keyedStateBackend.getKeyGroupRange();
- // find the starting index of the local key-group range
- int startIdx = Integer.MAX_VALUE;
- for (Integer keyGroupIdx : localKeyGroupRange) {
- startIdx = Math.min(keyGroupIdx, startIdx);
- }
- this.localKeyGroupRangeStartIdx = startIdx;
- stateTables = (Map<String, Tuple2<Coder<?>, Map<String, ?>>>[])
- new Map[localKeyGroupRange.getNumberOfKeyGroups()];
- for (int i = 0; i < stateTables.length; i++) {
- stateTables[i] = new HashMap<>();
- }
- }
-
- @Override
- public K getKey() {
- ByteBuffer keyBytes = (ByteBuffer) keyedStateBackend.getCurrentKey();
- try {
- return CoderUtils.decodeFromByteArray(keyCoder, keyBytes.array());
- } catch (CoderException e) {
- throw new RuntimeException("Error decoding key.", e);
- }
- }
-
- @Override
- public <T extends State> T state(
- final StateNamespace namespace,
- StateTag<? super K, T> address) {
-
- return state(namespace, address, StateContexts.nullContext());
- }
-
- @Override
- public <T extends State> T state(
- final StateNamespace namespace,
- StateTag<? super K, T> address,
- final StateContext<?> context) {
-
- return address.bind(new StateTag.StateBinder<K>() {
-
- @Override
- public <T> ValueState<T> bindValue(
- StateTag<? super K, ValueState<T>> address,
- Coder<T> coder) {
- throw new UnsupportedOperationException(
- String.format("%s is not supported", ValueState.class.getSimpleName()));
- }
-
- @Override
- public <T> BagState<T> bindBag(
- StateTag<? super K, BagState<T>> address,
- Coder<T> elemCoder) {
-
- return new FlinkKeyGroupBagState<>(address, namespace, elemCoder);
- }
-
- @Override
- public <T> SetState<T> bindSet(
- StateTag<? super K, SetState<T>> address,
- Coder<T> elemCoder) {
- throw new UnsupportedOperationException(
- String.format("%s is not supported", SetState.class.getSimpleName()));
- }
-
- @Override
- public <KeyT, ValueT> MapState<KeyT, ValueT> bindMap(
- StateTag<? super K, MapState<KeyT, ValueT>> spec,
- Coder<KeyT> mapKeyCoder, Coder<ValueT> mapValueCoder) {
- throw new UnsupportedOperationException(
- String.format("%s is not supported", MapState.class.getSimpleName()));
- }
-
- @Override
- public <InputT, AccumT, OutputT>
- CombiningState<InputT, AccumT, OutputT>
- bindCombiningValue(
- StateTag<? super K, CombiningState<InputT, AccumT, OutputT>> address,
- Coder<AccumT> accumCoder,
- Combine.CombineFn<InputT, AccumT, OutputT> combineFn) {
- throw new UnsupportedOperationException("bindCombiningValue is not supported.");
- }
-
- @Override
- public <InputT, AccumT, OutputT>
- CombiningState<InputT, AccumT, OutputT> bindKeyedCombiningValue(
- StateTag<? super K, CombiningState<InputT, AccumT, OutputT>> address,
- Coder<AccumT> accumCoder,
- final Combine.KeyedCombineFn<? super K, InputT, AccumT, OutputT> combineFn) {
- throw new UnsupportedOperationException("bindKeyedCombiningValue is not supported.");
-
- }
-
- @Override
- public <InputT, AccumT, OutputT>
- CombiningState<InputT, AccumT, OutputT> bindKeyedCombiningValueWithContext(
- StateTag<? super K, CombiningState<InputT, AccumT, OutputT>> address,
- Coder<AccumT> accumCoder,
- CombineWithContext.KeyedCombineFnWithContext<
- ? super K, InputT, AccumT, OutputT> combineFn) {
- throw new UnsupportedOperationException(
- "bindKeyedCombiningValueWithContext is not supported.");
- }
-
- @Override
- public <W extends BoundedWindow> WatermarkHoldState<W> bindWatermark(
- StateTag<? super K, WatermarkHoldState<W>> address,
- OutputTimeFn<? super W> outputTimeFn) {
- throw new UnsupportedOperationException(
- String.format("%s is not supported", CombiningState.class.getSimpleName()));
- }
- });
- }
-
- /**
- * Reference from {@link Combine.CombineFn}.
- *
- * <p>Accumulators are stored in each KeyGroup, call addInput() when a element comes,
- * call extractOutput() to produce the desired value when need to read data.
- */
- interface KeyGroupCombiner<InputT, AccumT, OutputT> {
-
- /**
- * Returns a new, mutable accumulator value, representing the accumulation
- * of zero input values.
- */
- AccumT createAccumulator();
-
- /**
- * Adds the given input value to the given accumulator, returning the
- * new accumulator value.
- */
- AccumT addInput(AccumT accumulator, InputT input);
-
- /**
- * Returns the output value that is the result of all accumulators from KeyGroups
- * that are assigned to this operator.
- */
- OutputT extractOutput(Iterable<AccumT> accumulators);
- }
-
- private abstract class AbstractKeyGroupState<InputT, AccumT, OutputT> {
-
- private String stateName;
- private String namespace;
- private Coder<AccumT> coder;
- private KeyGroupCombiner<InputT, AccumT, OutputT> keyGroupCombiner;
-
- AbstractKeyGroupState(
- String stateName,
- String namespace,
- Coder<AccumT> coder,
- KeyGroupCombiner<InputT, AccumT, OutputT> keyGroupCombiner) {
- this.stateName = stateName;
- this.namespace = namespace;
- this.coder = coder;
- this.keyGroupCombiner = keyGroupCombiner;
- }
-
- /**
- * Choose keyGroup of input and addInput to accumulator.
- */
- void addInput(InputT input) {
- int keyGroupIdx = keyedStateBackend.getCurrentKeyGroupIndex();
- int localIdx = getIndexForKeyGroup(keyGroupIdx);
- Map<String, Tuple2<Coder<?>, Map<String, ?>>> stateTable = stateTables[localIdx];
- Tuple2<Coder<?>, Map<String, ?>> tuple2 = stateTable.get(stateName);
- if (tuple2 == null) {
- tuple2 = new Tuple2<>();
- tuple2.f0 = coder;
- tuple2.f1 = new HashMap<>();
- stateTable.put(stateName, tuple2);
- }
- Map<String, AccumT> map = (Map<String, AccumT>) tuple2.f1;
- AccumT accumulator = map.get(namespace);
- if (accumulator == null) {
- accumulator = keyGroupCombiner.createAccumulator();
- }
- accumulator = keyGroupCombiner.addInput(accumulator, input);
- map.put(namespace, accumulator);
- }
-
- /**
- * Get all accumulators and invoke extractOutput().
- */
- OutputT extractOutput() {
- List<AccumT> accumulators = new ArrayList<>(stateTables.length);
- for (Map<String, Tuple2<Coder<?>, Map<String, ?>>> stateTable : stateTables) {
- Tuple2<Coder<?>, Map<String, ?>> tuple2 = stateTable.get(stateName);
- if (tuple2 != null) {
- AccumT accumulator = (AccumT) tuple2.f1.get(namespace);
- if (accumulator != null) {
- accumulators.add(accumulator);
- }
- }
- }
- return keyGroupCombiner.extractOutput(accumulators);
- }
-
- /**
- * Find the first accumulator and return immediately.
- */
- boolean isEmptyInternal() {
- for (Map<String, Tuple2<Coder<?>, Map<String, ?>>> stateTable : stateTables) {
- Tuple2<Coder<?>, Map<String, ?>> tuple2 = stateTable.get(stateName);
- if (tuple2 != null) {
- AccumT accumulator = (AccumT) tuple2.f1.get(namespace);
- if (accumulator != null) {
- return false;
- }
- }
- }
- return true;
- }
-
- /**
- * Clear accumulators and clean empty map.
- */
- void clearInternal() {
- for (Map<String, Tuple2<Coder<?>, Map<String, ?>>> stateTable : stateTables) {
- Tuple2<Coder<?>, Map<String, ?>> tuple2 = stateTable.get(stateName);
- if (tuple2 != null) {
- tuple2.f1.remove(namespace);
- if (tuple2.f1.size() == 0) {
- stateTable.remove(stateName);
- }
- }
- }
- }
-
- }
-
- private int getIndexForKeyGroup(int keyGroupIdx) {
- checkArgument(localKeyGroupRange.contains(keyGroupIdx),
- "Key Group " + keyGroupIdx + " does not belong to the local range.");
- return keyGroupIdx - this.localKeyGroupRangeStartIdx;
- }
-
- private class KeyGroupBagCombiner<T> implements KeyGroupCombiner<T, List<T>, Iterable<T>> {
-
- @Override
- public List<T> createAccumulator() {
- return new ArrayList<>();
- }
-
- @Override
- public List<T> addInput(List<T> accumulator, T input) {
- accumulator.add(input);
- return accumulator;
- }
-
- @Override
- public Iterable<T> extractOutput(Iterable<List<T>> accumulators) {
- List<T> result = new ArrayList<>();
- // maybe can return an unmodifiable view.
- for (List<T> list : accumulators) {
- result.addAll(list);
- }
- return result;
- }
- }
-
- private class FlinkKeyGroupBagState<T> extends AbstractKeyGroupState<T, List<T>, Iterable<T>>
- implements BagState<T> {
-
- private final StateNamespace namespace;
- private final StateTag<? super K, BagState<T>> address;
-
- FlinkKeyGroupBagState(
- StateTag<? super K, BagState<T>> address,
- StateNamespace namespace,
- Coder<T> coder) {
- super(address.getId(), namespace.stringKey(), ListCoder.of(coder),
- new KeyGroupBagCombiner<T>());
- this.namespace = namespace;
- this.address = address;
- }
-
- @Override
- public void add(T input) {
- addInput(input);
- }
-
- @Override
- public BagState<T> readLater() {
- return this;
- }
-
- @Override
- public Iterable<T> read() {
- Iterable<T> result = extractOutput();
- return result != null ? result : Collections.<T>emptyList();
- }
-
- @Override
- public ReadableState<Boolean> isEmpty() {
- return new ReadableState<Boolean>() {
- @Override
- public Boolean read() {
- try {
- return isEmptyInternal();
- } catch (Exception e) {
- throw new RuntimeException("Error reading state.", e);
- }
-
- }
-
- @Override
- public ReadableState<Boolean> readLater() {
- return this;
- }
- };
- }
-
- @Override
- public void clear() {
- clearInternal();
- }
-
- @Override
- public boolean equals(Object o) {
- if (this == o) {
- return true;
- }
- if (o == null || getClass() != o.getClass()) {
- return false;
- }
-
- FlinkKeyGroupBagState<?> that = (FlinkKeyGroupBagState<?>) o;
-
- return namespace.equals(that.namespace) && address.equals(that.address);
-
- }
-
- @Override
- public int hashCode() {
- int result = namespace.hashCode();
- result = 31 * result + address.hashCode();
- return result;
- }
- }
-
- /**
- * Snapshots the state {@code (stateName -> (valueCoder && (namespace -> value)))} for a given
- * {@code keyGroupIdx}.
- *
- * @param keyGroupIdx the id of the key-group to be put in the snapshot.
- * @param out the stream to write to.
- */
- public void snapshotKeyGroupState(int keyGroupIdx, DataOutputStream out) throws Exception {
- int localIdx = getIndexForKeyGroup(keyGroupIdx);
- Map<String, Tuple2<Coder<?>, Map<String, ?>>> stateTable = stateTables[localIdx];
- Preconditions.checkState(stateTable.size() <= Short.MAX_VALUE,
- "Too many States: " + stateTable.size() + ". Currently at most "
- + Short.MAX_VALUE + " states are supported");
- out.writeShort(stateTable.size());
- for (Map.Entry<String, Tuple2<Coder<?>, Map<String, ?>>> entry : stateTable.entrySet()) {
- out.writeUTF(entry.getKey());
- Coder coder = entry.getValue().f0;
- InstantiationUtil.serializeObject(out, coder);
- Map<String, ?> map = entry.getValue().f1;
- out.writeInt(map.size());
- for (Map.Entry<String, ?> entry1 : map.entrySet()) {
- StringUtf8Coder.of().encode(entry1.getKey(), out, Context.NESTED);
- coder.encode(entry1.getValue(), out, Context.NESTED);
- }
- }
- }
-
- /**
- * Restore the state {@code (stateName -> (valueCoder && (namespace -> value)))}
- * for a given {@code keyGroupIdx}.
- *
- * @param keyGroupIdx the id of the key-group to be put in the snapshot.
- * @param in the stream to read from.
- * @param userCodeClassLoader the class loader that will be used to deserialize
- * the valueCoder.
- */
- public void restoreKeyGroupState(int keyGroupIdx, DataInputStream in,
- ClassLoader userCodeClassLoader) throws Exception {
- int localIdx = getIndexForKeyGroup(keyGroupIdx);
- Map<String, Tuple2<Coder<?>, Map<String, ?>>> stateTable = stateTables[localIdx];
- int numStates = in.readShort();
- for (int i = 0; i < numStates; ++i) {
- String stateName = in.readUTF();
- Coder coder = InstantiationUtil.deserializeObject(in, userCodeClassLoader);
- Tuple2<Coder<?>, Map<String, ?>> tuple2 = stateTable.get(stateName);
- if (tuple2 == null) {
- tuple2 = new Tuple2<>();
- tuple2.f0 = coder;
- tuple2.f1 = new HashMap<>();
- stateTable.put(stateName, tuple2);
- }
- Map<String, Object> map = (Map<String, Object>) tuple2.f1;
- int mapSize = in.readInt();
- for (int j = 0; j < mapSize; j++) {
- String namespace = StringUtf8Coder.of().decode(in, Context.NESTED);
- Object value = coder.decode(in, Context.NESTED);
- map.put(namespace, value);
- }
- }
- }
-
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/state/FlinkSplitStateInternals.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/state/FlinkSplitStateInternals.java b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/state/FlinkSplitStateInternals.java
deleted file mode 100644
index 2bf0bf1..0000000
--- a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/state/FlinkSplitStateInternals.java
+++ /dev/null
@@ -1,260 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink.translation.wrappers.streaming.state;
-
-import com.google.common.collect.Iterators;
-import java.util.Collections;
-import org.apache.beam.runners.core.StateInternals;
-import org.apache.beam.runners.core.StateNamespace;
-import org.apache.beam.runners.core.StateTag;
-import org.apache.beam.runners.flink.translation.types.CoderTypeInformation;
-import org.apache.beam.sdk.coders.Coder;
-import org.apache.beam.sdk.transforms.Combine;
-import org.apache.beam.sdk.transforms.CombineWithContext;
-import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
-import org.apache.beam.sdk.transforms.windowing.OutputTimeFn;
-import org.apache.beam.sdk.util.state.BagState;
-import org.apache.beam.sdk.util.state.CombiningState;
-import org.apache.beam.sdk.util.state.MapState;
-import org.apache.beam.sdk.util.state.ReadableState;
-import org.apache.beam.sdk.util.state.SetState;
-import org.apache.beam.sdk.util.state.State;
-import org.apache.beam.sdk.util.state.StateContext;
-import org.apache.beam.sdk.util.state.StateContexts;
-import org.apache.beam.sdk.util.state.ValueState;
-import org.apache.beam.sdk.util.state.WatermarkHoldState;
-import org.apache.flink.api.common.ExecutionConfig;
-import org.apache.flink.api.common.state.ListStateDescriptor;
-import org.apache.flink.runtime.state.OperatorStateBackend;
-
-/**
- * {@link StateInternals} that uses a Flink {@link OperatorStateBackend}
- * to manage the split-distribute state.
- *
- * <p>Elements in ListState will be redistributed in round robin fashion
- * to operators when restarting with a different parallelism.
- *
- * <p>Note:
- * Ignore index of key and namespace.
- * Just implement BagState.
- */
-public class FlinkSplitStateInternals<K> implements StateInternals<K> {
-
- private final OperatorStateBackend stateBackend;
-
- public FlinkSplitStateInternals(OperatorStateBackend stateBackend) {
- this.stateBackend = stateBackend;
- }
-
- @Override
- public K getKey() {
- return null;
- }
-
- @Override
- public <T extends State> T state(
- final StateNamespace namespace,
- StateTag<? super K, T> address) {
-
- return state(namespace, address, StateContexts.nullContext());
- }
-
- @Override
- public <T extends State> T state(
- final StateNamespace namespace,
- StateTag<? super K, T> address,
- final StateContext<?> context) {
-
- return address.bind(new StateTag.StateBinder<K>() {
-
- @Override
- public <T> ValueState<T> bindValue(
- StateTag<? super K, ValueState<T>> address,
- Coder<T> coder) {
- throw new UnsupportedOperationException(
- String.format("%s is not supported", ValueState.class.getSimpleName()));
- }
-
- @Override
- public <T> BagState<T> bindBag(
- StateTag<? super K, BagState<T>> address,
- Coder<T> elemCoder) {
-
- return new FlinkSplitBagState<>(stateBackend, address, namespace, elemCoder);
- }
-
- @Override
- public <T> SetState<T> bindSet(
- StateTag<? super K, SetState<T>> address,
- Coder<T> elemCoder) {
- throw new UnsupportedOperationException(
- String.format("%s is not supported", SetState.class.getSimpleName()));
- }
-
- @Override
- public <KeyT, ValueT> MapState<KeyT, ValueT> bindMap(
- StateTag<? super K, MapState<KeyT, ValueT>> spec,
- Coder<KeyT> mapKeyCoder, Coder<ValueT> mapValueCoder) {
- throw new UnsupportedOperationException(
- String.format("%s is not supported", MapState.class.getSimpleName()));
- }
-
- @Override
- public <InputT, AccumT, OutputT>
- CombiningState<InputT, AccumT, OutputT>
- bindCombiningValue(
- StateTag<? super K, CombiningState<InputT, AccumT, OutputT>> address,
- Coder<AccumT> accumCoder,
- Combine.CombineFn<InputT, AccumT, OutputT> combineFn) {
- throw new UnsupportedOperationException("bindCombiningValue is not supported.");
- }
-
- @Override
- public <InputT, AccumT, OutputT>
- CombiningState<InputT, AccumT, OutputT> bindKeyedCombiningValue(
- StateTag<? super K, CombiningState<InputT, AccumT, OutputT>> address,
- Coder<AccumT> accumCoder,
- final Combine.KeyedCombineFn<? super K, InputT, AccumT, OutputT> combineFn) {
- throw new UnsupportedOperationException("bindKeyedCombiningValue is not supported.");
-
- }
-
- @Override
- public <InputT, AccumT, OutputT>
- CombiningState<InputT, AccumT, OutputT> bindKeyedCombiningValueWithContext(
- StateTag<? super K, CombiningState<InputT, AccumT, OutputT>> address,
- Coder<AccumT> accumCoder,
- CombineWithContext.KeyedCombineFnWithContext<
- ? super K, InputT, AccumT, OutputT> combineFn) {
- throw new UnsupportedOperationException(
- "bindKeyedCombiningValueWithContext is not supported.");
- }
-
- @Override
- public <W extends BoundedWindow> WatermarkHoldState<W> bindWatermark(
- StateTag<? super K, WatermarkHoldState<W>> address,
- OutputTimeFn<? super W> outputTimeFn) {
- throw new UnsupportedOperationException(
- String.format("%s is not supported", CombiningState.class.getSimpleName()));
- }
- });
- }
-
- private static class FlinkSplitBagState<K, T> implements BagState<T> {
-
- private final ListStateDescriptor<T> descriptor;
- private OperatorStateBackend flinkStateBackend;
- private final StateNamespace namespace;
- private final StateTag<? super K, BagState<T>> address;
-
- FlinkSplitBagState(
- OperatorStateBackend flinkStateBackend,
- StateTag<? super K, BagState<T>> address,
- StateNamespace namespace,
- Coder<T> coder) {
- this.flinkStateBackend = flinkStateBackend;
- this.namespace = namespace;
- this.address = address;
-
- CoderTypeInformation<T> typeInfo =
- new CoderTypeInformation<>(coder);
-
- descriptor = new ListStateDescriptor<>(address.getId(),
- typeInfo.createSerializer(new ExecutionConfig()));
- }
-
- @Override
- public void add(T input) {
- try {
- flinkStateBackend.getOperatorState(descriptor).add(input);
- } catch (Exception e) {
- throw new RuntimeException("Error updating state.", e);
- }
- }
-
- @Override
- public BagState<T> readLater() {
- return this;
- }
-
- @Override
- public Iterable<T> read() {
- try {
- Iterable<T> result = flinkStateBackend.getOperatorState(descriptor).get();
- return result != null ? result : Collections.<T>emptyList();
- } catch (Exception e) {
- throw new RuntimeException("Error updating state.", e);
- }
- }
-
- @Override
- public ReadableState<Boolean> isEmpty() {
- return new ReadableState<Boolean>() {
- @Override
- public Boolean read() {
- try {
- Iterable<T> result = flinkStateBackend.getOperatorState(descriptor).get();
- // PartitionableListState.get() return empty collection When there is no element,
- // KeyedListState different. (return null)
- return result == null || Iterators.size(result.iterator()) == 0;
- } catch (Exception e) {
- throw new RuntimeException("Error reading state.", e);
- }
-
- }
-
- @Override
- public ReadableState<Boolean> readLater() {
- return this;
- }
- };
- }
-
- @Override
- public void clear() {
- try {
- flinkStateBackend.getOperatorState(descriptor).clear();
- } catch (Exception e) {
- throw new RuntimeException("Error reading state.", e);
- }
- }
-
- @Override
- public boolean equals(Object o) {
- if (this == o) {
- return true;
- }
- if (o == null || getClass() != o.getClass()) {
- return false;
- }
-
- FlinkSplitBagState<?, ?> that = (FlinkSplitBagState<?, ?>) o;
-
- return namespace.equals(that.namespace) && address.equals(that.address);
-
- }
-
- @Override
- public int hashCode() {
- int result = namespace.hashCode();
- result = 31 * result + address.hashCode();
- return result;
- }
- }
-
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/state/FlinkStateInternals.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/state/FlinkStateInternals.java b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/state/FlinkStateInternals.java
deleted file mode 100644
index 4f961e5..0000000
--- a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/state/FlinkStateInternals.java
+++ /dev/null
@@ -1,1053 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink.translation.wrappers.streaming.state;
-
-import com.google.common.collect.Lists;
-import java.nio.ByteBuffer;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.Map;
-import org.apache.beam.runners.core.StateInternals;
-import org.apache.beam.runners.core.StateNamespace;
-import org.apache.beam.runners.core.StateTag;
-import org.apache.beam.runners.flink.translation.types.CoderTypeInformation;
-import org.apache.beam.sdk.coders.Coder;
-import org.apache.beam.sdk.coders.CoderException;
-import org.apache.beam.sdk.coders.InstantCoder;
-import org.apache.beam.sdk.transforms.Combine;
-import org.apache.beam.sdk.transforms.CombineWithContext;
-import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
-import org.apache.beam.sdk.transforms.windowing.OutputTimeFn;
-import org.apache.beam.sdk.util.CoderUtils;
-import org.apache.beam.sdk.util.CombineContextFactory;
-import org.apache.beam.sdk.util.state.BagState;
-import org.apache.beam.sdk.util.state.CombiningState;
-import org.apache.beam.sdk.util.state.MapState;
-import org.apache.beam.sdk.util.state.ReadableState;
-import org.apache.beam.sdk.util.state.SetState;
-import org.apache.beam.sdk.util.state.State;
-import org.apache.beam.sdk.util.state.StateContext;
-import org.apache.beam.sdk.util.state.StateContexts;
-import org.apache.beam.sdk.util.state.ValueState;
-import org.apache.beam.sdk.util.state.WatermarkHoldState;
-import org.apache.flink.api.common.state.ListStateDescriptor;
-import org.apache.flink.api.common.state.ValueStateDescriptor;
-import org.apache.flink.api.common.typeutils.base.StringSerializer;
-import org.apache.flink.runtime.state.KeyedStateBackend;
-import org.joda.time.Instant;
-
-/**
- * {@link StateInternals} that uses a Flink {@link KeyedStateBackend} to manage state.
- *
- * <p>Note: In the Flink streaming runner the key is always encoded
- * using an {@link Coder} and stored in a {@link ByteBuffer}.
- */
-public class FlinkStateInternals<K> implements StateInternals<K> {
-
- private final KeyedStateBackend<ByteBuffer> flinkStateBackend;
- private Coder<K> keyCoder;
-
- // on recovery, these will no be properly set because we don't
- // know which watermark hold states there are in the Flink State Backend
- private final Map<String, Instant> watermarkHolds = new HashMap<>();
-
- public FlinkStateInternals(KeyedStateBackend<ByteBuffer> flinkStateBackend, Coder<K> keyCoder) {
- this.flinkStateBackend = flinkStateBackend;
- this.keyCoder = keyCoder;
- }
-
- /**
- * Returns the minimum over all watermark holds.
- */
- public Instant watermarkHold() {
- long min = Long.MAX_VALUE;
- for (Instant hold: watermarkHolds.values()) {
- min = Math.min(min, hold.getMillis());
- }
- return new Instant(min);
- }
-
- @Override
- public K getKey() {
- ByteBuffer keyBytes = flinkStateBackend.getCurrentKey();
- try {
- return CoderUtils.decodeFromByteArray(keyCoder, keyBytes.array());
- } catch (CoderException e) {
- throw new RuntimeException("Error decoding key.", e);
- }
- }
-
- @Override
- public <T extends State> T state(
- final StateNamespace namespace,
- StateTag<? super K, T> address) {
-
- return state(namespace, address, StateContexts.nullContext());
- }
-
- @Override
- public <T extends State> T state(
- final StateNamespace namespace,
- StateTag<? super K, T> address,
- final StateContext<?> context) {
-
- return address.bind(new StateTag.StateBinder<K>() {
-
- @Override
- public <T> ValueState<T> bindValue(
- StateTag<? super K, ValueState<T>> address,
- Coder<T> coder) {
-
- return new FlinkValueState<>(flinkStateBackend, address, namespace, coder);
- }
-
- @Override
- public <T> BagState<T> bindBag(
- StateTag<? super K, BagState<T>> address,
- Coder<T> elemCoder) {
-
- return new FlinkBagState<>(flinkStateBackend, address, namespace, elemCoder);
- }
-
- @Override
- public <T> SetState<T> bindSet(
- StateTag<? super K, SetState<T>> address,
- Coder<T> elemCoder) {
- throw new UnsupportedOperationException(
- String.format("%s is not supported", SetState.class.getSimpleName()));
- }
-
- @Override
- public <KeyT, ValueT> MapState<KeyT, ValueT> bindMap(
- StateTag<? super K, MapState<KeyT, ValueT>> spec,
- Coder<KeyT> mapKeyCoder, Coder<ValueT> mapValueCoder) {
- throw new UnsupportedOperationException(
- String.format("%s is not supported", MapState.class.getSimpleName()));
- }
-
- @Override
- public <InputT, AccumT, OutputT>
- CombiningState<InputT, AccumT, OutputT>
- bindCombiningValue(
- StateTag<? super K, CombiningState<InputT, AccumT, OutputT>> address,
- Coder<AccumT> accumCoder,
- Combine.CombineFn<InputT, AccumT, OutputT> combineFn) {
-
- return new FlinkCombiningState<>(
- flinkStateBackend, address, combineFn, namespace, accumCoder);
- }
-
- @Override
- public <InputT, AccumT, OutputT>
- CombiningState<InputT, AccumT, OutputT> bindKeyedCombiningValue(
- StateTag<? super K, CombiningState<InputT, AccumT, OutputT>> address,
- Coder<AccumT> accumCoder,
- final Combine.KeyedCombineFn<? super K, InputT, AccumT, OutputT> combineFn) {
- return new FlinkKeyedCombiningState<>(
- flinkStateBackend,
- address,
- combineFn,
- namespace,
- accumCoder,
- FlinkStateInternals.this);
- }
-
- @Override
- public <InputT, AccumT, OutputT>
- CombiningState<InputT, AccumT, OutputT> bindKeyedCombiningValueWithContext(
- StateTag<? super K, CombiningState<InputT, AccumT, OutputT>> address,
- Coder<AccumT> accumCoder,
- CombineWithContext.KeyedCombineFnWithContext<
- ? super K, InputT, AccumT, OutputT> combineFn) {
- return new FlinkCombiningStateWithContext<>(
- flinkStateBackend,
- address,
- combineFn,
- namespace,
- accumCoder,
- FlinkStateInternals.this,
- CombineContextFactory.createFromStateContext(context));
- }
-
- @Override
- public <W extends BoundedWindow> WatermarkHoldState<W> bindWatermark(
- StateTag<? super K, WatermarkHoldState<W>> address,
- OutputTimeFn<? super W> outputTimeFn) {
-
- return new FlinkWatermarkHoldState<>(
- flinkStateBackend, FlinkStateInternals.this, address, namespace, outputTimeFn);
- }
- });
- }
-
- private static class FlinkValueState<K, T> implements ValueState<T> {
-
- private final StateNamespace namespace;
- private final StateTag<? super K, ValueState<T>> address;
- private final ValueStateDescriptor<T> flinkStateDescriptor;
- private final KeyedStateBackend<ByteBuffer> flinkStateBackend;
-
- FlinkValueState(
- KeyedStateBackend<ByteBuffer> flinkStateBackend,
- StateTag<? super K, ValueState<T>> address,
- StateNamespace namespace,
- Coder<T> coder) {
-
- this.namespace = namespace;
- this.address = address;
- this.flinkStateBackend = flinkStateBackend;
-
- CoderTypeInformation<T> typeInfo = new CoderTypeInformation<>(coder);
-
- flinkStateDescriptor = new ValueStateDescriptor<>(address.getId(), typeInfo, null);
- }
-
- @Override
- public void write(T input) {
- try {
- flinkStateBackend.getPartitionedState(
- namespace.stringKey(),
- StringSerializer.INSTANCE,
- flinkStateDescriptor).update(input);
- } catch (Exception e) {
- throw new RuntimeException("Error updating state.", e);
- }
- }
-
- @Override
- public ValueState<T> readLater() {
- return this;
- }
-
- @Override
- public T read() {
- try {
- return flinkStateBackend.getPartitionedState(
- namespace.stringKey(),
- StringSerializer.INSTANCE,
- flinkStateDescriptor).value();
- } catch (Exception e) {
- throw new RuntimeException("Error reading state.", e);
- }
- }
-
- @Override
- public void clear() {
- try {
- flinkStateBackend.getPartitionedState(
- namespace.stringKey(),
- StringSerializer.INSTANCE,
- flinkStateDescriptor).clear();
- } catch (Exception e) {
- throw new RuntimeException("Error clearing state.", e);
- }
- }
-
- @Override
- public boolean equals(Object o) {
- if (this == o) {
- return true;
- }
- if (o == null || getClass() != o.getClass()) {
- return false;
- }
-
- FlinkValueState<?, ?> that = (FlinkValueState<?, ?>) o;
-
- return namespace.equals(that.namespace) && address.equals(that.address);
-
- }
-
- @Override
- public int hashCode() {
- int result = namespace.hashCode();
- result = 31 * result + address.hashCode();
- return result;
- }
- }
-
- private static class FlinkBagState<K, T> implements BagState<T> {
-
- private final StateNamespace namespace;
- private final StateTag<? super K, BagState<T>> address;
- private final ListStateDescriptor<T> flinkStateDescriptor;
- private final KeyedStateBackend<ByteBuffer> flinkStateBackend;
-
- FlinkBagState(
- KeyedStateBackend<ByteBuffer> flinkStateBackend,
- StateTag<? super K, BagState<T>> address,
- StateNamespace namespace,
- Coder<T> coder) {
-
- this.namespace = namespace;
- this.address = address;
- this.flinkStateBackend = flinkStateBackend;
-
- CoderTypeInformation<T> typeInfo = new CoderTypeInformation<>(coder);
-
- flinkStateDescriptor = new ListStateDescriptor<>(address.getId(), typeInfo);
- }
-
- @Override
- public void add(T input) {
- try {
- flinkStateBackend.getPartitionedState(
- namespace.stringKey(),
- StringSerializer.INSTANCE,
- flinkStateDescriptor).add(input);
- } catch (Exception e) {
- throw new RuntimeException("Error adding to bag state.", e);
- }
- }
-
- @Override
- public BagState<T> readLater() {
- return this;
- }
-
- @Override
- public Iterable<T> read() {
- try {
- Iterable<T> result = flinkStateBackend.getPartitionedState(
- namespace.stringKey(),
- StringSerializer.INSTANCE,
- flinkStateDescriptor).get();
-
- return result != null ? result : Collections.<T>emptyList();
- } catch (Exception e) {
- throw new RuntimeException("Error reading state.", e);
- }
- }
-
- @Override
- public ReadableState<Boolean> isEmpty() {
- return new ReadableState<Boolean>() {
- @Override
- public Boolean read() {
- try {
- Iterable<T> result = flinkStateBackend.getPartitionedState(
- namespace.stringKey(),
- StringSerializer.INSTANCE,
- flinkStateDescriptor).get();
- return result == null;
- } catch (Exception e) {
- throw new RuntimeException("Error reading state.", e);
- }
-
- }
-
- @Override
- public ReadableState<Boolean> readLater() {
- return this;
- }
- };
- }
-
- @Override
- public void clear() {
- try {
- flinkStateBackend.getPartitionedState(
- namespace.stringKey(),
- StringSerializer.INSTANCE,
- flinkStateDescriptor).clear();
- } catch (Exception e) {
- throw new RuntimeException("Error clearing state.", e);
- }
- }
-
- @Override
- public boolean equals(Object o) {
- if (this == o) {
- return true;
- }
- if (o == null || getClass() != o.getClass()) {
- return false;
- }
-
- FlinkBagState<?, ?> that = (FlinkBagState<?, ?>) o;
-
- return namespace.equals(that.namespace) && address.equals(that.address);
-
- }
-
- @Override
- public int hashCode() {
- int result = namespace.hashCode();
- result = 31 * result + address.hashCode();
- return result;
- }
- }
-
- private static class FlinkCombiningState<K, InputT, AccumT, OutputT>
- implements CombiningState<InputT, AccumT, OutputT> {
-
- private final StateNamespace namespace;
- private final StateTag<? super K, CombiningState<InputT, AccumT, OutputT>> address;
- private final Combine.CombineFn<InputT, AccumT, OutputT> combineFn;
- private final ValueStateDescriptor<AccumT> flinkStateDescriptor;
- private final KeyedStateBackend<ByteBuffer> flinkStateBackend;
-
- FlinkCombiningState(
- KeyedStateBackend<ByteBuffer> flinkStateBackend,
- StateTag<? super K, CombiningState<InputT, AccumT, OutputT>> address,
- Combine.CombineFn<InputT, AccumT, OutputT> combineFn,
- StateNamespace namespace,
- Coder<AccumT> accumCoder) {
-
- this.namespace = namespace;
- this.address = address;
- this.combineFn = combineFn;
- this.flinkStateBackend = flinkStateBackend;
-
- CoderTypeInformation<AccumT> typeInfo = new CoderTypeInformation<>(accumCoder);
-
- flinkStateDescriptor = new ValueStateDescriptor<>(address.getId(), typeInfo, null);
- }
-
- @Override
- public CombiningState<InputT, AccumT, OutputT> readLater() {
- return this;
- }
-
- @Override
- public void add(InputT value) {
- try {
- org.apache.flink.api.common.state.ValueState<AccumT> state =
- flinkStateBackend.getPartitionedState(
- namespace.stringKey(),
- StringSerializer.INSTANCE,
- flinkStateDescriptor);
-
- AccumT current = state.value();
- if (current == null) {
- current = combineFn.createAccumulator();
- }
- current = combineFn.addInput(current, value);
- state.update(current);
- } catch (Exception e) {
- throw new RuntimeException("Error adding to state." , e);
- }
- }
-
- @Override
- public void addAccum(AccumT accum) {
- try {
- org.apache.flink.api.common.state.ValueState<AccumT> state =
- flinkStateBackend.getPartitionedState(
- namespace.stringKey(),
- StringSerializer.INSTANCE,
- flinkStateDescriptor);
-
- AccumT current = state.value();
- if (current == null) {
- state.update(accum);
- } else {
- current = combineFn.mergeAccumulators(Lists.newArrayList(current, accum));
- state.update(current);
- }
- } catch (Exception e) {
- throw new RuntimeException("Error adding to state.", e);
- }
- }
-
- @Override
- public AccumT getAccum() {
- try {
- return flinkStateBackend.getPartitionedState(
- namespace.stringKey(),
- StringSerializer.INSTANCE,
- flinkStateDescriptor).value();
- } catch (Exception e) {
- throw new RuntimeException("Error reading state.", e);
- }
- }
-
- @Override
- public AccumT mergeAccumulators(Iterable<AccumT> accumulators) {
- return combineFn.mergeAccumulators(accumulators);
- }
-
- @Override
- public OutputT read() {
- try {
- org.apache.flink.api.common.state.ValueState<AccumT> state =
- flinkStateBackend.getPartitionedState(
- namespace.stringKey(),
- StringSerializer.INSTANCE,
- flinkStateDescriptor);
-
- AccumT accum = state.value();
- if (accum != null) {
- return combineFn.extractOutput(accum);
- } else {
- return combineFn.extractOutput(combineFn.createAccumulator());
- }
- } catch (Exception e) {
- throw new RuntimeException("Error reading state.", e);
- }
- }
-
- @Override
- public ReadableState<Boolean> isEmpty() {
- return new ReadableState<Boolean>() {
- @Override
- public Boolean read() {
- try {
- return flinkStateBackend.getPartitionedState(
- namespace.stringKey(),
- StringSerializer.INSTANCE,
- flinkStateDescriptor).value() == null;
- } catch (Exception e) {
- throw new RuntimeException("Error reading state.", e);
- }
-
- }
-
- @Override
- public ReadableState<Boolean> readLater() {
- return this;
- }
- };
- }
-
- @Override
- public void clear() {
- try {
- flinkStateBackend.getPartitionedState(
- namespace.stringKey(),
- StringSerializer.INSTANCE,
- flinkStateDescriptor).clear();
- } catch (Exception e) {
- throw new RuntimeException("Error clearing state.", e);
- }
- }
-
- @Override
- public boolean equals(Object o) {
- if (this == o) {
- return true;
- }
- if (o == null || getClass() != o.getClass()) {
- return false;
- }
-
- FlinkCombiningState<?, ?, ?, ?> that =
- (FlinkCombiningState<?, ?, ?, ?>) o;
-
- return namespace.equals(that.namespace) && address.equals(that.address);
-
- }
-
- @Override
- public int hashCode() {
- int result = namespace.hashCode();
- result = 31 * result + address.hashCode();
- return result;
- }
- }
-
- private static class FlinkKeyedCombiningState<K, InputT, AccumT, OutputT>
- implements CombiningState<InputT, AccumT, OutputT> {
-
- private final StateNamespace namespace;
- private final StateTag<? super K, CombiningState<InputT, AccumT, OutputT>> address;
- private final Combine.KeyedCombineFn<? super K, InputT, AccumT, OutputT> combineFn;
- private final ValueStateDescriptor<AccumT> flinkStateDescriptor;
- private final KeyedStateBackend<ByteBuffer> flinkStateBackend;
- private final FlinkStateInternals<K> flinkStateInternals;
-
- FlinkKeyedCombiningState(
- KeyedStateBackend<ByteBuffer> flinkStateBackend,
- StateTag<? super K, CombiningState<InputT, AccumT, OutputT>> address,
- Combine.KeyedCombineFn<? super K, InputT, AccumT, OutputT> combineFn,
- StateNamespace namespace,
- Coder<AccumT> accumCoder,
- FlinkStateInternals<K> flinkStateInternals) {
-
- this.namespace = namespace;
- this.address = address;
- this.combineFn = combineFn;
- this.flinkStateBackend = flinkStateBackend;
- this.flinkStateInternals = flinkStateInternals;
-
- CoderTypeInformation<AccumT> typeInfo = new CoderTypeInformation<>(accumCoder);
-
- flinkStateDescriptor = new ValueStateDescriptor<>(address.getId(), typeInfo, null);
- }
-
- @Override
- public CombiningState<InputT, AccumT, OutputT> readLater() {
- return this;
- }
-
- @Override
- public void add(InputT value) {
- try {
- org.apache.flink.api.common.state.ValueState<AccumT> state =
- flinkStateBackend.getPartitionedState(
- namespace.stringKey(),
- StringSerializer.INSTANCE,
- flinkStateDescriptor);
-
- AccumT current = state.value();
- if (current == null) {
- current = combineFn.createAccumulator(flinkStateInternals.getKey());
- }
- current = combineFn.addInput(flinkStateInternals.getKey(), current, value);
- state.update(current);
- } catch (Exception e) {
- throw new RuntimeException("Error adding to state." , e);
- }
- }
-
- @Override
- public void addAccum(AccumT accum) {
- try {
- org.apache.flink.api.common.state.ValueState<AccumT> state =
- flinkStateBackend.getPartitionedState(
- namespace.stringKey(),
- StringSerializer.INSTANCE,
- flinkStateDescriptor);
-
- AccumT current = state.value();
- if (current == null) {
- state.update(accum);
- } else {
- current = combineFn.mergeAccumulators(
- flinkStateInternals.getKey(),
- Lists.newArrayList(current, accum));
- state.update(current);
- }
- } catch (Exception e) {
- throw new RuntimeException("Error adding to state.", e);
- }
- }
-
- @Override
- public AccumT getAccum() {
- try {
- return flinkStateBackend.getPartitionedState(
- namespace.stringKey(),
- StringSerializer.INSTANCE,
- flinkStateDescriptor).value();
- } catch (Exception e) {
- throw new RuntimeException("Error reading state.", e);
- }
- }
-
- @Override
- public AccumT mergeAccumulators(Iterable<AccumT> accumulators) {
- return combineFn.mergeAccumulators(flinkStateInternals.getKey(), accumulators);
- }
-
- @Override
- public OutputT read() {
- try {
- org.apache.flink.api.common.state.ValueState<AccumT> state =
- flinkStateBackend.getPartitionedState(
- namespace.stringKey(),
- StringSerializer.INSTANCE,
- flinkStateDescriptor);
-
- AccumT accum = state.value();
- if (accum != null) {
- return combineFn.extractOutput(flinkStateInternals.getKey(), accum);
- } else {
- return combineFn.extractOutput(
- flinkStateInternals.getKey(),
- combineFn.createAccumulator(flinkStateInternals.getKey()));
- }
- } catch (Exception e) {
- throw new RuntimeException("Error reading state.", e);
- }
- }
-
- @Override
- public ReadableState<Boolean> isEmpty() {
- return new ReadableState<Boolean>() {
- @Override
- public Boolean read() {
- try {
- return flinkStateBackend.getPartitionedState(
- namespace.stringKey(),
- StringSerializer.INSTANCE,
- flinkStateDescriptor).value() == null;
- } catch (Exception e) {
- throw new RuntimeException("Error reading state.", e);
- }
-
- }
-
- @Override
- public ReadableState<Boolean> readLater() {
- return this;
- }
- };
- }
-
- @Override
- public void clear() {
- try {
- flinkStateBackend.getPartitionedState(
- namespace.stringKey(),
- StringSerializer.INSTANCE,
- flinkStateDescriptor).clear();
- } catch (Exception e) {
- throw new RuntimeException("Error clearing state.", e);
- }
- }
-
- @Override
- public boolean equals(Object o) {
- if (this == o) {
- return true;
- }
- if (o == null || getClass() != o.getClass()) {
- return false;
- }
-
- FlinkKeyedCombiningState<?, ?, ?, ?> that =
- (FlinkKeyedCombiningState<?, ?, ?, ?>) o;
-
- return namespace.equals(that.namespace) && address.equals(that.address);
-
- }
-
- @Override
- public int hashCode() {
- int result = namespace.hashCode();
- result = 31 * result + address.hashCode();
- return result;
- }
- }
-
- private static class FlinkCombiningStateWithContext<K, InputT, AccumT, OutputT>
- implements CombiningState<InputT, AccumT, OutputT> {
-
- private final StateNamespace namespace;
- private final StateTag<? super K, CombiningState<InputT, AccumT, OutputT>> address;
- private final CombineWithContext.KeyedCombineFnWithContext<
- ? super K, InputT, AccumT, OutputT> combineFn;
- private final ValueStateDescriptor<AccumT> flinkStateDescriptor;
- private final KeyedStateBackend<ByteBuffer> flinkStateBackend;
- private final FlinkStateInternals<K> flinkStateInternals;
- private final CombineWithContext.Context context;
-
- FlinkCombiningStateWithContext(
- KeyedStateBackend<ByteBuffer> flinkStateBackend,
- StateTag<? super K, CombiningState<InputT, AccumT, OutputT>> address,
- CombineWithContext.KeyedCombineFnWithContext<
- ? super K, InputT, AccumT, OutputT> combineFn,
- StateNamespace namespace,
- Coder<AccumT> accumCoder,
- FlinkStateInternals<K> flinkStateInternals,
- CombineWithContext.Context context) {
-
- this.namespace = namespace;
- this.address = address;
- this.combineFn = combineFn;
- this.flinkStateBackend = flinkStateBackend;
- this.flinkStateInternals = flinkStateInternals;
- this.context = context;
-
- CoderTypeInformation<AccumT> typeInfo = new CoderTypeInformation<>(accumCoder);
-
- flinkStateDescriptor = new ValueStateDescriptor<>(address.getId(), typeInfo, null);
- }
-
- @Override
- public CombiningState<InputT, AccumT, OutputT> readLater() {
- return this;
- }
-
- @Override
- public void add(InputT value) {
- try {
- org.apache.flink.api.common.state.ValueState<AccumT> state =
- flinkStateBackend.getPartitionedState(
- namespace.stringKey(),
- StringSerializer.INSTANCE,
- flinkStateDescriptor);
-
- AccumT current = state.value();
- if (current == null) {
- current = combineFn.createAccumulator(flinkStateInternals.getKey(), context);
- }
- current = combineFn.addInput(flinkStateInternals.getKey(), current, value, context);
- state.update(current);
- } catch (Exception e) {
- throw new RuntimeException("Error adding to state." , e);
- }
- }
-
- @Override
- public void addAccum(AccumT accum) {
- try {
- org.apache.flink.api.common.state.ValueState<AccumT> state =
- flinkStateBackend.getPartitionedState(
- namespace.stringKey(),
- StringSerializer.INSTANCE,
- flinkStateDescriptor);
-
- AccumT current = state.value();
- if (current == null) {
- state.update(accum);
- } else {
- current = combineFn.mergeAccumulators(
- flinkStateInternals.getKey(),
- Lists.newArrayList(current, accum),
- context);
- state.update(current);
- }
- } catch (Exception e) {
- throw new RuntimeException("Error adding to state.", e);
- }
- }
-
- @Override
- public AccumT getAccum() {
- try {
- return flinkStateBackend.getPartitionedState(
- namespace.stringKey(),
- StringSerializer.INSTANCE,
- flinkStateDescriptor).value();
- } catch (Exception e) {
- throw new RuntimeException("Error reading state.", e);
- }
- }
-
- @Override
- public AccumT mergeAccumulators(Iterable<AccumT> accumulators) {
- return combineFn.mergeAccumulators(flinkStateInternals.getKey(), accumulators, context);
- }
-
- @Override
- public OutputT read() {
- try {
- org.apache.flink.api.common.state.ValueState<AccumT> state =
- flinkStateBackend.getPartitionedState(
- namespace.stringKey(),
- StringSerializer.INSTANCE,
- flinkStateDescriptor);
-
- AccumT accum = state.value();
- return combineFn.extractOutput(flinkStateInternals.getKey(), accum, context);
- } catch (Exception e) {
- throw new RuntimeException("Error reading state.", e);
- }
- }
-
- @Override
- public ReadableState<Boolean> isEmpty() {
- return new ReadableState<Boolean>() {
- @Override
- public Boolean read() {
- try {
- return flinkStateBackend.getPartitionedState(
- namespace.stringKey(),
- StringSerializer.INSTANCE,
- flinkStateDescriptor).value() == null;
- } catch (Exception e) {
- throw new RuntimeException("Error reading state.", e);
- }
-
- }
-
- @Override
- public ReadableState<Boolean> readLater() {
- return this;
- }
- };
- }
-
- @Override
- public void clear() {
- try {
- flinkStateBackend.getPartitionedState(
- namespace.stringKey(),
- StringSerializer.INSTANCE,
- flinkStateDescriptor).clear();
- } catch (Exception e) {
- throw new RuntimeException("Error clearing state.", e);
- }
- }
-
- @Override
- public boolean equals(Object o) {
- if (this == o) {
- return true;
- }
- if (o == null || getClass() != o.getClass()) {
- return false;
- }
-
- FlinkCombiningStateWithContext<?, ?, ?, ?> that =
- (FlinkCombiningStateWithContext<?, ?, ?, ?>) o;
-
- return namespace.equals(that.namespace) && address.equals(that.address);
-
- }
-
- @Override
- public int hashCode() {
- int result = namespace.hashCode();
- result = 31 * result + address.hashCode();
- return result;
- }
- }
-
- private static class FlinkWatermarkHoldState<K, W extends BoundedWindow>
- implements WatermarkHoldState<W> {
- private final StateTag<? super K, WatermarkHoldState<W>> address;
- private final OutputTimeFn<? super W> outputTimeFn;
- private final StateNamespace namespace;
- private final KeyedStateBackend<ByteBuffer> flinkStateBackend;
- private final FlinkStateInternals<K> flinkStateInternals;
- private final ValueStateDescriptor<Instant> flinkStateDescriptor;
-
- public FlinkWatermarkHoldState(
- KeyedStateBackend<ByteBuffer> flinkStateBackend,
- FlinkStateInternals<K> flinkStateInternals,
- StateTag<? super K, WatermarkHoldState<W>> address,
- StateNamespace namespace,
- OutputTimeFn<? super W> outputTimeFn) {
- this.address = address;
- this.outputTimeFn = outputTimeFn;
- this.namespace = namespace;
- this.flinkStateBackend = flinkStateBackend;
- this.flinkStateInternals = flinkStateInternals;
-
- CoderTypeInformation<Instant> typeInfo = new CoderTypeInformation<>(InstantCoder.of());
- flinkStateDescriptor = new ValueStateDescriptor<>(address.getId(), typeInfo, null);
- }
-
- @Override
- public OutputTimeFn<? super W> getOutputTimeFn() {
- return outputTimeFn;
- }
-
- @Override
- public WatermarkHoldState<W> readLater() {
- return this;
- }
-
- @Override
- public ReadableState<Boolean> isEmpty() {
- return new ReadableState<Boolean>() {
- @Override
- public Boolean read() {
- try {
- return flinkStateBackend.getPartitionedState(
- namespace.stringKey(),
- StringSerializer.INSTANCE,
- flinkStateDescriptor).value() == null;
- } catch (Exception e) {
- throw new RuntimeException("Error reading state.", e);
- }
- }
-
- @Override
- public ReadableState<Boolean> readLater() {
- return this;
- }
- };
-
- }
-
- @Override
- public void add(Instant value) {
- try {
- org.apache.flink.api.common.state.ValueState<Instant> state =
- flinkStateBackend.getPartitionedState(
- namespace.stringKey(),
- StringSerializer.INSTANCE,
- flinkStateDescriptor);
-
- Instant current = state.value();
- if (current == null) {
- state.update(value);
- flinkStateInternals.watermarkHolds.put(namespace.stringKey(), value);
- } else {
- Instant combined = outputTimeFn.combine(current, value);
- state.update(combined);
- flinkStateInternals.watermarkHolds.put(namespace.stringKey(), combined);
- }
- } catch (Exception e) {
- throw new RuntimeException("Error updating state.", e);
- }
- }
-
- @Override
- public Instant read() {
- try {
- org.apache.flink.api.common.state.ValueState<Instant> state =
- flinkStateBackend.getPartitionedState(
- namespace.stringKey(),
- StringSerializer.INSTANCE,
- flinkStateDescriptor);
- return state.value();
- } catch (Exception e) {
- throw new RuntimeException("Error reading state.", e);
- }
- }
-
- @Override
- public void clear() {
- flinkStateInternals.watermarkHolds.remove(namespace.stringKey());
- try {
- org.apache.flink.api.common.state.ValueState<Instant> state =
- flinkStateBackend.getPartitionedState(
- namespace.stringKey(),
- StringSerializer.INSTANCE,
- flinkStateDescriptor);
- state.clear();
- } catch (Exception e) {
- throw new RuntimeException("Error reading state.", e);
- }
- }
-
- @Override
- public boolean equals(Object o) {
- if (this == o) {
- return true;
- }
- if (o == null || getClass() != o.getClass()) {
- return false;
- }
-
- FlinkWatermarkHoldState<?, ?> that = (FlinkWatermarkHoldState<?, ?>) o;
-
- if (!address.equals(that.address)) {
- return false;
- }
- if (!outputTimeFn.equals(that.outputTimeFn)) {
- return false;
- }
- return namespace.equals(that.namespace);
-
- }
-
- @Override
- public int hashCode() {
- int result = address.hashCode();
- result = 31 * result + outputTimeFn.hashCode();
- result = 31 * result + namespace.hashCode();
- return result;
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/state/KeyGroupCheckpointedOperator.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/state/KeyGroupCheckpointedOperator.java b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/state/KeyGroupCheckpointedOperator.java
deleted file mode 100644
index b38a520..0000000
--- a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/state/KeyGroupCheckpointedOperator.java
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink.translation.wrappers.streaming.state;
-
-import java.io.DataOutputStream;
-
-/**
- * This interface is used to checkpoint key-groups state.
- */
-public interface KeyGroupCheckpointedOperator extends KeyGroupRestoringOperator{
- /**
- * Snapshots the state for a given {@code keyGroupIdx}.
- *
- * <p>AbstractStreamOperator would call this hook in
- * AbstractStreamOperator.snapshotState() while iterating over the key groups.
- * @param keyGroupIndex the id of the key-group to be put in the snapshot.
- * @param out the stream to write to.
- */
- void snapshotKeyGroupState(int keyGroupIndex, DataOutputStream out) throws Exception;
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/state/KeyGroupRestoringOperator.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/state/KeyGroupRestoringOperator.java b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/state/KeyGroupRestoringOperator.java
deleted file mode 100644
index 2bdfc6e..0000000
--- a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/state/KeyGroupRestoringOperator.java
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink.translation.wrappers.streaming.state;
-
-import java.io.DataInputStream;
-
-/**
- * This interface is used to restore key-groups state.
- */
-public interface KeyGroupRestoringOperator {
- /**
- * Restore the state for a given {@code keyGroupIndex}.
- * @param keyGroupIndex the id of the key-group to be put in the snapshot.
- * @param in the stream to read from.
- */
- void restoreKeyGroupState(int keyGroupIndex, DataInputStream in) throws Exception;
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/state/package-info.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/state/package-info.java b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/state/package-info.java
deleted file mode 100644
index 0004e9e..0000000
--- a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/state/package-info.java
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * Internal state implementation of the Beam runner for Apache Flink.
- */
-package org.apache.beam.runners.flink.translation.wrappers.streaming.state;
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/main/resources/log4j.properties
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/resources/log4j.properties b/runners/flink/runner/src/main/resources/log4j.properties
deleted file mode 100644
index 4b6a708..0000000
--- a/runners/flink/runner/src/main/resources/log4j.properties
+++ /dev/null
@@ -1,23 +0,0 @@
-################################################################################
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-################################################################################
-
-log4j.rootLogger=OFF,console
-log4j.appender.console=org.apache.log4j.ConsoleAppender
-log4j.appender.console.target=System.err
-log4j.appender.console.layout=org.apache.log4j.PatternLayout
-log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{2}: %m%n
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/test/java/org/apache/beam/runners/flink/EncodedValueComparatorTest.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/test/java/org/apache/beam/runners/flink/EncodedValueComparatorTest.java b/runners/flink/runner/src/test/java/org/apache/beam/runners/flink/EncodedValueComparatorTest.java
deleted file mode 100644
index 10d6d9d..0000000
--- a/runners/flink/runner/src/test/java/org/apache/beam/runners/flink/EncodedValueComparatorTest.java
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink;
-
-import org.apache.beam.runners.flink.translation.types.EncodedValueComparator;
-import org.apache.beam.runners.flink.translation.types.EncodedValueTypeInformation;
-import org.apache.beam.sdk.coders.CoderException;
-import org.apache.beam.sdk.coders.StringUtf8Coder;
-import org.apache.beam.sdk.util.CoderUtils;
-import org.apache.flink.api.common.ExecutionConfig;
-import org.apache.flink.api.common.typeutils.ComparatorTestBase;
-import org.apache.flink.api.common.typeutils.TypeComparator;
-import org.apache.flink.api.common.typeutils.TypeSerializer;
-import org.junit.Assert;
-
-/**
- * Test for {@link EncodedValueComparator}.
- */
-public class EncodedValueComparatorTest extends ComparatorTestBase<byte[]> {
-
- @Override
- protected TypeComparator<byte[]> createComparator(boolean ascending) {
- return new EncodedValueTypeInformation().createComparator(ascending, new ExecutionConfig());
- }
-
- @Override
- protected TypeSerializer<byte[]> createSerializer() {
- return new EncodedValueTypeInformation().createSerializer(new ExecutionConfig());
- }
-
- @Override
- protected void deepEquals(String message, byte[] should, byte[] is) {
- Assert.assertArrayEquals(message, should, is);
- }
-
- @Override
- protected byte[][] getSortedTestData() {
- StringUtf8Coder coder = StringUtf8Coder.of();
-
- try {
- return new byte[][]{
- CoderUtils.encodeToByteArray(coder, ""),
- CoderUtils.encodeToByteArray(coder, "Lorem Ipsum Dolor Omit Longer"),
- CoderUtils.encodeToByteArray(coder, "aaaa"),
- CoderUtils.encodeToByteArray(coder, "abcd"),
- CoderUtils.encodeToByteArray(coder, "abce"),
- CoderUtils.encodeToByteArray(coder, "abdd"),
- CoderUtils.encodeToByteArray(coder, "accd"),
- CoderUtils.encodeToByteArray(coder, "bbcd")
- };
- } catch (CoderException e) {
- throw new RuntimeException("Could not encode values.", e);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/test/java/org/apache/beam/runners/flink/FlinkRunnerRegistrarTest.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/test/java/org/apache/beam/runners/flink/FlinkRunnerRegistrarTest.java b/runners/flink/runner/src/test/java/org/apache/beam/runners/flink/FlinkRunnerRegistrarTest.java
deleted file mode 100644
index d9d174c..0000000
--- a/runners/flink/runner/src/test/java/org/apache/beam/runners/flink/FlinkRunnerRegistrarTest.java
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.beam.runners.flink;
-
-import static org.junit.Assert.assertEquals;
-
-import org.apache.beam.sdk.options.PipelineOptions;
-import org.apache.beam.sdk.options.PipelineOptionsFactory;
-import org.junit.Test;
-
-/**
- * Tests the proper registration of the Flink runner.
- */
-public class FlinkRunnerRegistrarTest {
-
- @Test
- public void testFullName() {
- String[] args =
- new String[] {String.format("--runner=%s", FlinkRunner.class.getName())};
- PipelineOptions opts = PipelineOptionsFactory.fromArgs(args).create();
- assertEquals(opts.getRunner(), FlinkRunner.class);
- }
-
- @Test
- public void testClassName() {
- String[] args =
- new String[] {String.format("--runner=%s", FlinkRunner.class.getSimpleName())};
- PipelineOptions opts = PipelineOptionsFactory.fromArgs(args).create();
- assertEquals(opts.getRunner(), FlinkRunner.class);
- }
-
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/test/java/org/apache/beam/runners/flink/FlinkTestPipeline.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/test/java/org/apache/beam/runners/flink/FlinkTestPipeline.java b/runners/flink/runner/src/test/java/org/apache/beam/runners/flink/FlinkTestPipeline.java
deleted file mode 100644
index d6240c4..0000000
--- a/runners/flink/runner/src/test/java/org/apache/beam/runners/flink/FlinkTestPipeline.java
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink;
-
-import org.apache.beam.sdk.Pipeline;
-import org.apache.beam.sdk.PipelineResult;
-import org.apache.beam.sdk.options.PipelineOptions;
-import org.apache.beam.sdk.runners.PipelineRunner;
-
-/**
- * {@link org.apache.beam.sdk.Pipeline} for testing Dataflow programs on the
- * {@link FlinkRunner}.
- */
-public class FlinkTestPipeline extends Pipeline {
-
- /**
- * Creates and returns a new test pipeline for batch execution.
- *
- * <p>Use {@link org.apache.beam.sdk.testing.PAssert} to add tests, then call
- * {@link Pipeline#run} to execute the pipeline and check the tests.
- */
- public static FlinkTestPipeline createForBatch() {
- return create(false);
- }
-
- /**
- * Creates and returns a new test pipeline for streaming execution.
- *
- * <p>Use {@link org.apache.beam.sdk.testing.PAssert} to add tests, then call
- * {@link Pipeline#run} to execute the pipeline and check the tests.
- *
- * @return The Test Pipeline
- */
- public static FlinkTestPipeline createForStreaming() {
- return create(true);
- }
-
- /**
- * Creates and returns a new test pipeline for streaming or batch execution.
- *
- * <p>Use {@link org.apache.beam.sdk.testing.PAssert} to add tests, then call
- * {@link Pipeline#run} to execute the pipeline and check the tests.
- *
- * @param streaming <code>True</code> for streaming mode, <code>False</code> for batch.
- * @return The Test Pipeline.
- */
- private static FlinkTestPipeline create(boolean streaming) {
- TestFlinkRunner flinkRunner = TestFlinkRunner.create(streaming);
- return new FlinkTestPipeline(flinkRunner, flinkRunner.getPipelineOptions());
- }
-
- private FlinkTestPipeline(PipelineRunner<? extends PipelineResult> runner,
- PipelineOptions options) {
- super(runner, options);
- }
-}
-
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/test/java/org/apache/beam/runners/flink/PipelineOptionsTest.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/test/java/org/apache/beam/runners/flink/PipelineOptionsTest.java b/runners/flink/runner/src/test/java/org/apache/beam/runners/flink/PipelineOptionsTest.java
deleted file mode 100644
index 06187f6..0000000
--- a/runners/flink/runner/src/test/java/org/apache/beam/runners/flink/PipelineOptionsTest.java
+++ /dev/null
@@ -1,184 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.assertNull;
-import static org.junit.Assert.assertTrue;
-
-import java.util.Collections;
-import java.util.HashMap;
-import org.apache.beam.runners.flink.translation.utils.SerializedPipelineOptions;
-import org.apache.beam.runners.flink.translation.wrappers.streaming.DoFnOperator;
-import org.apache.beam.sdk.coders.StringUtf8Coder;
-import org.apache.beam.sdk.options.Default;
-import org.apache.beam.sdk.options.Description;
-import org.apache.beam.sdk.options.PipelineOptions;
-import org.apache.beam.sdk.options.PipelineOptionsFactory;
-import org.apache.beam.sdk.transforms.DoFn;
-import org.apache.beam.sdk.transforms.windowing.GlobalWindow;
-import org.apache.beam.sdk.transforms.windowing.PaneInfo;
-import org.apache.beam.sdk.util.WindowedValue;
-import org.apache.beam.sdk.util.WindowingStrategy;
-import org.apache.beam.sdk.values.PCollectionView;
-import org.apache.beam.sdk.values.TupleTag;
-import org.apache.commons.lang3.SerializationUtils;
-import org.apache.flink.api.common.ExecutionConfig;
-import org.apache.flink.api.common.typeinfo.TypeHint;
-import org.apache.flink.api.common.typeinfo.TypeInformation;
-import org.apache.flink.runtime.state.memory.MemoryStateBackend;
-import org.apache.flink.streaming.runtime.streamrecord.StreamRecord;
-import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness;
-import org.joda.time.Instant;
-import org.junit.Assert;
-import org.junit.BeforeClass;
-import org.junit.Test;
-
-/**
- * Tests for serialization and deserialization of {@link PipelineOptions} in {@link DoFnOperator}.
- */
-public class PipelineOptionsTest {
-
- /**
- * Pipeline options.
- */
- public interface MyOptions extends FlinkPipelineOptions {
- @Description("Bla bla bla")
- @Default.String("Hello")
- String getTestOption();
- void setTestOption(String value);
- }
-
- private static MyOptions options;
- private static SerializedPipelineOptions serializedOptions;
-
- private static final String[] args = new String[]{"--testOption=nothing"};
-
- @BeforeClass
- public static void beforeTest() {
- options = PipelineOptionsFactory.fromArgs(args).as(MyOptions.class);
- serializedOptions = new SerializedPipelineOptions(options);
- }
-
- @Test
- public void testDeserialization() {
- MyOptions deserializedOptions = serializedOptions.getPipelineOptions().as(MyOptions.class);
- assertEquals("nothing", deserializedOptions.getTestOption());
- }
-
- @Test
- public void testIgnoredFieldSerialization() {
- FlinkPipelineOptions options = PipelineOptionsFactory.as(FlinkPipelineOptions.class);
- options.setStateBackend(new MemoryStateBackend());
-
- FlinkPipelineOptions deserialized =
- new SerializedPipelineOptions(options).getPipelineOptions().as(FlinkPipelineOptions.class);
-
- assertNull(deserialized.getStateBackend());
- }
-
- @Test
- public void testCaching() {
- PipelineOptions deserializedOptions =
- serializedOptions.getPipelineOptions().as(PipelineOptions.class);
-
- assertNotNull(deserializedOptions);
- assertTrue(deserializedOptions == serializedOptions.getPipelineOptions());
- assertTrue(deserializedOptions == serializedOptions.getPipelineOptions());
- assertTrue(deserializedOptions == serializedOptions.getPipelineOptions());
- }
-
- @Test(expected = Exception.class)
- public void testNonNull() {
- new SerializedPipelineOptions(null);
- }
-
- @Test(expected = Exception.class)
- public void parDoBaseClassPipelineOptionsNullTest() {
- DoFnOperator<String, String, String> doFnOperator = new DoFnOperator<>(
- new TestDoFn(),
- WindowedValue.getValueOnlyCoder(StringUtf8Coder.of()),
- new TupleTag<String>("main-output"),
- Collections.<TupleTag<?>>emptyList(),
- new DoFnOperator.DefaultOutputManagerFactory<String>(),
- WindowingStrategy.globalDefault(),
- new HashMap<Integer, PCollectionView<?>>(),
- Collections.<PCollectionView<?>>emptyList(),
- null,
- null);
-
- }
-
- /**
- * Tests that PipelineOptions are present after serialization.
- */
- @Test
- public void parDoBaseClassPipelineOptionsSerializationTest() throws Exception {
-
- DoFnOperator<String, String, String> doFnOperator = new DoFnOperator<>(
- new TestDoFn(),
- WindowedValue.getValueOnlyCoder(StringUtf8Coder.of()),
- new TupleTag<String>("main-output"),
- Collections.<TupleTag<?>>emptyList(),
- new DoFnOperator.DefaultOutputManagerFactory<String>(),
- WindowingStrategy.globalDefault(),
- new HashMap<Integer, PCollectionView<?>>(),
- Collections.<PCollectionView<?>>emptyList(),
- options,
- null);
-
- final byte[] serialized = SerializationUtils.serialize(doFnOperator);
-
- @SuppressWarnings("unchecked")
- DoFnOperator<Object, Object, Object> deserialized =
- (DoFnOperator<Object, Object, Object>) SerializationUtils.deserialize(serialized);
-
- TypeInformation<WindowedValue<Object>> typeInformation = TypeInformation.of(
- new TypeHint<WindowedValue<Object>>() {});
-
- OneInputStreamOperatorTestHarness<WindowedValue<Object>, Object> testHarness =
- new OneInputStreamOperatorTestHarness<>(deserialized,
- typeInformation.createSerializer(new ExecutionConfig()));
-
- testHarness.open();
-
- // execute once to access options
- testHarness.processElement(new StreamRecord<>(
- WindowedValue.of(
- new Object(),
- Instant.now(),
- GlobalWindow.INSTANCE,
- PaneInfo.NO_FIRING)));
-
- testHarness.close();
-
- }
-
-
- private static class TestDoFn extends DoFn<String, String> {
-
- @ProcessElement
- public void processElement(ProcessContext c) throws Exception {
- Assert.assertNotNull(c.getPipelineOptions());
- Assert.assertEquals(
- options.getTestOption(),
- c.getPipelineOptions().as(MyOptions.class).getTestOption());
- }
- }
-}
[22/50] [abbrv] beam git commit: Pin default commons-compress version
to beam-parent pom
Posted by dh...@apache.org.
Pin default commons-compress version to beam-parent pom
Project: http://git-wip-us.apache.org/repos/asf/beam/repo
Commit: http://git-wip-us.apache.org/repos/asf/beam/commit/28b692d5
Tree: http://git-wip-us.apache.org/repos/asf/beam/tree/28b692d5
Diff: http://git-wip-us.apache.org/repos/asf/beam/diff/28b692d5
Branch: refs/heads/DSL_SQL
Commit: 28b692d596638926964d3099045067d1d4e7de38
Parents: 470808c
Author: Isma�l Mej�a <ie...@apache.org>
Authored: Wed Apr 19 12:05:41 2017 +0200
Committer: Isma�l Mej�a <ie...@apache.org>
Committed: Wed Apr 19 12:05:41 2017 +0200
----------------------------------------------------------------------
pom.xml | 7 +++++++
runners/flink/runner/pom.xml | 1 -
runners/spark/pom.xml | 1 -
sdks/java/core/pom.xml | 1 -
4 files changed, 7 insertions(+), 3 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/beam/blob/28b692d5/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index 306978d..09659db 100644
--- a/pom.xml
+++ b/pom.xml
@@ -102,6 +102,7 @@
<!-- If updating dependencies, please update any relevant javadoc offlineLinks -->
<apache.commons.lang.version>3.5</apache.commons.lang.version>
+ <apache.commons.compress.version>1.9</apache.commons.compress.version>
<apex.kryo.version>2.24.0</apex.kryo.version>
<avro.version>1.8.1</avro.version>
<bigquery.version>v2-rev295-1.22.0</bigquery.version>
@@ -514,6 +515,12 @@
<dependency>
<groupId>org.apache.commons</groupId>
+ <artifactId>commons-compress</artifactId>
+ <version>${apache.commons.compress.version}</version>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>${apache.commons.lang.version}</version>
</dependency>
http://git-wip-us.apache.org/repos/asf/beam/blob/28b692d5/runners/flink/runner/pom.xml
----------------------------------------------------------------------
diff --git a/runners/flink/runner/pom.xml b/runners/flink/runner/pom.xml
index 1e6452d..18343ef 100644
--- a/runners/flink/runner/pom.xml
+++ b/runners/flink/runner/pom.xml
@@ -238,7 +238,6 @@
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-compress</artifactId>
- <version>[1.9,)</version>
<scope>runtime</scope>
</dependency>
http://git-wip-us.apache.org/repos/asf/beam/blob/28b692d5/runners/spark/pom.xml
----------------------------------------------------------------------
diff --git a/runners/spark/pom.xml b/runners/spark/pom.xml
index 514cb43..7493485 100644
--- a/runners/spark/pom.xml
+++ b/runners/spark/pom.xml
@@ -196,7 +196,6 @@
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-compress</artifactId>
- <version>1.9</version>
<scope>provided</scope>
</dependency>
<dependency>
http://git-wip-us.apache.org/repos/asf/beam/blob/28b692d5/sdks/java/core/pom.xml
----------------------------------------------------------------------
diff --git a/sdks/java/core/pom.xml b/sdks/java/core/pom.xml
index 2b12481..dc80a2c 100644
--- a/sdks/java/core/pom.xml
+++ b/sdks/java/core/pom.xml
@@ -247,7 +247,6 @@
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-compress</artifactId>
- <version>1.9</version>
</dependency>
<dependency>
[50/50] [abbrv] beam git commit: This closes #2584
Posted by dh...@apache.org.
This closes #2584
Project: http://git-wip-us.apache.org/repos/asf/beam/repo
Commit: http://git-wip-us.apache.org/repos/asf/beam/commit/aa07a1d4
Tree: http://git-wip-us.apache.org/repos/asf/beam/tree/aa07a1d4
Diff: http://git-wip-us.apache.org/repos/asf/beam/diff/aa07a1d4
Branch: refs/heads/DSL_SQL
Commit: aa07a1d41b71c810c4968bd35e5ca69dc56b8c30
Parents: ca87603 19ae877
Author: Dan Halperin <dh...@google.com>
Authored: Wed Apr 19 12:14:05 2017 -0700
Committer: Dan Halperin <dh...@google.com>
Committed: Wed Apr 19 12:14:05 2017 -0700
----------------------------------------------------------------------
.jenkins/common_job_properties.groovy | 261 ----
.../job_beam_PerformanceTests_Dataflow.groovy | 43 -
.jenkins/job_beam_PerformanceTests_JDBC.groovy | 60 -
.jenkins/job_beam_PerformanceTests_Spark.groovy | 44 -
...job_beam_PostCommit_Java_MavenInstall.groovy | 42 -
..._PostCommit_Java_ValidatesRunner_Apex.groovy | 48 -
...tCommit_Java_ValidatesRunner_Dataflow.groovy | 45 -
...PostCommit_Java_ValidatesRunner_Flink.groovy | 43 -
...tCommit_Java_ValidatesRunner_Gearpump.groovy | 49 -
...PostCommit_Java_ValidatesRunner_Spark.groovy | 44 -
.../job_beam_PostCommit_Python_Verify.groovy | 55 -
.../job_beam_PreCommit_Java_MavenInstall.groovy | 42 -
.../job_beam_PreCommit_Website_Stage.groovy | 80 -
.jenkins/job_beam_PreCommit_Website_Test.groovy | 65 -
.../job_beam_Release_NightlySnapshot.groovy | 45 -
.jenkins/job_seed.groovy | 53 -
.../jenkins/common_job_properties.groovy | 261 ++++
.../job_beam_PerformanceTests_Dataflow.groovy | 43 +
.../job_beam_PerformanceTests_JDBC.groovy | 60 +
.../job_beam_PerformanceTests_Spark.groovy | 44 +
...job_beam_PostCommit_Java_MavenInstall.groovy | 42 +
..._PostCommit_Java_ValidatesRunner_Apex.groovy | 48 +
...tCommit_Java_ValidatesRunner_Dataflow.groovy | 45 +
...PostCommit_Java_ValidatesRunner_Flink.groovy | 43 +
...tCommit_Java_ValidatesRunner_Gearpump.groovy | 49 +
...PostCommit_Java_ValidatesRunner_Spark.groovy | 44 +
.../job_beam_PostCommit_Python_Verify.groovy | 55 +
.../job_beam_PreCommit_Java_MavenInstall.groovy | 42 +
.../job_beam_PreCommit_Website_Stage.groovy | 80 +
.../job_beam_PreCommit_Website_Test.groovy | 65 +
.../job_beam_Release_NightlySnapshot.groovy | 45 +
.test-infra/jenkins/job_seed.groovy | 53 +
.../cassandra-service-for-local-dev.yaml | 28 +
.../cassandra-svc-statefulset.yaml | 114 ++
.../LargeITCluster/cassandra-svc-temp.yaml | 74 +
.../cassandra/LargeITCluster/data-load.sh | 122 ++
.../cassandra/LargeITCluster/show_health.sh | 47 +
.../cassandra/LargeITCluster/start-up.sh | 22 +
.../cassandra/LargeITCluster/teardown.sh | 25 +
.../cassandra-service-for-local-dev.yaml | 30 +
.../SmallITCluster/cassandra-svc-rc.yaml | 74 +
.../cassandra/SmallITCluster/data-load.sh | 86 ++
.../cassandra/SmallITCluster/show_health.sh | 47 +
.../cassandra/SmallITCluster/start-up.sh | 23 +
.../cassandra/SmallITCluster/teardown.sh | 22 +
.../kubernetes/cassandra/data-load-setup.sh | 29 +
.../elasticsearch-service-for-local-dev.yaml | 33 +
.../es-services-deployments.yaml | 258 ++++
.../LargeProductionCluster/start-up.sh | 22 +
.../LargeProductionCluster/teardown.sh | 21 +
.../elasticsearch-service-for-local-dev.yaml | 34 +
.../SmallITCluster/elasticsearch-svc-rc.yaml | 96 ++
.../elasticsearch/SmallITCluster/start-up.sh | 23 +
.../elasticsearch/SmallITCluster/teardown.sh | 21 +
.../kubernetes/elasticsearch/data-load-setup.sh | 26 +
.../kubernetes/elasticsearch/data-load.sh | 33 +
.../kubernetes/elasticsearch/es_test_data.py | 299 ++++
.../kubernetes/elasticsearch/show-health.sh | 33 +
.../postgres/postgres-service-for-local-dev.yml | 28 +
.test-infra/kubernetes/postgres/postgres.yml | 56 +
.test-infra/travis/README.md | 23 +
.test-infra/travis/settings.xml | 33 +
.test-infra/travis/test_wordcount.sh | 125 ++
.travis.yml | 4 +-
.travis/README.md | 23 -
.travis/settings.xml | 33 -
.travis/test_wordcount.sh | 125 --
examples/java/pom.xml | 5 +
examples/java8/pom.xml | 39 +
pom.xml | 13 +
runners/apex/pom.xml | 1 +
.../apache/beam/runners/apex/ApexRunner.java | 32 +-
.../beam/runners/apex/ApexYarnLauncher.java | 2 -
.../apex/translation/ParDoTranslator.java | 6 +-
.../operators/ApexGroupByKeyOperator.java | 226 +--
.../operators/ApexParDoOperator.java | 51 +-
.../apex/translation/utils/NoOpStepContext.java | 2 +-
.../utils/SerializablePipelineOptions.java | 13 +-
.../apex/translation/utils/ValuesSource.java | 2 +-
.../apex/examples/UnboundedTextSource.java | 2 +-
.../translation/GroupByKeyTranslatorTest.java | 2 +-
.../apex/translation/ParDoTranslatorTest.java | 18 +-
.../translation/utils/CollectionSource.java | 2 +-
runners/core-construction-java/pom.xml | 2 +-
.../beam/runners/core/construction/Coders.java | 26 +-
.../DeduplicatedFlattenFactory.java | 63 +-
.../EmptyFlattenAsCreateFactory.java | 20 +-
.../runners/core/construction/PCollections.java | 97 ++
.../core/construction/PTransformMatchers.java | 2 -
.../construction/PTransformReplacements.java | 69 +
.../runners/core/construction/PTransforms.java | 107 ++
.../core/construction/PrimitiveCreate.java | 13 +-
.../core/construction/SdkComponents.java | 49 +-
.../SingleInputOutputOverrideFactory.java | 9 +-
.../runners/core/construction/Triggers.java | 336 +++++
.../UnboundedReadFromBoundedSource.java | 7 +-
.../core/construction/UnconsumedReads.java | 72 +
.../UnsupportedOverrideFactory.java | 14 +-
.../core/construction/WindowingStrategies.java | 1 -
.../runners/core/construction/CodersTest.java | 144 +-
.../DeduplicatedFlattenFactoryTest.java | 18 +-
.../EmptyFlattenAsCreateFactoryTest.java | 36 +-
.../core/construction/PCollectionsTest.java | 188 +++
.../PTransformReplacementsTest.java | 131 ++
.../core/construction/PTransformsTest.java | 189 +++
.../core/construction/SdkComponentsTest.java | 85 +-
.../SingleInputOutputOverrideFactoryTest.java | 31 +-
.../runners/core/construction/TriggersTest.java | 111 ++
.../core/construction/UnconsumedReadsTest.java | 105 ++
.../UnsupportedOverrideFactoryTest.java | 11 +-
runners/core-java/pom.xml | 5 +
.../beam/runners/core/BaseExecutionContext.java | 13 +-
.../apache/beam/runners/core/DoFnAdapters.java | 16 +-
.../apache/beam/runners/core/DoFnRunners.java | 40 +-
.../beam/runners/core/ExecutionContext.java | 13 +-
.../GroupAlsoByWindowViaOutputBufferDoFn.java | 2 +-
.../core/GroupAlsoByWindowViaWindowSetDoFn.java | 2 +-
.../GroupAlsoByWindowViaWindowSetNewDoFn.java | 8 +-
.../org/apache/beam/runners/core/OldDoFn.java | 41 +-
...eBoundedSplittableProcessElementInvoker.java | 8 +-
.../beam/runners/core/OutputWindowedValue.java | 10 +-
.../beam/runners/core/ProcessFnRunner.java | 127 ++
.../core/PushbackSideInputDoFnRunner.java | 106 +-
.../beam/runners/core/SimpleDoFnRunner.java | 54 +-
.../beam/runners/core/SimpleOldDoFnRunner.java | 63 +-
.../core/SimplePushbackSideInputDoFnRunner.java | 115 ++
.../beam/runners/core/SplittableParDo.java | 131 +-
.../beam/runners/core/WindowingInternals.java | 8 +-
.../core/WindowingInternalsAdapters.java | 8 +-
.../core/GroupAlsoByWindowsProperties.java | 10 +-
.../apache/beam/runners/core/NoOpOldDoFn.java | 4 +-
.../apache/beam/runners/core/OldDoFnTest.java | 4 +-
...ndedSplittableProcessElementInvokerTest.java | 6 +-
.../core/PushbackSideInputDoFnRunnerTest.java | 282 ----
.../beam/runners/core/ReduceFnTester.java | 10 +-
.../runners/core/SimpleOldDoFnRunnerTest.java | 4 +-
.../SimplePushbackSideInputDoFnRunnerTest.java | 282 ++++
.../beam/runners/core/SplittableParDoTest.java | 98 +-
runners/direct-java/pom.xml | 1 +
.../direct/BoundedReadEvaluatorFactory.java | 2 +-
...ectGBKIntoKeyedWorkItemsOverrideFactory.java | 16 +-
.../direct/DirectGroupByKeyOverrideFactory.java | 14 +-
...ecycleManagerRemovingTransformEvaluator.java | 6 +-
.../GroupAlsoByWindowEvaluatorFactory.java | 10 +-
.../beam/runners/direct/ModelEnforcement.java | 13 +-
.../beam/runners/direct/ParDoEvaluator.java | 129 +-
.../runners/direct/ParDoEvaluatorFactory.java | 23 +-
.../direct/ParDoMultiOverrideFactory.java | 24 +-
...littableProcessElementsEvaluatorFactory.java | 108 +-
.../direct/StatefulParDoEvaluatorFactory.java | 6 +-
.../direct/TestStreamEvaluatorFactory.java | 14 +-
.../direct/TransformEvaluatorFactory.java | 10 +-
.../direct/TransformEvaluatorRegistry.java | 4 +-
.../direct/UnboundedReadEvaluatorFactory.java | 2 +-
.../runners/direct/ViewOverrideFactory.java | 18 +-
.../direct/WriteWithShardingFactory.java | 16 +-
.../direct/BoundedReadEvaluatorFactoryTest.java | 4 +-
.../DirectGroupByKeyOverrideFactoryTest.java | 12 +-
.../beam/runners/direct/DirectRunnerTest.java | 4 +-
...leManagerRemovingTransformEvaluatorTest.java | 8 +-
.../beam/runners/direct/ParDoEvaluatorTest.java | 13 +-
.../direct/ParDoMultiOverrideFactoryTest.java | 45 -
.../direct/TestStreamEvaluatorFactoryTest.java | 12 -
.../UnboundedReadEvaluatorFactoryTest.java | 2 +-
.../runners/direct/ViewOverrideFactoryTest.java | 42 +-
.../direct/WriteWithShardingFactoryTest.java | 23 +-
runners/flink/examples/pom.xml | 126 --
.../beam/runners/flink/examples/TFIDF.java | 455 ------
.../beam/runners/flink/examples/WordCount.java | 129 --
.../runners/flink/examples/package-info.java | 22 -
.../flink/examples/streaming/AutoComplete.java | 400 -----
.../flink/examples/streaming/JoinExamples.java | 154 --
.../examples/streaming/KafkaIOExamples.java | 338 -----
.../KafkaWindowedWordCountExample.java | 164 --
.../examples/streaming/WindowedWordCount.java | 141 --
.../flink/examples/streaming/package-info.java | 22 -
runners/flink/pom.xml | 320 +++-
runners/flink/runner/pom.xml | 311 ----
.../flink/DefaultParallelismFactory.java | 39 -
.../flink/FlinkBatchPipelineTranslator.java | 139 --
.../flink/FlinkBatchTransformTranslators.java | 723 ---------
.../flink/FlinkBatchTranslationContext.java | 153 --
.../flink/FlinkDetachedRunnerResult.java | 76 -
.../FlinkPipelineExecutionEnvironment.java | 241 ---
.../runners/flink/FlinkPipelineOptions.java | 101 --
.../runners/flink/FlinkPipelineTranslator.java | 53 -
.../apache/beam/runners/flink/FlinkRunner.java | 232 ---
.../runners/flink/FlinkRunnerRegistrar.java | 62 -
.../beam/runners/flink/FlinkRunnerResult.java | 98 --
.../flink/FlinkStreamingPipelineTranslator.java | 272 ----
.../FlinkStreamingTransformTranslators.java | 1107 --------------
.../flink/FlinkStreamingTranslationContext.java | 130 --
.../flink/FlinkStreamingViewOverrides.java | 372 -----
.../flink/PipelineTranslationOptimizer.java | 72 -
.../beam/runners/flink/TestFlinkRunner.java | 84 --
.../beam/runners/flink/TranslationMode.java | 31 -
.../apache/beam/runners/flink/package-info.java | 22 -
.../functions/FlinkAggregatorFactory.java | 53 -
.../functions/FlinkAssignContext.java | 63 -
.../functions/FlinkAssignWindows.java | 49 -
.../functions/FlinkDoFnFunction.java | 161 --
.../FlinkMergingNonShuffleReduceFunction.java | 228 ---
.../FlinkMergingPartialReduceFunction.java | 201 ---
.../functions/FlinkMergingReduceFunction.java | 199 ---
.../FlinkMultiOutputPruningFunction.java | 50 -
.../functions/FlinkNoOpStepContext.java | 73 -
.../functions/FlinkPartialReduceFunction.java | 172 ---
.../functions/FlinkReduceFunction.java | 173 ---
.../functions/FlinkSideInputReader.java | 80 -
.../functions/FlinkStatefulDoFnFunction.java | 198 ---
.../functions/SideInputInitializer.java | 73 -
.../translation/functions/package-info.java | 22 -
.../runners/flink/translation/package-info.java | 22 -
.../translation/types/CoderTypeInformation.java | 120 --
.../translation/types/CoderTypeSerializer.java | 132 --
.../types/EncodedValueComparator.java | 195 ---
.../types/EncodedValueSerializer.java | 113 --
.../types/EncodedValueTypeInformation.java | 107 --
.../flink/translation/types/FlinkCoder.java | 63 -
.../types/InspectableByteArrayOutputStream.java | 34 -
.../flink/translation/types/KvKeySelector.java | 50 -
.../flink/translation/types/package-info.java | 22 -
.../utils/SerializedPipelineOptions.java | 65 -
.../flink/translation/utils/package-info.java | 22 -
.../wrappers/DataInputViewWrapper.java | 58 -
.../wrappers/DataOutputViewWrapper.java | 51 -
.../SerializableFnAggregatorWrapper.java | 98 --
.../translation/wrappers/SourceInputFormat.java | 149 --
.../translation/wrappers/SourceInputSplit.java | 52 -
.../translation/wrappers/package-info.java | 22 -
.../wrappers/streaming/DoFnOperator.java | 772 ----------
.../streaming/KvToByteBufferKeySelector.java | 56 -
.../streaming/SingletonKeyedWorkItem.java | 58 -
.../streaming/SingletonKeyedWorkItemCoder.java | 128 --
.../streaming/SplittableDoFnOperator.java | 150 --
.../wrappers/streaming/WindowDoFnOperator.java | 120 --
.../wrappers/streaming/WorkItemKeySelector.java | 56 -
.../streaming/io/BoundedSourceWrapper.java | 218 ---
.../streaming/io/UnboundedFlinkSink.java | 200 ---
.../streaming/io/UnboundedFlinkSource.java | 120 --
.../streaming/io/UnboundedSocketSource.java | 249 ---
.../streaming/io/UnboundedSourceWrapper.java | 476 ------
.../wrappers/streaming/io/package-info.java | 22 -
.../wrappers/streaming/package-info.java | 22 -
.../state/FlinkBroadcastStateInternals.java | 865 -----------
.../state/FlinkKeyGroupStateInternals.java | 487 ------
.../state/FlinkSplitStateInternals.java | 260 ----
.../streaming/state/FlinkStateInternals.java | 1053 -------------
.../state/KeyGroupCheckpointedOperator.java | 35 -
.../state/KeyGroupRestoringOperator.java | 32 -
.../wrappers/streaming/state/package-info.java | 22 -
.../runner/src/main/resources/log4j.properties | 23 -
.../flink/EncodedValueComparatorTest.java | 70 -
.../runners/flink/FlinkRunnerRegistrarTest.java | 48 -
.../beam/runners/flink/FlinkTestPipeline.java | 72 -
.../beam/runners/flink/PipelineOptionsTest.java | 184 ---
.../beam/runners/flink/ReadSourceITCase.java | 85 --
.../flink/ReadSourceStreamingITCase.java | 74 -
.../beam/runners/flink/WriteSinkITCase.java | 192 ---
.../flink/streaming/DoFnOperatorTest.java | 600 --------
.../FlinkBroadcastStateInternalsTest.java | 245 ---
.../FlinkKeyGroupStateInternalsTest.java | 262 ----
.../streaming/FlinkSplitStateInternalsTest.java | 101 --
.../streaming/FlinkStateInternalsTest.java | 395 -----
.../flink/streaming/GroupByNullKeyTest.java | 124 --
.../flink/streaming/TestCountingSource.java | 254 ----
.../streaming/TopWikipediaSessionsITCase.java | 133 --
.../streaming/UnboundedSourceWrapperTest.java | 464 ------
.../runners/flink/streaming/package-info.java | 22 -
.../src/test/resources/log4j-test.properties | 27 -
.../flink/DefaultParallelismFactory.java | 39 +
.../flink/FlinkBatchPipelineTranslator.java | 139 ++
.../flink/FlinkBatchTransformTranslators.java | 723 +++++++++
.../flink/FlinkBatchTranslationContext.java | 153 ++
.../flink/FlinkDetachedRunnerResult.java | 75 +
.../FlinkPipelineExecutionEnvironment.java | 241 +++
.../runners/flink/FlinkPipelineOptions.java | 101 ++
.../runners/flink/FlinkPipelineTranslator.java | 53 +
.../apache/beam/runners/flink/FlinkRunner.java | 232 +++
.../runners/flink/FlinkRunnerRegistrar.java | 62 +
.../beam/runners/flink/FlinkRunnerResult.java | 98 ++
.../flink/FlinkStreamingPipelineTranslator.java | 276 ++++
.../FlinkStreamingTransformTranslators.java | 1044 +++++++++++++
.../flink/FlinkStreamingTranslationContext.java | 130 ++
.../flink/FlinkStreamingViewOverrides.java | 372 +++++
.../flink/PipelineTranslationOptimizer.java | 72 +
.../beam/runners/flink/TestFlinkRunner.java | 84 ++
.../beam/runners/flink/TranslationMode.java | 31 +
.../apache/beam/runners/flink/package-info.java | 22 +
.../functions/FlinkAggregatorFactory.java | 53 +
.../functions/FlinkAssignContext.java | 63 +
.../functions/FlinkAssignWindows.java | 49 +
.../functions/FlinkDoFnFunction.java | 161 ++
.../FlinkMergingNonShuffleReduceFunction.java | 228 +++
.../FlinkMergingPartialReduceFunction.java | 201 +++
.../functions/FlinkMergingReduceFunction.java | 199 +++
.../FlinkMultiOutputPruningFunction.java | 50 +
.../functions/FlinkNoOpStepContext.java | 73 +
.../functions/FlinkPartialReduceFunction.java | 172 +++
.../functions/FlinkReduceFunction.java | 173 +++
.../functions/FlinkSideInputReader.java | 80 +
.../functions/FlinkStatefulDoFnFunction.java | 198 +++
.../functions/SideInputInitializer.java | 73 +
.../translation/functions/package-info.java | 22 +
.../runners/flink/translation/package-info.java | 22 +
.../translation/types/CoderTypeInformation.java | 120 ++
.../translation/types/CoderTypeSerializer.java | 132 ++
.../types/EncodedValueComparator.java | 195 +++
.../types/EncodedValueSerializer.java | 113 ++
.../types/EncodedValueTypeInformation.java | 98 ++
.../types/InspectableByteArrayOutputStream.java | 34 +
.../flink/translation/types/KvKeySelector.java | 50 +
.../flink/translation/types/package-info.java | 22 +
.../utils/SerializedPipelineOptions.java | 67 +
.../flink/translation/utils/package-info.java | 22 +
.../wrappers/DataInputViewWrapper.java | 58 +
.../wrappers/DataOutputViewWrapper.java | 51 +
.../SerializableFnAggregatorWrapper.java | 98 ++
.../translation/wrappers/SourceInputFormat.java | 150 ++
.../translation/wrappers/SourceInputSplit.java | 52 +
.../translation/wrappers/package-info.java | 22 +
.../wrappers/streaming/DoFnOperator.java | 774 ++++++++++
.../streaming/KvToByteBufferKeySelector.java | 56 +
.../streaming/SingletonKeyedWorkItem.java | 56 +
.../streaming/SingletonKeyedWorkItemCoder.java | 126 ++
.../streaming/SplittableDoFnOperator.java | 150 ++
.../wrappers/streaming/WindowDoFnOperator.java | 117 ++
.../wrappers/streaming/WorkItemKeySelector.java | 56 +
.../streaming/io/BoundedSourceWrapper.java | 218 +++
.../streaming/io/UnboundedSocketSource.java | 249 +++
.../streaming/io/UnboundedSourceWrapper.java | 476 ++++++
.../wrappers/streaming/io/package-info.java | 22 +
.../wrappers/streaming/package-info.java | 22 +
.../state/FlinkBroadcastStateInternals.java | 865 +++++++++++
.../state/FlinkKeyGroupStateInternals.java | 487 ++++++
.../state/FlinkSplitStateInternals.java | 260 ++++
.../streaming/state/FlinkStateInternals.java | 1053 +++++++++++++
.../state/KeyGroupCheckpointedOperator.java | 35 +
.../state/KeyGroupRestoringOperator.java | 32 +
.../wrappers/streaming/state/package-info.java | 22 +
.../flink/src/main/resources/log4j.properties | 23 +
.../flink/EncodedValueComparatorTest.java | 70 +
.../runners/flink/FlinkRunnerRegistrarTest.java | 48 +
.../beam/runners/flink/FlinkTestPipeline.java | 72 +
.../beam/runners/flink/PipelineOptionsTest.java | 184 +++
.../beam/runners/flink/ReadSourceITCase.java | 85 ++
.../flink/ReadSourceStreamingITCase.java | 74 +
.../beam/runners/flink/WriteSinkITCase.java | 192 +++
.../flink/streaming/DoFnOperatorTest.java | 600 ++++++++
.../FlinkBroadcastStateInternalsTest.java | 245 +++
.../FlinkKeyGroupStateInternalsTest.java | 262 ++++
.../streaming/FlinkSplitStateInternalsTest.java | 101 ++
.../streaming/FlinkStateInternalsTest.java | 395 +++++
.../flink/streaming/GroupByNullKeyTest.java | 124 ++
.../flink/streaming/TestCountingSource.java | 254 ++++
.../streaming/TopWikipediaSessionsITCase.java | 133 ++
.../streaming/UnboundedSourceWrapperTest.java | 464 ++++++
.../runners/flink/streaming/package-info.java | 22 +
.../src/test/resources/log4j-test.properties | 27 +
runners/google-cloud-dataflow-java/pom.xml | 62 +-
.../dataflow/BatchStatefulParDoOverrides.java | 44 +-
.../runners/dataflow/BatchViewOverrides.java | 23 +-
.../beam/runners/dataflow/DataflowRunner.java | 96 +-
.../dataflow/PrimitiveParDoSingleFactory.java | 15 +-
.../dataflow/ReshuffleOverrideFactory.java | 12 +-
.../dataflow/StreamingViewOverrides.java | 14 +-
.../dataflow/internal/CustomSources.java | 2 +-
.../dataflow/BatchViewOverridesTest.java | 4 +-
.../dataflow/DataflowPipelineJobTest.java | 2 +-
.../runners/dataflow/DataflowRunnerTest.java | 24 +
.../PrimitiveParDoSingleFactoryTest.java | 59 +-
runners/pom.xml | 40 -
runners/spark/pom.xml | 2 +-
.../beam/runners/spark/TestSparkRunner.java | 14 +-
.../beam/runners/spark/io/MicrobatchSource.java | 7 +-
.../beam/runners/spark/io/SourceDStream.java | 2 +-
.../apache/beam/runners/spark/io/SourceRDD.java | 4 +-
.../SparkGroupAlsoByWindowViaWindowSet.java | 13 +-
.../spark/translation/MultiDoFnFunction.java | 4 +-
...SparkGroupAlsoByWindowViaOutputBufferFn.java | 10 +-
.../spark/translation/SparkProcessContext.java | 2 +-
.../spark/translation/SparkRuntimeContext.java | 2 +
.../streaming/StreamingTransformTranslator.java | 2 +-
.../spark/util/SparkSideInputReader.java | 3 +-
sdks/common/fn-api/pom.xml | 5 -
.../fn-api/src/main/proto/beam_fn_api.proto | 174 +--
.../src/main/proto/beam_runner_api.proto | 4 +-
.../src/main/resources/beam/findbugs-filter.xml | 7 -
sdks/java/core/pom.xml | 49 +-
.../main/java/org/apache/beam/sdk/Pipeline.java | 58 +-
.../java/org/apache/beam/sdk/io/AvroIO.java | 13 +-
.../sdk/io/BoundedReadFromUnboundedSource.java | 4 +-
.../org/apache/beam/sdk/io/BoundedSource.java | 13 +-
.../apache/beam/sdk/io/CompressedSource.java | 4 +-
.../org/apache/beam/sdk/io/CountingSource.java | 2 +-
.../org/apache/beam/sdk/io/FileBasedSource.java | 9 +-
.../apache/beam/sdk/io/OffsetBasedSource.java | 4 +-
.../main/java/org/apache/beam/sdk/io/Sink.java | 2 +-
.../java/org/apache/beam/sdk/io/TFRecordIO.java | 639 +++-----
.../org/apache/beam/sdk/io/UnboundedSource.java | 15 +-
.../main/java/org/apache/beam/sdk/io/XmlIO.java | 477 ++++++
.../java/org/apache/beam/sdk/io/XmlSink.java | 226 +--
.../java/org/apache/beam/sdk/io/XmlSource.java | 191 +--
.../beam/sdk/options/BigQueryOptions.java | 32 -
.../options/CloudResourceManagerOptions.java | 40 -
.../DefaultPipelineOptionsRegistrar.java | 5 -
.../org/apache/beam/sdk/options/GcpOptions.java | 227 ---
.../org/apache/beam/sdk/options/GcsOptions.java | 158 --
.../beam/sdk/options/GoogleApiDebugOptions.java | 87 --
.../beam/sdk/options/PipelineOptions.java | 16 +-
.../apache/beam/sdk/options/PubsubOptions.java | 36 -
.../sdk/runners/PTransformOverrideFactory.java | 33 +-
.../apache/beam/sdk/runners/PipelineRunner.java | 7 +-
.../beam/sdk/testing/BigqueryMatcher.java | 256 ----
.../beam/sdk/testing/SourceTestUtils.java | 6 +-
.../apache/beam/sdk/testing/TestPipeline.java | 5 +-
...esSplittableParDoWithWindowedSideInputs.java | 26 +
.../beam/sdk/transforms/AppliedPTransform.java | 5 +
.../org/apache/beam/sdk/transforms/Combine.java | 8 +-
.../org/apache/beam/sdk/transforms/Create.java | 7 +-
.../org/apache/beam/sdk/transforms/DoFn.java | 14 +-
.../apache/beam/sdk/transforms/DoFnTester.java | 41 +-
.../beam/sdk/transforms/FlatMapElements.java | 4 +-
.../apache/beam/sdk/transforms/MapElements.java | 4 +-
.../org/apache/beam/sdk/transforms/ParDo.java | 86 +-
.../apache/beam/sdk/transforms/Partition.java | 2 +-
.../sdk/transforms/display/DisplayData.java | 6 +
.../beam/sdk/transforms/windowing/Triggers.java | 322 ----
.../beam/sdk/transforms/windowing/Window.java | 1 +
.../beam/sdk/transforms/windowing/WindowFn.java | 13 -
.../beam/sdk/util/AppEngineEnvironment.java | 62 -
...AttemptAndTimeBoundedExponentialBackOff.java | 170 ---
.../util/AttemptBoundedExponentialBackOff.java | 86 --
.../apache/beam/sdk/util/CredentialFactory.java | 29 -
.../org/apache/beam/sdk/util/DefaultBucket.java | 105 --
.../beam/sdk/util/GcpCredentialFactory.java | 67 -
.../apache/beam/sdk/util/GcpProjectUtil.java | 106 --
.../beam/sdk/util/GcsIOChannelFactory.java | 111 --
.../sdk/util/GcsIOChannelFactoryRegistrar.java | 38 -
.../apache/beam/sdk/util/GcsPathValidator.java | 95 --
.../java/org/apache/beam/sdk/util/GcsUtil.java | 798 ----------
.../apache/beam/sdk/util/IOChannelUtils.java | 22 +-
.../util/IntervalBoundedExponentialBackOff.java | 89 --
.../beam/sdk/util/NoopCredentialFactory.java | 68 -
.../sdk/util/NullCredentialInitializer.java | 62 -
.../org/apache/beam/sdk/util/Transport.java | 178 ---
.../beam/sdk/values/PCollectionTuple.java | 3 +-
.../org/apache/beam/sdk/values/TupleTag.java | 26 +-
.../apache/beam/sdk/values/TupleTagList.java | 2 +-
.../apache/beam/sdk/values/TypeDescriptors.java | 25 +-
.../org/apache/beam/sdk/values/TypedPValue.java | 4 +-
.../org/apache/beam/SdkCoreApiSurfaceTest.java | 2 -
.../java/org/apache/beam/sdk/PipelineTest.java | 92 +-
.../org/apache/beam/sdk/io/AvroSourceTest.java | 10 +-
.../apache/beam/sdk/io/CountingSourceTest.java | 6 +-
.../apache/beam/sdk/io/FileBasedSourceTest.java | 8 +-
.../beam/sdk/io/OffsetBasedSourceTest.java | 8 +-
.../java/org/apache/beam/sdk/io/ReadTest.java | 4 +-
.../org/apache/beam/sdk/io/TFRecordIOTest.java | 20 +-
.../java/org/apache/beam/sdk/io/TextIOTest.java | 113 +-
.../org/apache/beam/sdk/io/XmlSinkTest.java | 89 +-
.../org/apache/beam/sdk/io/XmlSourceTest.java | 250 +--
.../apache/beam/sdk/metrics/MetricsTest.java | 2 +-
.../apache/beam/sdk/options/GcpOptionsTest.java | 171 ---
.../sdk/options/GoogleApiDebugOptionsTest.java | 145 --
.../sdk/options/PipelineOptionsFactoryTest.java | 4 +-
.../beam/sdk/runners/PipelineRunnerTest.java | 46 +-
.../runners/dataflow/TestCountingSource.java | 2 +-
.../beam/sdk/testing/BigqueryMatcherTest.java | 176 ---
.../beam/sdk/testing/SourceTestUtilsTest.java | 2 +-
.../beam/sdk/testing/StaticWindowsTest.java | 10 +-
.../beam/sdk/testing/TestPipelineTest.java | 6 +-
.../apache/beam/sdk/transforms/CreateTest.java | 20 +-
.../apache/beam/sdk/transforms/ParDoTest.java | 293 ++--
.../beam/sdk/transforms/SplittableDoFnTest.java | 107 +-
.../sdk/transforms/display/DisplayDataTest.java | 17 +
.../sdk/transforms/windowing/TriggersTest.java | 100 --
...mptAndTimeBoundedExponentialBackOffTest.java | 213 ---
.../AttemptBoundedExponentialBackOffTest.java | 85 --
.../apache/beam/sdk/util/DefaultBucketTest.java | 112 --
.../beam/sdk/util/GcpProjectUtilTest.java | 76 -
.../util/GcsIOChannelFactoryRegistrarTest.java | 44 -
.../beam/sdk/util/GcsIOChannelFactoryTest.java | 43 -
.../beam/sdk/util/GcsPathValidatorTest.java | 87 --
.../org/apache/beam/sdk/util/GcsUtilTest.java | 798 ----------
.../IntervalBoundedExponentialBackOffTest.java | 100 --
.../util/RetryHttpRequestInitializerTest.java | 290 ----
.../beam/sdk/values/PCollectionTupleTest.java | 8 +-
.../apache/beam/sdk/values/TypedPValueTest.java | 46 +-
sdks/java/extensions/gcp-core/pom.xml | 217 +++
.../beam/sdk/options/BigQueryOptions.java | 32 +
.../options/CloudResourceManagerOptions.java | 40 +
.../org/apache/beam/sdk/options/GcpOptions.java | 227 +++
.../options/GcpPipelineOptionsRegistrar.java | 39 +
.../org/apache/beam/sdk/options/GcsOptions.java | 154 ++
.../beam/sdk/options/GoogleApiDebugOptions.java | 87 ++
.../apache/beam/sdk/options/PubsubOptions.java | 36 +
.../apache/beam/sdk/options/package-info.java | 22 +
.../beam/sdk/testing/BigqueryMatcher.java | 256 ++++
.../apache/beam/sdk/testing/package-info.java | 21 +
.../apache/beam/sdk/util/CredentialFactory.java | 29 +
.../org/apache/beam/sdk/util/DefaultBucket.java | 105 ++
.../beam/sdk/util/GcpCredentialFactory.java | 67 +
.../apache/beam/sdk/util/GcpProjectUtil.java | 106 ++
.../beam/sdk/util/GcsIOChannelFactory.java | 111 ++
.../sdk/util/GcsIOChannelFactoryRegistrar.java | 38 +
.../apache/beam/sdk/util/GcsPathValidator.java | 95 ++
.../java/org/apache/beam/sdk/util/GcsUtil.java | 798 ++++++++++
.../beam/sdk/util/NoopCredentialFactory.java | 68 +
.../sdk/util/NullCredentialInitializer.java | 62 +
.../org/apache/beam/sdk/util/Transport.java | 178 +++
.../org/apache/beam/sdk/util/package-info.java | 20 +
.../org/apache/beam/GcpCoreApiSurfaceTest.java | 62 +
.../apache/beam/sdk/options/GcpOptionsTest.java | 171 +++
.../sdk/options/GoogleApiDebugOptionsTest.java | 145 ++
.../beam/sdk/testing/BigqueryMatcherTest.java | 176 +++
.../apache/beam/sdk/util/DefaultBucketTest.java | 112 ++
.../beam/sdk/util/GcpProjectUtilTest.java | 76 +
.../util/GcsIOChannelFactoryRegistrarTest.java | 44 +
.../beam/sdk/util/GcsIOChannelFactoryTest.java | 43 +
.../beam/sdk/util/GcsPathValidatorTest.java | 87 ++
.../org/apache/beam/sdk/util/GcsUtilTest.java | 798 ++++++++++
.../util/RetryHttpRequestInitializerTest.java | 290 ++++
sdks/java/extensions/pom.xml | 1 +
sdks/java/harness/pom.xml | 5 +
.../beam/fn/harness/fake/FakeStepContext.java | 2 +-
.../control/ProcessBundleHandlerTest.java | 30 +-
.../sdk/io/elasticsearch/ElasticsearchIO.java | 2 +-
.../sdk/io/elasticsearch/ElasticsearchIOIT.java | 2 +-
.../io/elasticsearch/ElasticsearchIOTest.java | 4 +-
sdks/java/io/google-cloud-platform/pom.xml | 20 +-
.../sdk/io/gcp/bigquery/BatchLoadBigQuery.java | 180 ---
.../beam/sdk/io/gcp/bigquery/BatchLoads.java | 225 +++
.../sdk/io/gcp/bigquery/BigQueryHelpers.java | 13 +
.../beam/sdk/io/gcp/bigquery/BigQueryIO.java | 172 +--
.../sdk/io/gcp/bigquery/BigQuerySourceBase.java | 33 +-
.../io/gcp/bigquery/BigQueryTableSource.java | 34 +-
.../beam/sdk/io/gcp/bigquery/CreateTables.java | 127 ++
.../io/gcp/bigquery/GenerateShardedTable.java | 47 +
.../beam/sdk/io/gcp/bigquery/PrepareWrite.java | 81 +
.../beam/sdk/io/gcp/bigquery/ShardedKey.java | 25 +-
.../sdk/io/gcp/bigquery/StreamWithDeDup.java | 90 --
.../sdk/io/gcp/bigquery/StreamingInserts.java | 79 +
.../sdk/io/gcp/bigquery/StreamingWriteFn.java | 81 +-
.../io/gcp/bigquery/StreamingWriteTables.java | 86 ++
.../sdk/io/gcp/bigquery/TableDestination.java | 76 +
.../io/gcp/bigquery/TableDestinationCoder.java | 60 +
.../sdk/io/gcp/bigquery/TableRowWriter.java | 19 +-
.../sdk/io/gcp/bigquery/TagWithUniqueIds.java | 62 +
.../gcp/bigquery/TagWithUniqueIdsAndTable.java | 135 --
.../sdk/io/gcp/bigquery/TransformingSource.java | 4 +-
.../beam/sdk/io/gcp/bigquery/WriteBundles.java | 82 -
.../io/gcp/bigquery/WriteBundlesToFiles.java | 157 ++
.../sdk/io/gcp/bigquery/WritePartition.java | 163 +-
.../beam/sdk/io/gcp/bigquery/WriteRename.java | 71 +-
.../beam/sdk/io/gcp/bigquery/WriteTables.java | 58 +-
.../beam/sdk/io/gcp/bigtable/BigtableIO.java | 6 +-
.../io/gcp/pubsub/PubsubUnboundedSource.java | 6 +-
.../sdk/io/gcp/bigquery/BigQueryIOTest.java | 1419 +++++++-----------
.../io/gcp/bigquery/FakeBigQueryServices.java | 166 ++
.../sdk/io/gcp/bigquery/FakeDatasetService.java | 208 +++
.../sdk/io/gcp/bigquery/FakeJobService.java | 404 +++++
.../sdk/io/gcp/bigquery/TableContainer.java | 61 +
.../sdk/io/gcp/bigtable/BigtableIOTest.java | 8 +-
.../gcp/pubsub/PubsubUnboundedSourceTest.java | 4 +-
.../hadoop/inputformat/HadoopInputFormatIO.java | 4 +-
.../inputformat/HadoopInputFormatIOTest.java | 19 +-
.../integration/tests/HIFIOCassandraIT.java | 6 +-
.../integration/tests/HIFIOElasticIT.java | 4 +-
.../SmallITCluster/cassandra-svc-rc.yaml | 88 --
.../cassandra/SmallITCluster/start-up.sh | 21 -
.../cassandra/SmallITCluster/teardown.sh | 21 -
.../kubernetes/cassandra/data-load-setup.sh | 29 -
.../resources/kubernetes/cassandra/data-load.sh | 67 -
.../LargeProductionCluster/es-services.yaml | 277 ----
.../LargeProductionCluster/start-up.sh | 21 -
.../LargeProductionCluster/teardown.sh | 20 -
.../SmallITCluster/elasticsearch-svc-rc.yaml | 84 --
.../elasticsearch/SmallITCluster/start-up.sh | 22 -
.../elasticsearch/SmallITCluster/teardown.sh | 20 -
.../kubernetes/elasticsearch/data-load-setup.sh | 26 -
.../kubernetes/elasticsearch/data-load.sh | 33 -
.../kubernetes/elasticsearch/es_test_data.py | 299 ----
.../kubernetes/elasticsearch/show-health.sh | 25 -
sdks/java/io/hadoop/pom.xml | 15 +-
.../org/apache/beam/sdk/io/hbase/HBaseIO.java | 7 +-
.../apache/beam/sdk/io/hbase/HBaseIOTest.java | 2 +-
.../apache/beam/sdk/io/hdfs/HDFSFileSource.java | 6 +-
.../beam/sdk/io/hdfs/HDFSFileSourceTest.java | 4 +-
.../org/apache/beam/sdk/io/jdbc/JdbcIO.java | 42 +-
.../org/apache/beam/sdk/io/jdbc/JdbcIOTest.java | 10 +-
.../postgres-service-for-local-dev.yml | 28 -
.../src/test/resources/kubernetes/postgres.yml | 56 -
.../jdbc/src/test/resources/kubernetes/setup.sh | 19 -
.../src/test/resources/kubernetes/teardown.sh | 19 -
.../java/org/apache/beam/sdk/io/jms/JmsIO.java | 2 +-
.../org/apache/beam/sdk/io/jms/JmsIOTest.java | 4 +-
.../org/apache/beam/sdk/io/kafka/KafkaIO.java | 12 +-
.../apache/beam/sdk/io/kafka/KafkaIOTest.java | 8 +-
.../beam/sdk/io/kinesis/KinesisSource.java | 2 +-
.../beam/sdk/io/mongodb/MongoDbGridFSIO.java | 11 +-
.../apache/beam/sdk/io/mongodb/MongoDbIO.java | 2 +-
.../sdk/io/mongodb/MongoDBGridFSIOTest.java | 2 +-
.../org/apache/beam/sdk/io/mqtt/MqttIO.java | 2 +-
sdks/python/.pylintrc | 6 +-
sdks/python/apache_beam/coders/coder_impl.py | 50 +-
sdks/python/apache_beam/coders/coders.py | 6 +-
sdks/python/apache_beam/error.py | 4 +
.../examples/complete/game/user_score.py | 8 +-
.../examples/complete/top_wikipedia_sessions.py | 8 -
.../examples/cookbook/datastore_wordcount.py | 15 +-
.../examples/cookbook/group_with_coder.py | 6 +-
.../examples/cookbook/multiple_output_pardo.py | 47 +-
.../examples/snippets/snippets_test.py | 33 +-
sdks/python/apache_beam/examples/wordcount.py | 32 +-
.../apache_beam/examples/wordcount_debugging.py | 4 -
sdks/python/apache_beam/internal/gcp/auth.py | 80 +-
.../apache_beam/internal/gcp/auth_test.py | 44 -
.../apache_beam/internal/gcp/json_value.py | 6 +
sdks/python/apache_beam/internal/pickler.py | 8 +-
sdks/python/apache_beam/io/avroio_test.py | 14 +-
sdks/python/apache_beam/io/concat_source.py | 74 +-
.../python/apache_beam/io/concat_source_test.py | 12 +-
sdks/python/apache_beam/io/filebasedsource.py | 53 +-
.../apache_beam/io/filebasedsource_test.py | 25 +-
sdks/python/apache_beam/io/fileio.py | 158 +-
sdks/python/apache_beam/io/fileio_test.py | 45 +-
sdks/python/apache_beam/io/filesystem.py | 1 +
sdks/python/apache_beam/io/filesystems_util.py | 10 +-
sdks/python/apache_beam/io/gcp/bigquery.py | 32 +-
.../io/gcp/datastore/v1/datastoreio.py | 24 +-
.../io/gcp/datastore/v1/datastoreio_test.py | 4 +-
.../apache_beam/io/gcp/datastore/v1/helper.py | 16 +-
.../io/gcp/datastore/v1/query_splitter.py | 2 +-
sdks/python/apache_beam/io/gcp/gcsfilesystem.py | 7 +-
.../io/gcp/tests/bigquery_matcher.py | 3 +-
sdks/python/apache_beam/io/iobase.py | 7 +-
sdks/python/apache_beam/io/localfilesystem.py | 3 +-
sdks/python/apache_beam/io/range_trackers.py | 19 +-
sdks/python/apache_beam/io/source_test_utils.py | 79 +-
.../apache_beam/io/source_test_utils_test.py | 20 +-
sdks/python/apache_beam/io/textio.py | 15 +-
sdks/python/apache_beam/io/textio_test.py | 18 +-
sdks/python/apache_beam/metrics/cells.py | 28 +-
sdks/python/apache_beam/metrics/execution.py | 3 +-
sdks/python/apache_beam/metrics/metric.py | 9 +-
sdks/python/apache_beam/pvalue.py | 26 +-
sdks/python/apache_beam/runners/common.pxd | 2 +-
sdks/python/apache_beam/runners/common.py | 15 +-
.../runners/dataflow/dataflow_metrics_test.py | 3 +-
.../runners/dataflow/dataflow_runner.py | 29 +-
.../runners/dataflow/internal/apiclient.py | 9 +-
.../runners/dataflow/internal/dependency.py | 6 +-
.../runners/dataflow/test_dataflow_runner.py | 4 -
.../runners/direct/bundle_factory.py | 14 +-
sdks/python/apache_beam/runners/direct/clock.py | 9 +-
.../consumer_tracking_pipeline_visitor_test.py | 2 +-
.../apache_beam/runners/direct/direct_runner.py | 6 +
.../runners/direct/evaluation_context.py | 21 +-
.../apache_beam/runners/direct/executor.py | 21 +-
.../runners/direct/transform_evaluator.py | 19 +-
.../runners/direct/transform_result.py | 45 +-
.../runners/direct/watermark_manager.py | 4 +-
sdks/python/apache_beam/runners/runner.py | 17 +-
.../apache_beam/tests/pipeline_verifiers.py | 7 +-
sdks/python/apache_beam/transforms/combiners.py | 62 +-
.../apache_beam/transforms/combiners_test.py | 4 +-
sdks/python/apache_beam/transforms/core.py | 112 +-
.../apache_beam/transforms/create_test.py | 18 +-
sdks/python/apache_beam/transforms/display.py | 4 +-
.../apache_beam/transforms/display_test.py | 36 +
.../python/apache_beam/transforms/ptransform.py | 38 +-
.../apache_beam/transforms/ptransform_test.py | 19 +-
.../python/apache_beam/transforms/sideinputs.py | 11 +-
sdks/python/apache_beam/transforms/trigger.py | 26 +-
.../apache_beam/transforms/trigger_test.py | 6 +-
sdks/python/apache_beam/typehints/decorators.py | 26 +-
.../apache_beam/typehints/trivial_inference.py | 26 +-
.../typehints/trivial_inference_test.py | 3 +-
sdks/python/apache_beam/typehints/typecheck.py | 11 +-
sdks/python/apache_beam/typehints/typehints.py | 66 +-
.../apache_beam/typehints/typehints_test.py | 7 +-
.../apache_beam/utils/annotations_test.py | 2 +-
sdks/python/apache_beam/utils/path.py | 3 +-
.../apache_beam/utils/pipeline_options.py | 95 +-
.../apache_beam/utils/pipeline_options_test.py | 52 +-
sdks/python/apache_beam/utils/proto_utils.py | 15 +-
sdks/python/apache_beam/utils/retry.py | 14 +-
sdks/python/apache_beam/utils/timestamp.py | 6 +-
sdks/python/apache_beam/utils/value_provider.py | 103 ++
.../apache_beam/utils/value_provider_test.py | 145 ++
sdks/python/apache_beam/utils/windowed_value.py | 17 +-
sdks/python/run_pylint.sh | 2 +-
sdks/python/tox.ini | 18 +-
694 files changed, 32745 insertions(+), 32281 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/beam/blob/aa07a1d4/pom.xml
----------------------------------------------------------------------
[07/50] [abbrv] beam git commit: Explodes windows before GBKIKWI
Posted by dh...@apache.org.
Explodes windows before GBKIKWI
Also
* Adds a test for windowed side inputs that requires this
behavior.
* Adds a test category for SDF with windowed side input.
Runners should gradually implement this properly. For now
only direct runner implements this properly.
Project: http://git-wip-us.apache.org/repos/asf/beam/repo
Commit: http://git-wip-us.apache.org/repos/asf/beam/commit/6ac3ac50
Tree: http://git-wip-us.apache.org/repos/asf/beam/tree/6ac3ac50
Diff: http://git-wip-us.apache.org/repos/asf/beam/diff/6ac3ac50
Branch: refs/heads/DSL_SQL
Commit: 6ac3ac50fec2eb02927c0a07ca928967cfef5652
Parents: b93de58
Author: Eugene Kirpichov <ki...@google.com>
Authored: Mon Apr 17 11:28:24 2017 -0700
Committer: Eugene Kirpichov <ki...@google.com>
Committed: Tue Apr 18 18:02:07 2017 -0700
----------------------------------------------------------------------
.../beam/runners/core/SplittableParDo.java | 75 +++++++++---------
.../beam/runners/core/SplittableParDoTest.java | 82 +++++++-------------
runners/flink/runner/pom.xml | 3 +-
...esSplittableParDoWithWindowedSideInputs.java | 26 +++++++
.../beam/sdk/transforms/SplittableDoFnTest.java | 41 ++++++++++
5 files changed, 137 insertions(+), 90 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/beam/blob/6ac3ac50/runners/core-java/src/main/java/org/apache/beam/runners/core/SplittableParDo.java
----------------------------------------------------------------------
diff --git a/runners/core-java/src/main/java/org/apache/beam/runners/core/SplittableParDo.java b/runners/core-java/src/main/java/org/apache/beam/runners/core/SplittableParDo.java
index 44db1f7..31d89ee 100644
--- a/runners/core-java/src/main/java/org/apache/beam/runners/core/SplittableParDo.java
+++ b/runners/core-java/src/main/java/org/apache/beam/runners/core/SplittableParDo.java
@@ -19,10 +19,8 @@ package org.apache.beam.runners.core;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkNotNull;
-import static com.google.common.base.Preconditions.checkState;
import com.google.common.annotations.VisibleForTesting;
-import com.google.common.collect.ImmutableList;
import com.google.common.collect.Iterables;
import java.util.List;
import java.util.UUID;
@@ -138,6 +136,12 @@ public class SplittableParDo<InputT, OutputT, RestrictionT>
.setCoder(splitCoder)
.apply("Split restriction", ParDo.of(new SplitRestrictionFn<InputT, RestrictionT>(fn)))
.setCoder(splitCoder)
+ // ProcessFn requires all input elements to be in a single window and have a single
+ // element per work item. This must precede the unique keying so each key has a single
+ // associated element.
+ .apply(
+ "Explode windows",
+ ParDo.of(new ExplodeWindowsFn<ElementAndRestriction<InputT, RestrictionT>>()))
.apply(
"Assign unique key",
WithKeys.of(new RandomUniqueKeyFn<ElementAndRestriction<InputT, RestrictionT>>()))
@@ -158,6 +162,18 @@ public class SplittableParDo<InputT, OutputT, RestrictionT>
}
/**
+ * A {@link DoFn} that forces each of its outputs to be in a single window, by indicating to the
+ * runner that it observes the window of its input element, so the runner is forced to apply it to
+ * each input in a single window and thus its output is also in a single window.
+ */
+ private static class ExplodeWindowsFn<InputT> extends DoFn<InputT, InputT> {
+ @ProcessElement
+ public void process(ProcessContext c, BoundedWindow window) {
+ c.output(c.element());
+ }
+ }
+
+ /**
* Runner-specific primitive {@link GroupByKey GroupByKey-like} {@link PTransform} that produces
* {@link KeyedWorkItem KeyedWorkItems} so that downstream transforms can access state and timers.
*
@@ -317,6 +333,13 @@ public class SplittableParDo<InputT, OutputT, RestrictionT>
* The heart of splittable {@link DoFn} execution: processes a single (element, restriction) pair
* by creating a tracker for the restriction and checkpointing/resuming processing later if
* necessary.
+ *
+ * <p>Takes {@link KeyedWorkItem} and assumes that the KeyedWorkItem contains a single element
+ * (or a single timer set by {@link ProcessFn itself}, in a single window. This is necessary
+ * because {@link ProcessFn} sets timers, and timers are namespaced to a single window and it
+ * should be the window of the input element.
+ *
+ * <p>See also: https://issues.apache.org/jira/browse/BEAM-1983
*/
@VisibleForTesting
public static class ProcessFn<
@@ -441,7 +464,18 @@ public class SplittableParDo<InputT, OutputT, RestrictionT>
// Subsequent calls are timer firings and the element has to be retrieved from the state.
TimerInternals.TimerData timer = Iterables.getOnlyElement(c.element().timersIterable(), null);
boolean isSeedCall = (timer == null);
- StateNamespace stateNamespace = isSeedCall ? StateNamespaces.global() : timer.getNamespace();
+ StateNamespace stateNamespace;
+ if (isSeedCall) {
+ WindowedValue<ElementAndRestriction<InputT, RestrictionT>> windowedValue =
+ Iterables.getOnlyElement(c.element().elementsIterable());
+ BoundedWindow window = Iterables.getOnlyElement(windowedValue.getWindows());
+ stateNamespace =
+ StateNamespaces.window(
+ (Coder<BoundedWindow>) inputWindowingStrategy.getWindowFn().windowCoder(), window);
+ } else {
+ stateNamespace = timer.getNamespace();
+ }
+
ValueState<WindowedValue<InputT>> elementState =
stateInternals.state(stateNamespace, elementTag);
ValueState<RestrictionT> restrictionState =
@@ -451,15 +485,8 @@ public class SplittableParDo<InputT, OutputT, RestrictionT>
ElementAndRestriction<WindowedValue<InputT>, RestrictionT> elementAndRestriction;
if (isSeedCall) {
- // The element and restriction are available in c.element().
- // elementsIterable() will, by construction of SplittableParDo, contain the same value
- // potentially in several different windows. We implode this into a single WindowedValue
- // in order to simplify the rest of the code and avoid iterating over elementsIterable()
- // explicitly. The windows of this WindowedValue will be propagated to windows of the
- // output. This is correct because a splittable DoFn is not allowed to inspect the window
- // of its element.
WindowedValue<ElementAndRestriction<InputT, RestrictionT>> windowedValue =
- implodeWindows(c.element().elementsIterable());
+ Iterables.getOnlyElement(c.element().elementsIterable());
WindowedValue<InputT> element = windowedValue.withValue(windowedValue.getValue().element());
elementState.write(element);
elementAndRestriction =
@@ -498,32 +525,6 @@ public class SplittableParDo<InputT, OutputT, RestrictionT>
stateNamespace, timerInternals.currentProcessingTime(), TimeDomain.PROCESSING_TIME));
}
- /**
- * Does the opposite of {@link WindowedValue#explodeWindows()} - creates a single {@link
- * WindowedValue} from a collection of {@link WindowedValue}'s that is known to contain copies
- * of the same value with the same timestamp, but different window sets.
- *
- * <p>This is only legal to do because we know that {@link RandomUniqueKeyFn} created unique
- * keys for every {@link ElementAndRestriction}, so if there's multiple {@link WindowedValue}'s
- * for the same key, that means only that the windows of that {@link ElementAndRestriction} are
- * being delivered separately rather than all at once. It is also legal to do because splittable
- * {@link DoFn} is not allowed to access the window of its element, so we can propagate the full
- * set of windows of its input to its output.
- */
- private static <InputT, RestrictionT>
- WindowedValue<ElementAndRestriction<InputT, RestrictionT>> implodeWindows(
- Iterable<WindowedValue<ElementAndRestriction<InputT, RestrictionT>>> values) {
- WindowedValue<ElementAndRestriction<InputT, RestrictionT>> first =
- Iterables.getFirst(values, null);
- checkState(first != null, "Got a KeyedWorkItem with no elements and no timers");
- ImmutableList.Builder<BoundedWindow> windows = ImmutableList.builder();
- for (WindowedValue<ElementAndRestriction<InputT, RestrictionT>> value : values) {
- windows.addAll(value.getWindows());
- }
- return WindowedValue.of(
- first.getValue(), first.getTimestamp(), windows.build(), first.getPane());
- }
-
private DoFn<InputT, OutputT>.Context wrapContext(final Context baseContext) {
return fn.new Context() {
@Override
http://git-wip-us.apache.org/repos/asf/beam/blob/6ac3ac50/runners/core-java/src/test/java/org/apache/beam/runners/core/SplittableParDoTest.java
----------------------------------------------------------------------
diff --git a/runners/core-java/src/test/java/org/apache/beam/runners/core/SplittableParDoTest.java b/runners/core-java/src/test/java/org/apache/beam/runners/core/SplittableParDoTest.java
index 5629635..1a44453 100644
--- a/runners/core-java/src/test/java/org/apache/beam/runners/core/SplittableParDoTest.java
+++ b/runners/core-java/src/test/java/org/apache/beam/runners/core/SplittableParDoTest.java
@@ -30,6 +30,7 @@ import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
+import java.util.Collections;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.concurrent.Executors;
@@ -194,11 +195,6 @@ public class SplittableParDoTest {
// ------------------------------- Tests for ProcessFn ---------------------------------
- enum WindowExplosion {
- EXPLODE_WINDOWS,
- DO_NOT_EXPLODE_WINDOWS
- }
-
/**
* A helper for testing {@link SplittableParDo.ProcessFn} on 1 element (but possibly over multiple
* {@link DoFn.ProcessElement} calls).
@@ -293,24 +289,13 @@ public class SplittableParDoTest {
ElementAndRestriction.of(element, restriction),
currentProcessingTime,
GlobalWindow.INSTANCE,
- PaneInfo.ON_TIME_AND_ONLY_FIRING),
- WindowExplosion.DO_NOT_EXPLODE_WINDOWS);
+ PaneInfo.ON_TIME_AND_ONLY_FIRING));
}
- void startElement(
- WindowedValue<ElementAndRestriction<InputT, RestrictionT>> windowedValue,
- WindowExplosion explosion)
+ void startElement(WindowedValue<ElementAndRestriction<InputT, RestrictionT>> windowedValue)
throws Exception {
- switch (explosion) {
- case EXPLODE_WINDOWS:
- tester.processElement(
- KeyedWorkItems.elementsWorkItem("key", windowedValue.explodeWindows()));
- break;
- case DO_NOT_EXPLODE_WINDOWS:
- tester.processElement(
- KeyedWorkItems.elementsWorkItem("key", Arrays.asList(windowedValue)));
- break;
- }
+ tester.processElement(
+ KeyedWorkItems.elementsWorkItem("key", Collections.singletonList(windowedValue)));
}
/**
@@ -394,46 +379,39 @@ public class SplittableParDoTest {
}
@Test
- public void testTrivialProcessFnPropagatesOutputsWindowsAndTimestamp() throws Exception {
- // Tests that ProcessFn correctly propagates windows and timestamp of the element
+ public void testTrivialProcessFnPropagatesOutputWindowAndTimestamp() throws Exception {
+ // Tests that ProcessFn correctly propagates the window and timestamp of the element
// inside the KeyedWorkItem.
// The underlying DoFn is actually monolithic, so this doesn't test splitting.
DoFn<Integer, String> fn = new ToStringFn();
Instant base = Instant.now();
- IntervalWindow w1 =
+ IntervalWindow w =
new IntervalWindow(
base.minus(Duration.standardMinutes(1)), base.plus(Duration.standardMinutes(1)));
- IntervalWindow w2 =
- new IntervalWindow(
- base.minus(Duration.standardMinutes(2)), base.plus(Duration.standardMinutes(2)));
- IntervalWindow w3 =
- new IntervalWindow(
- base.minus(Duration.standardMinutes(3)), base.plus(Duration.standardMinutes(3)));
-
- for (WindowExplosion explosion : WindowExplosion.values()) {
- ProcessFnTester<Integer, String, SomeRestriction, SomeRestrictionTracker> tester =
- new ProcessFnTester<>(
- base, fn, BigEndianIntegerCoder.of(), SerializableCoder.of(SomeRestriction.class),
- MAX_OUTPUTS_PER_BUNDLE, MAX_BUNDLE_DURATION);
- tester.startElement(
- WindowedValue.of(
- ElementAndRestriction.of(42, new SomeRestriction()),
- base,
- Arrays.asList(w1, w2, w3),
- PaneInfo.ON_TIME_AND_ONLY_FIRING),
- explosion);
-
- for (IntervalWindow w : new IntervalWindow[] {w1, w2, w3}) {
- assertEquals(
- Arrays.asList(
- TimestampedValue.of("42a", base),
- TimestampedValue.of("42b", base),
- TimestampedValue.of("42c", base)),
- tester.peekOutputElementsInWindow(w));
- }
- }
+
+ ProcessFnTester<Integer, String, SomeRestriction, SomeRestrictionTracker> tester =
+ new ProcessFnTester<>(
+ base,
+ fn,
+ BigEndianIntegerCoder.of(),
+ SerializableCoder.of(SomeRestriction.class),
+ MAX_OUTPUTS_PER_BUNDLE,
+ MAX_BUNDLE_DURATION);
+ tester.startElement(
+ WindowedValue.of(
+ ElementAndRestriction.of(42, new SomeRestriction()),
+ base,
+ Collections.singletonList(w),
+ PaneInfo.ON_TIME_AND_ONLY_FIRING));
+
+ assertEquals(
+ Arrays.asList(
+ TimestampedValue.of("42a", base),
+ TimestampedValue.of("42b", base),
+ TimestampedValue.of("42c", base)),
+ tester.peekOutputElementsInWindow(w));
}
private static class WatermarkUpdateFn extends DoFn<Instant, String> {
http://git-wip-us.apache.org/repos/asf/beam/blob/6ac3ac50/runners/flink/runner/pom.xml
----------------------------------------------------------------------
diff --git a/runners/flink/runner/pom.xml b/runners/flink/runner/pom.xml
index 95880f4..1e6452d 100644
--- a/runners/flink/runner/pom.xml
+++ b/runners/flink/runner/pom.xml
@@ -91,7 +91,8 @@
org.apache.beam.sdk.testing.UsesMapState,
org.apache.beam.sdk.testing.UsesAttemptedMetrics,
org.apache.beam.sdk.testing.UsesCommittedMetrics,
- org.apache.beam.sdk.testing.UsesTestStream
+ org.apache.beam.sdk.testing.UsesTestStream,
+ org.apache.beam.sdk.testing.UsesSplittableParDoWithWindowedSideInputs
</excludedGroups>
<parallel>none</parallel>
<failIfNoTests>true</failIfNoTests>
http://git-wip-us.apache.org/repos/asf/beam/blob/6ac3ac50/sdks/java/core/src/main/java/org/apache/beam/sdk/testing/UsesSplittableParDoWithWindowedSideInputs.java
----------------------------------------------------------------------
diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/testing/UsesSplittableParDoWithWindowedSideInputs.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/testing/UsesSplittableParDoWithWindowedSideInputs.java
new file mode 100644
index 0000000..2b1d673
--- /dev/null
+++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/testing/UsesSplittableParDoWithWindowedSideInputs.java
@@ -0,0 +1,26 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.sdk.testing;
+
+import org.apache.beam.sdk.transforms.ParDo;
+
+/**
+ * Category tag for validation tests which utilize splittable {@link ParDo} and use
+ * windowed side inputs.
+ */
+public interface UsesSplittableParDoWithWindowedSideInputs {}
http://git-wip-us.apache.org/repos/asf/beam/blob/6ac3ac50/sdks/java/core/src/test/java/org/apache/beam/sdk/transforms/SplittableDoFnTest.java
----------------------------------------------------------------------
diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/transforms/SplittableDoFnTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/transforms/SplittableDoFnTest.java
index 30329f4..a0f1fd3 100644
--- a/sdks/java/core/src/test/java/org/apache/beam/sdk/transforms/SplittableDoFnTest.java
+++ b/sdks/java/core/src/test/java/org/apache/beam/sdk/transforms/SplittableDoFnTest.java
@@ -33,6 +33,7 @@ import org.apache.beam.sdk.testing.PAssert;
import org.apache.beam.sdk.testing.TestPipeline;
import org.apache.beam.sdk.testing.TestStream;
import org.apache.beam.sdk.testing.UsesSplittableParDo;
+import org.apache.beam.sdk.testing.UsesSplittableParDoWithWindowedSideInputs;
import org.apache.beam.sdk.testing.UsesTestStream;
import org.apache.beam.sdk.testing.ValidatesRunner;
import org.apache.beam.sdk.transforms.DoFn.BoundedPerElement;
@@ -252,6 +253,46 @@ public class SplittableDoFnTest implements Serializable {
p.run();
}
+ @Test
+ @Category({
+ ValidatesRunner.class,
+ UsesSplittableParDo.class,
+ UsesSplittableParDoWithWindowedSideInputs.class
+ })
+ public void testWindowedSideInput() throws Exception {
+ PCollection<Integer> mainInput =
+ p.apply("main",
+ Create.timestamped(
+ TimestampedValue.of(0, new Instant(0)),
+ TimestampedValue.of(1, new Instant(1)),
+ TimestampedValue.of(2, new Instant(2)),
+ TimestampedValue.of(3, new Instant(3)),
+ TimestampedValue.of(4, new Instant(4)),
+ TimestampedValue.of(5, new Instant(5)),
+ TimestampedValue.of(6, new Instant(6)),
+ TimestampedValue.of(7, new Instant(7))))
+ .apply("window 2", Window.<Integer>into(FixedWindows.of(Duration.millis(2))));
+
+ PCollectionView<String> sideInput =
+ p.apply("side",
+ Create.timestamped(
+ TimestampedValue.of("a", new Instant(0)),
+ TimestampedValue.of("b", new Instant(4))))
+ .apply("window 4", Window.<String>into(FixedWindows.of(Duration.millis(4))))
+ .apply("singleton", View.<String>asSingleton());
+
+ PCollection<String> res =
+ mainInput.apply(ParDo.of(new SDFWithSideInput(sideInput)).withSideInputs(sideInput));
+
+ PAssert.that(res).containsInAnyOrder("a:0", "a:1", "a:2", "a:3", "b:4", "b:5", "b:6", "b:7");
+
+ p.run();
+
+ // TODO: also add test coverage when the SDF checkpoints - the resumed call should also
+ // properly access side inputs.
+ // TODO: also test coverage when some of the windows of the side input are not ready.
+ }
+
private static class SDFWithAdditionalOutput extends DoFn<Integer, String> {
private final TupleTag<String> additionalOutput;
[12/50] [abbrv] beam git commit: Refactor batch load job path,
and add support for data-dependent tables.
Posted by dh...@apache.org.
Refactor batch load job path, and add support for data-dependent tables.
Project: http://git-wip-us.apache.org/repos/asf/beam/repo
Commit: http://git-wip-us.apache.org/repos/asf/beam/commit/8581caf3
Tree: http://git-wip-us.apache.org/repos/asf/beam/tree/8581caf3
Diff: http://git-wip-us.apache.org/repos/asf/beam/diff/8581caf3
Branch: refs/heads/DSL_SQL
Commit: 8581caf388ad688a0e79cfa154262d1e701dee10
Parents: 58ed5c7
Author: Reuven Lax <re...@google.com>
Authored: Wed Mar 29 07:34:10 2017 -0700
Committer: Eugene Kirpichov <ki...@google.com>
Committed: Tue Apr 18 21:12:50 2017 -0700
----------------------------------------------------------------------
.../sdk/io/gcp/bigquery/BatchLoadBigQuery.java | 180 ----------------
.../beam/sdk/io/gcp/bigquery/BatchLoads.java | 203 +++++++++++++++++++
.../beam/sdk/io/gcp/bigquery/BigQueryIO.java | 3 +-
.../sdk/io/gcp/bigquery/TableDestination.java | 17 +-
.../sdk/io/gcp/bigquery/TableRowWriter.java | 12 +-
.../beam/sdk/io/gcp/bigquery/WriteBundles.java | 82 --------
.../io/gcp/bigquery/WriteBundlesToFiles.java | 102 ++++++++++
.../sdk/io/gcp/bigquery/WritePartition.java | 95 ++++++---
.../beam/sdk/io/gcp/bigquery/WriteRename.java | 63 +++---
.../beam/sdk/io/gcp/bigquery/WriteTables.java | 47 ++---
.../sdk/io/gcp/bigquery/BigQueryIOTest.java | 27 +--
11 files changed, 469 insertions(+), 362 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/beam/blob/8581caf3/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BatchLoadBigQuery.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BatchLoadBigQuery.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BatchLoadBigQuery.java
deleted file mode 100644
index 160b231..0000000
--- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BatchLoadBigQuery.java
+++ /dev/null
@@ -1,180 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.beam.sdk.io.gcp.bigquery;
-
-import com.google.api.services.bigquery.model.TableReference;
-import com.google.api.services.bigquery.model.TableRow;
-import java.io.IOException;
-import java.util.List;
-import org.apache.beam.sdk.Pipeline;
-import org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.TableRefToJson;
-import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition;
-import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition;
-import org.apache.beam.sdk.options.BigQueryOptions;
-import org.apache.beam.sdk.options.ValueProvider;
-import org.apache.beam.sdk.options.ValueProvider.NestedValueProvider;
-import org.apache.beam.sdk.transforms.Create;
-import org.apache.beam.sdk.transforms.GroupByKey;
-import org.apache.beam.sdk.transforms.MapElements;
-import org.apache.beam.sdk.transforms.PTransform;
-import org.apache.beam.sdk.transforms.ParDo;
-import org.apache.beam.sdk.transforms.SimpleFunction;
-import org.apache.beam.sdk.transforms.View;
-import org.apache.beam.sdk.transforms.windowing.DefaultTrigger;
-import org.apache.beam.sdk.transforms.windowing.GlobalWindows;
-import org.apache.beam.sdk.transforms.windowing.Window;
-import org.apache.beam.sdk.util.IOChannelFactory;
-import org.apache.beam.sdk.util.IOChannelUtils;
-import org.apache.beam.sdk.values.KV;
-import org.apache.beam.sdk.values.PCollection;
-import org.apache.beam.sdk.values.PCollectionTuple;
-import org.apache.beam.sdk.values.PCollectionView;
-import org.apache.beam.sdk.values.TupleTag;
-import org.apache.beam.sdk.values.TupleTagList;
-import org.apache.beam.sdk.values.TypeDescriptor;
-
-/**
- * PTransform that uses BigQuery batch-load jobs to write a PCollection to BigQuery.
- */
-class BatchLoadBigQuery<T> extends PTransform<PCollection<T>, WriteResult> {
- BigQueryIO.Write<T> write;
-
- BatchLoadBigQuery(BigQueryIO.Write<T> write) {
- this.write = write;
- }
-
- @Override
- public WriteResult expand(PCollection<T> input) {
- Pipeline p = input.getPipeline();
- BigQueryOptions options = p.getOptions().as(BigQueryOptions.class);
- ValueProvider<TableReference> table = write.getTableWithDefaultProject(options);
-
- final String stepUuid = BigQueryHelpers.randomUUIDString();
-
- String tempLocation = options.getTempLocation();
- String tempFilePrefix;
- try {
- IOChannelFactory factory = IOChannelUtils.getFactory(tempLocation);
- tempFilePrefix = factory.resolve(
- factory.resolve(tempLocation, "BigQueryWriteTemp"),
- stepUuid);
- } catch (IOException e) {
- throw new RuntimeException(
- String.format("Failed to resolve BigQuery temp location in %s", tempLocation),
- e);
- }
-
- // Create a singleton job ID token at execution time.
- PCollection<String> singleton = p.apply("Create", Create.of(tempFilePrefix));
- PCollectionView<String> jobIdTokenView = p
- .apply("TriggerIdCreation", Create.of("ignored"))
- .apply("CreateJobId", MapElements.via(
- new SimpleFunction<String, String>() {
- @Override
- public String apply(String input) {
- return stepUuid;
- }
- }))
- .apply(View.<String>asSingleton());
-
- PCollection<T> typedInputInGlobalWindow =
- input.apply(
- Window.<T>into(new GlobalWindows())
- .triggering(DefaultTrigger.of())
- .discardingFiredPanes());
- // Avoid applying the formatFunction if it is the identity formatter.
- PCollection<TableRow> inputInGlobalWindow;
- if (write.getFormatFunction() == BigQueryIO.IDENTITY_FORMATTER) {
- inputInGlobalWindow = (PCollection<TableRow>) typedInputInGlobalWindow;
- } else {
- inputInGlobalWindow =
- typedInputInGlobalWindow.apply(
- MapElements.into(new TypeDescriptor<TableRow>() {}).via(write.getFormatFunction()));
- }
-
- // PCollection of filename, file byte size.
- PCollection<KV<String, Long>> results = inputInGlobalWindow
- .apply("WriteBundles",
- ParDo.of(new WriteBundles(tempFilePrefix)));
-
- TupleTag<KV<Long, List<String>>> multiPartitionsTag =
- new TupleTag<KV<Long, List<String>>>("multiPartitionsTag") {};
- TupleTag<KV<Long, List<String>>> singlePartitionTag =
- new TupleTag<KV<Long, List<String>>>("singlePartitionTag") {};
-
- // Turn the list of files and record counts in a PCollectionView that can be used as a
- // side input.
- PCollectionView<Iterable<KV<String, Long>>> resultsView = results
- .apply("ResultsView", View.<KV<String, Long>>asIterable());
- PCollectionTuple partitions = singleton.apply(ParDo
- .of(new WritePartition(
- resultsView,
- multiPartitionsTag,
- singlePartitionTag))
- .withSideInputs(resultsView)
- .withOutputTags(multiPartitionsTag, TupleTagList.of(singlePartitionTag)));
-
- // If WriteBundles produced more than MAX_NUM_FILES files or MAX_SIZE_BYTES bytes, then
- // the import needs to be split into multiple partitions, and those partitions will be
- // specified in multiPartitionsTag.
- PCollection<String> tempTables = partitions.get(multiPartitionsTag)
- .apply("MultiPartitionsGroupByKey", GroupByKey.<Long, List<String>>create())
- .apply("MultiPartitionsWriteTables", ParDo.of(new WriteTables(
- false,
- write.getBigQueryServices(),
- jobIdTokenView,
- tempFilePrefix,
- NestedValueProvider.of(table, new TableRefToJson()),
- write.getJsonSchema(),
- WriteDisposition.WRITE_EMPTY,
- CreateDisposition.CREATE_IF_NEEDED,
- write.getTableDescription()))
- .withSideInputs(jobIdTokenView));
-
- PCollectionView<Iterable<String>> tempTablesView = tempTables
- .apply("TempTablesView", View.<String>asIterable());
- singleton.apply(ParDo
- .of(new WriteRename(
- write.getBigQueryServices(),
- jobIdTokenView,
- NestedValueProvider.of(table, new TableRefToJson()),
- write.getWriteDisposition(),
- write.getCreateDisposition(),
- tempTablesView,
- write.getTableDescription()))
- .withSideInputs(tempTablesView, jobIdTokenView));
-
- // Write single partition to final table
- partitions.get(singlePartitionTag)
- .apply("SinglePartitionGroupByKey", GroupByKey.<Long, List<String>>create())
- .apply("SinglePartitionWriteTables", ParDo.of(new WriteTables(
- true,
- write.getBigQueryServices(),
- jobIdTokenView,
- tempFilePrefix,
- NestedValueProvider.of(table, new TableRefToJson()),
- write.getJsonSchema(),
- write.getWriteDisposition(),
- write.getCreateDisposition(),
- write.getTableDescription()))
- .withSideInputs(jobIdTokenView));
-
- return WriteResult.in(input.getPipeline());
- }
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/8581caf3/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BatchLoads.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BatchLoads.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BatchLoads.java
new file mode 100644
index 0000000..8594211
--- /dev/null
+++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BatchLoads.java
@@ -0,0 +1,203 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.beam.sdk.io.gcp.bigquery;
+
+import com.google.api.services.bigquery.model.TableReference;
+import com.google.api.services.bigquery.model.TableRow;
+import com.google.api.services.bigquery.model.TableSchema;
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+import javax.annotation.Nullable;
+import org.apache.beam.sdk.Pipeline;
+import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition;
+import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition;
+import org.apache.beam.sdk.options.BigQueryOptions;
+import org.apache.beam.sdk.options.ValueProvider;
+import org.apache.beam.sdk.transforms.Create;
+import org.apache.beam.sdk.transforms.GroupByKey;
+import org.apache.beam.sdk.transforms.MapElements;
+import org.apache.beam.sdk.transforms.PTransform;
+import org.apache.beam.sdk.transforms.ParDo;
+import org.apache.beam.sdk.transforms.SerializableFunction;
+import org.apache.beam.sdk.transforms.SimpleFunction;
+import org.apache.beam.sdk.transforms.View;
+import org.apache.beam.sdk.transforms.windowing.DefaultTrigger;
+import org.apache.beam.sdk.transforms.windowing.GlobalWindows;
+import org.apache.beam.sdk.transforms.windowing.Window;
+import org.apache.beam.sdk.util.IOChannelFactory;
+import org.apache.beam.sdk.util.IOChannelUtils;
+import org.apache.beam.sdk.values.KV;
+import org.apache.beam.sdk.values.PCollection;
+import org.apache.beam.sdk.values.PCollectionTuple;
+import org.apache.beam.sdk.values.PCollectionView;
+import org.apache.beam.sdk.values.TupleTag;
+import org.apache.beam.sdk.values.TupleTagList;
+
+
+/**
+ * PTransform that uses BigQuery batch-load jobs to write a PCollection to BigQuery.
+ */
+class BatchLoads<T> extends
+ PTransform<PCollection<KV<TableDestination, TableRow>>, WriteResult> {
+ BigQueryIO.Write<T> write;
+
+ private static class ConstantSchemaFunction implements
+ SerializableFunction<TableDestination, TableSchema> {
+ private final @Nullable
+ String jsonSchema;
+
+ ConstantSchemaFunction(TableSchema schema) {
+ this.jsonSchema = BigQueryHelpers.toJsonString(schema);
+ }
+
+ @Override
+ @Nullable
+ public TableSchema apply(TableDestination table) {
+ return BigQueryHelpers.fromJsonString(jsonSchema, TableSchema.class);
+ }
+ }
+
+ BatchLoads(BigQueryIO.Write<T> write) {
+ this.write = write;
+ }
+
+ @Override
+ public WriteResult expand(PCollection<KV<TableDestination, TableRow>> input) {
+ Pipeline p = input.getPipeline();
+ BigQueryOptions options = p.getOptions().as(BigQueryOptions.class);
+ ValueProvider<TableReference> table = write.getTableWithDefaultProject(options);
+
+ final String stepUuid = BigQueryHelpers.randomUUIDString();
+
+ String tempLocation = options.getTempLocation();
+ String tempFilePrefix;
+ try {
+ IOChannelFactory factory = IOChannelUtils.getFactory(tempLocation);
+ tempFilePrefix = factory.resolve(
+ factory.resolve(tempLocation, "BigQueryWriteTemp"),
+ stepUuid);
+ } catch (IOException e) {
+ throw new RuntimeException(
+ String.format("Failed to resolve BigQuery temp location in %s", tempLocation),
+ e);
+ }
+
+ // Create a singleton job ID token at execution time. This will be used as the base for all
+ // load jobs issued from this instance of the transfomr.
+ PCollection<String> singleton = p.apply("Create", Create.of(tempFilePrefix));
+ PCollectionView<String> jobIdTokenView = p
+ .apply("TriggerIdCreation", Create.of("ignored"))
+ .apply("CreateJobId", MapElements.via(
+ new SimpleFunction<String, String>() {
+ @Override
+ public String apply(String input) {
+ return stepUuid;
+ }
+ }))
+ .apply(View.<String>asSingleton());
+
+ PCollection<KV<TableDestination, TableRow>> inputInGlobalWindow =
+ input.apply(
+ Window.<KV<TableDestination, TableRow>>into(new GlobalWindows())
+ .triggering(DefaultTrigger.of())
+ .discardingFiredPanes());
+
+ // PCollection of filename, file byte size, and table destination.
+ PCollection<WriteBundlesToFiles.Result> results = inputInGlobalWindow
+ .apply("WriteBundlesToFiles",
+ ParDo.of(new WriteBundlesToFiles(tempFilePrefix)));
+
+ TupleTag<KV<KV<TableDestination, Integer>, List<String>>> multiPartitionsTag =
+ new TupleTag<KV<KV<TableDestination, Integer>, List<String>>>("multiPartitionsTag") {};
+ TupleTag<KV<KV<TableDestination, Integer>, List<String>>> singlePartitionTag =
+ new TupleTag<KV<KV<TableDestination, Integer>, List<String>>>("singlePartitionTag") {};
+
+ // Turn the list of files and record counts in a PCollectionView that can be used as a
+ // side input.
+ PCollectionView<Iterable<WriteBundlesToFiles.Result>> resultsView = results
+ .apply("ResultsView", View.<WriteBundlesToFiles.Result>asIterable());
+ // This transform will look at the set of files written for each table, and if any table has
+ // too many files or bytes, will partition that table's files into multiple partitions for
+ // loading.
+ PCollectionTuple partitions = singleton.apply(ParDo
+ .of(new WritePartition(
+ write.getTable(),
+ write.getTableDescription(),
+ resultsView,
+ multiPartitionsTag,
+ singlePartitionTag))
+ .withSideInputs(resultsView)
+ .withOutputTags(multiPartitionsTag, TupleTagList.of(singlePartitionTag)));
+
+ // Since BigQueryIO.java does not yet have support for per-table schemas, inject a constant
+ // schema function here. If no schema is specified, this function will return null.
+ SerializableFunction<TableDestination, TableSchema> schemaFunction =
+ new ConstantSchemaFunction(write.getSchema());
+
+ // If WriteBundlesToFiles produced more than MAX_NUM_FILES files or MAX_SIZE_BYTES bytes, then
+ // the import needs to be split into multiple partitions, and those partitions will be
+ // specified in multiPartitionsTag.
+ PCollection<KV<TableDestination, String>> tempTables = partitions.get(multiPartitionsTag)
+ // What's this GroupByKey for? Is this so we have a deterministic temp tables? If so, maybe
+ // Reshuffle is better here.
+ .apply("MultiPartitionsGroupByKey",
+ GroupByKey.<KV<TableDestination, Integer>, List<String>>create())
+ .apply("MultiPartitionsWriteTables", ParDo.of(new WriteTables(
+ false,
+ write.getBigQueryServices(),
+ jobIdTokenView,
+ tempFilePrefix,
+ WriteDisposition.WRITE_EMPTY,
+ CreateDisposition.CREATE_IF_NEEDED,
+ schemaFunction))
+ .withSideInputs(jobIdTokenView));
+
+ // This view maps each final table destination to the set of temporary partitioned tables
+ // the PCollection was loaded into.
+ PCollectionView<Map<TableDestination, Iterable<String>>> tempTablesView = tempTables
+ .apply("TempTablesView", View.<TableDestination, String>asMultimap());
+
+ singleton.apply(ParDo
+ .of(new WriteRename(
+ write.getBigQueryServices(),
+ jobIdTokenView,
+ write.getWriteDisposition(),
+ write.getCreateDisposition(),
+ tempTablesView,
+ write.getTableDescription()))
+ .withSideInputs(tempTablesView, jobIdTokenView));
+
+ // Write single partition to final table
+ partitions.get(singlePartitionTag)
+ .apply("SinglePartitionGroupByKey",
+ GroupByKey.<KV<TableDestination, Integer>, List<String>>create())
+ .apply("SinglePartitionWriteTables", ParDo.of(new WriteTables(
+ true,
+ write.getBigQueryServices(),
+ jobIdTokenView,
+ tempFilePrefix,
+ write.getWriteDisposition(),
+ write.getCreateDisposition(),
+ schemaFunction))
+ .withSideInputs(jobIdTokenView));
+
+ return WriteResult.in(input.getPipeline());
+ }
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/8581caf3/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIO.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIO.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIO.java
index af19b83..f1baaf7 100644
--- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIO.java
+++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIO.java
@@ -984,7 +984,8 @@ public class BigQueryIO {
if (input.isBounded() == IsBounded.UNBOUNDED) {
return rowsWithDestination.apply(new StreamingInserts(this));
} else {
- return input.apply(new BatchLoadBigQuery<T>(this));
+
+ return rowsWithDestination.apply(new BatchLoads<T>(this));
}
}
http://git-wip-us.apache.org/repos/asf/beam/blob/8581caf3/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TableDestination.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TableDestination.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TableDestination.java
index 631afeb..1c2b256 100644
--- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TableDestination.java
+++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TableDestination.java
@@ -20,6 +20,8 @@ package org.apache.beam.sdk.io.gcp.bigquery;
import com.google.api.services.bigquery.model.TableReference;
+import java.util.Objects;
+
/**
* Encapsulates a BigQuery table destination.
*/
@@ -42,7 +44,6 @@ public class TableDestination {
return tableSpec;
}
-
public TableReference getTableReference() {
return BigQueryHelpers.parseTableSpec(tableSpec);
}
@@ -50,4 +51,18 @@ public class TableDestination {
public String getTableDescription() {
return tableDescription;
}
+
+ @Override
+ public boolean equals(Object o) {
+ if (!(o instanceof TableDestination)) {
+ return false;
+ }
+ TableDestination other = (TableDestination) o;
+ return tableSpec == other.tableSpec && tableDescription == other.tableDescription;
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(tableSpec, tableDescription);
+ }
}
http://git-wip-us.apache.org/repos/asf/beam/blob/8581caf3/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TableRowWriter.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TableRowWriter.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TableRowWriter.java
index 014c498..a1f6153 100644
--- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TableRowWriter.java
+++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TableRowWriter.java
@@ -48,6 +48,14 @@ class TableRowWriter {
protected String mimeType = MimeTypes.TEXT;
private CountingOutputStream out;
+ public class Result {
+ String filename;
+ long byteSize;
+ public Result(String filename, long byteSize) {
+ this.filename = filename;
+ this.byteSize = byteSize;
+ }
+ }
TableRowWriter(String basename) {
this.tempFilePrefix = basename;
}
@@ -77,8 +85,8 @@ class TableRowWriter {
out.write(NEWLINE);
}
- public final KV<String, Long> close() throws IOException {
+ public final Result close() throws IOException {
channel.close();
- return KV.of(fileName, out.getCount());
+ return new Result(fileName, out.getCount());
}
}
http://git-wip-us.apache.org/repos/asf/beam/blob/8581caf3/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/WriteBundles.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/WriteBundles.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/WriteBundles.java
deleted file mode 100644
index 6219226..0000000
--- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/WriteBundles.java
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.beam.sdk.io.gcp.bigquery;
-
-import com.google.api.services.bigquery.model.TableRow;
-import java.util.UUID;
-
-import org.apache.beam.sdk.transforms.DoFn;
-import org.apache.beam.sdk.transforms.display.DisplayData;
-import org.apache.beam.sdk.values.KV;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * Writes each bundle of {@link TableRow} elements out to a separate file using
- * {@link TableRowWriter}.
- */
-class WriteBundles extends DoFn<TableRow, KV<String, Long>> {
- private static final Logger LOG = LoggerFactory.getLogger(WriteBundles.class);
-
- private transient TableRowWriter writer = null;
- private final String tempFilePrefix;
-
- WriteBundles(String tempFilePrefix) {
- this.tempFilePrefix = tempFilePrefix;
- }
-
- @ProcessElement
- public void processElement(ProcessContext c) throws Exception {
- if (writer == null) {
- writer = new TableRowWriter(tempFilePrefix);
- writer.open(UUID.randomUUID().toString());
- LOG.debug("Done opening writer {}", writer);
- }
- try {
- writer.write(c.element());
- } catch (Exception e) {
- // Discard write result and close the write.
- try {
- writer.close();
- // The writer does not need to be reset, as this DoFn cannot be reused.
- } catch (Exception closeException) {
- // Do not mask the exception that caused the write to fail.
- e.addSuppressed(closeException);
- }
- throw e;
- }
- }
-
- @FinishBundle
- public void finishBundle(Context c) throws Exception {
- if (writer != null) {
- c.output(writer.close());
- writer = null;
- }
- }
-
- @Override
- public void populateDisplayData(DisplayData.Builder builder) {
- super.populateDisplayData(builder);
-
- builder
- .addIfNotNull(DisplayData.item("tempFilePrefix", tempFilePrefix)
- .withLabel("Temporary File Prefix"));
- }
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/8581caf3/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/WriteBundlesToFiles.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/WriteBundlesToFiles.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/WriteBundlesToFiles.java
new file mode 100644
index 0000000..4e6167b
--- /dev/null
+++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/WriteBundlesToFiles.java
@@ -0,0 +1,102 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.beam.sdk.io.gcp.bigquery;
+
+import com.google.api.services.bigquery.model.TableRow;
+
+import java.util.Map;
+import java.util.UUID;
+
+import com.google.common.collect.Maps;
+import org.apache.beam.sdk.transforms.DoFn;
+import org.apache.beam.sdk.transforms.display.DisplayData;
+import org.apache.beam.sdk.values.KV;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Writes each bundle of {@link TableRow} elements out to a separate file using
+ * {@link TableRowWriter}.
+ */
+class WriteBundlesToFiles extends DoFn<KV<TableDestination, TableRow>, WriteBundlesToFiles.Result> {
+ private static final Logger LOG = LoggerFactory.getLogger(WriteBundlesToFiles.class);
+
+ // Map from tablespec to a writer for that table.
+ private transient Map<TableDestination, TableRowWriter> writers;
+ private final String tempFilePrefix;
+
+ public static class Result {
+ public String filename;
+ public Long fileByteSize;
+ public TableDestination tableDestination;
+
+ public Result(String filename, Long fileByteSize, TableDestination tableDestination) {
+ this.filename = filename;
+ this.fileByteSize = fileByteSize;
+ this.tableDestination = tableDestination;
+ }
+ }
+ WriteBundlesToFiles(String tempFilePrefix) {
+ this.tempFilePrefix = tempFilePrefix;
+ this.writers = Maps.newHashMap();
+ }
+
+ @ProcessElement
+ public void processElement(ProcessContext c) throws Exception {
+ // ??? can we assume Java8?
+ TableRowWriter writer = writers.getOrDefault(c.element().getKey(), null);
+ if (writer == null) {
+ writer = new TableRowWriter(tempFilePrefix);
+ writer.open(UUID.randomUUID().toString());
+ writers.put(c.element().getKey(), writer);
+ LOG.debug("Done opening writer {}", writer);
+ }
+ try {
+ writer.write(c.element().getValue());
+ } catch (Exception e) {
+ // Discard write result and close the write.
+ try {
+ writer.close();
+ // The writer does not need to be reset, as this DoFn cannot be reused.
+ } catch (Exception closeException) {
+ // Do not mask the exception that caused the write to fail.
+ e.addSuppressed(closeException);
+ }
+ throw e;
+ }
+ }
+
+ @FinishBundle
+ public void finishBundle(Context c) throws Exception {
+ for (Map.Entry<TableDestination, TableRowWriter> entry : writers.entrySet()) {
+ TableRowWriter.Result result = entry.getValue().close();
+ c.output(new Result(result.filename, result.byteSize, entry.getKey()));
+ }
+ writers.clear();
+ }
+
+ @Override
+ public void populateDisplayData(DisplayData.Builder builder) {
+ super.populateDisplayData(builder);
+
+ builder
+ .addIfNotNull(DisplayData.item("tempFilePrefix", tempFilePrefix)
+ .withLabel("Temporary File Prefix"));
+ }
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/8581caf3/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/WritePartition.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/WritePartition.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/WritePartition.java
index 1b6492e..8e1b16d 100644
--- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/WritePartition.java
+++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/WritePartition.java
@@ -18,27 +18,40 @@
package org.apache.beam.sdk.io.gcp.bigquery;
+import com.google.api.services.bigquery.model.TableReference;
import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
import java.util.List;
+import java.util.Map;
import java.util.UUID;
+
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write;
+import org.apache.beam.sdk.io.gcp.bigquery.WriteBundlesToFiles.Result;
+import org.apache.beam.sdk.options.ValueProvider;
import org.apache.beam.sdk.transforms.DoFn;
import org.apache.beam.sdk.values.KV;
import org.apache.beam.sdk.values.PCollectionView;
import org.apache.beam.sdk.values.TupleTag;
/**
- * Partitions temporary files based on number of files and file sizes.
+ * Partitions temporary files based on number of files and file sizes. Output key is a pair of
+ * tablespec and the list of files corresponding to each partition of that table.
*/
-class WritePartition extends DoFn<String, KV<Long, List<String>>> {
- private final PCollectionView<Iterable<KV<String, Long>>> resultsView;
- private TupleTag<KV<Long, List<String>>> multiPartitionsTag;
- private TupleTag<KV<Long, List<String>>> singlePartitionTag;
+class WritePartition extends DoFn<String, KV<KV<TableDestination, Integer>, List<String>>> {
+ private final ValueProvider<TableReference> singletonOutputTable;
+ private final String singletonOutputTableDescription;
+ private final PCollectionView<Iterable<WriteBundlesToFiles.Result>> resultsView;
+ private TupleTag<KV<KV<TableDestination, Integer>, List<String>>> multiPartitionsTag;
+ private TupleTag<KV<KV<TableDestination, Integer>, List<String>>> singlePartitionTag;
public WritePartition(
- PCollectionView<Iterable<KV<String, Long>>> resultsView,
- TupleTag<KV<Long, List<String>>> multiPartitionsTag,
- TupleTag<KV<Long, List<String>>> singlePartitionTag) {
+ ValueProvider<TableReference> singletonOutputTable,
+ String singletonOutputTableDescription,
+ PCollectionView<Iterable<WriteBundlesToFiles.Result>> resultsView,
+ TupleTag<KV<KV<TableDestination, Integer>, List<String>>> multiPartitionsTag,
+ TupleTag<KV<KV<TableDestination, Integer>, List<String>>> singlePartitionTag) {
+ this.singletonOutputTable = singletonOutputTable;
+ this.singletonOutputTableDescription = singletonOutputTableDescription;
this.resultsView = resultsView;
this.multiPartitionsTag = multiPartitionsTag;
this.singlePartitionTag = singlePartitionTag;
@@ -46,34 +59,62 @@ class WritePartition extends DoFn<String, KV<Long, List<String>>> {
@ProcessElement
public void processElement(ProcessContext c) throws Exception {
- List<KV<String, Long>> results = Lists.newArrayList(c.sideInput(resultsView));
- if (results.isEmpty()) {
- TableRowWriter writer = new TableRowWriter(c.element());
- writer.open(UUID.randomUUID().toString());
- results.add(writer.close());
+ List<WriteBundlesToFiles.Result> results = Lists.newArrayList(c.sideInput(resultsView));
+
+ // If there are no elements to write _and_ the user specified a constant output table, then
+ // generate an empty table of that name.
+ if (results.isEmpty() && singletonOutputTable != null) {
+ TableReference singletonTable = singletonOutputTable.get();
+ if (singletonTable != null) {
+ TableRowWriter writer = new TableRowWriter(c.element());
+ writer.open(UUID.randomUUID().toString());
+ TableRowWriter.Result writerResult = writer.close();
+ results.add(new Result(writerResult.filename, writerResult.byteSize,
+ new TableDestination(singletonTable, singletonOutputTableDescription)));
+ }
}
+
long partitionId = 0;
- int currNumFiles = 0;
- long currSizeBytes = 0;
- List<String> currResults = Lists.newArrayList();
+ Map<TableDestination, Integer> currNumFilesMap = Maps.newHashMap();
+ Map<TableDestination, Long> currSizeBytesMap = Maps.newHashMap();
+ Map<TableDestination, List<List<String>>> currResultsMap = Maps.newHashMap();
for (int i = 0; i < results.size(); ++i) {
- KV<String, Long> fileResult = results.get(i);
+ WriteBundlesToFiles.Result fileResult = results.get(i);
+ TableDestination tableDestination = fileResult.tableDestination;
+ // JAVA8
+ List<List<String>> partitions = currResultsMap.getOrDefault(tableDestination, null);
+ if (partitions == null) {
+ partitions = Lists.newArrayList();
+ partitions.add(Lists.<String>newArrayList());
+ currResultsMap.put(tableDestination, partitions);
+ }
+ int currNumFiles = currNumFilesMap.getOrDefault(tableDestination, 0);
+ long currSizeBytes = currSizeBytesMap.getOrDefault(tableDestination, 0L);
if (currNumFiles + 1 > Write.MAX_NUM_FILES
- || currSizeBytes + fileResult.getValue() > Write.MAX_SIZE_BYTES) {
- c.output(multiPartitionsTag, KV.of(++partitionId, currResults));
- currResults = Lists.newArrayList();
+ || currSizeBytes + fileResult.fileByteSize > Write.MAX_SIZE_BYTES) {
+ // Add a new partition for this table.
+ partitions.add(Lists.<String>newArrayList());
+ // c.sideOutput(multiPartitionsTag, KV.of(++partitionId, currResults));
currNumFiles = 0;
currSizeBytes = 0;
+ currNumFilesMap.remove(tableDestination);
+ currSizeBytesMap.remove(tableDestination);
}
- ++currNumFiles;
- currSizeBytes += fileResult.getValue();
- currResults.add(fileResult.getKey());
+ currNumFilesMap.put(tableDestination, currNumFiles + 1);
+ currSizeBytesMap.put(tableDestination, currSizeBytes + fileResult.fileByteSize);
+ // Always add to the most recent partition for this table.
+ partitions.get(partitions.size() - 1).add(fileResult.filename);
}
- if (partitionId == 0) {
- c.output(singlePartitionTag, KV.of(++partitionId, currResults));
- } else {
- c.output(multiPartitionsTag, KV.of(++partitionId, currResults));
+
+ for (Map.Entry<TableDestination, List<List<String>>> entry : currResultsMap.entrySet()) {
+ TableDestination tableDestination = entry.getKey();
+ List<List<String>> partitions = entry.getValue();
+ TupleTag<KV<KV<TableDestination, Integer>, List<String>>> outputTag =
+ (partitions.size() == 1) ? singlePartitionTag : multiPartitionsTag;
+ for (int i = 0; i < partitions.size(); ++i) {
+ c.output(outputTag, KV.of(KV.of(tableDestination, i + 1), partitions.get(i)));
+ }
}
}
}
http://git-wip-us.apache.org/repos/asf/beam/blob/8581caf3/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/WriteRename.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/WriteRename.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/WriteRename.java
index 8cb9439..fbfb290 100644
--- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/WriteRename.java
+++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/WriteRename.java
@@ -18,6 +18,7 @@
package org.apache.beam.sdk.io.gcp.bigquery;
+import avro.shaded.com.google.common.collect.Maps;
import com.google.api.services.bigquery.model.Job;
import com.google.api.services.bigquery.model.JobConfigurationTableCopy;
import com.google.api.services.bigquery.model.JobReference;
@@ -25,6 +26,7 @@ import com.google.api.services.bigquery.model.TableReference;
import com.google.common.collect.Lists;
import java.io.IOException;
import java.util.List;
+import java.util.Map;
import javax.annotation.Nullable;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.Status;
@@ -49,24 +51,21 @@ class WriteRename extends DoFn<String, Void> {
private final BigQueryServices bqServices;
private final PCollectionView<String> jobIdToken;
- private final ValueProvider<String> jsonTableRef;
private final WriteDisposition writeDisposition;
private final CreateDisposition createDisposition;
- private final PCollectionView<Iterable<String>> tempTablesView;
+ private final PCollectionView<Map<TableDestination, Iterable<String>>> tempTablesView;
@Nullable
private final String tableDescription;
public WriteRename(
BigQueryServices bqServices,
PCollectionView<String> jobIdToken,
- ValueProvider<String> jsonTableRef,
WriteDisposition writeDisposition,
CreateDisposition createDisposition,
- PCollectionView<Iterable<String>> tempTablesView,
+ PCollectionView<Map<TableDestination, Iterable<String>>> tempTablesView,
@Nullable String tableDescription) {
this.bqServices = bqServices;
this.jobIdToken = jobIdToken;
- this.jsonTableRef = jsonTableRef;
this.writeDisposition = writeDisposition;
this.createDisposition = createDisposition;
this.tempTablesView = tempTablesView;
@@ -75,30 +74,40 @@ class WriteRename extends DoFn<String, Void> {
@ProcessElement
public void processElement(ProcessContext c) throws Exception {
- List<String> tempTablesJson = Lists.newArrayList(c.sideInput(tempTablesView));
+ Map<TableDestination, Iterable<String>> tempTablesMap =
+ Maps.newHashMap(c.sideInput(tempTablesView));
- // Do not copy if no temp tables are provided
- if (tempTablesJson.size() == 0) {
- return;
- }
+ // Process each destination table.
+ for (Map.Entry<TableDestination, Iterable<String>> entry : tempTablesMap.entrySet()) {
+ TableDestination finalTableDestination = entry.getKey();
+ List<String> tempTablesJson = Lists.newArrayList(entry.getValue());
+ // Do not copy if no temp tables are provided
+ if (tempTablesJson.size() == 0) {
+ return;
+ }
+
+ List<TableReference> tempTables = Lists.newArrayList();
+ for (String table : tempTablesJson) {
+ tempTables.add(BigQueryHelpers.fromJsonString(table, TableReference.class));
+ }
+
+ // Make sure each destination table gets a unique job id.
+ String jobIdPrefix = String.format(
+ c.sideInput(jobIdToken) + "0x%08x", finalTableDestination.hashCode());
+ copy(
+ bqServices.getJobService(c.getPipelineOptions().as(BigQueryOptions.class)),
+ bqServices.getDatasetService(c.getPipelineOptions().as(BigQueryOptions.class)),
+ jobIdPrefix,
+ finalTableDestination.getTableReference(),
+ tempTables,
+ writeDisposition,
+ createDisposition,
+ tableDescription);
- List<TableReference> tempTables = Lists.newArrayList();
- for (String table : tempTablesJson) {
- tempTables.add(BigQueryHelpers.fromJsonString(table, TableReference.class));
+ DatasetService tableService =
+ bqServices.getDatasetService(c.getPipelineOptions().as(BigQueryOptions.class));
+ removeTemporaryTables(tableService, tempTables);
}
- copy(
- bqServices.getJobService(c.getPipelineOptions().as(BigQueryOptions.class)),
- bqServices.getDatasetService(c.getPipelineOptions().as(BigQueryOptions.class)),
- c.sideInput(jobIdToken),
- BigQueryHelpers.fromJsonString(jsonTableRef.get(), TableReference.class),
- tempTables,
- writeDisposition,
- createDisposition,
- tableDescription);
-
- DatasetService tableService =
- bqServices.getDatasetService(c.getPipelineOptions().as(BigQueryOptions.class));
- removeTemporaryTables(tableService, tempTables);
}
private void copy(
@@ -170,8 +179,6 @@ class WriteRename extends DoFn<String, Void> {
super.populateDisplayData(builder);
builder
- .addIfNotNull(DisplayData.item("jsonTableRef", jsonTableRef)
- .withLabel("Table Reference"))
.add(DisplayData.item("writeDisposition", writeDisposition.toString())
.withLabel("Write Disposition"))
.add(DisplayData.item("createDisposition", createDisposition.toString())
http://git-wip-us.apache.org/repos/asf/beam/blob/8581caf3/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/WriteTables.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/WriteTables.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/WriteTables.java
index 29680ad..5051c95 100644
--- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/WriteTables.java
+++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/WriteTables.java
@@ -41,6 +41,7 @@ import org.apache.beam.sdk.options.BigQueryOptions;
import org.apache.beam.sdk.options.PipelineOptions;
import org.apache.beam.sdk.options.ValueProvider;
import org.apache.beam.sdk.transforms.DoFn;
+import org.apache.beam.sdk.transforms.SerializableFunction;
import org.apache.beam.sdk.transforms.display.DisplayData;
import org.apache.beam.sdk.util.FileIOChannelFactory;
import org.apache.beam.sdk.util.GcsIOChannelFactory;
@@ -57,48 +58,45 @@ import org.slf4j.LoggerFactory;
/**
* Writes partitions to BigQuery tables.
*/
-class WriteTables extends DoFn<KV<Long, Iterable<List<String>>>, String> {
+class WriteTables extends DoFn<KV<KV<TableDestination, Integer>, Iterable<List<String>>>,
+ KV<TableDestination, String>> {
private static final Logger LOG = LoggerFactory.getLogger(WriteTables.class);
private final boolean singlePartition;
private final BigQueryServices bqServices;
private final PCollectionView<String> jobIdToken;
private final String tempFilePrefix;
- private final ValueProvider<String> jsonTableRef;
- private final ValueProvider<String> jsonSchema;
private final WriteDisposition writeDisposition;
private final CreateDisposition createDisposition;
- @Nullable
- private final String tableDescription;
+ private final SerializableFunction<TableDestination, TableSchema> schemaFunction;
public WriteTables(
boolean singlePartition,
BigQueryServices bqServices,
PCollectionView<String> jobIdToken,
String tempFilePrefix,
- ValueProvider<String> jsonTableRef,
- ValueProvider<String> jsonSchema,
WriteDisposition writeDisposition,
CreateDisposition createDisposition,
- @Nullable String tableDescription) {
+ SerializableFunction<TableDestination, TableSchema> schemaFunction) {
this.singlePartition = singlePartition;
this.bqServices = bqServices;
this.jobIdToken = jobIdToken;
this.tempFilePrefix = tempFilePrefix;
- this.jsonTableRef = jsonTableRef;
- this.jsonSchema = jsonSchema;
this.writeDisposition = writeDisposition;
this.createDisposition = createDisposition;
- this.tableDescription = tableDescription;
+ this.schemaFunction = schemaFunction;
}
@ProcessElement
public void processElement(ProcessContext c) throws Exception {
- List<String> partition = Lists.newArrayList(c.element().getValue()).get(0);
+ TableDestination tableDestination = c.element().getKey().getKey();
+ Integer partition = c.element().getKey().getValue();
+ List<String> partitionFiles = Lists.newArrayList(c.element().getValue()).get(0);
+ // Job ID must be different for each partition of each table.
String jobIdPrefix = String.format(
- c.sideInput(jobIdToken) + "_%05d", c.element().getKey());
- TableReference ref = BigQueryHelpers.fromJsonString(jsonTableRef.get(),
- TableReference.class);
+ c.sideInput(jobIdToken) + "0x%08x_%05d", tableDestination.hashCode(), partition);
+
+ TableReference ref = tableDestination.getTableReference();
if (!singlePartition) {
ref.setTableId(jobIdPrefix);
}
@@ -108,15 +106,14 @@ class WriteTables extends DoFn<KV<Long, Iterable<List<String>>>, String> {
bqServices.getDatasetService(c.getPipelineOptions().as(BigQueryOptions.class)),
jobIdPrefix,
ref,
- BigQueryHelpers.fromJsonString(
- jsonSchema == null ? null : jsonSchema.get(), TableSchema.class),
- partition,
+ schemaFunction.apply(tableDestination),
+ partitionFiles,
writeDisposition,
createDisposition,
- tableDescription);
- c.output(BigQueryHelpers.toJsonString(ref));
+ tableDestination.getTableDescription());
+ c.output(KV.of(tableDestination, BigQueryHelpers.toJsonString(ref)));
- removeTemporaryFiles(c.getPipelineOptions(), tempFilePrefix, partition);
+ removeTemporaryFiles(c.getPipelineOptions(), tempFilePrefix, partitionFiles);
}
private void load(
@@ -202,12 +199,6 @@ class WriteTables extends DoFn<KV<Long, Iterable<List<String>>>, String> {
builder
.addIfNotNull(DisplayData.item("tempFilePrefix", tempFilePrefix)
- .withLabel("Temporary File Prefix"))
- .addIfNotNull(DisplayData.item("jsonTableRef", jsonTableRef)
- .withLabel("Table Reference"))
- .addIfNotNull(DisplayData.item("jsonSchema", jsonSchema)
- .withLabel("Table Schema"))
- .addIfNotNull(DisplayData.item("tableDescription", tableDescription)
- .withLabel("Table Description"));
+ .withLabel("Temporary File Prefix"));
}
}
http://git-wip-us.apache.org/repos/asf/beam/blob/8581caf3/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOTest.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOTest.java
index d953edd..af39483 100644
--- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOTest.java
+++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOTest.java
@@ -2078,26 +2078,27 @@ public class BigQueryIOTest implements Serializable {
files.add(KV.of(fileName, fileSize));
}
- TupleTag<KV<Long, List<String>>> multiPartitionsTag =
- new TupleTag<KV<Long, List<String>>>("multiPartitionsTag") {};
- TupleTag<KV<Long, List<String>>> singlePartitionTag =
- new TupleTag<KV<Long, List<String>>>("singlePartitionTag") {};
-
- PCollection<KV<String, Long>> filesPCollection =
- p.apply(Create.of(files).withType(new TypeDescriptor<KV<String, Long>>() {}));
- PCollectionView<Iterable<KV<String, Long>>> filesView = PCollectionViews.iterableView(
- filesPCollection,
+ TupleTag<KV<KV<TableDestination, Integer>, List<String>>> multiPartitionsTag =
+ new TupleTag<KV<KV<TableDestination, Integer>, List<String>>>("multiPartitionsTag") {};
+ TupleTag<KV<KV<TableDestination, Integer>, List<String>>> singlePartitionTag =
+ new TupleTag<KV<KV<TableDestination, Integer>, List<String>>>("singlePartitionTag") {};
+
+ PCollectionView<Iterable<WriteBundlesToFiles.Result>> resultsView =
+ PCollectionViews.iterableView(
+ p,
WindowingStrategy.globalDefault(),
KvCoder.of(StringUtf8Coder.of(), VarLongCoder.of()));
WritePartition writePartition =
- new WritePartition(filesView, multiPartitionsTag, singlePartitionTag);
+ new WritePartition(null, null, resultsView,
+ multiPartitionsTag, singlePartitionTag);
- DoFnTester<String, KV<Long, List<String>>> tester = DoFnTester.of(writePartition);
- tester.setSideInput(filesView, GlobalWindow.INSTANCE, files);
+ DoFnTester<String, KV<KV<TableDestination, Integer>, List<String>>> tester =
+ DoFnTester.of(writePartition);
+ tester.setSideInput(resultsView, GlobalWindow.INSTANCE, files);
tester.processElement(testFolder.newFolder("BigQueryIOTest").getAbsolutePath());
- List<KV<Long, List<String>>> partitions;
+ List<KV<KV<TableDestination, Integer>, List<String>>> partitions;
if (expectedNumPartitions > 1) {
partitions = tester.takeOutputElements(multiPartitionsTag);
} else {
[46/50] [abbrv] beam git commit: Cache result of
BigQuerySourceBase.split
Posted by dh...@apache.org.
Cache result of BigQuerySourceBase.split
Project: http://git-wip-us.apache.org/repos/asf/beam/repo
Commit: http://git-wip-us.apache.org/repos/asf/beam/commit/1533e2b9
Tree: http://git-wip-us.apache.org/repos/asf/beam/tree/1533e2b9
Diff: http://git-wip-us.apache.org/repos/asf/beam/diff/1533e2b9
Branch: refs/heads/DSL_SQL
Commit: 1533e2b9bc49971929277b804587d93d8d2cae4c
Parents: 29e054a
Author: Eugene Kirpichov <ki...@google.com>
Authored: Wed Apr 19 10:09:42 2017 -0700
Committer: Eugene Kirpichov <ki...@google.com>
Committed: Wed Apr 19 11:39:21 2017 -0700
----------------------------------------------------------------------
.../sdk/io/gcp/bigquery/BigQuerySourceBase.java | 31 +++++++++++++-------
.../sdk/io/gcp/bigquery/BigQueryIOTest.java | 18 +++++-------
.../sdk/io/gcp/bigquery/FakeJobService.java | 9 ++++++
3 files changed, 37 insertions(+), 21 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/beam/blob/1533e2b9/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQuerySourceBase.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQuerySourceBase.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQuerySourceBase.java
index 1b90dc3..4142da9 100644
--- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQuerySourceBase.java
+++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQuerySourceBase.java
@@ -69,6 +69,8 @@ abstract class BigQuerySourceBase extends BoundedSource<TableRow> {
protected final BigQueryServices bqServices;
protected final ValueProvider<String> executingProject;
+ private List<BoundedSource<TableRow>> cachedSplitResult;
+
BigQuerySourceBase(
ValueProvider<String> jobIdToken,
String extractDestinationDir,
@@ -83,17 +85,24 @@ abstract class BigQuerySourceBase extends BoundedSource<TableRow> {
@Override
public List<BoundedSource<TableRow>> split(
long desiredBundleSizeBytes, PipelineOptions options) throws Exception {
- BigQueryOptions bqOptions = options.as(BigQueryOptions.class);
- TableReference tableToExtract = getTableToExtract(bqOptions);
- JobService jobService = bqServices.getJobService(bqOptions);
- String extractJobId = BigQueryIO.getExtractJobId(jobIdToken);
- List<String> tempFiles = executeExtract(extractJobId, tableToExtract, jobService);
-
- TableSchema tableSchema = bqServices.getDatasetService(bqOptions)
- .getTable(tableToExtract).getSchema();
-
- cleanupTempResource(bqOptions);
- return createSources(tempFiles, tableSchema);
+ // split() can be called multiple times, e.g. Dataflow runner may call it multiple times
+ // with different desiredBundleSizeBytes in case the split() call produces too many sources.
+ // We ignore desiredBundleSizeBytes anyway, however in any case, we should not initiate
+ // another BigQuery extract job for the repeated split() calls.
+ if (cachedSplitResult == null) {
+ BigQueryOptions bqOptions = options.as(BigQueryOptions.class);
+ TableReference tableToExtract = getTableToExtract(bqOptions);
+ JobService jobService = bqServices.getJobService(bqOptions);
+ String extractJobId = BigQueryIO.getExtractJobId(jobIdToken);
+ List<String> tempFiles = executeExtract(extractJobId, tableToExtract, jobService);
+
+ TableSchema tableSchema = bqServices.getDatasetService(bqOptions)
+ .getTable(tableToExtract).getSchema();
+
+ cleanupTempResource(bqOptions);
+ cachedSplitResult = checkNotNull(createSources(tempFiles, tableSchema));
+ }
+ return cachedSplitResult;
}
protected abstract TableReference getTableToExtract(BigQueryOptions bqOptions) throws Exception;
http://git-wip-us.apache.org/repos/asf/beam/blob/1533e2b9/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOTest.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOTest.java
index d0004e4..62c5b5f 100644
--- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOTest.java
+++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOTest.java
@@ -28,7 +28,6 @@ import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertThat;
import static org.junit.Assert.assertTrue;
-
import com.google.api.client.util.Data;
import com.google.api.services.bigquery.model.Job;
import com.google.api.services.bigquery.model.JobStatistics;
@@ -1230,17 +1229,10 @@ public class BigQueryIOTest implements Serializable {
@Test
public void testBigQueryTableSourceInitSplit() throws Exception {
- Job extractJob = new Job();
- JobStatistics jobStats = new JobStatistics();
- JobStatistics4 extractStats = new JobStatistics4();
- extractStats.setDestinationUriFileCounts(ImmutableList.of(1L));
- jobStats.setExtract(extractStats);
- extractJob.setStatus(new JobStatus())
- .setStatistics(jobStats);
-
FakeDatasetService fakeDatasetService = new FakeDatasetService();
+ FakeJobService fakeJobService = new FakeJobService();
FakeBigQueryServices fakeBqServices = new FakeBigQueryServices()
- .withJobService(new FakeJobService())
+ .withJobService(fakeJobService)
.withDatasetService(fakeDatasetService);
List<TableRow> expected = ImmutableList.of(
@@ -1280,8 +1272,14 @@ public class BigQueryIOTest implements Serializable {
List<? extends BoundedSource<TableRow>> sources = bqSource.split(100, options);
assertEquals(2, sources.size());
+ // Simulate a repeated call to split(), like a Dataflow worker will sometimes do.
+ sources = bqSource.split(200, options);
+ assertEquals(2, sources.size());
BoundedSource<TableRow> actual = sources.get(0);
assertThat(actual, CoreMatchers.instanceOf(TransformingSource.class));
+
+ // A repeated call to split() should not have caused a duplicate extract job.
+ assertEquals(1, fakeJobService.getNumExtractJobCalls());
}
@Test
http://git-wip-us.apache.org/repos/asf/beam/blob/1533e2b9/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/FakeJobService.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/FakeJobService.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/FakeJobService.java
index a2454fb..cffd873 100644
--- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/FakeJobService.java
+++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/FakeJobService.java
@@ -95,6 +95,7 @@ class FakeJobService implements JobService, Serializable {
private static final com.google.common.collect.Table<String, String, JobInfo> allJobs =
HashBasedTable.create();
+ private static int numExtractJobCalls = 0;
private static final com.google.common.collect.Table<String, String, List<String>>
filesForLoadJobs = HashBasedTable.create();
@@ -136,6 +137,8 @@ class FakeJobService implements JobService, Serializable {
checkArgument(extractConfig.getDestinationFormat().equals("AVRO"),
"Only extract to AVRO is supported");
synchronized (allJobs) {
+ ++numExtractJobCalls;
+
Job job = new Job();
job.setJobReference(jobRef);
job.setConfiguration(new JobConfiguration().setExtract(extractConfig));
@@ -145,6 +148,12 @@ class FakeJobService implements JobService, Serializable {
}
}
+ public int getNumExtractJobCalls() {
+ synchronized (allJobs) {
+ return numExtractJobCalls;
+ }
+ }
+
@Override
public void startQueryJob(JobReference jobRef, JobConfigurationQuery query)
throws IOException, InterruptedException {
[25/50] [abbrv] beam git commit: [BEAM-1994] Remove Flink examples
package
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/test/java/org/apache/beam/runners/flink/streaming/DoFnOperatorTest.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/test/java/org/apache/beam/runners/flink/streaming/DoFnOperatorTest.java b/runners/flink/src/test/java/org/apache/beam/runners/flink/streaming/DoFnOperatorTest.java
new file mode 100644
index 0000000..4c826d1
--- /dev/null
+++ b/runners/flink/src/test/java/org/apache/beam/runners/flink/streaming/DoFnOperatorTest.java
@@ -0,0 +1,600 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink.streaming;
+
+import static org.hamcrest.Matchers.emptyIterable;
+import static org.hamcrest.collection.IsIterableContainingInOrder.contains;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertThat;
+
+import com.google.common.base.Function;
+import com.google.common.base.Predicate;
+import com.google.common.collect.FluentIterable;
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.ImmutableMap;
+import java.util.Collections;
+import java.util.HashMap;
+import javax.annotation.Nullable;
+import org.apache.beam.runners.core.StatefulDoFnRunner;
+import org.apache.beam.runners.flink.FlinkPipelineOptions;
+import org.apache.beam.runners.flink.translation.types.CoderTypeInformation;
+import org.apache.beam.runners.flink.translation.wrappers.streaming.DoFnOperator;
+import org.apache.beam.sdk.coders.Coder;
+import org.apache.beam.sdk.coders.KvCoder;
+import org.apache.beam.sdk.coders.StringUtf8Coder;
+import org.apache.beam.sdk.coders.VarIntCoder;
+import org.apache.beam.sdk.options.PipelineOptionsFactory;
+import org.apache.beam.sdk.testing.PCollectionViewTesting;
+import org.apache.beam.sdk.transforms.DoFn;
+import org.apache.beam.sdk.transforms.join.RawUnionValue;
+import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
+import org.apache.beam.sdk.transforms.windowing.FixedWindows;
+import org.apache.beam.sdk.transforms.windowing.IntervalWindow;
+import org.apache.beam.sdk.transforms.windowing.PaneInfo;
+import org.apache.beam.sdk.util.TimeDomain;
+import org.apache.beam.sdk.util.Timer;
+import org.apache.beam.sdk.util.TimerSpec;
+import org.apache.beam.sdk.util.TimerSpecs;
+import org.apache.beam.sdk.util.WindowedValue;
+import org.apache.beam.sdk.util.WindowingStrategy;
+import org.apache.beam.sdk.util.state.StateSpec;
+import org.apache.beam.sdk.util.state.StateSpecs;
+import org.apache.beam.sdk.util.state.ValueState;
+import org.apache.beam.sdk.values.KV;
+import org.apache.beam.sdk.values.PCollectionView;
+import org.apache.beam.sdk.values.TupleTag;
+import org.apache.flink.api.common.typeinfo.BasicTypeInfo;
+import org.apache.flink.api.java.functions.KeySelector;
+import org.apache.flink.streaming.runtime.streamrecord.StreamRecord;
+import org.apache.flink.streaming.util.KeyedOneInputStreamOperatorTestHarness;
+import org.apache.flink.streaming.util.KeyedTwoInputStreamOperatorTestHarness;
+import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness;
+import org.apache.flink.streaming.util.TwoInputStreamOperatorTestHarness;
+import org.joda.time.Duration;
+import org.joda.time.Instant;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+/**
+ * Tests for {@link DoFnOperator}.
+ */
+@RunWith(JUnit4.class)
+public class DoFnOperatorTest {
+
+ // views and windows for testing side inputs
+ private static final long WINDOW_MSECS_1 = 100;
+ private static final long WINDOW_MSECS_2 = 500;
+
+ private WindowingStrategy<Object, IntervalWindow> windowingStrategy1 =
+ WindowingStrategy.of(FixedWindows.of(new Duration(WINDOW_MSECS_1)));
+
+ private PCollectionView<Iterable<String>> view1 =
+ PCollectionViewTesting.testingView(
+ new TupleTag<Iterable<WindowedValue<String>>>() {},
+ new PCollectionViewTesting.IdentityViewFn<String>(),
+ StringUtf8Coder.of(),
+ windowingStrategy1);
+
+ private WindowingStrategy<Object, IntervalWindow> windowingStrategy2 =
+ WindowingStrategy.of(FixedWindows.of(new Duration(WINDOW_MSECS_2)));
+
+ private PCollectionView<Iterable<String>> view2 =
+ PCollectionViewTesting.testingView(
+ new TupleTag<Iterable<WindowedValue<String>>>() {},
+ new PCollectionViewTesting.IdentityViewFn<String>(),
+ StringUtf8Coder.of(),
+ windowingStrategy2);
+
+ @Test
+ @SuppressWarnings("unchecked")
+ public void testSingleOutput() throws Exception {
+
+ WindowedValue.ValueOnlyWindowedValueCoder<String> windowedValueCoder =
+ WindowedValue.getValueOnlyCoder(StringUtf8Coder.of());
+
+ TupleTag<String> outputTag = new TupleTag<>("main-output");
+
+ DoFnOperator<String, String, String> doFnOperator = new DoFnOperator<>(
+ new IdentityDoFn<String>(),
+ windowedValueCoder,
+ outputTag,
+ Collections.<TupleTag<?>>emptyList(),
+ new DoFnOperator.DefaultOutputManagerFactory(),
+ WindowingStrategy.globalDefault(),
+ new HashMap<Integer, PCollectionView<?>>(), /* side-input mapping */
+ Collections.<PCollectionView<?>>emptyList(), /* side inputs */
+ PipelineOptionsFactory.as(FlinkPipelineOptions.class),
+ null);
+
+ OneInputStreamOperatorTestHarness<WindowedValue<String>, String> testHarness =
+ new OneInputStreamOperatorTestHarness<>(doFnOperator);
+
+ testHarness.open();
+
+ testHarness.processElement(new StreamRecord<>(WindowedValue.valueInGlobalWindow("Hello")));
+
+ assertThat(
+ this.<String>stripStreamRecordFromWindowedValue(testHarness.getOutput()),
+ contains(WindowedValue.valueInGlobalWindow("Hello")));
+
+ testHarness.close();
+ }
+
+ @Test
+ @SuppressWarnings("unchecked")
+ public void testMultiOutputOutput() throws Exception {
+
+ WindowedValue.ValueOnlyWindowedValueCoder<String> windowedValueCoder =
+ WindowedValue.getValueOnlyCoder(StringUtf8Coder.of());
+
+ TupleTag<String> mainOutput = new TupleTag<>("main-output");
+ TupleTag<String> additionalOutput1 = new TupleTag<>("output-1");
+ TupleTag<String> additionalOutput2 = new TupleTag<>("output-2");
+ ImmutableMap<TupleTag<?>, Integer> outputMapping = ImmutableMap.<TupleTag<?>, Integer>builder()
+ .put(mainOutput, 1)
+ .put(additionalOutput1, 2)
+ .put(additionalOutput2, 3)
+ .build();
+
+ DoFnOperator<String, String, RawUnionValue> doFnOperator = new DoFnOperator<>(
+ new MultiOutputDoFn(additionalOutput1, additionalOutput2),
+ windowedValueCoder,
+ mainOutput,
+ ImmutableList.<TupleTag<?>>of(additionalOutput1, additionalOutput2),
+ new DoFnOperator.MultiOutputOutputManagerFactory(outputMapping),
+ WindowingStrategy.globalDefault(),
+ new HashMap<Integer, PCollectionView<?>>(), /* side-input mapping */
+ Collections.<PCollectionView<?>>emptyList(), /* side inputs */
+ PipelineOptionsFactory.as(FlinkPipelineOptions.class),
+ null);
+
+ OneInputStreamOperatorTestHarness<WindowedValue<String>, RawUnionValue> testHarness =
+ new OneInputStreamOperatorTestHarness<>(doFnOperator);
+
+ testHarness.open();
+
+ testHarness.processElement(new StreamRecord<>(WindowedValue.valueInGlobalWindow("one")));
+ testHarness.processElement(new StreamRecord<>(WindowedValue.valueInGlobalWindow("two")));
+ testHarness.processElement(new StreamRecord<>(WindowedValue.valueInGlobalWindow("hello")));
+
+ assertThat(
+ this.stripStreamRecordFromRawUnion(testHarness.getOutput()),
+ contains(
+ new RawUnionValue(2, WindowedValue.valueInGlobalWindow("extra: one")),
+ new RawUnionValue(3, WindowedValue.valueInGlobalWindow("extra: two")),
+ new RawUnionValue(1, WindowedValue.valueInGlobalWindow("got: hello")),
+ new RawUnionValue(2, WindowedValue.valueInGlobalWindow("got: hello")),
+ new RawUnionValue(3, WindowedValue.valueInGlobalWindow("got: hello"))));
+
+ testHarness.close();
+ }
+
+ @Test
+ public void testLateDroppingForStatefulFn() throws Exception {
+
+ WindowingStrategy<Object, IntervalWindow> windowingStrategy =
+ WindowingStrategy.of(FixedWindows.of(new Duration(10)));
+
+ DoFn<Integer, String> fn = new DoFn<Integer, String>() {
+
+ @StateId("state")
+ private final StateSpec<Object, ValueState<String>> stateSpec =
+ StateSpecs.value(StringUtf8Coder.of());
+
+ @ProcessElement
+ public void processElement(ProcessContext context) {
+ context.output(context.element().toString());
+ }
+ };
+
+ WindowedValue.FullWindowedValueCoder<Integer> windowedValueCoder =
+ WindowedValue.getFullCoder(
+ VarIntCoder.of(),
+ windowingStrategy.getWindowFn().windowCoder());
+
+ TupleTag<String> outputTag = new TupleTag<>("main-output");
+
+ DoFnOperator<Integer, String, WindowedValue<String>> doFnOperator = new DoFnOperator<>(
+ fn,
+ windowedValueCoder,
+ outputTag,
+ Collections.<TupleTag<?>>emptyList(),
+ new DoFnOperator.DefaultOutputManagerFactory<WindowedValue<String>>(),
+ windowingStrategy,
+ new HashMap<Integer, PCollectionView<?>>(), /* side-input mapping */
+ Collections.<PCollectionView<?>>emptyList(), /* side inputs */
+ PipelineOptionsFactory.as(FlinkPipelineOptions.class),
+ VarIntCoder.of() /* key coder */);
+
+ OneInputStreamOperatorTestHarness<WindowedValue<Integer>, WindowedValue<String>> testHarness =
+ new KeyedOneInputStreamOperatorTestHarness<>(
+ doFnOperator,
+ new KeySelector<WindowedValue<Integer>, Integer>() {
+ @Override
+ public Integer getKey(WindowedValue<Integer> integerWindowedValue) throws Exception {
+ return integerWindowedValue.getValue();
+ }
+ },
+ new CoderTypeInformation<>(VarIntCoder.of()));
+
+ testHarness.open();
+
+ testHarness.processWatermark(0);
+
+ IntervalWindow window1 = new IntervalWindow(new Instant(0), Duration.millis(10));
+
+ // this should not be late
+ testHarness.processElement(
+ new StreamRecord<>(WindowedValue.of(13, new Instant(0), window1, PaneInfo.NO_FIRING)));
+
+ assertThat(
+ this.<String>stripStreamRecordFromWindowedValue(testHarness.getOutput()),
+ contains(WindowedValue.of("13", new Instant(0), window1, PaneInfo.NO_FIRING)));
+
+ testHarness.getOutput().clear();
+
+ testHarness.processWatermark(9);
+
+ // this should still not be considered late
+ testHarness.processElement(
+ new StreamRecord<>(WindowedValue.of(17, new Instant(0), window1, PaneInfo.NO_FIRING)));
+
+ assertThat(
+ this.<String>stripStreamRecordFromWindowedValue(testHarness.getOutput()),
+ contains(WindowedValue.of("17", new Instant(0), window1, PaneInfo.NO_FIRING)));
+
+ testHarness.getOutput().clear();
+
+ testHarness.processWatermark(10);
+
+ // this should now be considered late
+ testHarness.processElement(
+ new StreamRecord<>(WindowedValue.of(17, new Instant(0), window1, PaneInfo.NO_FIRING)));
+
+ assertThat(
+ this.<String>stripStreamRecordFromWindowedValue(testHarness.getOutput()),
+ emptyIterable());
+
+ testHarness.close();
+ }
+
+ @Test
+ public void testStateGCForStatefulFn() throws Exception {
+
+ WindowingStrategy<Object, IntervalWindow> windowingStrategy =
+ WindowingStrategy.of(FixedWindows.of(new Duration(10))).withAllowedLateness(Duration.ZERO);
+
+ final String timerId = "boo";
+ final String stateId = "dazzle";
+
+ final int offset = 5000;
+ final int timerOutput = 4093;
+
+ DoFn<KV<String, Integer>, KV<String, Integer>> fn =
+ new DoFn<KV<String, Integer>, KV<String, Integer>>() {
+
+ @TimerId(timerId)
+ private final TimerSpec spec = TimerSpecs.timer(TimeDomain.EVENT_TIME);
+
+ @StateId(stateId)
+ private final StateSpec<Object, ValueState<String>> stateSpec =
+ StateSpecs.value(StringUtf8Coder.of());
+
+ @ProcessElement
+ public void processElement(
+ ProcessContext context,
+ @TimerId(timerId) Timer timer,
+ @StateId(stateId) ValueState<String> state,
+ BoundedWindow window) {
+ timer.set(window.maxTimestamp());
+ state.write(context.element().getKey());
+ context.output(
+ KV.of(context.element().getKey(), context.element().getValue() + offset));
+ }
+
+ @OnTimer(timerId)
+ public void onTimer(OnTimerContext context, @StateId(stateId) ValueState<String> state) {
+ context.output(KV.of(state.read(), timerOutput));
+ }
+ };
+
+ WindowedValue.FullWindowedValueCoder<KV<String, Integer>> windowedValueCoder =
+ WindowedValue.getFullCoder(
+ KvCoder.of(StringUtf8Coder.of(), VarIntCoder.of()),
+ windowingStrategy.getWindowFn().windowCoder());
+
+ TupleTag<KV<String, Integer>> outputTag = new TupleTag<>("main-output");
+
+ DoFnOperator<
+ KV<String, Integer>, KV<String, Integer>, WindowedValue<KV<String, Integer>>> doFnOperator =
+ new DoFnOperator<>(
+ fn,
+ windowedValueCoder,
+ outputTag,
+ Collections.<TupleTag<?>>emptyList(),
+ new DoFnOperator.DefaultOutputManagerFactory<WindowedValue<KV<String, Integer>>>(),
+ windowingStrategy,
+ new HashMap<Integer, PCollectionView<?>>(), /* side-input mapping */
+ Collections.<PCollectionView<?>>emptyList(), /* side inputs */
+ PipelineOptionsFactory.as(FlinkPipelineOptions.class),
+ StringUtf8Coder.of() /* key coder */);
+
+ KeyedOneInputStreamOperatorTestHarness<
+ String,
+ WindowedValue<KV<String, Integer>>,
+ WindowedValue<KV<String, Integer>>> testHarness =
+ new KeyedOneInputStreamOperatorTestHarness<>(
+ doFnOperator,
+ new KeySelector<WindowedValue<KV<String, Integer>>, String>() {
+ @Override
+ public String getKey(
+ WindowedValue<KV<String, Integer>> kvWindowedValue) throws Exception {
+ return kvWindowedValue.getValue().getKey();
+ }
+ },
+ new CoderTypeInformation<>(StringUtf8Coder.of()));
+
+ testHarness.open();
+
+ testHarness.processWatermark(0);
+
+ assertEquals(0, testHarness.numKeyedStateEntries());
+
+ IntervalWindow window1 = new IntervalWindow(new Instant(0), Duration.millis(10));
+
+ testHarness.processElement(
+ new StreamRecord<>(
+ WindowedValue.of(KV.of("key1", 5), new Instant(1), window1, PaneInfo.NO_FIRING)));
+
+ testHarness.processElement(
+ new StreamRecord<>(
+ WindowedValue.of(KV.of("key2", 7), new Instant(3), window1, PaneInfo.NO_FIRING)));
+
+ assertThat(
+ this.<KV<String, Integer>>stripStreamRecordFromWindowedValue(testHarness.getOutput()),
+ contains(
+ WindowedValue.of(
+ KV.of("key1", 5 + offset), new Instant(1), window1, PaneInfo.NO_FIRING),
+ WindowedValue.of(
+ KV.of("key2", 7 + offset), new Instant(3), window1, PaneInfo.NO_FIRING)));
+
+ assertEquals(2, testHarness.numKeyedStateEntries());
+
+ testHarness.getOutput().clear();
+
+ // this should trigger both the window.maxTimestamp() timer and the GC timer
+ // this tests that the GC timer fires after the user timer
+ testHarness.processWatermark(
+ window1.maxTimestamp()
+ .plus(windowingStrategy.getAllowedLateness())
+ .plus(StatefulDoFnRunner.TimeInternalsCleanupTimer.GC_DELAY_MS)
+ .getMillis());
+
+ assertThat(
+ this.<KV<String, Integer>>stripStreamRecordFromWindowedValue(testHarness.getOutput()),
+ contains(
+ WindowedValue.of(
+ KV.of("key1", timerOutput), new Instant(9), window1, PaneInfo.NO_FIRING),
+ WindowedValue.of(
+ KV.of("key2", timerOutput), new Instant(9), window1, PaneInfo.NO_FIRING)));
+
+ // ensure the state was garbage collected
+ assertEquals(0, testHarness.numKeyedStateEntries());
+
+ testHarness.close();
+ }
+
+ public void testSideInputs(boolean keyed) throws Exception {
+
+ WindowedValue.ValueOnlyWindowedValueCoder<String> windowedValueCoder =
+ WindowedValue.getValueOnlyCoder(StringUtf8Coder.of());
+
+ TupleTag<String> outputTag = new TupleTag<>("main-output");
+
+ ImmutableMap<Integer, PCollectionView<?>> sideInputMapping =
+ ImmutableMap.<Integer, PCollectionView<?>>builder()
+ .put(1, view1)
+ .put(2, view2)
+ .build();
+
+ Coder<String> keyCoder = null;
+ if (keyed) {
+ keyCoder = StringUtf8Coder.of();
+ }
+
+ DoFnOperator<String, String, String> doFnOperator = new DoFnOperator<>(
+ new IdentityDoFn<String>(),
+ windowedValueCoder,
+ outputTag,
+ Collections.<TupleTag<?>>emptyList(),
+ new DoFnOperator.DefaultOutputManagerFactory<String>(),
+ WindowingStrategy.globalDefault(),
+ sideInputMapping, /* side-input mapping */
+ ImmutableList.<PCollectionView<?>>of(view1, view2), /* side inputs */
+ PipelineOptionsFactory.as(FlinkPipelineOptions.class),
+ keyCoder);
+
+ TwoInputStreamOperatorTestHarness<WindowedValue<String>, RawUnionValue, String> testHarness =
+ new TwoInputStreamOperatorTestHarness<>(doFnOperator);
+
+ if (keyed) {
+ // we use a dummy key for the second input since it is considered to be broadcast
+ testHarness = new KeyedTwoInputStreamOperatorTestHarness<>(
+ doFnOperator,
+ new StringKeySelector(),
+ new DummyKeySelector(),
+ BasicTypeInfo.STRING_TYPE_INFO);
+ }
+
+ testHarness.open();
+
+ IntervalWindow firstWindow = new IntervalWindow(new Instant(0), new Instant(100));
+ IntervalWindow secondWindow = new IntervalWindow(new Instant(0), new Instant(500));
+
+ // test the keep of sideInputs events
+ testHarness.processElement2(
+ new StreamRecord<>(
+ new RawUnionValue(
+ 1,
+ valuesInWindow(ImmutableList.of("hello", "ciao"), new Instant(0), firstWindow))));
+ testHarness.processElement2(
+ new StreamRecord<>(
+ new RawUnionValue(
+ 2,
+ valuesInWindow(ImmutableList.of("foo", "bar"), new Instant(0), secondWindow))));
+
+ // push in a regular elements
+ WindowedValue<String> helloElement = valueInWindow("Hello", new Instant(0), firstWindow);
+ WindowedValue<String> worldElement = valueInWindow("World", new Instant(1000), firstWindow);
+ testHarness.processElement1(new StreamRecord<>(helloElement));
+ testHarness.processElement1(new StreamRecord<>(worldElement));
+
+ // test the keep of pushed-back events
+ testHarness.processElement2(
+ new StreamRecord<>(
+ new RawUnionValue(
+ 1,
+ valuesInWindow(ImmutableList.of("hello", "ciao"),
+ new Instant(1000), firstWindow))));
+ testHarness.processElement2(
+ new StreamRecord<>(
+ new RawUnionValue(
+ 2,
+ valuesInWindow(ImmutableList.of("foo", "bar"), new Instant(1000), secondWindow))));
+
+ assertThat(
+ this.<String>stripStreamRecordFromWindowedValue(testHarness.getOutput()),
+ contains(helloElement, worldElement));
+
+ testHarness.close();
+
+ }
+
+ /**
+ * {@link TwoInputStreamOperatorTestHarness} support OperatorStateBackend,
+ * but don't support KeyedStateBackend. So we just test sideInput of normal ParDo.
+ */
+ @Test
+ @SuppressWarnings("unchecked")
+ public void testNormalParDoSideInputs() throws Exception {
+ testSideInputs(false);
+ }
+
+ @Test
+ public void testKeyedSideInputs() throws Exception {
+ testSideInputs(true);
+ }
+
+ private <T> Iterable<WindowedValue<T>> stripStreamRecordFromWindowedValue(
+ Iterable<Object> input) {
+
+ return FluentIterable.from(input).filter(new Predicate<Object>() {
+ @Override
+ public boolean apply(@Nullable Object o) {
+ return o instanceof StreamRecord && ((StreamRecord) o).getValue() instanceof WindowedValue;
+ }
+ }).transform(new Function<Object, WindowedValue<T>>() {
+ @Nullable
+ @Override
+ @SuppressWarnings({"unchecked", "rawtypes"})
+ public WindowedValue<T> apply(@Nullable Object o) {
+ if (o instanceof StreamRecord && ((StreamRecord) o).getValue() instanceof WindowedValue) {
+ return (WindowedValue) ((StreamRecord) o).getValue();
+ }
+ throw new RuntimeException("unreachable");
+ }
+ });
+ }
+
+ private Iterable<RawUnionValue> stripStreamRecordFromRawUnion(Iterable<Object> input) {
+ return FluentIterable.from(input).filter(new Predicate<Object>() {
+ @Override
+ public boolean apply(@Nullable Object o) {
+ return o instanceof StreamRecord && ((StreamRecord) o).getValue() instanceof RawUnionValue;
+ }
+ }).transform(new Function<Object, RawUnionValue>() {
+ @Nullable
+ @Override
+ @SuppressWarnings({"unchecked", "rawtypes"})
+ public RawUnionValue apply(@Nullable Object o) {
+ if (o instanceof StreamRecord && ((StreamRecord) o).getValue() instanceof RawUnionValue) {
+ return (RawUnionValue) ((StreamRecord) o).getValue();
+ }
+ throw new RuntimeException("unreachable");
+ }
+ });
+ }
+
+ private static class MultiOutputDoFn extends DoFn<String, String> {
+ private TupleTag<String> additionalOutput1;
+ private TupleTag<String> additionalOutput2;
+
+ public MultiOutputDoFn(TupleTag<String> additionalOutput1, TupleTag<String> additionalOutput2) {
+ this.additionalOutput1 = additionalOutput1;
+ this.additionalOutput2 = additionalOutput2;
+ }
+
+ @ProcessElement
+ public void processElement(ProcessContext c) throws Exception {
+ if (c.element().equals("one")) {
+ c.output(additionalOutput1, "extra: one");
+ } else if (c.element().equals("two")) {
+ c.output(additionalOutput2, "extra: two");
+ } else {
+ c.output("got: " + c.element());
+ c.output(additionalOutput1, "got: " + c.element());
+ c.output(additionalOutput2, "got: " + c.element());
+ }
+ }
+ }
+
+ private static class IdentityDoFn<T> extends DoFn<T, T> {
+ @ProcessElement
+ public void processElement(ProcessContext c) throws Exception {
+ c.output(c.element());
+ }
+ }
+
+ @SuppressWarnings({"unchecked", "rawtypes"})
+ private WindowedValue<Iterable<?>> valuesInWindow(
+ Iterable<?> values, Instant timestamp, BoundedWindow window) {
+ return (WindowedValue) WindowedValue.of(values, timestamp, window, PaneInfo.NO_FIRING);
+ }
+
+ @SuppressWarnings({"unchecked", "rawtypes"})
+ private <T> WindowedValue<T> valueInWindow(
+ T value, Instant timestamp, BoundedWindow window) {
+ return WindowedValue.of(value, timestamp, window, PaneInfo.NO_FIRING);
+ }
+
+
+ private static class DummyKeySelector implements KeySelector<RawUnionValue, String> {
+ @Override
+ public String getKey(RawUnionValue stringWindowedValue) throws Exception {
+ return "dummy_key";
+ }
+ }
+
+ private static class StringKeySelector implements KeySelector<WindowedValue<String>, String> {
+ @Override
+ public String getKey(WindowedValue<String> stringWindowedValue) throws Exception {
+ return stringWindowedValue.getValue();
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/test/java/org/apache/beam/runners/flink/streaming/FlinkBroadcastStateInternalsTest.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/test/java/org/apache/beam/runners/flink/streaming/FlinkBroadcastStateInternalsTest.java b/runners/flink/src/test/java/org/apache/beam/runners/flink/streaming/FlinkBroadcastStateInternalsTest.java
new file mode 100644
index 0000000..7e7d1e1
--- /dev/null
+++ b/runners/flink/src/test/java/org/apache/beam/runners/flink/streaming/FlinkBroadcastStateInternalsTest.java
@@ -0,0 +1,245 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink.streaming;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotEquals;
+import static org.junit.Assert.assertThat;
+
+import java.util.Arrays;
+import org.apache.beam.runners.core.StateMerging;
+import org.apache.beam.runners.core.StateNamespace;
+import org.apache.beam.runners.core.StateNamespaceForTest;
+import org.apache.beam.runners.core.StateTag;
+import org.apache.beam.runners.core.StateTags;
+import org.apache.beam.runners.flink.translation.wrappers.streaming.state.FlinkBroadcastStateInternals;
+import org.apache.beam.sdk.coders.StringUtf8Coder;
+import org.apache.beam.sdk.coders.VarIntCoder;
+import org.apache.beam.sdk.transforms.Sum;
+import org.apache.beam.sdk.util.state.BagState;
+import org.apache.beam.sdk.util.state.CombiningState;
+import org.apache.beam.sdk.util.state.GroupingState;
+import org.apache.beam.sdk.util.state.ReadableState;
+import org.apache.beam.sdk.util.state.ValueState;
+import org.apache.flink.runtime.operators.testutils.DummyEnvironment;
+import org.apache.flink.runtime.state.OperatorStateBackend;
+import org.apache.flink.runtime.state.memory.MemoryStateBackend;
+import org.hamcrest.Matchers;
+import org.junit.Before;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+/**
+ * Tests for {@link FlinkBroadcastStateInternals}. This is based on the tests for
+ * {@code InMemoryStateInternals}.
+ */
+@RunWith(JUnit4.class)
+public class FlinkBroadcastStateInternalsTest {
+ private static final StateNamespace NAMESPACE_1 = new StateNamespaceForTest("ns1");
+ private static final StateNamespace NAMESPACE_2 = new StateNamespaceForTest("ns2");
+ private static final StateNamespace NAMESPACE_3 = new StateNamespaceForTest("ns3");
+
+ private static final StateTag<Object, ValueState<String>> STRING_VALUE_ADDR =
+ StateTags.value("stringValue", StringUtf8Coder.of());
+ private static final StateTag<Object, CombiningState<Integer, int[], Integer>>
+ SUM_INTEGER_ADDR = StateTags.combiningValueFromInputInternal(
+ "sumInteger", VarIntCoder.of(), Sum.ofIntegers());
+ private static final StateTag<Object, BagState<String>> STRING_BAG_ADDR =
+ StateTags.bag("stringBag", StringUtf8Coder.of());
+
+ FlinkBroadcastStateInternals<String> underTest;
+
+ @Before
+ public void initStateInternals() {
+ MemoryStateBackend backend = new MemoryStateBackend();
+ try {
+ OperatorStateBackend operatorStateBackend =
+ backend.createOperatorStateBackend(new DummyEnvironment("test", 1, 0), "");
+ underTest = new FlinkBroadcastStateInternals<>(1, operatorStateBackend);
+
+ } catch (Exception e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ @Test
+ public void testValue() throws Exception {
+ ValueState<String> value = underTest.state(NAMESPACE_1, STRING_VALUE_ADDR);
+
+ assertEquals(underTest.state(NAMESPACE_1, STRING_VALUE_ADDR), value);
+ assertNotEquals(
+ underTest.state(NAMESPACE_2, STRING_VALUE_ADDR),
+ value);
+
+ assertThat(value.read(), Matchers.nullValue());
+ value.write("hello");
+ assertThat(value.read(), Matchers.equalTo("hello"));
+ value.write("world");
+ assertThat(value.read(), Matchers.equalTo("world"));
+
+ value.clear();
+ assertThat(value.read(), Matchers.nullValue());
+ assertEquals(underTest.state(NAMESPACE_1, STRING_VALUE_ADDR), value);
+
+ }
+
+ @Test
+ public void testBag() throws Exception {
+ BagState<String> value = underTest.state(NAMESPACE_1, STRING_BAG_ADDR);
+
+ assertEquals(value, underTest.state(NAMESPACE_1, STRING_BAG_ADDR));
+ assertFalse(value.equals(underTest.state(NAMESPACE_2, STRING_BAG_ADDR)));
+
+ assertThat(value.read(), Matchers.emptyIterable());
+ value.add("hello");
+ assertThat(value.read(), Matchers.containsInAnyOrder("hello"));
+
+ value.add("world");
+ assertThat(value.read(), Matchers.containsInAnyOrder("hello", "world"));
+
+ value.clear();
+ assertThat(value.read(), Matchers.emptyIterable());
+ assertEquals(underTest.state(NAMESPACE_1, STRING_BAG_ADDR), value);
+
+ }
+
+ @Test
+ public void testBagIsEmpty() throws Exception {
+ BagState<String> value = underTest.state(NAMESPACE_1, STRING_BAG_ADDR);
+
+ assertThat(value.isEmpty().read(), Matchers.is(true));
+ ReadableState<Boolean> readFuture = value.isEmpty();
+ value.add("hello");
+ assertThat(readFuture.read(), Matchers.is(false));
+
+ value.clear();
+ assertThat(readFuture.read(), Matchers.is(true));
+ }
+
+ @Test
+ public void testMergeBagIntoSource() throws Exception {
+ BagState<String> bag1 = underTest.state(NAMESPACE_1, STRING_BAG_ADDR);
+ BagState<String> bag2 = underTest.state(NAMESPACE_2, STRING_BAG_ADDR);
+
+ bag1.add("Hello");
+ bag2.add("World");
+ bag1.add("!");
+
+ StateMerging.mergeBags(Arrays.asList(bag1, bag2), bag1);
+
+ // Reading the merged bag gets both the contents
+ assertThat(bag1.read(), Matchers.containsInAnyOrder("Hello", "World", "!"));
+ assertThat(bag2.read(), Matchers.emptyIterable());
+ }
+
+ @Test
+ public void testMergeBagIntoNewNamespace() throws Exception {
+ BagState<String> bag1 = underTest.state(NAMESPACE_1, STRING_BAG_ADDR);
+ BagState<String> bag2 = underTest.state(NAMESPACE_2, STRING_BAG_ADDR);
+ BagState<String> bag3 = underTest.state(NAMESPACE_3, STRING_BAG_ADDR);
+
+ bag1.add("Hello");
+ bag2.add("World");
+ bag1.add("!");
+
+ StateMerging.mergeBags(Arrays.asList(bag1, bag2, bag3), bag3);
+
+ // Reading the merged bag gets both the contents
+ assertThat(bag3.read(), Matchers.containsInAnyOrder("Hello", "World", "!"));
+ assertThat(bag1.read(), Matchers.emptyIterable());
+ assertThat(bag2.read(), Matchers.emptyIterable());
+ }
+
+ @Test
+ public void testCombiningValue() throws Exception {
+ GroupingState<Integer, Integer> value = underTest.state(NAMESPACE_1, SUM_INTEGER_ADDR);
+
+ // State instances are cached, but depend on the namespace.
+ assertEquals(value, underTest.state(NAMESPACE_1, SUM_INTEGER_ADDR));
+ assertFalse(value.equals(underTest.state(NAMESPACE_2, SUM_INTEGER_ADDR)));
+
+ assertThat(value.read(), Matchers.equalTo(0));
+ value.add(2);
+ assertThat(value.read(), Matchers.equalTo(2));
+
+ value.add(3);
+ assertThat(value.read(), Matchers.equalTo(5));
+
+ value.clear();
+ assertThat(value.read(), Matchers.equalTo(0));
+ assertEquals(underTest.state(NAMESPACE_1, SUM_INTEGER_ADDR), value);
+ }
+
+ @Test
+ public void testCombiningIsEmpty() throws Exception {
+ GroupingState<Integer, Integer> value = underTest.state(NAMESPACE_1, SUM_INTEGER_ADDR);
+
+ assertThat(value.isEmpty().read(), Matchers.is(true));
+ ReadableState<Boolean> readFuture = value.isEmpty();
+ value.add(5);
+ assertThat(readFuture.read(), Matchers.is(false));
+
+ value.clear();
+ assertThat(readFuture.read(), Matchers.is(true));
+ }
+
+ @Test
+ public void testMergeCombiningValueIntoSource() throws Exception {
+ CombiningState<Integer, int[], Integer> value1 =
+ underTest.state(NAMESPACE_1, SUM_INTEGER_ADDR);
+ CombiningState<Integer, int[], Integer> value2 =
+ underTest.state(NAMESPACE_2, SUM_INTEGER_ADDR);
+
+ value1.add(5);
+ value2.add(10);
+ value1.add(6);
+
+ assertThat(value1.read(), Matchers.equalTo(11));
+ assertThat(value2.read(), Matchers.equalTo(10));
+
+ // Merging clears the old values and updates the result value.
+ StateMerging.mergeCombiningValues(Arrays.asList(value1, value2), value1);
+
+ assertThat(value1.read(), Matchers.equalTo(21));
+ assertThat(value2.read(), Matchers.equalTo(0));
+ }
+
+ @Test
+ public void testMergeCombiningValueIntoNewNamespace() throws Exception {
+ CombiningState<Integer, int[], Integer> value1 =
+ underTest.state(NAMESPACE_1, SUM_INTEGER_ADDR);
+ CombiningState<Integer, int[], Integer> value2 =
+ underTest.state(NAMESPACE_2, SUM_INTEGER_ADDR);
+ CombiningState<Integer, int[], Integer> value3 =
+ underTest.state(NAMESPACE_3, SUM_INTEGER_ADDR);
+
+ value1.add(5);
+ value2.add(10);
+ value1.add(6);
+
+ StateMerging.mergeCombiningValues(Arrays.asList(value1, value2), value3);
+
+ // Merging clears the old values and updates the result value.
+ assertThat(value1.read(), Matchers.equalTo(0));
+ assertThat(value2.read(), Matchers.equalTo(0));
+ assertThat(value3.read(), Matchers.equalTo(21));
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/test/java/org/apache/beam/runners/flink/streaming/FlinkKeyGroupStateInternalsTest.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/test/java/org/apache/beam/runners/flink/streaming/FlinkKeyGroupStateInternalsTest.java b/runners/flink/src/test/java/org/apache/beam/runners/flink/streaming/FlinkKeyGroupStateInternalsTest.java
new file mode 100644
index 0000000..5433d07
--- /dev/null
+++ b/runners/flink/src/test/java/org/apache/beam/runners/flink/streaming/FlinkKeyGroupStateInternalsTest.java
@@ -0,0 +1,262 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink.streaming;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertThat;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+import org.apache.beam.runners.core.StateMerging;
+import org.apache.beam.runners.core.StateNamespace;
+import org.apache.beam.runners.core.StateNamespaceForTest;
+import org.apache.beam.runners.core.StateTag;
+import org.apache.beam.runners.core.StateTags;
+import org.apache.beam.runners.flink.translation.wrappers.streaming.state.FlinkKeyGroupStateInternals;
+import org.apache.beam.sdk.coders.StringUtf8Coder;
+import org.apache.beam.sdk.util.CoderUtils;
+import org.apache.beam.sdk.util.state.BagState;
+import org.apache.beam.sdk.util.state.ReadableState;
+import org.apache.flink.api.common.ExecutionConfig;
+import org.apache.flink.api.common.JobID;
+import org.apache.flink.api.java.typeutils.GenericTypeInfo;
+import org.apache.flink.runtime.jobgraph.JobVertexID;
+import org.apache.flink.runtime.operators.testutils.DummyEnvironment;
+import org.apache.flink.runtime.query.KvStateRegistry;
+import org.apache.flink.runtime.state.AbstractKeyedStateBackend;
+import org.apache.flink.runtime.state.KeyGroupRange;
+import org.apache.flink.runtime.state.KeyedStateBackend;
+import org.apache.flink.runtime.state.memory.MemoryStateBackend;
+import org.apache.flink.streaming.api.operators.KeyContext;
+import org.hamcrest.Matchers;
+import org.junit.Before;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+/**
+ * Tests for {@link FlinkKeyGroupStateInternals}. This is based on the tests for
+ * {@code InMemoryStateInternals}.
+ */
+@RunWith(JUnit4.class)
+public class FlinkKeyGroupStateInternalsTest {
+ private static final StateNamespace NAMESPACE_1 = new StateNamespaceForTest("ns1");
+ private static final StateNamespace NAMESPACE_2 = new StateNamespaceForTest("ns2");
+ private static final StateNamespace NAMESPACE_3 = new StateNamespaceForTest("ns3");
+
+ private static final StateTag<Object, BagState<String>> STRING_BAG_ADDR =
+ StateTags.bag("stringBag", StringUtf8Coder.of());
+
+ FlinkKeyGroupStateInternals<String> underTest;
+ private KeyedStateBackend keyedStateBackend;
+
+ @Before
+ public void initStateInternals() {
+ try {
+ keyedStateBackend = getKeyedStateBackend(2, new KeyGroupRange(0, 1));
+ underTest = new FlinkKeyGroupStateInternals<>(StringUtf8Coder.of(), keyedStateBackend);
+ } catch (Exception e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ private KeyedStateBackend getKeyedStateBackend(int numberOfKeyGroups,
+ KeyGroupRange keyGroupRange) {
+ MemoryStateBackend backend = new MemoryStateBackend();
+ try {
+ AbstractKeyedStateBackend<ByteBuffer> keyedStateBackend = backend.createKeyedStateBackend(
+ new DummyEnvironment("test", 1, 0),
+ new JobID(),
+ "test_op",
+ new GenericTypeInfo<>(ByteBuffer.class).createSerializer(new ExecutionConfig()),
+ numberOfKeyGroups,
+ keyGroupRange,
+ new KvStateRegistry().createTaskRegistry(new JobID(), new JobVertexID()));
+ keyedStateBackend.setCurrentKey(ByteBuffer.wrap(
+ CoderUtils.encodeToByteArray(StringUtf8Coder.of(), "1")));
+ return keyedStateBackend;
+ } catch (Exception e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ @Test
+ public void testBag() throws Exception {
+ BagState<String> value = underTest.state(NAMESPACE_1, STRING_BAG_ADDR);
+
+ assertEquals(value, underTest.state(NAMESPACE_1, STRING_BAG_ADDR));
+ assertFalse(value.equals(underTest.state(NAMESPACE_2, STRING_BAG_ADDR)));
+
+ assertThat(value.read(), Matchers.emptyIterable());
+ value.add("hello");
+ assertThat(value.read(), Matchers.containsInAnyOrder("hello"));
+
+ value.add("world");
+ assertThat(value.read(), Matchers.containsInAnyOrder("hello", "world"));
+
+ value.clear();
+ assertThat(value.read(), Matchers.emptyIterable());
+ assertEquals(underTest.state(NAMESPACE_1, STRING_BAG_ADDR), value);
+
+ }
+
+ @Test
+ public void testBagIsEmpty() throws Exception {
+ BagState<String> value = underTest.state(NAMESPACE_1, STRING_BAG_ADDR);
+
+ assertThat(value.isEmpty().read(), Matchers.is(true));
+ ReadableState<Boolean> readFuture = value.isEmpty();
+ value.add("hello");
+ assertThat(readFuture.read(), Matchers.is(false));
+
+ value.clear();
+ assertThat(readFuture.read(), Matchers.is(true));
+ }
+
+ @Test
+ public void testMergeBagIntoSource() throws Exception {
+ BagState<String> bag1 = underTest.state(NAMESPACE_1, STRING_BAG_ADDR);
+ BagState<String> bag2 = underTest.state(NAMESPACE_2, STRING_BAG_ADDR);
+
+ bag1.add("Hello");
+ bag2.add("World");
+ bag1.add("!");
+
+ StateMerging.mergeBags(Arrays.asList(bag1, bag2), bag1);
+
+ // Reading the merged bag gets both the contents
+ assertThat(bag1.read(), Matchers.containsInAnyOrder("Hello", "World", "!"));
+ assertThat(bag2.read(), Matchers.emptyIterable());
+ }
+
+ @Test
+ public void testMergeBagIntoNewNamespace() throws Exception {
+ BagState<String> bag1 = underTest.state(NAMESPACE_1, STRING_BAG_ADDR);
+ BagState<String> bag2 = underTest.state(NAMESPACE_2, STRING_BAG_ADDR);
+ BagState<String> bag3 = underTest.state(NAMESPACE_3, STRING_BAG_ADDR);
+
+ bag1.add("Hello");
+ bag2.add("World");
+ bag1.add("!");
+
+ StateMerging.mergeBags(Arrays.asList(bag1, bag2, bag3), bag3);
+
+ // Reading the merged bag gets both the contents
+ assertThat(bag3.read(), Matchers.containsInAnyOrder("Hello", "World", "!"));
+ assertThat(bag1.read(), Matchers.emptyIterable());
+ assertThat(bag2.read(), Matchers.emptyIterable());
+ }
+
+ @Test
+ public void testKeyGroupAndCheckpoint() throws Exception {
+ // assign to keyGroup 0
+ ByteBuffer key0 = ByteBuffer.wrap(
+ CoderUtils.encodeToByteArray(StringUtf8Coder.of(), "11111111"));
+ // assign to keyGroup 1
+ ByteBuffer key1 = ByteBuffer.wrap(
+ CoderUtils.encodeToByteArray(StringUtf8Coder.of(), "22222222"));
+ FlinkKeyGroupStateInternals<String> allState;
+ {
+ KeyedStateBackend keyedStateBackend = getKeyedStateBackend(2, new KeyGroupRange(0, 1));
+ allState = new FlinkKeyGroupStateInternals<>(
+ StringUtf8Coder.of(), keyedStateBackend);
+ BagState<String> valueForNamespace0 = allState.state(NAMESPACE_1, STRING_BAG_ADDR);
+ BagState<String> valueForNamespace1 = allState.state(NAMESPACE_2, STRING_BAG_ADDR);
+ keyedStateBackend.setCurrentKey(key0);
+ valueForNamespace0.add("0");
+ valueForNamespace1.add("2");
+ keyedStateBackend.setCurrentKey(key1);
+ valueForNamespace0.add("1");
+ valueForNamespace1.add("3");
+ assertThat(valueForNamespace0.read(), Matchers.containsInAnyOrder("0", "1"));
+ assertThat(valueForNamespace1.read(), Matchers.containsInAnyOrder("2", "3"));
+ }
+
+ ClassLoader classLoader = FlinkKeyGroupStateInternalsTest.class.getClassLoader();
+
+ // 1. scale up
+ ByteArrayOutputStream out0 = new ByteArrayOutputStream();
+ allState.snapshotKeyGroupState(0, new DataOutputStream(out0));
+ DataInputStream in0 = new DataInputStream(
+ new ByteArrayInputStream(out0.toByteArray()));
+ {
+ KeyedStateBackend keyedStateBackend = getKeyedStateBackend(2, new KeyGroupRange(0, 0));
+ FlinkKeyGroupStateInternals<String> state0 =
+ new FlinkKeyGroupStateInternals<>(
+ StringUtf8Coder.of(), keyedStateBackend);
+ state0.restoreKeyGroupState(0, in0, classLoader);
+ BagState<String> valueForNamespace0 = state0.state(NAMESPACE_1, STRING_BAG_ADDR);
+ BagState<String> valueForNamespace1 = state0.state(NAMESPACE_2, STRING_BAG_ADDR);
+ assertThat(valueForNamespace0.read(), Matchers.containsInAnyOrder("0"));
+ assertThat(valueForNamespace1.read(), Matchers.containsInAnyOrder("2"));
+ }
+
+ ByteArrayOutputStream out1 = new ByteArrayOutputStream();
+ allState.snapshotKeyGroupState(1, new DataOutputStream(out1));
+ DataInputStream in1 = new DataInputStream(
+ new ByteArrayInputStream(out1.toByteArray()));
+ {
+ KeyedStateBackend keyedStateBackend = getKeyedStateBackend(2, new KeyGroupRange(1, 1));
+ FlinkKeyGroupStateInternals<String> state1 =
+ new FlinkKeyGroupStateInternals<>(
+ StringUtf8Coder.of(), keyedStateBackend);
+ state1.restoreKeyGroupState(1, in1, classLoader);
+ BagState<String> valueForNamespace0 = state1.state(NAMESPACE_1, STRING_BAG_ADDR);
+ BagState<String> valueForNamespace1 = state1.state(NAMESPACE_2, STRING_BAG_ADDR);
+ assertThat(valueForNamespace0.read(), Matchers.containsInAnyOrder("1"));
+ assertThat(valueForNamespace1.read(), Matchers.containsInAnyOrder("3"));
+ }
+
+ // 2. scale down
+ {
+ KeyedStateBackend keyedStateBackend = getKeyedStateBackend(2, new KeyGroupRange(0, 1));
+ FlinkKeyGroupStateInternals<String> newAllState = new FlinkKeyGroupStateInternals<>(
+ StringUtf8Coder.of(), keyedStateBackend);
+ in0.reset();
+ in1.reset();
+ newAllState.restoreKeyGroupState(0, in0, classLoader);
+ newAllState.restoreKeyGroupState(1, in1, classLoader);
+ BagState<String> valueForNamespace0 = newAllState.state(NAMESPACE_1, STRING_BAG_ADDR);
+ BagState<String> valueForNamespace1 = newAllState.state(NAMESPACE_2, STRING_BAG_ADDR);
+ assertThat(valueForNamespace0.read(), Matchers.containsInAnyOrder("0", "1"));
+ assertThat(valueForNamespace1.read(), Matchers.containsInAnyOrder("2", "3"));
+ }
+
+ }
+
+ private static class TestKeyContext implements KeyContext {
+
+ private Object key;
+
+ @Override
+ public void setCurrentKey(Object key) {
+ this.key = key;
+ }
+
+ @Override
+ public Object getCurrentKey() {
+ return key;
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/test/java/org/apache/beam/runners/flink/streaming/FlinkSplitStateInternalsTest.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/test/java/org/apache/beam/runners/flink/streaming/FlinkSplitStateInternalsTest.java b/runners/flink/src/test/java/org/apache/beam/runners/flink/streaming/FlinkSplitStateInternalsTest.java
new file mode 100644
index 0000000..08ae0c4
--- /dev/null
+++ b/runners/flink/src/test/java/org/apache/beam/runners/flink/streaming/FlinkSplitStateInternalsTest.java
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink.streaming;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertThat;
+
+import org.apache.beam.runners.core.StateNamespace;
+import org.apache.beam.runners.core.StateNamespaceForTest;
+import org.apache.beam.runners.core.StateTag;
+import org.apache.beam.runners.core.StateTags;
+import org.apache.beam.runners.flink.translation.wrappers.streaming.state.FlinkSplitStateInternals;
+import org.apache.beam.sdk.coders.StringUtf8Coder;
+import org.apache.beam.sdk.util.state.BagState;
+import org.apache.beam.sdk.util.state.ReadableState;
+import org.apache.flink.runtime.operators.testutils.DummyEnvironment;
+import org.apache.flink.runtime.state.OperatorStateBackend;
+import org.apache.flink.runtime.state.memory.MemoryStateBackend;
+import org.hamcrest.Matchers;
+import org.junit.Before;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+/**
+ * Tests for {@link FlinkSplitStateInternals}. This is based on the tests for
+ * {@code InMemoryStateInternals}.
+ */
+@RunWith(JUnit4.class)
+public class FlinkSplitStateInternalsTest {
+ private static final StateNamespace NAMESPACE_1 = new StateNamespaceForTest("ns1");
+ private static final StateNamespace NAMESPACE_2 = new StateNamespaceForTest("ns2");
+
+ private static final StateTag<Object, BagState<String>> STRING_BAG_ADDR =
+ StateTags.bag("stringBag", StringUtf8Coder.of());
+
+ FlinkSplitStateInternals<String> underTest;
+
+ @Before
+ public void initStateInternals() {
+ MemoryStateBackend backend = new MemoryStateBackend();
+ try {
+ OperatorStateBackend operatorStateBackend =
+ backend.createOperatorStateBackend(new DummyEnvironment("test", 1, 0), "");
+ underTest = new FlinkSplitStateInternals<>(operatorStateBackend);
+
+ } catch (Exception e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ @Test
+ public void testBag() throws Exception {
+ BagState<String> value = underTest.state(NAMESPACE_1, STRING_BAG_ADDR);
+
+ assertEquals(value, underTest.state(NAMESPACE_1, STRING_BAG_ADDR));
+ assertFalse(value.equals(underTest.state(NAMESPACE_2, STRING_BAG_ADDR)));
+
+ assertThat(value.read(), Matchers.emptyIterable());
+ value.add("hello");
+ assertThat(value.read(), Matchers.containsInAnyOrder("hello"));
+
+ value.add("world");
+ assertThat(value.read(), Matchers.containsInAnyOrder("hello", "world"));
+
+ value.clear();
+ assertThat(value.read(), Matchers.emptyIterable());
+ assertEquals(underTest.state(NAMESPACE_1, STRING_BAG_ADDR), value);
+
+ }
+
+ @Test
+ public void testBagIsEmpty() throws Exception {
+ BagState<String> value = underTest.state(NAMESPACE_1, STRING_BAG_ADDR);
+
+ assertThat(value.isEmpty().read(), Matchers.is(true));
+ ReadableState<Boolean> readFuture = value.isEmpty();
+ value.add("hello");
+ assertThat(readFuture.read(), Matchers.is(false));
+
+ value.clear();
+ assertThat(readFuture.read(), Matchers.is(true));
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/test/java/org/apache/beam/runners/flink/streaming/FlinkStateInternalsTest.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/test/java/org/apache/beam/runners/flink/streaming/FlinkStateInternalsTest.java b/runners/flink/src/test/java/org/apache/beam/runners/flink/streaming/FlinkStateInternalsTest.java
new file mode 100644
index 0000000..d140271
--- /dev/null
+++ b/runners/flink/src/test/java/org/apache/beam/runners/flink/streaming/FlinkStateInternalsTest.java
@@ -0,0 +1,395 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink.streaming;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotEquals;
+import static org.junit.Assert.assertThat;
+
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+import org.apache.beam.runners.core.StateMerging;
+import org.apache.beam.runners.core.StateNamespace;
+import org.apache.beam.runners.core.StateNamespaceForTest;
+import org.apache.beam.runners.core.StateTag;
+import org.apache.beam.runners.core.StateTags;
+import org.apache.beam.runners.flink.translation.wrappers.streaming.state.FlinkStateInternals;
+import org.apache.beam.sdk.coders.StringUtf8Coder;
+import org.apache.beam.sdk.coders.VarIntCoder;
+import org.apache.beam.sdk.transforms.Sum;
+import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
+import org.apache.beam.sdk.transforms.windowing.IntervalWindow;
+import org.apache.beam.sdk.transforms.windowing.OutputTimeFns;
+import org.apache.beam.sdk.util.CoderUtils;
+import org.apache.beam.sdk.util.state.BagState;
+import org.apache.beam.sdk.util.state.CombiningState;
+import org.apache.beam.sdk.util.state.GroupingState;
+import org.apache.beam.sdk.util.state.ReadableState;
+import org.apache.beam.sdk.util.state.ValueState;
+import org.apache.beam.sdk.util.state.WatermarkHoldState;
+import org.apache.flink.api.common.ExecutionConfig;
+import org.apache.flink.api.common.JobID;
+import org.apache.flink.api.java.typeutils.GenericTypeInfo;
+import org.apache.flink.runtime.jobgraph.JobVertexID;
+import org.apache.flink.runtime.operators.testutils.DummyEnvironment;
+import org.apache.flink.runtime.query.KvStateRegistry;
+import org.apache.flink.runtime.state.AbstractKeyedStateBackend;
+import org.apache.flink.runtime.state.KeyGroupRange;
+import org.apache.flink.runtime.state.memory.MemoryStateBackend;
+import org.hamcrest.Matchers;
+import org.joda.time.Instant;
+import org.junit.Before;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+/**
+ * Tests for {@link FlinkStateInternals}. This is based on the tests for
+ * {@code InMemoryStateInternals}.
+ */
+@RunWith(JUnit4.class)
+public class FlinkStateInternalsTest {
+ private static final BoundedWindow WINDOW_1 = new IntervalWindow(new Instant(0), new Instant(10));
+ private static final StateNamespace NAMESPACE_1 = new StateNamespaceForTest("ns1");
+ private static final StateNamespace NAMESPACE_2 = new StateNamespaceForTest("ns2");
+ private static final StateNamespace NAMESPACE_3 = new StateNamespaceForTest("ns3");
+
+ private static final StateTag<Object, ValueState<String>> STRING_VALUE_ADDR =
+ StateTags.value("stringValue", StringUtf8Coder.of());
+ private static final StateTag<Object, CombiningState<Integer, int[], Integer>>
+ SUM_INTEGER_ADDR = StateTags.combiningValueFromInputInternal(
+ "sumInteger", VarIntCoder.of(), Sum.ofIntegers());
+ private static final StateTag<Object, BagState<String>> STRING_BAG_ADDR =
+ StateTags.bag("stringBag", StringUtf8Coder.of());
+ private static final StateTag<Object, WatermarkHoldState<BoundedWindow>>
+ WATERMARK_EARLIEST_ADDR =
+ StateTags.watermarkStateInternal("watermark", OutputTimeFns.outputAtEarliestInputTimestamp());
+ private static final StateTag<Object, WatermarkHoldState<BoundedWindow>>
+ WATERMARK_LATEST_ADDR =
+ StateTags.watermarkStateInternal("watermark", OutputTimeFns.outputAtLatestInputTimestamp());
+ private static final StateTag<Object, WatermarkHoldState<BoundedWindow>> WATERMARK_EOW_ADDR =
+ StateTags.watermarkStateInternal("watermark", OutputTimeFns.outputAtEndOfWindow());
+
+ FlinkStateInternals<String> underTest;
+
+ @Before
+ public void initStateInternals() {
+ MemoryStateBackend backend = new MemoryStateBackend();
+ try {
+ AbstractKeyedStateBackend<ByteBuffer> keyedStateBackend = backend.createKeyedStateBackend(
+ new DummyEnvironment("test", 1, 0),
+ new JobID(),
+ "test_op",
+ new GenericTypeInfo<>(ByteBuffer.class).createSerializer(new ExecutionConfig()),
+ 1,
+ new KeyGroupRange(0, 0),
+ new KvStateRegistry().createTaskRegistry(new JobID(), new JobVertexID()));
+ underTest = new FlinkStateInternals<>(keyedStateBackend, StringUtf8Coder.of());
+
+ keyedStateBackend.setCurrentKey(
+ ByteBuffer.wrap(CoderUtils.encodeToByteArray(StringUtf8Coder.of(), "Hello")));
+ } catch (Exception e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ @Test
+ public void testValue() throws Exception {
+ ValueState<String> value = underTest.state(NAMESPACE_1, STRING_VALUE_ADDR);
+
+ assertEquals(underTest.state(NAMESPACE_1, STRING_VALUE_ADDR), value);
+ assertNotEquals(
+ underTest.state(NAMESPACE_2, STRING_VALUE_ADDR),
+ value);
+
+ assertThat(value.read(), Matchers.nullValue());
+ value.write("hello");
+ assertThat(value.read(), Matchers.equalTo("hello"));
+ value.write("world");
+ assertThat(value.read(), Matchers.equalTo("world"));
+
+ value.clear();
+ assertThat(value.read(), Matchers.nullValue());
+ assertEquals(underTest.state(NAMESPACE_1, STRING_VALUE_ADDR), value);
+
+ }
+
+ @Test
+ public void testBag() throws Exception {
+ BagState<String> value = underTest.state(NAMESPACE_1, STRING_BAG_ADDR);
+
+ assertEquals(value, underTest.state(NAMESPACE_1, STRING_BAG_ADDR));
+ assertFalse(value.equals(underTest.state(NAMESPACE_2, STRING_BAG_ADDR)));
+
+ assertThat(value.read(), Matchers.emptyIterable());
+ value.add("hello");
+ assertThat(value.read(), Matchers.containsInAnyOrder("hello"));
+
+ value.add("world");
+ assertThat(value.read(), Matchers.containsInAnyOrder("hello", "world"));
+
+ value.clear();
+ assertThat(value.read(), Matchers.emptyIterable());
+ assertEquals(underTest.state(NAMESPACE_1, STRING_BAG_ADDR), value);
+
+ }
+
+ @Test
+ public void testBagIsEmpty() throws Exception {
+ BagState<String> value = underTest.state(NAMESPACE_1, STRING_BAG_ADDR);
+
+ assertThat(value.isEmpty().read(), Matchers.is(true));
+ ReadableState<Boolean> readFuture = value.isEmpty();
+ value.add("hello");
+ assertThat(readFuture.read(), Matchers.is(false));
+
+ value.clear();
+ assertThat(readFuture.read(), Matchers.is(true));
+ }
+
+ @Test
+ public void testMergeBagIntoSource() throws Exception {
+ BagState<String> bag1 = underTest.state(NAMESPACE_1, STRING_BAG_ADDR);
+ BagState<String> bag2 = underTest.state(NAMESPACE_2, STRING_BAG_ADDR);
+
+ bag1.add("Hello");
+ bag2.add("World");
+ bag1.add("!");
+
+ StateMerging.mergeBags(Arrays.asList(bag1, bag2), bag1);
+
+ // Reading the merged bag gets both the contents
+ assertThat(bag1.read(), Matchers.containsInAnyOrder("Hello", "World", "!"));
+ assertThat(bag2.read(), Matchers.emptyIterable());
+ }
+
+ @Test
+ public void testMergeBagIntoNewNamespace() throws Exception {
+ BagState<String> bag1 = underTest.state(NAMESPACE_1, STRING_BAG_ADDR);
+ BagState<String> bag2 = underTest.state(NAMESPACE_2, STRING_BAG_ADDR);
+ BagState<String> bag3 = underTest.state(NAMESPACE_3, STRING_BAG_ADDR);
+
+ bag1.add("Hello");
+ bag2.add("World");
+ bag1.add("!");
+
+ StateMerging.mergeBags(Arrays.asList(bag1, bag2, bag3), bag3);
+
+ // Reading the merged bag gets both the contents
+ assertThat(bag3.read(), Matchers.containsInAnyOrder("Hello", "World", "!"));
+ assertThat(bag1.read(), Matchers.emptyIterable());
+ assertThat(bag2.read(), Matchers.emptyIterable());
+ }
+
+ @Test
+ public void testCombiningValue() throws Exception {
+ GroupingState<Integer, Integer> value = underTest.state(NAMESPACE_1, SUM_INTEGER_ADDR);
+
+ // State instances are cached, but depend on the namespace.
+ assertEquals(value, underTest.state(NAMESPACE_1, SUM_INTEGER_ADDR));
+ assertFalse(value.equals(underTest.state(NAMESPACE_2, SUM_INTEGER_ADDR)));
+
+ assertThat(value.read(), Matchers.equalTo(0));
+ value.add(2);
+ assertThat(value.read(), Matchers.equalTo(2));
+
+ value.add(3);
+ assertThat(value.read(), Matchers.equalTo(5));
+
+ value.clear();
+ assertThat(value.read(), Matchers.equalTo(0));
+ assertEquals(underTest.state(NAMESPACE_1, SUM_INTEGER_ADDR), value);
+ }
+
+ @Test
+ public void testCombiningIsEmpty() throws Exception {
+ GroupingState<Integer, Integer> value = underTest.state(NAMESPACE_1, SUM_INTEGER_ADDR);
+
+ assertThat(value.isEmpty().read(), Matchers.is(true));
+ ReadableState<Boolean> readFuture = value.isEmpty();
+ value.add(5);
+ assertThat(readFuture.read(), Matchers.is(false));
+
+ value.clear();
+ assertThat(readFuture.read(), Matchers.is(true));
+ }
+
+ @Test
+ public void testMergeCombiningValueIntoSource() throws Exception {
+ CombiningState<Integer, int[], Integer> value1 =
+ underTest.state(NAMESPACE_1, SUM_INTEGER_ADDR);
+ CombiningState<Integer, int[], Integer> value2 =
+ underTest.state(NAMESPACE_2, SUM_INTEGER_ADDR);
+
+ value1.add(5);
+ value2.add(10);
+ value1.add(6);
+
+ assertThat(value1.read(), Matchers.equalTo(11));
+ assertThat(value2.read(), Matchers.equalTo(10));
+
+ // Merging clears the old values and updates the result value.
+ StateMerging.mergeCombiningValues(Arrays.asList(value1, value2), value1);
+
+ assertThat(value1.read(), Matchers.equalTo(21));
+ assertThat(value2.read(), Matchers.equalTo(0));
+ }
+
+ @Test
+ public void testMergeCombiningValueIntoNewNamespace() throws Exception {
+ CombiningState<Integer, int[], Integer> value1 =
+ underTest.state(NAMESPACE_1, SUM_INTEGER_ADDR);
+ CombiningState<Integer, int[], Integer> value2 =
+ underTest.state(NAMESPACE_2, SUM_INTEGER_ADDR);
+ CombiningState<Integer, int[], Integer> value3 =
+ underTest.state(NAMESPACE_3, SUM_INTEGER_ADDR);
+
+ value1.add(5);
+ value2.add(10);
+ value1.add(6);
+
+ StateMerging.mergeCombiningValues(Arrays.asList(value1, value2), value3);
+
+ // Merging clears the old values and updates the result value.
+ assertThat(value1.read(), Matchers.equalTo(0));
+ assertThat(value2.read(), Matchers.equalTo(0));
+ assertThat(value3.read(), Matchers.equalTo(21));
+ }
+
+ @Test
+ public void testWatermarkEarliestState() throws Exception {
+ WatermarkHoldState<BoundedWindow> value =
+ underTest.state(NAMESPACE_1, WATERMARK_EARLIEST_ADDR);
+
+ // State instances are cached, but depend on the namespace.
+ assertEquals(value, underTest.state(NAMESPACE_1, WATERMARK_EARLIEST_ADDR));
+ assertFalse(value.equals(underTest.state(NAMESPACE_2, WATERMARK_EARLIEST_ADDR)));
+
+ assertThat(value.read(), Matchers.nullValue());
+ value.add(new Instant(2000));
+ assertThat(value.read(), Matchers.equalTo(new Instant(2000)));
+
+ value.add(new Instant(3000));
+ assertThat(value.read(), Matchers.equalTo(new Instant(2000)));
+
+ value.add(new Instant(1000));
+ assertThat(value.read(), Matchers.equalTo(new Instant(1000)));
+
+ value.clear();
+ assertThat(value.read(), Matchers.equalTo(null));
+ assertEquals(underTest.state(NAMESPACE_1, WATERMARK_EARLIEST_ADDR), value);
+ }
+
+ @Test
+ public void testWatermarkLatestState() throws Exception {
+ WatermarkHoldState<BoundedWindow> value =
+ underTest.state(NAMESPACE_1, WATERMARK_LATEST_ADDR);
+
+ // State instances are cached, but depend on the namespace.
+ assertEquals(value, underTest.state(NAMESPACE_1, WATERMARK_LATEST_ADDR));
+ assertFalse(value.equals(underTest.state(NAMESPACE_2, WATERMARK_LATEST_ADDR)));
+
+ assertThat(value.read(), Matchers.nullValue());
+ value.add(new Instant(2000));
+ assertThat(value.read(), Matchers.equalTo(new Instant(2000)));
+
+ value.add(new Instant(3000));
+ assertThat(value.read(), Matchers.equalTo(new Instant(3000)));
+
+ value.add(new Instant(1000));
+ assertThat(value.read(), Matchers.equalTo(new Instant(3000)));
+
+ value.clear();
+ assertThat(value.read(), Matchers.equalTo(null));
+ assertEquals(underTest.state(NAMESPACE_1, WATERMARK_LATEST_ADDR), value);
+ }
+
+ @Test
+ public void testWatermarkEndOfWindowState() throws Exception {
+ WatermarkHoldState<BoundedWindow> value = underTest.state(NAMESPACE_1, WATERMARK_EOW_ADDR);
+
+ // State instances are cached, but depend on the namespace.
+ assertEquals(value, underTest.state(NAMESPACE_1, WATERMARK_EOW_ADDR));
+ assertFalse(value.equals(underTest.state(NAMESPACE_2, WATERMARK_EOW_ADDR)));
+
+ assertThat(value.read(), Matchers.nullValue());
+ value.add(new Instant(2000));
+ assertThat(value.read(), Matchers.equalTo(new Instant(2000)));
+
+ value.clear();
+ assertThat(value.read(), Matchers.equalTo(null));
+ assertEquals(underTest.state(NAMESPACE_1, WATERMARK_EOW_ADDR), value);
+ }
+
+ @Test
+ public void testWatermarkStateIsEmpty() throws Exception {
+ WatermarkHoldState<BoundedWindow> value =
+ underTest.state(NAMESPACE_1, WATERMARK_EARLIEST_ADDR);
+
+ assertThat(value.isEmpty().read(), Matchers.is(true));
+ ReadableState<Boolean> readFuture = value.isEmpty();
+ value.add(new Instant(1000));
+ assertThat(readFuture.read(), Matchers.is(false));
+
+ value.clear();
+ assertThat(readFuture.read(), Matchers.is(true));
+ }
+
+ @Test
+ public void testMergeEarliestWatermarkIntoSource() throws Exception {
+ WatermarkHoldState<BoundedWindow> value1 =
+ underTest.state(NAMESPACE_1, WATERMARK_EARLIEST_ADDR);
+ WatermarkHoldState<BoundedWindow> value2 =
+ underTest.state(NAMESPACE_2, WATERMARK_EARLIEST_ADDR);
+
+ value1.add(new Instant(3000));
+ value2.add(new Instant(5000));
+ value1.add(new Instant(4000));
+ value2.add(new Instant(2000));
+
+ // Merging clears the old values and updates the merged value.
+ StateMerging.mergeWatermarks(Arrays.asList(value1, value2), value1, WINDOW_1);
+
+ assertThat(value1.read(), Matchers.equalTo(new Instant(2000)));
+ assertThat(value2.read(), Matchers.equalTo(null));
+ }
+
+ @Test
+ public void testMergeLatestWatermarkIntoSource() throws Exception {
+ WatermarkHoldState<BoundedWindow> value1 =
+ underTest.state(NAMESPACE_1, WATERMARK_LATEST_ADDR);
+ WatermarkHoldState<BoundedWindow> value2 =
+ underTest.state(NAMESPACE_2, WATERMARK_LATEST_ADDR);
+ WatermarkHoldState<BoundedWindow> value3 =
+ underTest.state(NAMESPACE_3, WATERMARK_LATEST_ADDR);
+
+ value1.add(new Instant(3000));
+ value2.add(new Instant(5000));
+ value1.add(new Instant(4000));
+ value2.add(new Instant(2000));
+
+ // Merging clears the old values and updates the result value.
+ StateMerging.mergeWatermarks(Arrays.asList(value1, value2), value3, WINDOW_1);
+
+ // Merging clears the old values and updates the result value.
+ assertThat(value3.read(), Matchers.equalTo(new Instant(5000)));
+ assertThat(value1.read(), Matchers.equalTo(null));
+ assertThat(value2.read(), Matchers.equalTo(null));
+ }
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/test/java/org/apache/beam/runners/flink/streaming/GroupByNullKeyTest.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/test/java/org/apache/beam/runners/flink/streaming/GroupByNullKeyTest.java b/runners/flink/src/test/java/org/apache/beam/runners/flink/streaming/GroupByNullKeyTest.java
new file mode 100644
index 0000000..663b910
--- /dev/null
+++ b/runners/flink/src/test/java/org/apache/beam/runners/flink/streaming/GroupByNullKeyTest.java
@@ -0,0 +1,124 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink.streaming;
+
+import com.google.common.base.Joiner;
+import java.io.Serializable;
+import java.util.Arrays;
+import org.apache.beam.runners.flink.FlinkTestPipeline;
+import org.apache.beam.sdk.Pipeline;
+import org.apache.beam.sdk.io.TextIO;
+import org.apache.beam.sdk.transforms.Create;
+import org.apache.beam.sdk.transforms.DoFn;
+import org.apache.beam.sdk.transforms.GroupByKey;
+import org.apache.beam.sdk.transforms.ParDo;
+import org.apache.beam.sdk.transforms.windowing.AfterWatermark;
+import org.apache.beam.sdk.transforms.windowing.FixedWindows;
+import org.apache.beam.sdk.transforms.windowing.Window;
+import org.apache.beam.sdk.values.KV;
+import org.apache.beam.sdk.values.PCollection;
+import org.apache.flink.streaming.util.StreamingProgramTestBase;
+import org.joda.time.Duration;
+import org.joda.time.Instant;
+
+/**
+ * Test for GroupByNullKey.
+ */
+public class GroupByNullKeyTest extends StreamingProgramTestBase implements Serializable {
+
+
+ protected String resultPath;
+
+ static final String[] EXPECTED_RESULT = new String[] {
+ "k: null v: user1 user1 user1 user2 user2 user2 user2 user3"
+ };
+
+ public GroupByNullKeyTest(){
+ }
+
+ @Override
+ protected void preSubmit() throws Exception {
+ resultPath = getTempDirPath("result");
+ }
+
+ @Override
+ protected void postSubmit() throws Exception {
+ compareResultsByLinesInMemory(Joiner.on('\n').join(EXPECTED_RESULT), resultPath);
+ }
+
+ /**
+ * DoFn extracting user and timestamp.
+ */
+ private static class ExtractUserAndTimestamp extends DoFn<KV<Integer, String>, String> {
+ @ProcessElement
+ public void processElement(ProcessContext c) {
+ KV<Integer, String> record = c.element();
+ int timestamp = record.getKey();
+ String userName = record.getValue();
+ if (userName != null) {
+ // Sets the implicit timestamp field to be used in windowing.
+ c.outputWithTimestamp(userName, new Instant(timestamp));
+ }
+ }
+ }
+
+ @Override
+ protected void testProgram() throws Exception {
+
+ Pipeline p = FlinkTestPipeline.createForStreaming();
+
+ PCollection<String> output =
+ p.apply(Create.of(Arrays.asList(
+ KV.<Integer, String>of(0, "user1"),
+ KV.<Integer, String>of(1, "user1"),
+ KV.<Integer, String>of(2, "user1"),
+ KV.<Integer, String>of(10, "user2"),
+ KV.<Integer, String>of(1, "user2"),
+ KV.<Integer, String>of(15000, "user2"),
+ KV.<Integer, String>of(12000, "user2"),
+ KV.<Integer, String>of(25000, "user3"))))
+ .apply(ParDo.of(new ExtractUserAndTimestamp()))
+ .apply(Window.<String>into(FixedWindows.of(Duration.standardHours(1)))
+ .triggering(AfterWatermark.pastEndOfWindow())
+ .withAllowedLateness(Duration.ZERO)
+ .discardingFiredPanes())
+
+ .apply(ParDo.of(new DoFn<String, KV<Void, String>>() {
+ @ProcessElement
+ public void processElement(ProcessContext c) throws Exception {
+ String elem = c.element();
+ c.output(KV.<Void, String>of(null, elem));
+ }
+ }))
+ .apply(GroupByKey.<Void, String>create())
+ .apply(ParDo.of(new DoFn<KV<Void, Iterable<String>>, String>() {
+ @ProcessElement
+ public void processElement(ProcessContext c) throws Exception {
+ KV<Void, Iterable<String>> elem = c.element();
+ StringBuilder str = new StringBuilder();
+ str.append("k: " + elem.getKey() + " v:");
+ for (String v : elem.getValue()) {
+ str.append(" " + v);
+ }
+ c.output(str.toString());
+ }
+ }));
+ output.apply(TextIO.Write.to(resultPath));
+ p.run();
+ }
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/test/java/org/apache/beam/runners/flink/streaming/TestCountingSource.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/test/java/org/apache/beam/runners/flink/streaming/TestCountingSource.java b/runners/flink/src/test/java/org/apache/beam/runners/flink/streaming/TestCountingSource.java
new file mode 100644
index 0000000..3a08088
--- /dev/null
+++ b/runners/flink/src/test/java/org/apache/beam/runners/flink/streaming/TestCountingSource.java
@@ -0,0 +1,254 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink.streaming;
+
+import static org.apache.beam.sdk.util.CoderUtils.encodeToByteArray;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.concurrent.ThreadLocalRandom;
+import javax.annotation.Nullable;
+import org.apache.beam.sdk.coders.Coder;
+import org.apache.beam.sdk.coders.DelegateCoder;
+import org.apache.beam.sdk.coders.KvCoder;
+import org.apache.beam.sdk.coders.VarIntCoder;
+import org.apache.beam.sdk.io.UnboundedSource;
+import org.apache.beam.sdk.options.PipelineOptions;
+import org.apache.beam.sdk.values.KV;
+import org.joda.time.Instant;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * An unbounded source for testing the unbounded sources framework code.
+ *
+ * <p>Each split of this sources produces records of the form KV(split_id, i),
+ * where i counts up from 0. Each record has a timestamp of i, and the watermark
+ * accurately tracks these timestamps. The reader will occasionally return false
+ * from {@code advance}, in order to simulate a source where not all the data is
+ * available immediately.
+ */
+public class TestCountingSource
+ extends UnboundedSource<KV<Integer, Integer>, TestCountingSource.CounterMark> {
+ private static final Logger LOG = LoggerFactory.getLogger(TestCountingSource.class);
+
+ private static List<Integer> finalizeTracker;
+ private final int numMessagesPerShard;
+ private final int shardNumber;
+ private final boolean dedup;
+ private final boolean throwOnFirstSnapshot;
+ private final boolean allowSplitting;
+
+ /**
+ * We only allow an exception to be thrown from getCheckpointMark
+ * at most once. This must be static since the entire TestCountingSource
+ * instance may re-serialized when the pipeline recovers and retries.
+ */
+ private static boolean thrown = false;
+
+ public static void setFinalizeTracker(List<Integer> finalizeTracker) {
+ TestCountingSource.finalizeTracker = finalizeTracker;
+ }
+
+ public TestCountingSource(int numMessagesPerShard) {
+ this(numMessagesPerShard, 0, false, false, true);
+ }
+
+ public TestCountingSource withDedup() {
+ return new TestCountingSource(
+ numMessagesPerShard, shardNumber, true, throwOnFirstSnapshot, true);
+ }
+
+ private TestCountingSource withShardNumber(int shardNumber) {
+ return new TestCountingSource(
+ numMessagesPerShard, shardNumber, dedup, throwOnFirstSnapshot, true);
+ }
+
+ public TestCountingSource withThrowOnFirstSnapshot(boolean throwOnFirstSnapshot) {
+ return new TestCountingSource(
+ numMessagesPerShard, shardNumber, dedup, throwOnFirstSnapshot, true);
+ }
+
+ public TestCountingSource withoutSplitting() {
+ return new TestCountingSource(
+ numMessagesPerShard, shardNumber, dedup, throwOnFirstSnapshot, false);
+ }
+
+ private TestCountingSource(int numMessagesPerShard, int shardNumber, boolean dedup,
+ boolean throwOnFirstSnapshot, boolean allowSplitting) {
+ this.numMessagesPerShard = numMessagesPerShard;
+ this.shardNumber = shardNumber;
+ this.dedup = dedup;
+ this.throwOnFirstSnapshot = throwOnFirstSnapshot;
+ this.allowSplitting = allowSplitting;
+ }
+
+ public int getShardNumber() {
+ return shardNumber;
+ }
+
+ @Override
+ public List<TestCountingSource> split(
+ int desiredNumSplits, PipelineOptions options) {
+ List<TestCountingSource> splits = new ArrayList<>();
+ int numSplits = allowSplitting ? desiredNumSplits : 1;
+ for (int i = 0; i < numSplits; i++) {
+ splits.add(withShardNumber(i));
+ }
+ return splits;
+ }
+
+ class CounterMark implements UnboundedSource.CheckpointMark {
+ int current;
+
+ public CounterMark(int current) {
+ this.current = current;
+ }
+
+ @Override
+ public void finalizeCheckpoint() {
+ if (finalizeTracker != null) {
+ finalizeTracker.add(current);
+ }
+ }
+ }
+
+ @Override
+ public Coder<CounterMark> getCheckpointMarkCoder() {
+ return DelegateCoder.of(
+ VarIntCoder.of(),
+ new DelegateCoder.CodingFunction<CounterMark, Integer>() {
+ @Override
+ public Integer apply(CounterMark input) {
+ return input.current;
+ }
+ },
+ new DelegateCoder.CodingFunction<Integer, CounterMark>() {
+ @Override
+ public CounterMark apply(Integer input) {
+ return new CounterMark(input);
+ }
+ });
+ }
+
+ @Override
+ public boolean requiresDeduping() {
+ return dedup;
+ }
+
+ /**
+ * Public only so that the checkpoint can be conveyed from {@link #getCheckpointMark()} to
+ * {@link TestCountingSource#createReader(PipelineOptions, CounterMark)} without cast.
+ */
+ public class CountingSourceReader extends UnboundedReader<KV<Integer, Integer>> {
+ private int current;
+
+ public CountingSourceReader(int startingPoint) {
+ this.current = startingPoint;
+ }
+
+ @Override
+ public boolean start() {
+ return advance();
+ }
+
+ @Override
+ public boolean advance() {
+ if (current >= numMessagesPerShard - 1) {
+ return false;
+ }
+ // If testing dedup, occasionally insert a duplicate value;
+ if (current >= 0 && dedup && ThreadLocalRandom.current().nextInt(5) == 0) {
+ return true;
+ }
+ current++;
+ return true;
+ }
+
+ @Override
+ public KV<Integer, Integer> getCurrent() {
+ return KV.of(shardNumber, current);
+ }
+
+ @Override
+ public Instant getCurrentTimestamp() {
+ return new Instant(current);
+ }
+
+ @Override
+ public byte[] getCurrentRecordId() {
+ try {
+ return encodeToByteArray(KvCoder.of(VarIntCoder.of(), VarIntCoder.of()), getCurrent());
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ @Override
+ public void close() {}
+
+ @Override
+ public TestCountingSource getCurrentSource() {
+ return TestCountingSource.this;
+ }
+
+ @Override
+ public Instant getWatermark() {
+ // The watermark is a promise about future elements, and the timestamps of elements are
+ // strictly increasing for this source.
+ return new Instant(current + 1);
+ }
+
+ @Override
+ public CounterMark getCheckpointMark() {
+ if (throwOnFirstSnapshot && !thrown) {
+ thrown = true;
+ LOG.error("Throwing exception while checkpointing counter");
+ throw new RuntimeException("failed during checkpoint");
+ }
+ // The checkpoint can assume all records read, including the current, have
+ // been commited.
+ return new CounterMark(current);
+ }
+
+ @Override
+ public long getSplitBacklogBytes() {
+ return 7L;
+ }
+ }
+
+ @Override
+ public CountingSourceReader createReader(
+ PipelineOptions options, @Nullable CounterMark checkpointMark) {
+ if (checkpointMark == null) {
+ LOG.debug("creating reader");
+ } else {
+ LOG.debug("restoring reader from checkpoint with current = {}", checkpointMark.current);
+ }
+ return new CountingSourceReader(checkpointMark != null ? checkpointMark.current : -1);
+ }
+
+ @Override
+ public void validate() {}
+
+ @Override
+ public Coder<KV<Integer, Integer>> getDefaultOutputCoder() {
+ return KvCoder.of(VarIntCoder.of(), VarIntCoder.of());
+ }
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/test/java/org/apache/beam/runners/flink/streaming/TopWikipediaSessionsITCase.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/test/java/org/apache/beam/runners/flink/streaming/TopWikipediaSessionsITCase.java b/runners/flink/src/test/java/org/apache/beam/runners/flink/streaming/TopWikipediaSessionsITCase.java
new file mode 100644
index 0000000..9e6bba8
--- /dev/null
+++ b/runners/flink/src/test/java/org/apache/beam/runners/flink/streaming/TopWikipediaSessionsITCase.java
@@ -0,0 +1,133 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink.streaming;
+
+import com.google.api.services.bigquery.model.TableRow;
+import com.google.common.base.Joiner;
+import java.io.Serializable;
+import java.util.Arrays;
+import org.apache.beam.runners.flink.FlinkTestPipeline;
+import org.apache.beam.sdk.Pipeline;
+import org.apache.beam.sdk.io.TextIO;
+import org.apache.beam.sdk.transforms.Count;
+import org.apache.beam.sdk.transforms.Create;
+import org.apache.beam.sdk.transforms.DoFn;
+import org.apache.beam.sdk.transforms.ParDo;
+import org.apache.beam.sdk.transforms.windowing.Sessions;
+import org.apache.beam.sdk.transforms.windowing.Window;
+import org.apache.beam.sdk.values.KV;
+import org.apache.beam.sdk.values.PCollection;
+import org.apache.flink.streaming.util.StreamingProgramTestBase;
+import org.joda.time.Duration;
+import org.joda.time.Instant;
+
+
+/**
+ * Session window test.
+ */
+public class TopWikipediaSessionsITCase extends StreamingProgramTestBase implements Serializable {
+ protected String resultPath;
+
+ public TopWikipediaSessionsITCase(){
+ }
+
+ static final String[] EXPECTED_RESULT = new String[] {
+ "user: user1 value:3",
+ "user: user1 value:1",
+ "user: user2 value:4",
+ "user: user2 value:6",
+ "user: user3 value:7",
+ "user: user3 value:2"
+ };
+
+ @Override
+ protected void preSubmit() throws Exception {
+ resultPath = getTempDirPath("result");
+ }
+
+ @Override
+ protected void postSubmit() throws Exception {
+ compareResultsByLinesInMemory(Joiner.on('\n').join(EXPECTED_RESULT), resultPath);
+ }
+
+ @Override
+ protected void testProgram() throws Exception {
+
+ Pipeline p = FlinkTestPipeline.createForStreaming();
+
+ Long now = (System.currentTimeMillis() + 10000) / 1000;
+
+ PCollection<KV<String, Long>> output =
+ p.apply(Create.of(Arrays.asList(new TableRow().set("timestamp", now).set
+ ("contributor_username", "user1"), new TableRow().set("timestamp", now + 10).set
+ ("contributor_username", "user3"), new TableRow().set("timestamp", now).set
+ ("contributor_username", "user2"), new TableRow().set("timestamp", now).set
+ ("contributor_username", "user1"), new TableRow().set("timestamp", now + 2).set
+ ("contributor_username", "user1"), new TableRow().set("timestamp", now).set
+ ("contributor_username", "user2"), new TableRow().set("timestamp", now + 1).set
+ ("contributor_username", "user2"), new TableRow().set("timestamp", now + 5).set
+ ("contributor_username", "user2"), new TableRow().set("timestamp", now + 7).set
+ ("contributor_username", "user2"), new TableRow().set("timestamp", now + 8).set
+ ("contributor_username", "user2"), new TableRow().set("timestamp", now + 200).set
+ ("contributor_username", "user2"), new TableRow().set("timestamp", now + 230).set
+ ("contributor_username", "user1"), new TableRow().set("timestamp", now + 230).set
+ ("contributor_username", "user2"), new TableRow().set("timestamp", now + 240).set
+ ("contributor_username", "user2"), new TableRow().set("timestamp", now + 245).set
+ ("contributor_username", "user3"), new TableRow().set("timestamp", now + 235).set
+ ("contributor_username", "user3"), new TableRow().set("timestamp", now + 236).set
+ ("contributor_username", "user3"), new TableRow().set("timestamp", now + 237).set
+ ("contributor_username", "user3"), new TableRow().set("timestamp", now + 238).set
+ ("contributor_username", "user3"), new TableRow().set("timestamp", now + 239).set
+ ("contributor_username", "user3"), new TableRow().set("timestamp", now + 240).set
+ ("contributor_username", "user3"), new TableRow().set("timestamp", now + 241).set
+ ("contributor_username", "user2"), new TableRow().set("timestamp", now)
+ .set("contributor_username", "user3"))))
+
+
+
+ .apply(ParDo.of(new DoFn<TableRow, String>() {
+ @ProcessElement
+ public void processElement(ProcessContext c) throws Exception {
+ TableRow row = c.element();
+ long timestamp = (Integer) row.get("timestamp");
+ String userName = (String) row.get("contributor_username");
+ if (userName != null) {
+ // Sets the timestamp field to be used in windowing.
+ c.outputWithTimestamp(userName, new Instant(timestamp * 1000L));
+ }
+ }
+ }))
+
+ .apply(Window.<String>into(Sessions.withGapDuration(Duration.standardMinutes(1))))
+
+ .apply(Count.<String>perElement());
+
+ PCollection<String> format = output.apply(ParDo.of(new DoFn<KV<String, Long>, String>() {
+ @ProcessElement
+ public void processElement(ProcessContext c) throws Exception {
+ KV<String, Long> el = c.element();
+ String out = "user: " + el.getKey() + " value:" + el.getValue();
+ c.output(out);
+ }
+ }));
+
+ format.apply(TextIO.Write.to(resultPath));
+
+ p.run();
+ }
+}
[37/50] [abbrv] beam git commit: [BEAM-1994] Remove Flink examples
package
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/TestFlinkRunner.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/TestFlinkRunner.java b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/TestFlinkRunner.java
deleted file mode 100644
index 8f50105..0000000
--- a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/TestFlinkRunner.java
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink;
-
-import org.apache.beam.sdk.Pipeline;
-import org.apache.beam.sdk.Pipeline.PipelineExecutionException;
-import org.apache.beam.sdk.PipelineResult;
-import org.apache.beam.sdk.options.PipelineOptions;
-import org.apache.beam.sdk.options.PipelineOptionsFactory;
-import org.apache.beam.sdk.options.PipelineOptionsValidator;
-import org.apache.beam.sdk.runners.PipelineRunner;
-import org.apache.beam.sdk.util.UserCodeException;
-
-/**
- * Test Flink runner.
- */
-public class TestFlinkRunner extends PipelineRunner<PipelineResult> {
-
- private FlinkRunner delegate;
-
- private TestFlinkRunner(FlinkPipelineOptions options) {
- // We use [auto] for testing since this will make it pick up the Testing ExecutionEnvironment
- options.setFlinkMaster("[auto]");
- this.delegate = FlinkRunner.fromOptions(options);
- }
-
- public static TestFlinkRunner fromOptions(PipelineOptions options) {
- FlinkPipelineOptions flinkOptions =
- PipelineOptionsValidator.validate(FlinkPipelineOptions.class, options);
- return new TestFlinkRunner(flinkOptions);
- }
-
- public static TestFlinkRunner create(boolean streaming) {
- FlinkPipelineOptions flinkOptions = PipelineOptionsFactory.as(FlinkPipelineOptions.class);
- flinkOptions.setRunner(TestFlinkRunner.class);
- flinkOptions.setStreaming(streaming);
- return TestFlinkRunner.fromOptions(flinkOptions);
- }
-
- @Override
- public PipelineResult run(Pipeline pipeline) {
- try {
- return delegate.run(pipeline);
- } catch (Throwable t) {
- // Special case hack to pull out assertion errors from PAssert; instead there should
- // probably be a better story along the lines of UserCodeException.
- UserCodeException innermostUserCodeException = null;
- Throwable current = t;
- for (; current.getCause() != null; current = current.getCause()) {
- if (current instanceof UserCodeException) {
- innermostUserCodeException = ((UserCodeException) current);
- }
- }
- if (innermostUserCodeException != null) {
- current = innermostUserCodeException.getCause();
- }
- if (current instanceof AssertionError) {
- throw (AssertionError) current;
- }
- throw new PipelineExecutionException(current);
- }
- }
-
- public PipelineOptions getPipelineOptions() {
- return delegate.getPipelineOptions();
- }
-}
-
-
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/TranslationMode.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/TranslationMode.java b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/TranslationMode.java
deleted file mode 100644
index ad54750..0000000
--- a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/TranslationMode.java
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink;
-
-/**
- * The translation mode of the Beam Pipeline.
- */
-enum TranslationMode {
-
- /** Uses the batch mode of Flink. */
- BATCH,
-
- /** Uses the streaming mode of Flink. */
- STREAMING
-
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/package-info.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/package-info.java b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/package-info.java
deleted file mode 100644
index 57f1e59..0000000
--- a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/package-info.java
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * Internal implementation of the Beam runner for Apache Flink.
- */
-package org.apache.beam.runners.flink;
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkAggregatorFactory.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkAggregatorFactory.java b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkAggregatorFactory.java
deleted file mode 100644
index fb2493b..0000000
--- a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkAggregatorFactory.java
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink.translation.functions;
-
-import org.apache.beam.runners.core.AggregatorFactory;
-import org.apache.beam.runners.core.ExecutionContext;
-import org.apache.beam.runners.flink.translation.wrappers.SerializableFnAggregatorWrapper;
-import org.apache.beam.sdk.transforms.Aggregator;
-import org.apache.beam.sdk.transforms.Combine;
-import org.apache.flink.api.common.functions.RuntimeContext;
-
-/**
- * A {@link AggregatorFactory} for the Flink Batch Runner.
- */
-public class FlinkAggregatorFactory implements AggregatorFactory{
-
- private final RuntimeContext runtimeContext;
-
- public FlinkAggregatorFactory(RuntimeContext runtimeContext) {
- this.runtimeContext = runtimeContext;
- }
-
- @Override
- public <InputT, AccumT, OutputT> Aggregator<InputT, OutputT> createAggregatorForDoFn(
- Class<?> fnClass, ExecutionContext.StepContext stepContext, String aggregatorName,
- Combine.CombineFn<InputT, AccumT, OutputT> combine) {
- @SuppressWarnings("unchecked")
- SerializableFnAggregatorWrapper<InputT, OutputT> result =
- (SerializableFnAggregatorWrapper<InputT, OutputT>)
- runtimeContext.getAccumulator(aggregatorName);
-
- if (result == null) {
- result = new SerializableFnAggregatorWrapper<>(combine);
- runtimeContext.addAccumulator(aggregatorName, result);
- }
- return result;
- }
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkAssignContext.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkAssignContext.java b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkAssignContext.java
deleted file mode 100644
index 447b1e5..0000000
--- a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkAssignContext.java
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink.translation.functions;
-
-import static com.google.common.base.Preconditions.checkArgument;
-
-import com.google.common.collect.Iterables;
-import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
-import org.apache.beam.sdk.transforms.windowing.WindowFn;
-import org.apache.beam.sdk.util.WindowedValue;
-import org.joda.time.Instant;
-
-/**
- * {@link org.apache.beam.sdk.transforms.windowing.WindowFn.AssignContext} for
- * Flink functions.
- */
-class FlinkAssignContext<InputT, W extends BoundedWindow>
- extends WindowFn<InputT, W>.AssignContext {
- private final WindowedValue<InputT> value;
-
- FlinkAssignContext(WindowFn<InputT, W> fn, WindowedValue<InputT> value) {
- fn.super();
- checkArgument(
- Iterables.size(value.getWindows()) == 1,
- String.format(
- "%s passed to window assignment must be in a single window, but it was in %s: %s",
- WindowedValue.class.getSimpleName(),
- Iterables.size(value.getWindows()),
- value.getWindows()));
- this.value = value;
- }
-
- @Override
- public InputT element() {
- return value.getValue();
- }
-
- @Override
- public Instant timestamp() {
- return value.getTimestamp();
- }
-
- @Override
- public BoundedWindow window() {
- return Iterables.getOnlyElement(value.getWindows());
- }
-
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkAssignWindows.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkAssignWindows.java b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkAssignWindows.java
deleted file mode 100644
index c3a5095..0000000
--- a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkAssignWindows.java
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink.translation.functions;
-
-import java.util.Collection;
-import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
-import org.apache.beam.sdk.transforms.windowing.WindowFn;
-import org.apache.beam.sdk.util.WindowedValue;
-import org.apache.flink.api.common.functions.FlatMapFunction;
-import org.apache.flink.util.Collector;
-
-/**
- * Flink {@link FlatMapFunction} for implementing
- * {@link org.apache.beam.sdk.transforms.windowing.Window.Assign}.
- */
-public class FlinkAssignWindows<T, W extends BoundedWindow>
- implements FlatMapFunction<WindowedValue<T>, WindowedValue<T>> {
-
- private final WindowFn<T, W> windowFn;
-
- public FlinkAssignWindows(WindowFn<T, W> windowFn) {
- this.windowFn = windowFn;
- }
-
- @Override
- public void flatMap(
- WindowedValue<T> input, Collector<WindowedValue<T>> collector) throws Exception {
- Collection<W> windows = windowFn.assignWindows(new FlinkAssignContext<>(windowFn, input));
- for (W window: windows) {
- collector.collect(
- WindowedValue.of(input.getValue(), input.getTimestamp(), window, input.getPane()));
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkDoFnFunction.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkDoFnFunction.java b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkDoFnFunction.java
deleted file mode 100644
index 51582af..0000000
--- a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkDoFnFunction.java
+++ /dev/null
@@ -1,161 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink.translation.functions;
-
-import java.util.Collections;
-import java.util.Map;
-import org.apache.beam.runners.core.DoFnRunner;
-import org.apache.beam.runners.core.DoFnRunners;
-import org.apache.beam.runners.flink.translation.utils.SerializedPipelineOptions;
-import org.apache.beam.sdk.options.PipelineOptions;
-import org.apache.beam.sdk.transforms.DoFn;
-import org.apache.beam.sdk.transforms.join.RawUnionValue;
-import org.apache.beam.sdk.transforms.reflect.DoFnInvoker;
-import org.apache.beam.sdk.transforms.reflect.DoFnInvokers;
-import org.apache.beam.sdk.util.WindowedValue;
-import org.apache.beam.sdk.util.WindowingStrategy;
-import org.apache.beam.sdk.values.PCollectionView;
-import org.apache.beam.sdk.values.TupleTag;
-import org.apache.flink.api.common.functions.RichMapPartitionFunction;
-import org.apache.flink.api.common.functions.RuntimeContext;
-import org.apache.flink.configuration.Configuration;
-import org.apache.flink.util.Collector;
-
-/**
- * Encapsulates a {@link DoFn}
- * inside a Flink {@link org.apache.flink.api.common.functions.RichMapPartitionFunction}.
- *
- * <p>We get a mapping from {@link org.apache.beam.sdk.values.TupleTag} to output index
- * and must tag all outputs with the output number. Afterwards a filter will filter out
- * those elements that are not to be in a specific output.
- */
-public class FlinkDoFnFunction<InputT, OutputT>
- extends RichMapPartitionFunction<WindowedValue<InputT>, WindowedValue<OutputT>> {
-
- private final SerializedPipelineOptions serializedOptions;
-
- private final DoFn<InputT, OutputT> doFn;
- private final Map<PCollectionView<?>, WindowingStrategy<?, ?>> sideInputs;
-
- private final WindowingStrategy<?, ?> windowingStrategy;
-
- private final Map<TupleTag<?>, Integer> outputMap;
- private final TupleTag<OutputT> mainOutputTag;
-
- private transient DoFnInvoker<InputT, OutputT> doFnInvoker;
-
- public FlinkDoFnFunction(
- DoFn<InputT, OutputT> doFn,
- WindowingStrategy<?, ?> windowingStrategy,
- Map<PCollectionView<?>, WindowingStrategy<?, ?>> sideInputs,
- PipelineOptions options,
- Map<TupleTag<?>, Integer> outputMap,
- TupleTag<OutputT> mainOutputTag) {
-
- this.doFn = doFn;
- this.sideInputs = sideInputs;
- this.serializedOptions = new SerializedPipelineOptions(options);
- this.windowingStrategy = windowingStrategy;
- this.outputMap = outputMap;
- this.mainOutputTag = mainOutputTag;
-
- }
-
- @Override
- public void mapPartition(
- Iterable<WindowedValue<InputT>> values,
- Collector<WindowedValue<OutputT>> out) throws Exception {
-
- RuntimeContext runtimeContext = getRuntimeContext();
-
- DoFnRunners.OutputManager outputManager;
- if (outputMap == null) {
- outputManager = new FlinkDoFnFunction.DoFnOutputManager(out);
- } else {
- // it has some additional outputs
- outputManager =
- new FlinkDoFnFunction.MultiDoFnOutputManager((Collector) out, outputMap);
- }
-
- DoFnRunner<InputT, OutputT> doFnRunner = DoFnRunners.simpleRunner(
- serializedOptions.getPipelineOptions(), doFn,
- new FlinkSideInputReader(sideInputs, runtimeContext),
- outputManager,
- mainOutputTag,
- // see SimpleDoFnRunner, just use it to limit number of additional outputs
- Collections.<TupleTag<?>>emptyList(),
- new FlinkNoOpStepContext(),
- new FlinkAggregatorFactory(runtimeContext),
- windowingStrategy);
-
- doFnRunner.startBundle();
-
- for (WindowedValue<InputT> value : values) {
- doFnRunner.processElement(value);
- }
-
- doFnRunner.finishBundle();
- }
-
- @Override
- public void open(Configuration parameters) throws Exception {
- doFnInvoker = DoFnInvokers.invokerFor(doFn);
- doFnInvoker.invokeSetup();
- }
-
- @Override
- public void close() throws Exception {
- doFnInvoker.invokeTeardown();
- }
-
- static class DoFnOutputManager
- implements DoFnRunners.OutputManager {
-
- private Collector collector;
-
- DoFnOutputManager(Collector collector) {
- this.collector = collector;
- }
-
- @Override
- @SuppressWarnings("unchecked")
- public <T> void output(TupleTag<T> tag, WindowedValue<T> output) {
- collector.collect(output);
- }
- }
-
- static class MultiDoFnOutputManager
- implements DoFnRunners.OutputManager {
-
- private Collector<WindowedValue<RawUnionValue>> collector;
- private Map<TupleTag<?>, Integer> outputMap;
-
- MultiDoFnOutputManager(Collector<WindowedValue<RawUnionValue>> collector,
- Map<TupleTag<?>, Integer> outputMap) {
- this.collector = collector;
- this.outputMap = outputMap;
- }
-
- @Override
- public <T> void output(TupleTag<T> tag, WindowedValue<T> output) {
- collector.collect(WindowedValue.of(new RawUnionValue(outputMap.get(tag), output.getValue()),
- output.getTimestamp(), output.getWindows(), output.getPane()));
- }
- }
-
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkMergingNonShuffleReduceFunction.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkMergingNonShuffleReduceFunction.java b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkMergingNonShuffleReduceFunction.java
deleted file mode 100644
index 26fd0b4..0000000
--- a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkMergingNonShuffleReduceFunction.java
+++ /dev/null
@@ -1,228 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink.translation.functions;
-
-import com.google.common.collect.Iterables;
-import com.google.common.collect.Lists;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-import org.apache.beam.runners.core.PerKeyCombineFnRunner;
-import org.apache.beam.runners.core.PerKeyCombineFnRunners;
-import org.apache.beam.runners.flink.translation.utils.SerializedPipelineOptions;
-import org.apache.beam.sdk.options.PipelineOptions;
-import org.apache.beam.sdk.transforms.CombineFnBase;
-import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
-import org.apache.beam.sdk.transforms.windowing.IntervalWindow;
-import org.apache.beam.sdk.transforms.windowing.OutputTimeFn;
-import org.apache.beam.sdk.transforms.windowing.PaneInfo;
-import org.apache.beam.sdk.util.WindowedValue;
-import org.apache.beam.sdk.util.WindowingStrategy;
-import org.apache.beam.sdk.values.KV;
-import org.apache.beam.sdk.values.PCollectionView;
-import org.apache.flink.api.common.functions.RichGroupReduceFunction;
-import org.apache.flink.util.Collector;
-import org.joda.time.Instant;
-
-/**
- * Special version of {@link FlinkReduceFunction} that supports merging windows. This
- * assumes that the windows are {@link IntervalWindow IntervalWindows} and exhibits the
- * same behaviour as {@code MergeOverlappingIntervalWindows}.
- *
- * <p>This is different from the pair of function for the non-merging windows case
- * in that we cannot do combining before the shuffle because elements would not
- * yet be in their correct windows for side-input access.
- */
-public class FlinkMergingNonShuffleReduceFunction<
- K, InputT, AccumT, OutputT, W extends IntervalWindow>
- extends RichGroupReduceFunction<WindowedValue<KV<K, InputT>>, WindowedValue<KV<K, OutputT>>> {
-
- private final CombineFnBase.PerKeyCombineFn<K, InputT, AccumT, OutputT> combineFn;
-
- private final WindowingStrategy<?, W> windowingStrategy;
-
- private final Map<PCollectionView<?>, WindowingStrategy<?, ?>> sideInputs;
-
- private final SerializedPipelineOptions serializedOptions;
-
- public FlinkMergingNonShuffleReduceFunction(
- CombineFnBase.PerKeyCombineFn<K, InputT, AccumT, OutputT> keyedCombineFn,
- WindowingStrategy<?, W> windowingStrategy,
- Map<PCollectionView<?>, WindowingStrategy<?, ?>> sideInputs,
- PipelineOptions pipelineOptions) {
-
- this.combineFn = keyedCombineFn;
-
- this.windowingStrategy = windowingStrategy;
- this.sideInputs = sideInputs;
-
- this.serializedOptions = new SerializedPipelineOptions(pipelineOptions);
-
- }
-
- @Override
- public void reduce(
- Iterable<WindowedValue<KV<K, InputT>>> elements,
- Collector<WindowedValue<KV<K, OutputT>>> out) throws Exception {
-
- PipelineOptions options = serializedOptions.getPipelineOptions();
-
- FlinkSideInputReader sideInputReader =
- new FlinkSideInputReader(sideInputs, getRuntimeContext());
-
- PerKeyCombineFnRunner<K, InputT, AccumT, OutputT> combineFnRunner =
- PerKeyCombineFnRunners.create(combineFn);
-
- @SuppressWarnings("unchecked")
- OutputTimeFn<? super BoundedWindow> outputTimeFn =
- (OutputTimeFn<? super BoundedWindow>) windowingStrategy.getOutputTimeFn();
-
- // get all elements so that we can sort them, has to fit into
- // memory
- // this seems very unprudent, but correct, for now
- List<WindowedValue<KV<K, InputT>>> sortedInput = Lists.newArrayList();
- for (WindowedValue<KV<K, InputT>> inputValue : elements) {
- for (WindowedValue<KV<K, InputT>> exploded : inputValue.explodeWindows()) {
- sortedInput.add(exploded);
- }
- }
- Collections.sort(sortedInput, new Comparator<WindowedValue<KV<K, InputT>>>() {
- @Override
- public int compare(
- WindowedValue<KV<K, InputT>> o1,
- WindowedValue<KV<K, InputT>> o2) {
- return Iterables.getOnlyElement(o1.getWindows()).maxTimestamp()
- .compareTo(Iterables.getOnlyElement(o2.getWindows()).maxTimestamp());
- }
- });
-
- // merge windows, we have to do it in an extra pre-processing step and
- // can't do it as we go since the window of early elements would not
- // be correct when calling the CombineFn
- mergeWindow(sortedInput);
-
- // iterate over the elements that are sorted by window timestamp
- final Iterator<WindowedValue<KV<K, InputT>>> iterator = sortedInput.iterator();
-
- // create accumulator using the first elements key
- WindowedValue<KV<K, InputT>> currentValue = iterator.next();
- K key = currentValue.getValue().getKey();
- IntervalWindow currentWindow =
- (IntervalWindow) Iterables.getOnlyElement(currentValue.getWindows());
- InputT firstValue = currentValue.getValue().getValue();
- AccumT accumulator =
- combineFnRunner.createAccumulator(key, options, sideInputReader, currentValue.getWindows());
- accumulator = combineFnRunner.addInput(key, accumulator, firstValue,
- options, sideInputReader, currentValue.getWindows());
-
- // we use this to keep track of the timestamps assigned by the OutputTimeFn
- Instant windowTimestamp =
- outputTimeFn.assignOutputTime(currentValue.getTimestamp(), currentWindow);
-
- while (iterator.hasNext()) {
- WindowedValue<KV<K, InputT>> nextValue = iterator.next();
- IntervalWindow nextWindow =
- (IntervalWindow) Iterables.getOnlyElement(nextValue.getWindows());
-
- if (currentWindow.equals(nextWindow)) {
- // continue accumulating and merge windows
-
- InputT value = nextValue.getValue().getValue();
- accumulator = combineFnRunner.addInput(key, accumulator, value,
- options, sideInputReader, currentValue.getWindows());
-
- windowTimestamp = outputTimeFn.combine(
- windowTimestamp,
- outputTimeFn.assignOutputTime(nextValue.getTimestamp(), currentWindow));
-
- } else {
- // emit the value that we currently have
- out.collect(
- WindowedValue.of(
- KV.of(key, combineFnRunner.extractOutput(key, accumulator,
- options, sideInputReader, currentValue.getWindows())),
- windowTimestamp,
- currentWindow,
- PaneInfo.NO_FIRING));
-
- currentWindow = nextWindow;
- currentValue = nextValue;
- InputT value = nextValue.getValue().getValue();
- accumulator = combineFnRunner.createAccumulator(key,
- options, sideInputReader, currentValue.getWindows());
- accumulator = combineFnRunner.addInput(key, accumulator, value,
- options, sideInputReader, currentValue.getWindows());
- windowTimestamp = outputTimeFn.assignOutputTime(nextValue.getTimestamp(), currentWindow);
- }
-
- }
-
- // emit the final accumulator
- out.collect(
- WindowedValue.of(
- KV.of(key, combineFnRunner.extractOutput(key, accumulator,
- options, sideInputReader, currentValue.getWindows())),
- windowTimestamp,
- currentWindow,
- PaneInfo.NO_FIRING));
- }
-
- /**
- * Merge windows. This assumes that the list of elements is sorted by window-end timestamp.
- * This replaces windows in the input list.
- */
- private void mergeWindow(List<WindowedValue<KV<K, InputT>>> elements) {
- int currentStart = 0;
- IntervalWindow currentWindow =
- (IntervalWindow) Iterables.getOnlyElement(elements.get(0).getWindows());
-
- for (int i = 1; i < elements.size(); i++) {
- WindowedValue<KV<K, InputT>> nextValue = elements.get(i);
- IntervalWindow nextWindow =
- (IntervalWindow) Iterables.getOnlyElement(nextValue.getWindows());
- if (currentWindow.intersects(nextWindow)) {
- // we continue
- currentWindow = currentWindow.span(nextWindow);
- } else {
- // retrofit the merged window to all windows up to "currentStart"
- for (int j = i - 1; j >= currentStart; j--) {
- WindowedValue<KV<K, InputT>> value = elements.get(j);
- elements.set(
- j,
- WindowedValue.of(
- value.getValue(), value.getTimestamp(), currentWindow, value.getPane()));
- }
- currentStart = i;
- currentWindow = nextWindow;
- }
- }
- if (currentStart < elements.size() - 1) {
- // we have to retrofit the last batch
- for (int j = elements.size() - 1; j >= currentStart; j--) {
- WindowedValue<KV<K, InputT>> value = elements.get(j);
- elements.set(
- j,
- WindowedValue.of(
- value.getValue(), value.getTimestamp(), currentWindow, value.getPane()));
- }
- }
- }
-
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkMergingPartialReduceFunction.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkMergingPartialReduceFunction.java b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkMergingPartialReduceFunction.java
deleted file mode 100644
index c68f155..0000000
--- a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkMergingPartialReduceFunction.java
+++ /dev/null
@@ -1,201 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink.translation.functions;
-
-import com.google.common.collect.Iterables;
-import com.google.common.collect.Lists;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-import org.apache.beam.runners.core.PerKeyCombineFnRunner;
-import org.apache.beam.runners.core.PerKeyCombineFnRunners;
-import org.apache.beam.sdk.options.PipelineOptions;
-import org.apache.beam.sdk.transforms.CombineFnBase;
-import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
-import org.apache.beam.sdk.transforms.windowing.IntervalWindow;
-import org.apache.beam.sdk.transforms.windowing.OutputTimeFn;
-import org.apache.beam.sdk.transforms.windowing.PaneInfo;
-import org.apache.beam.sdk.util.WindowedValue;
-import org.apache.beam.sdk.util.WindowingStrategy;
-import org.apache.beam.sdk.values.KV;
-import org.apache.beam.sdk.values.PCollectionView;
-import org.apache.flink.util.Collector;
-import org.joda.time.Instant;
-
-/**
- * Special version of {@link FlinkPartialReduceFunction} that supports merging windows. This
- * assumes that the windows are {@link IntervalWindow IntervalWindows} and exhibits the
- * same behaviour as {@code MergeOverlappingIntervalWindows}.
- */
-public class FlinkMergingPartialReduceFunction<K, InputT, AccumT, W extends IntervalWindow>
- extends FlinkPartialReduceFunction<K, InputT, AccumT, W> {
-
- public FlinkMergingPartialReduceFunction(
- CombineFnBase.PerKeyCombineFn<K, InputT, AccumT, ?> combineFn,
- WindowingStrategy<?, W> windowingStrategy,
- Map<PCollectionView<?>, WindowingStrategy<?, ?>> sideInputs,
- PipelineOptions pipelineOptions) {
- super(combineFn, windowingStrategy, sideInputs, pipelineOptions);
- }
-
- @Override
- public void combine(
- Iterable<WindowedValue<KV<K, InputT>>> elements,
- Collector<WindowedValue<KV<K, AccumT>>> out) throws Exception {
-
- PipelineOptions options = serializedOptions.getPipelineOptions();
-
- FlinkSideInputReader sideInputReader =
- new FlinkSideInputReader(sideInputs, getRuntimeContext());
-
- PerKeyCombineFnRunner<K, InputT, AccumT, ?> combineFnRunner =
- PerKeyCombineFnRunners.create(combineFn);
-
- @SuppressWarnings("unchecked")
- OutputTimeFn<? super BoundedWindow> outputTimeFn =
- (OutputTimeFn<? super BoundedWindow>) windowingStrategy.getOutputTimeFn();
-
- // get all elements so that we can sort them, has to fit into
- // memory
- // this seems very unprudent, but correct, for now
- List<WindowedValue<KV<K, InputT>>> sortedInput = Lists.newArrayList();
- for (WindowedValue<KV<K, InputT>> inputValue : elements) {
- for (WindowedValue<KV<K, InputT>> exploded : inputValue.explodeWindows()) {
- sortedInput.add(exploded);
- }
- }
- Collections.sort(sortedInput, new Comparator<WindowedValue<KV<K, InputT>>>() {
- @Override
- public int compare(
- WindowedValue<KV<K, InputT>> o1,
- WindowedValue<KV<K, InputT>> o2) {
- return Iterables.getOnlyElement(o1.getWindows()).maxTimestamp()
- .compareTo(Iterables.getOnlyElement(o2.getWindows()).maxTimestamp());
- }
- });
-
- // merge windows, we have to do it in an extra pre-processing step and
- // can't do it as we go since the window of early elements would not
- // be correct when calling the CombineFn
- mergeWindow(sortedInput);
-
- // iterate over the elements that are sorted by window timestamp
- final Iterator<WindowedValue<KV<K, InputT>>> iterator = sortedInput.iterator();
-
- // create accumulator using the first elements key
- WindowedValue<KV<K, InputT>> currentValue = iterator.next();
- K key = currentValue.getValue().getKey();
- IntervalWindow currentWindow =
- (IntervalWindow) Iterables.getOnlyElement(currentValue.getWindows());
- InputT firstValue = currentValue.getValue().getValue();
- AccumT accumulator = combineFnRunner.createAccumulator(key,
- options, sideInputReader, currentValue.getWindows());
- accumulator = combineFnRunner.addInput(key, accumulator, firstValue,
- options, sideInputReader, currentValue.getWindows());
-
- // we use this to keep track of the timestamps assigned by the OutputTimeFn
- Instant windowTimestamp =
- outputTimeFn.assignOutputTime(currentValue.getTimestamp(), currentWindow);
-
- while (iterator.hasNext()) {
- WindowedValue<KV<K, InputT>> nextValue = iterator.next();
- IntervalWindow nextWindow = (IntervalWindow) Iterables.getOnlyElement(nextValue.getWindows());
-
- if (currentWindow.equals(nextWindow)) {
- // continue accumulating and merge windows
-
- InputT value = nextValue.getValue().getValue();
- accumulator = combineFnRunner.addInput(key, accumulator, value,
- options, sideInputReader, currentValue.getWindows());
-
- windowTimestamp = outputTimeFn.combine(
- windowTimestamp,
- outputTimeFn.assignOutputTime(nextValue.getTimestamp(), currentWindow));
-
- } else {
- // emit the value that we currently have
- out.collect(
- WindowedValue.of(
- KV.of(key, accumulator),
- windowTimestamp,
- currentWindow,
- PaneInfo.NO_FIRING));
-
- currentWindow = nextWindow;
- currentValue = nextValue;
- InputT value = nextValue.getValue().getValue();
- accumulator = combineFnRunner.createAccumulator(key,
- options, sideInputReader, currentValue.getWindows());
- accumulator = combineFnRunner.addInput(key, accumulator, value,
- options, sideInputReader, currentValue.getWindows());
- windowTimestamp = outputTimeFn.assignOutputTime(nextValue.getTimestamp(), currentWindow);
- }
- }
-
- // emit the final accumulator
- out.collect(
- WindowedValue.of(
- KV.of(key, accumulator),
- windowTimestamp,
- currentWindow,
- PaneInfo.NO_FIRING));
- }
-
- /**
- * Merge windows. This assumes that the list of elements is sorted by window-end timestamp.
- * This replaces windows in the input list.
- */
- private void mergeWindow(List<WindowedValue<KV<K, InputT>>> elements) {
- int currentStart = 0;
- IntervalWindow currentWindow =
- (IntervalWindow) Iterables.getOnlyElement(elements.get(0).getWindows());
-
- for (int i = 1; i < elements.size(); i++) {
- WindowedValue<KV<K, InputT>> nextValue = elements.get(i);
- IntervalWindow nextWindow =
- (IntervalWindow) Iterables.getOnlyElement(nextValue.getWindows());
- if (currentWindow.intersects(nextWindow)) {
- // we continue
- currentWindow = currentWindow.span(nextWindow);
- } else {
- // retrofit the merged window to all windows up to "currentStart"
- for (int j = i - 1; j >= currentStart; j--) {
- WindowedValue<KV<K, InputT>> value = elements.get(j);
- elements.set(
- j,
- WindowedValue.of(
- value.getValue(), value.getTimestamp(), currentWindow, value.getPane()));
- }
- currentStart = i;
- currentWindow = nextWindow;
- }
- }
- if (currentStart < elements.size() - 1) {
- // we have to retrofit the last batch
- for (int j = elements.size() - 1; j >= currentStart; j--) {
- WindowedValue<KV<K, InputT>> value = elements.get(j);
- elements.set(
- j,
- WindowedValue.of(
- value.getValue(), value.getTimestamp(), currentWindow, value.getPane()));
- }
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkMergingReduceFunction.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkMergingReduceFunction.java b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkMergingReduceFunction.java
deleted file mode 100644
index 84b3adc..0000000
--- a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkMergingReduceFunction.java
+++ /dev/null
@@ -1,199 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink.translation.functions;
-
-import com.google.common.collect.ImmutableList;
-import com.google.common.collect.Iterables;
-import com.google.common.collect.Lists;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-import org.apache.beam.runners.core.PerKeyCombineFnRunner;
-import org.apache.beam.runners.core.PerKeyCombineFnRunners;
-import org.apache.beam.sdk.options.PipelineOptions;
-import org.apache.beam.sdk.transforms.CombineFnBase;
-import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
-import org.apache.beam.sdk.transforms.windowing.IntervalWindow;
-import org.apache.beam.sdk.transforms.windowing.OutputTimeFn;
-import org.apache.beam.sdk.transforms.windowing.PaneInfo;
-import org.apache.beam.sdk.util.WindowedValue;
-import org.apache.beam.sdk.util.WindowingStrategy;
-import org.apache.beam.sdk.values.KV;
-import org.apache.beam.sdk.values.PCollectionView;
-import org.apache.flink.util.Collector;
-import org.joda.time.Instant;
-
-/**
- * Special version of {@link FlinkReduceFunction} that supports merging windows. This
- * assumes that the windows are {@link IntervalWindow IntervalWindows} and exhibits the
- * same behaviour as {@code MergeOverlappingIntervalWindows}.
- */
-public class FlinkMergingReduceFunction<K, AccumT, OutputT, W extends IntervalWindow>
- extends FlinkReduceFunction<K, AccumT, OutputT, W> {
-
- public FlinkMergingReduceFunction(
- CombineFnBase.PerKeyCombineFn<K, ?, AccumT, OutputT> keyedCombineFn,
- WindowingStrategy<?, W> windowingStrategy,
- Map<PCollectionView<?>, WindowingStrategy<?, ?>> sideInputs,
- PipelineOptions pipelineOptions) {
- super(keyedCombineFn, windowingStrategy, sideInputs, pipelineOptions);
- }
-
- @Override
- public void reduce(
- Iterable<WindowedValue<KV<K, AccumT>>> elements,
- Collector<WindowedValue<KV<K, OutputT>>> out) throws Exception {
-
- PipelineOptions options = serializedOptions.getPipelineOptions();
-
- FlinkSideInputReader sideInputReader =
- new FlinkSideInputReader(sideInputs, getRuntimeContext());
-
- PerKeyCombineFnRunner<K, ?, AccumT, OutputT> combineFnRunner =
- PerKeyCombineFnRunners.create(combineFn);
-
- @SuppressWarnings("unchecked")
- OutputTimeFn<? super BoundedWindow> outputTimeFn =
- (OutputTimeFn<? super BoundedWindow>) windowingStrategy.getOutputTimeFn();
-
- // get all elements so that we can sort them, has to fit into
- // memory
- // this seems very unprudent, but correct, for now
- ArrayList<WindowedValue<KV<K, AccumT>>> sortedInput = Lists.newArrayList();
- for (WindowedValue<KV<K, AccumT>> inputValue : elements) {
- for (WindowedValue<KV<K, AccumT>> exploded : inputValue.explodeWindows()) {
- sortedInput.add(exploded);
- }
- }
- Collections.sort(sortedInput, new Comparator<WindowedValue<KV<K, AccumT>>>() {
- @Override
- public int compare(
- WindowedValue<KV<K, AccumT>> o1,
- WindowedValue<KV<K, AccumT>> o2) {
- return Iterables.getOnlyElement(o1.getWindows()).maxTimestamp()
- .compareTo(Iterables.getOnlyElement(o2.getWindows()).maxTimestamp());
- }
- });
-
- // merge windows, we have to do it in an extra pre-processing step and
- // can't do it as we go since the window of early elements would not
- // be correct when calling the CombineFn
- mergeWindow(sortedInput);
-
- // iterate over the elements that are sorted by window timestamp
- final Iterator<WindowedValue<KV<K, AccumT>>> iterator = sortedInput.iterator();
-
- // get the first accumulator
- WindowedValue<KV<K, AccumT>> currentValue = iterator.next();
- K key = currentValue.getValue().getKey();
- IntervalWindow currentWindow =
- (IntervalWindow) Iterables.getOnlyElement(currentValue.getWindows());
- AccumT accumulator = currentValue.getValue().getValue();
-
- // we use this to keep track of the timestamps assigned by the OutputTimeFn,
- // in FlinkPartialReduceFunction we already merge the timestamps assigned
- // to individual elements, here we just merge them
- List<Instant> windowTimestamps = new ArrayList<>();
- windowTimestamps.add(currentValue.getTimestamp());
-
- while (iterator.hasNext()) {
- WindowedValue<KV<K, AccumT>> nextValue = iterator.next();
- IntervalWindow nextWindow =
- (IntervalWindow) Iterables.getOnlyElement(nextValue.getWindows());
-
- if (nextWindow.equals(currentWindow)) {
- // continue accumulating and merge windows
-
- accumulator = combineFnRunner.mergeAccumulators(
- key, ImmutableList.of(accumulator, nextValue.getValue().getValue()),
- options, sideInputReader, currentValue.getWindows());
-
- windowTimestamps.add(nextValue.getTimestamp());
- } else {
- out.collect(
- WindowedValue.of(
- KV.of(key, combineFnRunner.extractOutput(key, accumulator,
- options, sideInputReader, currentValue.getWindows())),
- outputTimeFn.merge(currentWindow, windowTimestamps),
- currentWindow,
- PaneInfo.NO_FIRING));
-
- windowTimestamps.clear();
-
- currentWindow = nextWindow;
- currentValue = nextValue;
- accumulator = nextValue.getValue().getValue();
- windowTimestamps.add(nextValue.getTimestamp());
- }
- }
-
- // emit the final accumulator
- out.collect(
- WindowedValue.of(
- KV.of(key, combineFnRunner.extractOutput(key, accumulator,
- options, sideInputReader, currentValue.getWindows())),
- outputTimeFn.merge(currentWindow, windowTimestamps),
- currentWindow,
- PaneInfo.NO_FIRING));
- }
-
- /**
- * Merge windows. This assumes that the list of elements is sorted by window-end timestamp.
- * This replaces windows in the input list.
- */
- private void mergeWindow(List<WindowedValue<KV<K, AccumT>>> elements) {
- int currentStart = 0;
- IntervalWindow currentWindow =
- (IntervalWindow) Iterables.getOnlyElement(elements.get(0).getWindows());
-
- for (int i = 1; i < elements.size(); i++) {
- WindowedValue<KV<K, AccumT>> nextValue = elements.get(i);
- IntervalWindow nextWindow =
- (IntervalWindow) Iterables.getOnlyElement(nextValue.getWindows());
- if (currentWindow.intersects(nextWindow)) {
- // we continue
- currentWindow = currentWindow.span(nextWindow);
- } else {
- // retrofit the merged window to all windows up to "currentStart"
- for (int j = i - 1; j >= currentStart; j--) {
- WindowedValue<KV<K, AccumT>> value = elements.get(j);
- elements.set(
- j,
- WindowedValue.of(
- value.getValue(), value.getTimestamp(), currentWindow, value.getPane()));
- }
- currentStart = i;
- currentWindow = nextWindow;
- }
- }
- if (currentStart < elements.size() - 1) {
- // we have to retrofit the last batch
- for (int j = elements.size() - 1; j >= currentStart; j--) {
- WindowedValue<KV<K, AccumT>> value = elements.get(j);
- elements.set(
- j,
- WindowedValue.of(
- value.getValue(), value.getTimestamp(), currentWindow, value.getPane()));
- }
- }
- }
-
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkMultiOutputPruningFunction.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkMultiOutputPruningFunction.java b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkMultiOutputPruningFunction.java
deleted file mode 100644
index 9071cc5..0000000
--- a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkMultiOutputPruningFunction.java
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink.translation.functions;
-
-import org.apache.beam.sdk.transforms.join.RawUnionValue;
-import org.apache.beam.sdk.util.WindowedValue;
-import org.apache.flink.api.common.functions.FlatMapFunction;
-import org.apache.flink.util.Collector;
-
-/**
- * A {@link FlatMapFunction} function that filters out those elements that don't belong in this
- * output. We need this to implement MultiOutput ParDo functions in combination with
- * {@link FlinkDoFnFunction}.
- */
-public class FlinkMultiOutputPruningFunction<T>
- implements FlatMapFunction<WindowedValue<RawUnionValue>, WindowedValue<T>> {
-
- private final int ourOutputTag;
-
- public FlinkMultiOutputPruningFunction(int ourOutputTag) {
- this.ourOutputTag = ourOutputTag;
- }
-
- @Override
- @SuppressWarnings("unchecked")
- public void flatMap(
- WindowedValue<RawUnionValue> windowedValue,
- Collector<WindowedValue<T>> collector) throws Exception {
- int unionTag = windowedValue.getValue().getUnionTag();
- if (unionTag == ourOutputTag) {
- collector.collect(
- (WindowedValue<T>) windowedValue.withValue(windowedValue.getValue().getValue()));
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkNoOpStepContext.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkNoOpStepContext.java b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkNoOpStepContext.java
deleted file mode 100644
index 847a00a..0000000
--- a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkNoOpStepContext.java
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink.translation.functions;
-
-import java.io.IOException;
-import org.apache.beam.runners.core.ExecutionContext.StepContext;
-import org.apache.beam.runners.core.StateInternals;
-import org.apache.beam.runners.core.TimerInternals;
-import org.apache.beam.sdk.coders.Coder;
-import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
-import org.apache.beam.sdk.util.WindowedValue;
-import org.apache.beam.sdk.values.TupleTag;
-
-/**
- * A {@link StepContext} for Flink Batch Runner execution.
- */
-public class FlinkNoOpStepContext implements StepContext {
-
- @Override
- public String getStepName() {
- return null;
- }
-
- @Override
- public String getTransformName() {
- return null;
- }
-
- @Override
- public void noteOutput(WindowedValue<?> output) {
-
- }
-
- @Override
- public void noteOutput(TupleTag<?> tag, WindowedValue<?> output) {
-
- }
-
- @Override
- public <T, W extends BoundedWindow> void writePCollectionViewData(
- TupleTag<?> tag,
- Iterable<WindowedValue<T>> data,
- Coder<Iterable<WindowedValue<T>>> dataCoder,
- W window,
- Coder<W> windowCoder) throws IOException {
- }
-
- @Override
- public StateInternals<?> stateInternals() {
- return null;
- }
-
- @Override
- public TimerInternals timerInternals() {
- return null;
- }
-}
-
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkPartialReduceFunction.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkPartialReduceFunction.java b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkPartialReduceFunction.java
deleted file mode 100644
index 1d1ff9f..0000000
--- a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkPartialReduceFunction.java
+++ /dev/null
@@ -1,172 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink.translation.functions;
-
-import com.google.common.collect.Iterables;
-import com.google.common.collect.Lists;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.Iterator;
-import java.util.Map;
-import org.apache.beam.runners.core.PerKeyCombineFnRunner;
-import org.apache.beam.runners.core.PerKeyCombineFnRunners;
-import org.apache.beam.runners.flink.translation.utils.SerializedPipelineOptions;
-import org.apache.beam.sdk.options.PipelineOptions;
-import org.apache.beam.sdk.transforms.CombineFnBase;
-import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
-import org.apache.beam.sdk.transforms.windowing.OutputTimeFn;
-import org.apache.beam.sdk.transforms.windowing.PaneInfo;
-import org.apache.beam.sdk.util.WindowedValue;
-import org.apache.beam.sdk.util.WindowingStrategy;
-import org.apache.beam.sdk.values.KV;
-import org.apache.beam.sdk.values.PCollectionView;
-import org.apache.flink.api.common.functions.RichGroupCombineFunction;
-import org.apache.flink.util.Collector;
-import org.joda.time.Instant;
-
-/**
- * This is is the first step for executing a {@link org.apache.beam.sdk.transforms.Combine.PerKey}
- * on Flink. The second part is {@link FlinkReduceFunction}. This function performs a local
- * combine step before shuffling while the latter does the final combination after a shuffle.
- *
- * <p>The input to {@link #combine(Iterable, Collector)} are elements of the same key but
- * for different windows. We have to ensure that we only combine elements of matching
- * windows.
- */
-public class FlinkPartialReduceFunction<K, InputT, AccumT, W extends BoundedWindow>
- extends RichGroupCombineFunction<WindowedValue<KV<K, InputT>>, WindowedValue<KV<K, AccumT>>> {
-
- protected final CombineFnBase.PerKeyCombineFn<K, InputT, AccumT, ?> combineFn;
-
- protected final WindowingStrategy<?, W> windowingStrategy;
-
- protected final SerializedPipelineOptions serializedOptions;
-
- protected final Map<PCollectionView<?>, WindowingStrategy<?, ?>> sideInputs;
-
- public FlinkPartialReduceFunction(
- CombineFnBase.PerKeyCombineFn<K, InputT, AccumT, ?> combineFn,
- WindowingStrategy<?, W> windowingStrategy,
- Map<PCollectionView<?>, WindowingStrategy<?, ?>> sideInputs,
- PipelineOptions pipelineOptions) {
-
- this.combineFn = combineFn;
- this.windowingStrategy = windowingStrategy;
- this.sideInputs = sideInputs;
- this.serializedOptions = new SerializedPipelineOptions(pipelineOptions);
-
- }
-
- @Override
- public void combine(
- Iterable<WindowedValue<KV<K, InputT>>> elements,
- Collector<WindowedValue<KV<K, AccumT>>> out) throws Exception {
-
- PipelineOptions options = serializedOptions.getPipelineOptions();
-
- FlinkSideInputReader sideInputReader =
- new FlinkSideInputReader(sideInputs, getRuntimeContext());
-
- PerKeyCombineFnRunner<K, InputT, AccumT, ?> combineFnRunner =
- PerKeyCombineFnRunners.create(combineFn);
-
- @SuppressWarnings("unchecked")
- OutputTimeFn<? super BoundedWindow> outputTimeFn =
- (OutputTimeFn<? super BoundedWindow>) windowingStrategy.getOutputTimeFn();
-
- // get all elements so that we can sort them, has to fit into
- // memory
- // this seems very unprudent, but correct, for now
- ArrayList<WindowedValue<KV<K, InputT>>> sortedInput = Lists.newArrayList();
- for (WindowedValue<KV<K, InputT>> inputValue : elements) {
- for (WindowedValue<KV<K, InputT>> exploded : inputValue.explodeWindows()) {
- sortedInput.add(exploded);
- }
- }
- Collections.sort(sortedInput, new Comparator<WindowedValue<KV<K, InputT>>>() {
- @Override
- public int compare(
- WindowedValue<KV<K, InputT>> o1,
- WindowedValue<KV<K, InputT>> o2) {
- return Iterables.getOnlyElement(o1.getWindows()).maxTimestamp()
- .compareTo(Iterables.getOnlyElement(o2.getWindows()).maxTimestamp());
- }
- });
-
- // iterate over the elements that are sorted by window timestamp
- //
- final Iterator<WindowedValue<KV<K, InputT>>> iterator = sortedInput.iterator();
-
- // create accumulator using the first elements key
- WindowedValue<KV<K, InputT>> currentValue = iterator.next();
- K key = currentValue.getValue().getKey();
- BoundedWindow currentWindow = Iterables.getFirst(currentValue.getWindows(), null);
- InputT firstValue = currentValue.getValue().getValue();
- AccumT accumulator = combineFnRunner.createAccumulator(key,
- options, sideInputReader, currentValue.getWindows());
- accumulator = combineFnRunner.addInput(key, accumulator, firstValue,
- options, sideInputReader, currentValue.getWindows());
-
- // we use this to keep track of the timestamps assigned by the OutputTimeFn
- Instant windowTimestamp =
- outputTimeFn.assignOutputTime(currentValue.getTimestamp(), currentWindow);
-
- while (iterator.hasNext()) {
- WindowedValue<KV<K, InputT>> nextValue = iterator.next();
- BoundedWindow nextWindow = Iterables.getOnlyElement(nextValue.getWindows());
-
- if (nextWindow.equals(currentWindow)) {
- // continue accumulating
- InputT value = nextValue.getValue().getValue();
- accumulator = combineFnRunner.addInput(key, accumulator, value,
- options, sideInputReader, currentValue.getWindows());
-
- windowTimestamp = outputTimeFn.combine(
- windowTimestamp,
- outputTimeFn.assignOutputTime(nextValue.getTimestamp(), currentWindow));
-
- } else {
- // emit the value that we currently have
- out.collect(
- WindowedValue.of(
- KV.of(key, accumulator),
- windowTimestamp,
- currentWindow,
- PaneInfo.NO_FIRING));
-
- currentWindow = nextWindow;
- currentValue = nextValue;
- InputT value = nextValue.getValue().getValue();
- accumulator = combineFnRunner.createAccumulator(key,
- options, sideInputReader, currentValue.getWindows());
- accumulator = combineFnRunner.addInput(key, accumulator, value,
- options, sideInputReader, currentValue.getWindows());
- windowTimestamp = outputTimeFn.assignOutputTime(nextValue.getTimestamp(), currentWindow);
- }
- }
-
- // emit the final accumulator
- out.collect(
- WindowedValue.of(
- KV.of(key, accumulator),
- windowTimestamp,
- currentWindow,
- PaneInfo.NO_FIRING));
- }
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkReduceFunction.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkReduceFunction.java b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkReduceFunction.java
deleted file mode 100644
index 3e4f742..0000000
--- a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkReduceFunction.java
+++ /dev/null
@@ -1,173 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink.translation.functions;
-
-import com.google.common.collect.ImmutableList;
-import com.google.common.collect.Iterables;
-import com.google.common.collect.Lists;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-import org.apache.beam.runners.core.PerKeyCombineFnRunner;
-import org.apache.beam.runners.core.PerKeyCombineFnRunners;
-import org.apache.beam.runners.flink.translation.utils.SerializedPipelineOptions;
-import org.apache.beam.sdk.options.PipelineOptions;
-import org.apache.beam.sdk.transforms.CombineFnBase;
-import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
-import org.apache.beam.sdk.transforms.windowing.OutputTimeFn;
-import org.apache.beam.sdk.transforms.windowing.PaneInfo;
-import org.apache.beam.sdk.util.WindowedValue;
-import org.apache.beam.sdk.util.WindowingStrategy;
-import org.apache.beam.sdk.values.KV;
-import org.apache.beam.sdk.values.PCollectionView;
-import org.apache.flink.api.common.functions.RichGroupReduceFunction;
-import org.apache.flink.util.Collector;
-import org.joda.time.Instant;
-
-/**
- * This is the second part for executing a {@link org.apache.beam.sdk.transforms.Combine.PerKey}
- * on Flink, the second part is {@link FlinkReduceFunction}. This function performs the final
- * combination of the pre-combined values after a shuffle.
- *
- * <p>The input to {@link #reduce(Iterable, Collector)} are elements of the same key but
- * for different windows. We have to ensure that we only combine elements of matching
- * windows.
- */
-public class FlinkReduceFunction<K, AccumT, OutputT, W extends BoundedWindow>
- extends RichGroupReduceFunction<WindowedValue<KV<K, AccumT>>, WindowedValue<KV<K, OutputT>>> {
-
- protected final CombineFnBase.PerKeyCombineFn<K, ?, AccumT, OutputT> combineFn;
-
- protected final WindowingStrategy<?, W> windowingStrategy;
-
- protected final Map<PCollectionView<?>, WindowingStrategy<?, ?>> sideInputs;
-
- protected final SerializedPipelineOptions serializedOptions;
-
- public FlinkReduceFunction(
- CombineFnBase.PerKeyCombineFn<K, ?, AccumT, OutputT> keyedCombineFn,
- WindowingStrategy<?, W> windowingStrategy,
- Map<PCollectionView<?>, WindowingStrategy<?, ?>> sideInputs,
- PipelineOptions pipelineOptions) {
-
- this.combineFn = keyedCombineFn;
-
- this.windowingStrategy = windowingStrategy;
- this.sideInputs = sideInputs;
-
- this.serializedOptions = new SerializedPipelineOptions(pipelineOptions);
-
- }
-
- @Override
- public void reduce(
- Iterable<WindowedValue<KV<K, AccumT>>> elements,
- Collector<WindowedValue<KV<K, OutputT>>> out) throws Exception {
-
- PipelineOptions options = serializedOptions.getPipelineOptions();
-
- FlinkSideInputReader sideInputReader =
- new FlinkSideInputReader(sideInputs, getRuntimeContext());
-
- PerKeyCombineFnRunner<K, ?, AccumT, OutputT> combineFnRunner =
- PerKeyCombineFnRunners.create(combineFn);
-
- @SuppressWarnings("unchecked")
- OutputTimeFn<? super BoundedWindow> outputTimeFn =
- (OutputTimeFn<? super BoundedWindow>) windowingStrategy.getOutputTimeFn();
-
-
- // get all elements so that we can sort them, has to fit into
- // memory
- // this seems very unprudent, but correct, for now
- ArrayList<WindowedValue<KV<K, AccumT>>> sortedInput = Lists.newArrayList();
- for (WindowedValue<KV<K, AccumT>> inputValue: elements) {
- for (WindowedValue<KV<K, AccumT>> exploded: inputValue.explodeWindows()) {
- sortedInput.add(exploded);
- }
- }
- Collections.sort(sortedInput, new Comparator<WindowedValue<KV<K, AccumT>>>() {
- @Override
- public int compare(
- WindowedValue<KV<K, AccumT>> o1,
- WindowedValue<KV<K, AccumT>> o2) {
- return Iterables.getOnlyElement(o1.getWindows()).maxTimestamp()
- .compareTo(Iterables.getOnlyElement(o2.getWindows()).maxTimestamp());
- }
- });
-
- // iterate over the elements that are sorted by window timestamp
- //
- final Iterator<WindowedValue<KV<K, AccumT>>> iterator = sortedInput.iterator();
-
- // get the first accumulator
- WindowedValue<KV<K, AccumT>> currentValue = iterator.next();
- K key = currentValue.getValue().getKey();
- BoundedWindow currentWindow = Iterables.getFirst(currentValue.getWindows(), null);
- AccumT accumulator = currentValue.getValue().getValue();
-
- // we use this to keep track of the timestamps assigned by the OutputTimeFn,
- // in FlinkPartialReduceFunction we already merge the timestamps assigned
- // to individual elements, here we just merge them
- List<Instant> windowTimestamps = new ArrayList<>();
- windowTimestamps.add(currentValue.getTimestamp());
-
- while (iterator.hasNext()) {
- WindowedValue<KV<K, AccumT>> nextValue = iterator.next();
- BoundedWindow nextWindow = Iterables.getOnlyElement(nextValue.getWindows());
-
- if (nextWindow.equals(currentWindow)) {
- // continue accumulating
- accumulator = combineFnRunner.mergeAccumulators(
- key, ImmutableList.of(accumulator, nextValue.getValue().getValue()),
- options, sideInputReader, currentValue.getWindows());
-
- windowTimestamps.add(nextValue.getTimestamp());
- } else {
- // emit the value that we currently have
- out.collect(
- WindowedValue.of(
- KV.of(key, combineFnRunner.extractOutput(key, accumulator,
- options, sideInputReader, currentValue.getWindows())),
- outputTimeFn.merge(currentWindow, windowTimestamps),
- currentWindow,
- PaneInfo.NO_FIRING));
-
- windowTimestamps.clear();
-
- currentWindow = nextWindow;
- currentValue = nextValue;
- accumulator = nextValue.getValue().getValue();
- windowTimestamps.add(nextValue.getTimestamp());
- }
-
- }
-
- // emit the final accumulator
- out.collect(
- WindowedValue.of(
- KV.of(key, combineFnRunner.extractOutput(key, accumulator,
- options, sideInputReader, currentValue.getWindows())),
- outputTimeFn.merge(currentWindow, windowTimestamps),
- currentWindow,
- PaneInfo.NO_FIRING));
- }
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkSideInputReader.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkSideInputReader.java b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkSideInputReader.java
deleted file mode 100644
index c317182..0000000
--- a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkSideInputReader.java
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink.translation.functions;
-
-import static com.google.common.base.Preconditions.checkNotNull;
-
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.Map;
-import javax.annotation.Nullable;
-import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
-import org.apache.beam.sdk.util.SideInputReader;
-import org.apache.beam.sdk.util.WindowedValue;
-import org.apache.beam.sdk.util.WindowingStrategy;
-import org.apache.beam.sdk.values.PCollectionView;
-import org.apache.beam.sdk.values.TupleTag;
-import org.apache.flink.api.common.functions.RuntimeContext;
-
-/**
- * A {@link SideInputReader} for the Flink Batch Runner.
- */
-public class FlinkSideInputReader implements SideInputReader {
-
- private final Map<TupleTag<?>, WindowingStrategy<?, ?>> sideInputs;
-
- private RuntimeContext runtimeContext;
-
- public FlinkSideInputReader(Map<PCollectionView<?>, WindowingStrategy<?, ?>> indexByView,
- RuntimeContext runtimeContext) {
- sideInputs = new HashMap<>();
- for (Map.Entry<PCollectionView<?>, WindowingStrategy<?, ?>> entry : indexByView.entrySet()) {
- sideInputs.put(entry.getKey().getTagInternal(), entry.getValue());
- }
- this.runtimeContext = runtimeContext;
- }
-
- @Nullable
- @Override
- public <T> T get(PCollectionView<T> view, BoundedWindow window) {
- checkNotNull(view, "View passed to sideInput cannot be null");
- TupleTag<Iterable<WindowedValue<?>>> tag = view.getTagInternal();
- checkNotNull(
- sideInputs.get(tag),
- "Side input for " + view + " not available.");
-
- Map<BoundedWindow, T> sideInputs =
- runtimeContext.getBroadcastVariableWithInitializer(
- tag.getId(), new SideInputInitializer<>(view));
- T result = sideInputs.get(window);
- if (result == null) {
- result = view.getViewFn().apply(Collections.<WindowedValue<?>>emptyList());
- }
- return result;
- }
-
- @Override
- public <T> boolean contains(PCollectionView<T> view) {
- return sideInputs.containsKey(view.getTagInternal());
- }
-
- @Override
- public boolean isEmpty() {
- return sideInputs.isEmpty();
- }
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkStatefulDoFnFunction.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkStatefulDoFnFunction.java b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkStatefulDoFnFunction.java
deleted file mode 100644
index c8193d2..0000000
--- a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkStatefulDoFnFunction.java
+++ /dev/null
@@ -1,198 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink.translation.functions;
-
-import static org.apache.flink.util.Preconditions.checkArgument;
-
-import java.util.Collections;
-import java.util.Iterator;
-import java.util.Map;
-import org.apache.beam.runners.core.DoFnRunner;
-import org.apache.beam.runners.core.DoFnRunners;
-import org.apache.beam.runners.core.InMemoryStateInternals;
-import org.apache.beam.runners.core.InMemoryTimerInternals;
-import org.apache.beam.runners.core.StateInternals;
-import org.apache.beam.runners.core.StateNamespace;
-import org.apache.beam.runners.core.StateNamespaces;
-import org.apache.beam.runners.core.TimerInternals;
-import org.apache.beam.runners.flink.translation.utils.SerializedPipelineOptions;
-import org.apache.beam.sdk.options.PipelineOptions;
-import org.apache.beam.sdk.transforms.DoFn;
-import org.apache.beam.sdk.transforms.ParDo;
-import org.apache.beam.sdk.transforms.reflect.DoFnInvoker;
-import org.apache.beam.sdk.transforms.reflect.DoFnInvokers;
-import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
-import org.apache.beam.sdk.util.WindowedValue;
-import org.apache.beam.sdk.util.WindowingStrategy;
-import org.apache.beam.sdk.values.KV;
-import org.apache.beam.sdk.values.PCollectionView;
-import org.apache.beam.sdk.values.TupleTag;
-import org.apache.flink.api.common.functions.RichGroupReduceFunction;
-import org.apache.flink.api.common.functions.RuntimeContext;
-import org.apache.flink.configuration.Configuration;
-import org.apache.flink.util.Collector;
-import org.joda.time.Instant;
-
-/**
- * A {@link RichGroupReduceFunction} for stateful {@link ParDo} in Flink Batch Runner.
- */
-public class FlinkStatefulDoFnFunction<K, V, OutputT>
- extends RichGroupReduceFunction<WindowedValue<KV<K, V>>, WindowedValue<OutputT>> {
-
- private final DoFn<KV<K, V>, OutputT> dofn;
- private final WindowingStrategy<?, ?> windowingStrategy;
- private final Map<PCollectionView<?>, WindowingStrategy<?, ?>> sideInputs;
- private final SerializedPipelineOptions serializedOptions;
- private final Map<TupleTag<?>, Integer> outputMap;
- private final TupleTag<OutputT> mainOutputTag;
- private transient DoFnInvoker doFnInvoker;
-
- public FlinkStatefulDoFnFunction(
- DoFn<KV<K, V>, OutputT> dofn,
- WindowingStrategy<?, ?> windowingStrategy,
- Map<PCollectionView<?>, WindowingStrategy<?, ?>> sideInputs,
- PipelineOptions pipelineOptions,
- Map<TupleTag<?>, Integer> outputMap,
- TupleTag<OutputT> mainOutputTag) {
-
- this.dofn = dofn;
- this.windowingStrategy = windowingStrategy;
- this.sideInputs = sideInputs;
- this.serializedOptions = new SerializedPipelineOptions(pipelineOptions);
- this.outputMap = outputMap;
- this.mainOutputTag = mainOutputTag;
- }
-
- @Override
- public void reduce(
- Iterable<WindowedValue<KV<K, V>>> values,
- Collector<WindowedValue<OutputT>> out) throws Exception {
- RuntimeContext runtimeContext = getRuntimeContext();
-
- DoFnRunners.OutputManager outputManager;
- if (outputMap == null) {
- outputManager = new FlinkDoFnFunction.DoFnOutputManager(out);
- } else {
- // it has some additional Outputs
- outputManager =
- new FlinkDoFnFunction.MultiDoFnOutputManager((Collector) out, outputMap);
- }
-
- final Iterator<WindowedValue<KV<K, V>>> iterator = values.iterator();
-
- // get the first value, we need this for initializing the state internals with the key.
- // we are guaranteed to have a first value, otherwise reduce() would not have been called.
- WindowedValue<KV<K, V>> currentValue = iterator.next();
- final K key = currentValue.getValue().getKey();
-
- final InMemoryStateInternals<K> stateInternals = InMemoryStateInternals.forKey(key);
-
- // Used with Batch, we know that all the data is available for this key. We can't use the
- // timer manager from the context because it doesn't exist. So we create one and advance
- // time to the end after processing all elements.
- final InMemoryTimerInternals timerInternals = new InMemoryTimerInternals();
- timerInternals.advanceProcessingTime(Instant.now());
- timerInternals.advanceSynchronizedProcessingTime(Instant.now());
-
- DoFnRunner<KV<K, V>, OutputT> doFnRunner = DoFnRunners.simpleRunner(
- serializedOptions.getPipelineOptions(), dofn,
- new FlinkSideInputReader(sideInputs, runtimeContext),
- outputManager,
- mainOutputTag,
- // see SimpleDoFnRunner, just use it to limit number of additional outputs
- Collections.<TupleTag<?>>emptyList(),
- new FlinkNoOpStepContext() {
- @Override
- public StateInternals<?> stateInternals() {
- return stateInternals;
- }
- @Override
- public TimerInternals timerInternals() {
- return timerInternals;
- }
- },
- new FlinkAggregatorFactory(runtimeContext),
- windowingStrategy);
-
- doFnRunner.startBundle();
-
- doFnRunner.processElement(currentValue);
- while (iterator.hasNext()) {
- currentValue = iterator.next();
- doFnRunner.processElement(currentValue);
- }
-
- // Finish any pending windows by advancing the input watermark to infinity.
- timerInternals.advanceInputWatermark(BoundedWindow.TIMESTAMP_MAX_VALUE);
-
- // Finally, advance the processing time to infinity to fire any timers.
- timerInternals.advanceProcessingTime(BoundedWindow.TIMESTAMP_MAX_VALUE);
- timerInternals.advanceSynchronizedProcessingTime(BoundedWindow.TIMESTAMP_MAX_VALUE);
-
- fireEligibleTimers(timerInternals, doFnRunner);
-
- doFnRunner.finishBundle();
- }
-
- private void fireEligibleTimers(
- InMemoryTimerInternals timerInternals, DoFnRunner<KV<K, V>, OutputT> runner)
- throws Exception {
-
- while (true) {
-
- TimerInternals.TimerData timer;
- boolean hasFired = false;
-
- while ((timer = timerInternals.removeNextEventTimer()) != null) {
- hasFired = true;
- fireTimer(timer, runner);
- }
- while ((timer = timerInternals.removeNextProcessingTimer()) != null) {
- hasFired = true;
- fireTimer(timer, runner);
- }
- while ((timer = timerInternals.removeNextSynchronizedProcessingTimer()) != null) {
- hasFired = true;
- fireTimer(timer, runner);
- }
- if (!hasFired) {
- break;
- }
- }
- }
-
- private void fireTimer(
- TimerInternals.TimerData timer, DoFnRunner<KV<K, V>, OutputT> doFnRunner) {
- StateNamespace namespace = timer.getNamespace();
- checkArgument(namespace instanceof StateNamespaces.WindowNamespace);
- BoundedWindow window = ((StateNamespaces.WindowNamespace) namespace).getWindow();
- doFnRunner.onTimer(timer.getTimerId(), window, timer.getTimestamp(), timer.getDomain());
- }
-
- @Override
- public void open(Configuration parameters) throws Exception {
- doFnInvoker = DoFnInvokers.invokerFor(dofn);
- doFnInvoker.invokeSetup();
- }
-
- @Override
- public void close() throws Exception {
- doFnInvoker.invokeTeardown();
- }
-
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/functions/SideInputInitializer.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/functions/SideInputInitializer.java b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/functions/SideInputInitializer.java
deleted file mode 100644
index 12222b4..0000000
--- a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/functions/SideInputInitializer.java
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink.translation.functions;
-
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
-import org.apache.beam.sdk.util.WindowedValue;
-import org.apache.beam.sdk.values.PCollectionView;
-import org.apache.flink.api.common.functions.BroadcastVariableInitializer;
-
-/**
- * {@link BroadcastVariableInitializer} that initializes the broadcast input as a {@code Map}
- * from window to side input.
- */
-public class SideInputInitializer<ElemT, ViewT, W extends BoundedWindow>
- implements BroadcastVariableInitializer<WindowedValue<ElemT>, Map<BoundedWindow, ViewT>> {
-
- PCollectionView<ViewT> view;
-
- public SideInputInitializer(PCollectionView<ViewT> view) {
- this.view = view;
- }
-
- @Override
- public Map<BoundedWindow, ViewT> initializeBroadcastVariable(
- Iterable<WindowedValue<ElemT>> inputValues) {
-
- // first partition into windows
- Map<BoundedWindow, List<WindowedValue<ElemT>>> partitionedElements = new HashMap<>();
- for (WindowedValue<ElemT> value: inputValues) {
- for (BoundedWindow window: value.getWindows()) {
- List<WindowedValue<ElemT>> windowedValues = partitionedElements.get(window);
- if (windowedValues == null) {
- windowedValues = new ArrayList<>();
- partitionedElements.put(window, windowedValues);
- }
- windowedValues.add(value);
- }
- }
-
- Map<BoundedWindow, ViewT> resultMap = new HashMap<>();
-
- for (Map.Entry<BoundedWindow, List<WindowedValue<ElemT>>> elements:
- partitionedElements.entrySet()) {
-
- @SuppressWarnings("unchecked")
- Iterable<WindowedValue<?>> elementsIterable =
- (List<WindowedValue<?>>) (List<?>) elements.getValue();
-
- resultMap.put(elements.getKey(), view.getViewFn().apply(elementsIterable));
- }
-
- return resultMap;
- }
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/functions/package-info.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/functions/package-info.java b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/functions/package-info.java
deleted file mode 100644
index 9f11212..0000000
--- a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/functions/package-info.java
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * Internal implementation of the Beam runner for Apache Flink.
- */
-package org.apache.beam.runners.flink.translation.functions;
[04/50] [abbrv] beam git commit: ProcessFn remembers more info about
its application context
Posted by dh...@apache.org.
ProcessFn remembers more info about its application context
Project: http://git-wip-us.apache.org/repos/asf/beam/repo
Commit: http://git-wip-us.apache.org/repos/asf/beam/commit/3fd88901
Tree: http://git-wip-us.apache.org/repos/asf/beam/tree/3fd88901
Diff: http://git-wip-us.apache.org/repos/asf/beam/diff/3fd88901
Branch: refs/heads/DSL_SQL
Commit: 3fd889015afa8528801d2c35c8c9f72b944ea472
Parents: a51bdd2
Author: Eugene Kirpichov <ki...@google.com>
Authored: Sat Apr 15 16:39:51 2017 -0700
Committer: Eugene Kirpichov <ki...@google.com>
Committed: Tue Apr 18 18:02:06 2017 -0700
----------------------------------------------------------------------
.../beam/runners/core/SplittableParDo.java | 35 +++++++++++++++-----
.../beam/runners/core/SplittableParDoTest.java | 8 ++++-
2 files changed, 34 insertions(+), 9 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/beam/blob/3fd88901/runners/core-java/src/main/java/org/apache/beam/runners/core/SplittableParDo.java
----------------------------------------------------------------------
diff --git a/runners/core-java/src/main/java/org/apache/beam/runners/core/SplittableParDo.java b/runners/core-java/src/main/java/org/apache/beam/runners/core/SplittableParDo.java
index 9cc965a..44db1f7 100644
--- a/runners/core-java/src/main/java/org/apache/beam/runners/core/SplittableParDo.java
+++ b/runners/core-java/src/main/java/org/apache/beam/runners/core/SplittableParDo.java
@@ -115,7 +115,7 @@ public class SplittableParDo<InputT, OutputT, RestrictionT>
fn,
input.getCoder(),
restrictionCoder,
- input.getWindowingStrategy(),
+ (WindowingStrategy<InputT, ?>) input.getWindowingStrategy(),
parDo.getSideInputs(),
parDo.getMainOutputTag(),
parDo.getAdditionalOutputTags()));
@@ -185,7 +185,7 @@ public class SplittableParDo<InputT, OutputT, RestrictionT>
private final DoFn<InputT, OutputT> fn;
private final Coder<InputT> elementCoder;
private final Coder<RestrictionT> restrictionCoder;
- private final WindowingStrategy<?, ?> windowingStrategy;
+ private final WindowingStrategy<InputT, ?> windowingStrategy;
private final List<PCollectionView<?>> sideInputs;
private final TupleTag<OutputT> mainOutputTag;
private final TupleTagList additionalOutputTags;
@@ -202,7 +202,7 @@ public class SplittableParDo<InputT, OutputT, RestrictionT>
DoFn<InputT, OutputT> fn,
Coder<InputT> elementCoder,
Coder<RestrictionT> restrictionCoder,
- WindowingStrategy<?, ?> windowingStrategy,
+ WindowingStrategy<InputT, ?> windowingStrategy,
List<PCollectionView<?>> sideInputs,
TupleTag<OutputT> mainOutputTag,
TupleTagList additionalOutputTags) {
@@ -234,7 +234,7 @@ public class SplittableParDo<InputT, OutputT, RestrictionT>
public ProcessFn<InputT, OutputT, RestrictionT, TrackerT> newProcessFn(
DoFn<InputT, OutputT> fn) {
return new SplittableParDo.ProcessFn<>(
- fn, elementCoder, restrictionCoder, windowingStrategy.getWindowFn().windowCoder());
+ fn, elementCoder, restrictionCoder, windowingStrategy);
}
@Override
@@ -351,7 +351,9 @@ public class SplittableParDo<InputT, OutputT, RestrictionT>
private StateTag<Object, ValueState<RestrictionT>> restrictionTag;
private final DoFn<InputT, OutputT> fn;
- private final Coder<? extends BoundedWindow> windowCoder;
+ private final Coder<InputT> elementCoder;
+ private final Coder<RestrictionT> restrictionCoder;
+ private final WindowingStrategy<InputT, ?> inputWindowingStrategy;
private transient StateInternalsFactory<String> stateInternalsFactory;
private transient TimerInternalsFactory<String> timerInternalsFactory;
@@ -364,11 +366,16 @@ public class SplittableParDo<InputT, OutputT, RestrictionT>
DoFn<InputT, OutputT> fn,
Coder<InputT> elementCoder,
Coder<RestrictionT> restrictionCoder,
- Coder<? extends BoundedWindow> windowCoder) {
+ WindowingStrategy<InputT, ?> inputWindowingStrategy) {
this.fn = fn;
- this.windowCoder = windowCoder;
+ this.elementCoder = elementCoder;
+ this.restrictionCoder = restrictionCoder;
+ this.inputWindowingStrategy = inputWindowingStrategy;
this.elementTag =
- StateTags.value("element", WindowedValue.getFullCoder(elementCoder, this.windowCoder));
+ StateTags.value(
+ "element",
+ WindowedValue.getFullCoder(
+ elementCoder, inputWindowingStrategy.getWindowFn().windowCoder()));
this.restrictionTag = StateTags.value("restriction", restrictionCoder);
}
@@ -389,6 +396,18 @@ public class SplittableParDo<InputT, OutputT, RestrictionT>
return fn;
}
+ public Coder<InputT> getElementCoder() {
+ return elementCoder;
+ }
+
+ public Coder<RestrictionT> getRestrictionCoder() {
+ return restrictionCoder;
+ }
+
+ public WindowingStrategy<InputT, ?> getInputWindowingStrategy() {
+ return inputWindowingStrategy;
+ }
+
@Setup
public void setup() throws Exception {
invoker = DoFnInvokers.invokerFor(fn);
http://git-wip-us.apache.org/repos/asf/beam/blob/3fd88901/runners/core-java/src/test/java/org/apache/beam/runners/core/SplittableParDoTest.java
----------------------------------------------------------------------
diff --git a/runners/core-java/src/test/java/org/apache/beam/runners/core/SplittableParDoTest.java b/runners/core-java/src/test/java/org/apache/beam/runners/core/SplittableParDoTest.java
index 2c89543..5629635 100644
--- a/runners/core-java/src/test/java/org/apache/beam/runners/core/SplittableParDoTest.java
+++ b/runners/core-java/src/test/java/org/apache/beam/runners/core/SplittableParDoTest.java
@@ -51,11 +51,13 @@ import org.apache.beam.sdk.transforms.splittabledofn.OffsetRange;
import org.apache.beam.sdk.transforms.splittabledofn.OffsetRangeTracker;
import org.apache.beam.sdk.transforms.splittabledofn.RestrictionTracker;
import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
+import org.apache.beam.sdk.transforms.windowing.FixedWindows;
import org.apache.beam.sdk.transforms.windowing.GlobalWindow;
import org.apache.beam.sdk.transforms.windowing.IntervalWindow;
import org.apache.beam.sdk.transforms.windowing.PaneInfo;
import org.apache.beam.sdk.util.SideInputReader;
import org.apache.beam.sdk.util.WindowedValue;
+import org.apache.beam.sdk.util.WindowingStrategy;
import org.apache.beam.sdk.values.PCollection;
import org.apache.beam.sdk.values.PCollectionView;
import org.apache.beam.sdk.values.TimestampedValue;
@@ -220,9 +222,13 @@ public class SplittableParDoTest {
int maxOutputsPerBundle,
Duration maxBundleDuration)
throws Exception {
+ // The exact windowing strategy doesn't matter in this test, but it should be able to
+ // encode IntervalWindow's because that's what all tests here use.
+ WindowingStrategy<InputT, BoundedWindow> windowingStrategy =
+ (WindowingStrategy) WindowingStrategy.of(FixedWindows.of(Duration.standardSeconds(1)));
final SplittableParDo.ProcessFn<InputT, OutputT, RestrictionT, TrackerT> processFn =
new SplittableParDo.ProcessFn<>(
- fn, inputCoder, restrictionCoder, IntervalWindow.getCoder());
+ fn, inputCoder, restrictionCoder, windowingStrategy);
this.tester = DoFnTester.of(processFn);
this.timerInternals = new InMemoryTimerInternals();
this.stateInternals = new TestInMemoryStateInternals<>("dummy");
[36/50] [abbrv] beam git commit: [BEAM-1994] Remove Flink examples
package
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/package-info.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/package-info.java b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/package-info.java
deleted file mode 100644
index af4b354..0000000
--- a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/package-info.java
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * Internal implementation of the Beam runner for Apache Flink.
- */
-package org.apache.beam.runners.flink.translation;
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/types/CoderTypeInformation.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/types/CoderTypeInformation.java b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/types/CoderTypeInformation.java
deleted file mode 100644
index 9b449aa..0000000
--- a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/types/CoderTypeInformation.java
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink.translation.types;
-
-import static com.google.common.base.Preconditions.checkNotNull;
-
-import org.apache.beam.sdk.coders.Coder;
-import org.apache.flink.api.common.ExecutionConfig;
-import org.apache.flink.api.common.typeinfo.AtomicType;
-import org.apache.flink.api.common.typeinfo.TypeInformation;
-import org.apache.flink.api.common.typeutils.TypeComparator;
-import org.apache.flink.api.common.typeutils.TypeSerializer;
-
-/**
- * Flink {@link org.apache.flink.api.common.typeinfo.TypeInformation} for
- * Dataflow {@link org.apache.beam.sdk.coders.Coder}s.
- */
-public class CoderTypeInformation<T> extends TypeInformation<T> implements AtomicType<T> {
-
- private final Coder<T> coder;
-
- public CoderTypeInformation(Coder<T> coder) {
- checkNotNull(coder);
- this.coder = coder;
- }
-
- public Coder<T> getCoder() {
- return coder;
- }
-
- @Override
- public boolean isBasicType() {
- return false;
- }
-
- @Override
- public boolean isTupleType() {
- return false;
- }
-
- @Override
- public int getArity() {
- return 1;
- }
-
- @Override
- @SuppressWarnings("unchecked")
- public Class<T> getTypeClass() {
- // We don't have the Class, so we have to pass null here. What a shame...
- return (Class<T>) Object.class;
- }
-
- @Override
- public boolean isKeyType() {
- return true;
- }
-
- @Override
- @SuppressWarnings("unchecked")
- public TypeSerializer<T> createSerializer(ExecutionConfig config) {
- return new CoderTypeSerializer<>(coder);
- }
-
- @Override
- public int getTotalFields() {
- return 2;
- }
-
- @Override
- public boolean equals(Object o) {
- if (this == o) {
- return true;
- }
- if (o == null || getClass() != o.getClass()) {
- return false;
- }
-
- CoderTypeInformation that = (CoderTypeInformation) o;
-
- return coder.equals(that.coder);
-
- }
-
- @Override
- public int hashCode() {
- return coder.hashCode();
- }
-
- @Override
- public boolean canEqual(Object obj) {
- return obj instanceof CoderTypeInformation;
- }
-
- @Override
- public String toString() {
- return "CoderTypeInformation{coder=" + coder + '}';
- }
-
- @Override
- public TypeComparator<T> createComparator(boolean sortOrderAscending, ExecutionConfig
- executionConfig) {
- throw new UnsupportedOperationException(
- "Non-encoded values cannot be compared directly.");
- }
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/types/CoderTypeSerializer.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/types/CoderTypeSerializer.java b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/types/CoderTypeSerializer.java
deleted file mode 100644
index e210ed9..0000000
--- a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/types/CoderTypeSerializer.java
+++ /dev/null
@@ -1,132 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink.translation.types;
-
-import java.io.EOFException;
-import java.io.IOException;
-import org.apache.beam.runners.flink.translation.wrappers.DataInputViewWrapper;
-import org.apache.beam.runners.flink.translation.wrappers.DataOutputViewWrapper;
-import org.apache.beam.sdk.coders.Coder;
-import org.apache.beam.sdk.coders.CoderException;
-import org.apache.beam.sdk.util.CoderUtils;
-import org.apache.flink.api.common.typeutils.TypeSerializer;
-import org.apache.flink.core.memory.DataInputView;
-import org.apache.flink.core.memory.DataOutputView;
-
-/**
- * Flink {@link org.apache.flink.api.common.typeutils.TypeSerializer} for
- * Dataflow {@link org.apache.beam.sdk.coders.Coder Coders}.
- */
-public class CoderTypeSerializer<T> extends TypeSerializer<T> {
-
- private Coder<T> coder;
-
- public CoderTypeSerializer(Coder<T> coder) {
- this.coder = coder;
- }
-
- @Override
- public boolean isImmutableType() {
- return false;
- }
-
- @Override
- public CoderTypeSerializer<T> duplicate() {
- return new CoderTypeSerializer<>(coder);
- }
-
- @Override
- public T createInstance() {
- return null;
- }
-
- @Override
- public T copy(T t) {
- try {
- return CoderUtils.clone(coder, t);
- } catch (CoderException e) {
- throw new RuntimeException("Could not clone.", e);
- }
- }
-
- @Override
- public T copy(T t, T reuse) {
- return copy(t);
- }
-
- @Override
- public int getLength() {
- return -1;
- }
-
- @Override
- public void serialize(T t, DataOutputView dataOutputView) throws IOException {
- DataOutputViewWrapper outputWrapper = new DataOutputViewWrapper(dataOutputView);
- coder.encode(t, outputWrapper, Coder.Context.NESTED);
- }
-
- @Override
- public T deserialize(DataInputView dataInputView) throws IOException {
- try {
- DataInputViewWrapper inputWrapper = new DataInputViewWrapper(dataInputView);
- return coder.decode(inputWrapper, Coder.Context.NESTED);
- } catch (CoderException e) {
- Throwable cause = e.getCause();
- if (cause instanceof EOFException) {
- throw (EOFException) cause;
- } else {
- throw e;
- }
- }
- }
-
- @Override
- public T deserialize(T t, DataInputView dataInputView) throws IOException {
- return deserialize(dataInputView);
- }
-
- @Override
- public void copy(
- DataInputView dataInputView,
- DataOutputView dataOutputView) throws IOException {
- serialize(deserialize(dataInputView), dataOutputView);
- }
-
- @Override
- public boolean equals(Object o) {
- if (this == o) {
- return true;
- }
- if (o == null || getClass() != o.getClass()) {
- return false;
- }
-
- CoderTypeSerializer that = (CoderTypeSerializer) o;
- return coder.equals(that.coder);
- }
-
- @Override
- public boolean canEqual(Object obj) {
- return obj instanceof CoderTypeSerializer;
- }
-
- @Override
- public int hashCode() {
- return coder.hashCode();
- }
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/types/EncodedValueComparator.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/types/EncodedValueComparator.java b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/types/EncodedValueComparator.java
deleted file mode 100644
index 667ef45..0000000
--- a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/types/EncodedValueComparator.java
+++ /dev/null
@@ -1,195 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink.translation.types;
-
-import java.io.IOException;
-import java.util.Arrays;
-import org.apache.beam.sdk.coders.Coder;
-import org.apache.flink.api.common.typeutils.TypeComparator;
-import org.apache.flink.core.memory.DataInputView;
-import org.apache.flink.core.memory.DataOutputView;
-import org.apache.flink.core.memory.MemorySegment;
-
-/**
- * Flink {@link org.apache.flink.api.common.typeutils.TypeComparator} for Beam values that have
- * been encoded to byte data by a {@link Coder}.
- */
-public class EncodedValueComparator extends TypeComparator<byte[]> {
-
- /** For storing the Reference in encoded form. */
- private transient byte[] encodedReferenceKey;
-
- private final boolean ascending;
-
- public EncodedValueComparator(boolean ascending) {
- this.ascending = ascending;
- }
-
- @Override
- public int hash(byte[] record) {
- return Arrays.hashCode(record);
- }
-
- @Override
- public void setReference(byte[] toCompare) {
- this.encodedReferenceKey = toCompare;
- }
-
- @Override
- public boolean equalToReference(byte[] candidate) {
- if (encodedReferenceKey.length != candidate.length) {
- return false;
- }
- int len = candidate.length;
- for (int i = 0; i < len; i++) {
- if (encodedReferenceKey[i] != candidate[i]) {
- return false;
- }
- }
- return true;
- }
-
- @Override
- public int compareToReference(TypeComparator<byte[]> other) {
- // VERY IMPORTANT: compareToReference does not behave like Comparable.compare
- // the meaning of the return value is inverted.
-
- EncodedValueComparator otherEncodedValueComparator = (EncodedValueComparator) other;
-
- int len = Math.min(
- encodedReferenceKey.length,
- otherEncodedValueComparator.encodedReferenceKey.length);
-
- for (int i = 0; i < len; i++) {
- byte b1 = encodedReferenceKey[i];
- byte b2 = otherEncodedValueComparator.encodedReferenceKey[i];
- int result = (b1 < b2 ? -1 : (b1 == b2 ? 0 : 1));
- if (result != 0) {
- return ascending ? -result : result;
- }
- }
- int result =
- encodedReferenceKey.length - otherEncodedValueComparator.encodedReferenceKey.length;
- return ascending ? -result : result;
- }
-
-
- @Override
- public int compare(byte[] first, byte[] second) {
- int len = Math.min(first.length, second.length);
- for (int i = 0; i < len; i++) {
- byte b1 = first[i];
- byte b2 = second[i];
- int result = (b1 < b2 ? -1 : (b1 == b2 ? 0 : 1));
- if (result != 0) {
- return ascending ? result : -result;
- }
- }
- int result = first.length - second.length;
- return ascending ? result : -result;
- }
-
- @Override
- public int compareSerialized(
- DataInputView firstSource,
- DataInputView secondSource) throws IOException {
- int lengthFirst = firstSource.readInt();
- int lengthSecond = secondSource.readInt();
-
- int len = Math.min(lengthFirst, lengthSecond);
- for (int i = 0; i < len; i++) {
- byte b1 = firstSource.readByte();
- byte b2 = secondSource.readByte();
- int result = (b1 < b2 ? -1 : (b1 == b2 ? 0 : 1));
- if (result != 0) {
- return ascending ? result : -result;
- }
- }
-
- int result = lengthFirst - lengthSecond;
- return ascending ? result : -result;
- }
-
-
-
- @Override
- public boolean supportsNormalizedKey() {
- // disabled because this seems to not work with some coders,
- // such as the AvroCoder
- return false;
- }
-
- @Override
- public boolean supportsSerializationWithKeyNormalization() {
- return false;
- }
-
- @Override
- public int getNormalizeKeyLen() {
- return Integer.MAX_VALUE;
- }
-
- @Override
- public boolean isNormalizedKeyPrefixOnly(int keyBytes) {
- return true;
- }
-
- @Override
- public void putNormalizedKey(byte[] record, MemorySegment target, int offset, int numBytes) {
- final int limit = offset + numBytes;
-
- target.put(offset, record, 0, Math.min(numBytes, record.length));
-
- offset += record.length;
-
- while (offset < limit) {
- target.put(offset++, (byte) 0);
- }
- }
-
- @Override
- public void writeWithKeyNormalization(byte[] record, DataOutputView target) throws IOException {
- throw new UnsupportedOperationException();
- }
-
- @Override
- public byte[] readWithKeyDenormalization(byte[] reuse, DataInputView source) throws IOException {
- throw new UnsupportedOperationException();
- }
-
- @Override
- public boolean invertNormalizedKey() {
- return !ascending;
- }
-
- @Override
- public TypeComparator<byte[]> duplicate() {
- return new EncodedValueComparator(ascending);
- }
-
- @Override
- public int extractKeys(Object record, Object[] target, int index) {
- target[index] = record;
- return 1;
- }
-
- @Override
- public TypeComparator[] getFlatComparators() {
- return new TypeComparator[] { this.duplicate() };
- }
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/types/EncodedValueSerializer.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/types/EncodedValueSerializer.java b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/types/EncodedValueSerializer.java
deleted file mode 100644
index 41db61e..0000000
--- a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/types/EncodedValueSerializer.java
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink.translation.types;
-
-import java.io.IOException;
-
-import org.apache.beam.sdk.coders.Coder;
-
-import org.apache.flink.api.common.typeutils.TypeSerializer;
-import org.apache.flink.core.memory.DataInputView;
-import org.apache.flink.core.memory.DataOutputView;
-
-/**
- * {@link TypeSerializer} for values that were encoded using a {@link Coder}.
- */
-public final class EncodedValueSerializer extends TypeSerializer<byte[]> {
-
- private static final long serialVersionUID = 1L;
-
- private static final byte[] EMPTY = new byte[0];
-
- @Override
- public boolean isImmutableType() {
- return true;
- }
-
- @Override
- public byte[] createInstance() {
- return EMPTY;
- }
-
- @Override
- public byte[] copy(byte[] from) {
- return from;
- }
-
- @Override
- public byte[] copy(byte[] from, byte[] reuse) {
- return copy(from);
- }
-
- @Override
- public int getLength() {
- return -1;
- }
-
-
- @Override
- public void serialize(byte[] record, DataOutputView target) throws IOException {
- if (record == null) {
- throw new IllegalArgumentException("The record must not be null.");
- }
-
- final int len = record.length;
- target.writeInt(len);
- target.write(record);
- }
-
- @Override
- public byte[] deserialize(DataInputView source) throws IOException {
- final int len = source.readInt();
- byte[] result = new byte[len];
- source.readFully(result);
- return result;
- }
-
- @Override
- public byte[] deserialize(byte[] reuse, DataInputView source) throws IOException {
- return deserialize(source);
- }
-
- @Override
- public void copy(DataInputView source, DataOutputView target) throws IOException {
- final int len = source.readInt();
- target.writeInt(len);
- target.write(source, len);
- }
-
- @Override
- public boolean canEqual(Object obj) {
- return obj instanceof EncodedValueSerializer;
- }
-
- @Override
- public int hashCode() {
- return this.getClass().hashCode();
- }
-
- @Override
- public boolean equals(Object obj) {
- return obj instanceof EncodedValueSerializer;
- }
-
- @Override
- public TypeSerializer<byte[]> duplicate() {
- return this;
- }
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/types/EncodedValueTypeInformation.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/types/EncodedValueTypeInformation.java b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/types/EncodedValueTypeInformation.java
deleted file mode 100644
index e24bf31..0000000
--- a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/types/EncodedValueTypeInformation.java
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink.translation.types;
-
-import org.apache.beam.sdk.coders.Coder;
-import org.apache.flink.api.common.ExecutionConfig;
-import org.apache.flink.api.common.typeinfo.AtomicType;
-import org.apache.flink.api.common.typeinfo.TypeInformation;
-import org.apache.flink.api.common.typeutils.TypeComparator;
-import org.apache.flink.api.common.typeutils.TypeSerializer;
-
-/**
- * Flink {@link TypeInformation} for Beam values that have been encoded to byte data
- * by a {@link Coder}.
- */
-public class EncodedValueTypeInformation
- extends TypeInformation<byte[]>
- implements AtomicType<byte[]> {
-
- private static final long serialVersionUID = 1L;
-
- @Override
- public boolean isBasicType() {
- return false;
- }
-
- @Override
- public boolean isTupleType() {
- return false;
- }
-
- @Override
- public int getArity() {
- return 0;
- }
-
- @Override
- public int getTotalFields() {
- return 0;
- }
-
- @Override
- public Class<byte[]> getTypeClass() {
- return byte[].class;
- }
-
- @Override
- public boolean isKeyType() {
- return true;
- }
-
- @Override
- public TypeSerializer<byte[]> createSerializer(ExecutionConfig executionConfig) {
- return new EncodedValueSerializer();
- }
-
- @Override
- public boolean equals(Object other) {
- return other instanceof EncodedValueTypeInformation;
- }
-
- @Override
- public int hashCode() {
- return this.getClass().hashCode();
- }
-
- @Override
- public boolean canEqual(Object obj) {
- return obj instanceof EncodedValueTypeInformation;
- }
-
- @Override
- public String toString() {
- return "EncodedValueTypeInformation";
- }
-
- @Override
- public TypeComparator<byte[]> createComparator(
- boolean sortOrderAscending,
- ExecutionConfig executionConfig) {
- return new EncodedValueComparator(sortOrderAscending);
- }
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/types/InspectableByteArrayOutputStream.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/types/InspectableByteArrayOutputStream.java b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/types/InspectableByteArrayOutputStream.java
deleted file mode 100644
index 36b5ba3..0000000
--- a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/types/InspectableByteArrayOutputStream.java
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink.translation.types;
-
-import java.io.ByteArrayOutputStream;
-
-/**
- * Version of {@link java.io.ByteArrayOutputStream} that allows to retrieve the internal
- * byte[] buffer without incurring an array copy.
- */
-public class InspectableByteArrayOutputStream extends ByteArrayOutputStream {
-
- /**
- * Get the underlying byte array.
- */
- public byte[] getBuffer() {
- return buf;
- }
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/types/KvKeySelector.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/types/KvKeySelector.java b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/types/KvKeySelector.java
deleted file mode 100644
index 9df6836..0000000
--- a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/types/KvKeySelector.java
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink.translation.types;
-
-import org.apache.beam.sdk.coders.Coder;
-import org.apache.beam.sdk.util.CoderUtils;
-import org.apache.beam.sdk.util.WindowedValue;
-import org.apache.beam.sdk.values.KV;
-import org.apache.flink.api.common.typeinfo.TypeInformation;
-import org.apache.flink.api.java.functions.KeySelector;
-import org.apache.flink.api.java.typeutils.ResultTypeQueryable;
-
-/**
- * {@link KeySelector} that extracts the key from a {@link KV} and returns
- * it in encoded form as a {@code byte} array.
- */
-public class KvKeySelector<InputT, K>
- implements KeySelector<WindowedValue<KV<K, InputT>>, byte[]>, ResultTypeQueryable<byte[]> {
-
- private final Coder<K> keyCoder;
-
- public KvKeySelector(Coder<K> keyCoder) {
- this.keyCoder = keyCoder;
- }
-
- @Override
- public byte[] getKey(WindowedValue<KV<K, InputT>> value) throws Exception {
- return CoderUtils.encodeToByteArray(keyCoder, value.getValue().getKey());
- }
-
- @Override
- public TypeInformation<byte[]> getProducedType() {
- return new EncodedValueTypeInformation();
- }
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/types/package-info.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/types/package-info.java b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/types/package-info.java
deleted file mode 100644
index 6fb3182..0000000
--- a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/types/package-info.java
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * Internal implementation of the Beam runner for Apache Flink.
- */
-package org.apache.beam.runners.flink.translation.types;
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/utils/SerializedPipelineOptions.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/utils/SerializedPipelineOptions.java b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/utils/SerializedPipelineOptions.java
deleted file mode 100644
index 2256bb1..0000000
--- a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/utils/SerializedPipelineOptions.java
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.beam.runners.flink.translation.utils;
-
-import static com.google.common.base.Preconditions.checkNotNull;
-
-import com.fasterxml.jackson.databind.ObjectMapper;
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
-import java.io.Serializable;
-import org.apache.beam.sdk.io.FileSystems;
-import org.apache.beam.sdk.options.PipelineOptions;
-import org.apache.beam.sdk.util.IOChannelUtils;
-
-/**
- * Encapsulates the PipelineOptions in serialized form to ship them to the cluster.
- */
-public class SerializedPipelineOptions implements Serializable {
-
- private final byte[] serializedOptions;
-
- /** Lazily initialized copy of deserialized options. */
- private transient PipelineOptions pipelineOptions;
-
- public SerializedPipelineOptions(PipelineOptions options) {
- checkNotNull(options, "PipelineOptions must not be null.");
-
- try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
- new ObjectMapper().writeValue(baos, options);
- this.serializedOptions = baos.toByteArray();
- } catch (Exception e) {
- throw new RuntimeException("Couldn't serialize PipelineOptions.", e);
- }
-
- }
-
- public PipelineOptions getPipelineOptions() {
- if (pipelineOptions == null) {
- try {
- pipelineOptions = new ObjectMapper().readValue(serializedOptions, PipelineOptions.class);
-
- IOChannelUtils.registerIOFactoriesAllowOverride(pipelineOptions);
- FileSystems.setDefaultConfigInWorkers(pipelineOptions);
- } catch (IOException e) {
- throw new RuntimeException("Couldn't deserialize the PipelineOptions.", e);
- }
- }
-
- return pipelineOptions;
- }
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/utils/package-info.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/utils/package-info.java b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/utils/package-info.java
deleted file mode 100644
index 5dedd53..0000000
--- a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/utils/package-info.java
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * Internal implementation of the Beam runner for Apache Flink.
- */
-package org.apache.beam.runners.flink.translation.utils;
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/DataInputViewWrapper.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/DataInputViewWrapper.java b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/DataInputViewWrapper.java
deleted file mode 100644
index 82a2c4e..0000000
--- a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/DataInputViewWrapper.java
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink.translation.wrappers;
-
-import java.io.EOFException;
-import java.io.IOException;
-import java.io.InputStream;
-import org.apache.flink.core.memory.DataInputView;
-
-/**
- * Wrapper for {@link DataInputView}. We need this because Flink reads data using a
- * {@link org.apache.flink.core.memory.DataInputView} while
- * Dataflow {@link org.apache.beam.sdk.coders.Coder}s expect an
- * {@link java.io.InputStream}.
- */
-public class DataInputViewWrapper extends InputStream {
-
- private DataInputView inputView;
-
- public DataInputViewWrapper(DataInputView inputView) {
- this.inputView = inputView;
- }
-
- public void setInputView(DataInputView inputView) {
- this.inputView = inputView;
- }
-
- @Override
- public int read() throws IOException {
- try {
- return inputView.readUnsignedByte();
- } catch (EOFException e) {
- // translate between DataInput and InputStream,
- // DataInput signals EOF by exception, InputStream does it by returning -1
- return -1;
- }
- }
-
- @Override
- public int read(byte[] b, int off, int len) throws IOException {
- return inputView.read(b, off, len);
- }
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/DataOutputViewWrapper.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/DataOutputViewWrapper.java b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/DataOutputViewWrapper.java
deleted file mode 100644
index f2d9db2..0000000
--- a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/DataOutputViewWrapper.java
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink.translation.wrappers;
-
-import java.io.IOException;
-import java.io.OutputStream;
-import org.apache.flink.core.memory.DataOutputView;
-
-/**
- * Wrapper for {@link org.apache.flink.core.memory.DataOutputView}. We need this because
- * Flink writes data using a {@link org.apache.flink.core.memory.DataInputView} while
- * Dataflow {@link org.apache.beam.sdk.coders.Coder}s expect an
- * {@link java.io.OutputStream}.
- */
-public class DataOutputViewWrapper extends OutputStream {
-
- private DataOutputView outputView;
-
- public DataOutputViewWrapper(DataOutputView outputView) {
- this.outputView = outputView;
- }
-
- public void setOutputView(DataOutputView outputView) {
- this.outputView = outputView;
- }
-
- @Override
- public void write(int b) throws IOException {
- outputView.write(b);
- }
-
- @Override
- public void write(byte[] b, int off, int len) throws IOException {
- outputView.write(b, off, len);
- }
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/SerializableFnAggregatorWrapper.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/SerializableFnAggregatorWrapper.java b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/SerializableFnAggregatorWrapper.java
deleted file mode 100644
index 70d97e3..0000000
--- a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/SerializableFnAggregatorWrapper.java
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink.translation.wrappers;
-
-import com.google.common.collect.ImmutableList;
-import com.google.common.collect.Lists;
-import java.io.Serializable;
-import org.apache.beam.sdk.transforms.Aggregator;
-import org.apache.beam.sdk.transforms.Combine;
-import org.apache.flink.api.common.accumulators.Accumulator;
-
-/**
- * Wrapper that wraps a {@link org.apache.beam.sdk.transforms.Combine.CombineFn}
- * in a Flink {@link org.apache.flink.api.common.accumulators.Accumulator} for using
- * the function as an aggregator in a {@link org.apache.beam.sdk.transforms.ParDo}
- * operation.
- */
-public class SerializableFnAggregatorWrapper<InputT, OutputT>
- implements Aggregator<InputT, OutputT>, Accumulator<InputT, Serializable> {
-
- private OutputT aa;
- private Combine.CombineFn<InputT, ?, OutputT> combiner;
-
- public SerializableFnAggregatorWrapper(Combine.CombineFn<InputT, ?, OutputT> combiner) {
- this.combiner = combiner;
- resetLocal();
- }
-
- @Override
- @SuppressWarnings("unchecked")
- public void add(InputT value) {
- this.aa = combiner.apply(ImmutableList.of((InputT) aa, value));
- }
-
- @Override
- public Serializable getLocalValue() {
- return (Serializable) aa;
- }
-
- @Override
- public void resetLocal() {
- this.aa = combiner.apply(ImmutableList.<InputT>of());
- }
-
- @Override
- @SuppressWarnings("unchecked")
- public void merge(Accumulator<InputT, Serializable> other) {
- this.aa = combiner.apply(ImmutableList.of((InputT) aa, (InputT) other.getLocalValue()));
- }
-
- @Override
- public void addValue(InputT value) {
- add(value);
- }
-
- @Override
- public String getName() {
- return "Aggregator :" + combiner.toString();
- }
-
- @Override
- public Combine.CombineFn<InputT, ?, OutputT> getCombineFn() {
- return combiner;
- }
-
- @Override
- public Accumulator<InputT, Serializable> clone() {
- try {
- super.clone();
- } catch (CloneNotSupportedException e) {
- // Flink Accumulators cannot throw CloneNotSupportedException, work around that.
- throw new RuntimeException(e);
- }
-
- // copy it by merging
- OutputT resultCopy = combiner.apply(Lists.newArrayList((InputT) aa));
- SerializableFnAggregatorWrapper<InputT, OutputT> result = new
- SerializableFnAggregatorWrapper<>(combiner);
-
- result.aa = resultCopy;
- return result;
- }
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/SourceInputFormat.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/SourceInputFormat.java b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/SourceInputFormat.java
deleted file mode 100644
index a87472b..0000000
--- a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/SourceInputFormat.java
+++ /dev/null
@@ -1,150 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink.translation.wrappers;
-
-import java.io.IOException;
-import java.util.List;
-import org.apache.beam.runners.flink.translation.utils.SerializedPipelineOptions;
-import org.apache.beam.sdk.io.BoundedSource;
-import org.apache.beam.sdk.io.Source;
-import org.apache.beam.sdk.options.PipelineOptions;
-import org.apache.beam.sdk.transforms.windowing.GlobalWindow;
-import org.apache.beam.sdk.transforms.windowing.PaneInfo;
-import org.apache.beam.sdk.util.WindowedValue;
-import org.apache.flink.api.common.io.DefaultInputSplitAssigner;
-import org.apache.flink.api.common.io.InputFormat;
-import org.apache.flink.api.common.io.statistics.BaseStatistics;
-import org.apache.flink.configuration.Configuration;
-import org.apache.flink.core.io.InputSplitAssigner;
-import org.joda.time.Instant;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-
-/**
- * Wrapper for executing a {@link Source} as a Flink {@link InputFormat}.
- */
-public class SourceInputFormat<T>
- implements InputFormat<WindowedValue<T>, SourceInputSplit<T>> {
- private static final Logger LOG = LoggerFactory.getLogger(SourceInputFormat.class);
-
- private final BoundedSource<T> initialSource;
-
- private transient PipelineOptions options;
- private final SerializedPipelineOptions serializedOptions;
-
- private transient BoundedSource.BoundedReader<T> reader;
- private boolean inputAvailable = false;
-
- public SourceInputFormat(BoundedSource<T> initialSource, PipelineOptions options) {
- this.initialSource = initialSource;
- this.serializedOptions = new SerializedPipelineOptions(options);
- }
-
- @Override
- public void configure(Configuration configuration) {
- options = serializedOptions.getPipelineOptions();
- }
-
- @Override
- public void open(SourceInputSplit<T> sourceInputSplit) throws IOException {
- reader = ((BoundedSource<T>) sourceInputSplit.getSource()).createReader(options);
- inputAvailable = reader.start();
- }
-
- @Override
- public BaseStatistics getStatistics(BaseStatistics baseStatistics) throws IOException {
- try {
- final long estimatedSize = initialSource.getEstimatedSizeBytes(options);
-
- return new BaseStatistics() {
- @Override
- public long getTotalInputSize() {
- return estimatedSize;
- }
-
- @Override
- public long getNumberOfRecords() {
- return BaseStatistics.NUM_RECORDS_UNKNOWN;
- }
-
- @Override
- public float getAverageRecordWidth() {
- return BaseStatistics.AVG_RECORD_BYTES_UNKNOWN;
- }
- };
- } catch (Exception e) {
- LOG.warn("Could not read Source statistics: {}", e);
- }
-
- return null;
- }
-
- @Override
- @SuppressWarnings("unchecked")
- public SourceInputSplit<T>[] createInputSplits(int numSplits) throws IOException {
- try {
- long desiredSizeBytes = initialSource.getEstimatedSizeBytes(options) / numSplits;
- List<? extends Source<T>> shards =
- initialSource.split(desiredSizeBytes, options);
- int numShards = shards.size();
- SourceInputSplit<T>[] sourceInputSplits = new SourceInputSplit[numShards];
- for (int i = 0; i < numShards; i++) {
- sourceInputSplits[i] = new SourceInputSplit<>(shards.get(i), i);
- }
- return sourceInputSplits;
- } catch (Exception e) {
- throw new IOException("Could not create input splits from Source.", e);
- }
- }
-
- @Override
- public InputSplitAssigner getInputSplitAssigner(final SourceInputSplit[] sourceInputSplits) {
- return new DefaultInputSplitAssigner(sourceInputSplits);
- }
-
-
- @Override
- public boolean reachedEnd() throws IOException {
- return !inputAvailable;
- }
-
- @Override
- public WindowedValue<T> nextRecord(WindowedValue<T> t) throws IOException {
- if (inputAvailable) {
- final T current = reader.getCurrent();
- final Instant timestamp = reader.getCurrentTimestamp();
- // advance reader to have a record ready next time
- inputAvailable = reader.advance();
- return WindowedValue.of(
- current,
- timestamp,
- GlobalWindow.INSTANCE, PaneInfo.NO_FIRING);
- }
-
- return null;
- }
-
- @Override
- public void close() throws IOException {
- // TODO null check can be removed once FLINK-3796 is fixed
- if (reader != null) {
- reader.close();
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/SourceInputSplit.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/SourceInputSplit.java b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/SourceInputSplit.java
deleted file mode 100644
index e4a7386..0000000
--- a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/SourceInputSplit.java
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink.translation.wrappers;
-
-import org.apache.beam.sdk.io.Source;
-import org.apache.flink.core.io.InputSplit;
-
-/**
- * {@link org.apache.flink.core.io.InputSplit} for
- * {@link org.apache.beam.runners.flink.translation.wrappers.SourceInputFormat}. We pass
- * the sharded Source around in the input split because Sources simply split up into several
- * Sources for sharding. This is different to how Flink creates a separate InputSplit from
- * an InputFormat.
- */
-public class SourceInputSplit<T> implements InputSplit {
-
- private Source<T> source;
- private int splitNumber;
-
- public SourceInputSplit() {
- }
-
- public SourceInputSplit(Source<T> source, int splitNumber) {
- this.source = source;
- this.splitNumber = splitNumber;
- }
-
- @Override
- public int getSplitNumber() {
- return splitNumber;
- }
-
- public Source<T> getSource() {
- return source;
- }
-
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/package-info.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/package-info.java b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/package-info.java
deleted file mode 100644
index 72f7deb..0000000
--- a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/package-info.java
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * Internal implementation of the Beam runner for Apache Flink.
- */
-package org.apache.beam.runners.flink.translation.wrappers;
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/DoFnOperator.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/DoFnOperator.java b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/DoFnOperator.java
deleted file mode 100644
index 8a09286..0000000
--- a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/DoFnOperator.java
+++ /dev/null
@@ -1,774 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink.translation.wrappers.streaming;
-
-import static org.apache.flink.util.Preconditions.checkArgument;
-
-import com.google.common.base.Optional;
-import com.google.common.collect.Iterables;
-import java.io.DataInputStream;
-import java.io.DataOutputStream;
-import java.io.IOException;
-import java.io.Serializable;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.List;
-import java.util.Map;
-import javax.annotation.Nullable;
-import org.apache.beam.runners.core.AggregatorFactory;
-import org.apache.beam.runners.core.DoFnRunner;
-import org.apache.beam.runners.core.DoFnRunners;
-import org.apache.beam.runners.core.ExecutionContext;
-import org.apache.beam.runners.core.GroupAlsoByWindowViaWindowSetNewDoFn;
-import org.apache.beam.runners.core.PushbackSideInputDoFnRunner;
-import org.apache.beam.runners.core.SideInputHandler;
-import org.apache.beam.runners.core.SimplePushbackSideInputDoFnRunner;
-import org.apache.beam.runners.core.StateInternals;
-import org.apache.beam.runners.core.StateNamespace;
-import org.apache.beam.runners.core.StateNamespaces;
-import org.apache.beam.runners.core.StateNamespaces.WindowNamespace;
-import org.apache.beam.runners.core.StateTag;
-import org.apache.beam.runners.core.StateTags;
-import org.apache.beam.runners.core.StatefulDoFnRunner;
-import org.apache.beam.runners.core.TimerInternals;
-import org.apache.beam.runners.core.TimerInternals.TimerData;
-import org.apache.beam.runners.flink.translation.types.CoderTypeSerializer;
-import org.apache.beam.runners.flink.translation.utils.SerializedPipelineOptions;
-import org.apache.beam.runners.flink.translation.wrappers.SerializableFnAggregatorWrapper;
-import org.apache.beam.runners.flink.translation.wrappers.streaming.state.FlinkBroadcastStateInternals;
-import org.apache.beam.runners.flink.translation.wrappers.streaming.state.FlinkKeyGroupStateInternals;
-import org.apache.beam.runners.flink.translation.wrappers.streaming.state.FlinkSplitStateInternals;
-import org.apache.beam.runners.flink.translation.wrappers.streaming.state.FlinkStateInternals;
-import org.apache.beam.runners.flink.translation.wrappers.streaming.state.KeyGroupCheckpointedOperator;
-import org.apache.beam.sdk.coders.Coder;
-import org.apache.beam.sdk.options.PipelineOptions;
-import org.apache.beam.sdk.transforms.Aggregator;
-import org.apache.beam.sdk.transforms.Combine;
-import org.apache.beam.sdk.transforms.DoFn;
-import org.apache.beam.sdk.transforms.join.RawUnionValue;
-import org.apache.beam.sdk.transforms.reflect.DoFnInvoker;
-import org.apache.beam.sdk.transforms.reflect.DoFnInvokers;
-import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
-import org.apache.beam.sdk.util.NullSideInputReader;
-import org.apache.beam.sdk.util.SideInputReader;
-import org.apache.beam.sdk.util.TimeDomain;
-import org.apache.beam.sdk.util.WindowedValue;
-import org.apache.beam.sdk.util.WindowingStrategy;
-import org.apache.beam.sdk.util.state.BagState;
-import org.apache.beam.sdk.values.PCollectionView;
-import org.apache.beam.sdk.values.TupleTag;
-import org.apache.flink.core.memory.DataInputViewStreamWrapper;
-import org.apache.flink.core.memory.DataOutputViewStreamWrapper;
-import org.apache.flink.runtime.state.KeyGroupStatePartitionStreamProvider;
-import org.apache.flink.runtime.state.KeyGroupsList;
-import org.apache.flink.runtime.state.KeyedStateBackend;
-import org.apache.flink.runtime.state.KeyedStateCheckpointOutputStream;
-import org.apache.flink.runtime.state.StateInitializationContext;
-import org.apache.flink.runtime.state.StateSnapshotContext;
-import org.apache.flink.streaming.api.operators.AbstractStreamOperator;
-import org.apache.flink.streaming.api.operators.ChainingStrategy;
-import org.apache.flink.streaming.api.operators.HeapInternalTimerService;
-import org.apache.flink.streaming.api.operators.InternalTimer;
-import org.apache.flink.streaming.api.operators.OneInputStreamOperator;
-import org.apache.flink.streaming.api.operators.Output;
-import org.apache.flink.streaming.api.operators.Triggerable;
-import org.apache.flink.streaming.api.operators.TwoInputStreamOperator;
-import org.apache.flink.streaming.api.watermark.Watermark;
-import org.apache.flink.streaming.runtime.streamrecord.StreamRecord;
-import org.joda.time.Instant;
-
-/**
- * Flink operator for executing {@link DoFn DoFns}.
- *
- * @param <InputT> the input type of the {@link DoFn}
- * @param <FnOutputT> the output type of the {@link DoFn}
- * @param <OutputT> the output type of the operator, this can be different from the fn output
- * type when we have additional tagged outputs
- */
-public class DoFnOperator<InputT, FnOutputT, OutputT>
- extends AbstractStreamOperator<OutputT>
- implements OneInputStreamOperator<WindowedValue<InputT>, OutputT>,
- TwoInputStreamOperator<WindowedValue<InputT>, RawUnionValue, OutputT>,
- KeyGroupCheckpointedOperator, Triggerable<Object, TimerData> {
-
- protected DoFn<InputT, FnOutputT> doFn;
-
- protected final SerializedPipelineOptions serializedOptions;
-
- protected final TupleTag<FnOutputT> mainOutputTag;
- protected final List<TupleTag<?>> additionalOutputTags;
-
- protected final Collection<PCollectionView<?>> sideInputs;
- protected final Map<Integer, PCollectionView<?>> sideInputTagMapping;
-
- protected final WindowingStrategy<?, ?> windowingStrategy;
-
- protected final OutputManagerFactory<OutputT> outputManagerFactory;
-
- protected transient DoFnRunner<InputT, FnOutputT> doFnRunner;
- protected transient PushbackSideInputDoFnRunner<InputT, FnOutputT> pushbackDoFnRunner;
-
- protected transient SideInputHandler sideInputHandler;
-
- protected transient SideInputReader sideInputReader;
-
- protected transient DoFnRunners.OutputManager outputManager;
-
- private transient DoFnInvoker<InputT, FnOutputT> doFnInvoker;
-
- protected transient long currentInputWatermark;
-
- protected transient long currentOutputWatermark;
-
- private transient StateTag<Object, BagState<WindowedValue<InputT>>> pushedBackTag;
-
- protected transient FlinkStateInternals<?> stateInternals;
-
- private Coder<WindowedValue<InputT>> inputCoder;
-
- private final Coder<?> keyCoder;
-
- private final TimerInternals.TimerDataCoder timerCoder;
-
- protected transient HeapInternalTimerService<?, TimerInternals.TimerData> timerService;
-
- protected transient FlinkTimerInternals timerInternals;
-
- private transient StateInternals<?> pushbackStateInternals;
-
- private transient Optional<Long> pushedBackWatermark;
-
- public DoFnOperator(
- DoFn<InputT, FnOutputT> doFn,
- Coder<WindowedValue<InputT>> inputCoder,
- TupleTag<FnOutputT> mainOutputTag,
- List<TupleTag<?>> additionalOutputTags,
- OutputManagerFactory<OutputT> outputManagerFactory,
- WindowingStrategy<?, ?> windowingStrategy,
- Map<Integer, PCollectionView<?>> sideInputTagMapping,
- Collection<PCollectionView<?>> sideInputs,
- PipelineOptions options,
- Coder<?> keyCoder) {
- this.doFn = doFn;
- this.inputCoder = inputCoder;
- this.mainOutputTag = mainOutputTag;
- this.additionalOutputTags = additionalOutputTags;
- this.sideInputTagMapping = sideInputTagMapping;
- this.sideInputs = sideInputs;
- this.serializedOptions = new SerializedPipelineOptions(options);
- this.windowingStrategy = windowingStrategy;
- this.outputManagerFactory = outputManagerFactory;
-
- setChainingStrategy(ChainingStrategy.ALWAYS);
-
- this.keyCoder = keyCoder;
-
- this.timerCoder =
- TimerInternals.TimerDataCoder.of(windowingStrategy.getWindowFn().windowCoder());
- }
-
- private ExecutionContext.StepContext createStepContext() {
- return new StepContext();
- }
-
- // allow overriding this in WindowDoFnOperator because this one dynamically creates
- // the DoFn
- protected DoFn<InputT, FnOutputT> getDoFn() {
- return doFn;
- }
-
- @Override
- public void open() throws Exception {
- super.open();
-
- currentInputWatermark = Long.MIN_VALUE;
- currentOutputWatermark = Long.MIN_VALUE;
-
- AggregatorFactory aggregatorFactory = new AggregatorFactory() {
- @Override
- public <InputT, AccumT, OutputT> Aggregator<InputT, OutputT> createAggregatorForDoFn(
- Class<?> fnClass,
- ExecutionContext.StepContext stepContext,
- String aggregatorName,
- Combine.CombineFn<InputT, AccumT, OutputT> combine) {
-
- @SuppressWarnings("unchecked")
- SerializableFnAggregatorWrapper<InputT, OutputT> result =
- (SerializableFnAggregatorWrapper<InputT, OutputT>)
- getRuntimeContext().getAccumulator(aggregatorName);
-
- if (result == null) {
- result = new SerializableFnAggregatorWrapper<>(combine);
- getRuntimeContext().addAccumulator(aggregatorName, result);
- }
- return result;
- }
- };
-
- sideInputReader = NullSideInputReader.of(sideInputs);
-
- if (!sideInputs.isEmpty()) {
-
- pushedBackTag = StateTags.bag("pushed-back-values", inputCoder);
-
- FlinkBroadcastStateInternals sideInputStateInternals =
- new FlinkBroadcastStateInternals<>(
- getContainingTask().getIndexInSubtaskGroup(), getOperatorStateBackend());
-
- sideInputHandler = new SideInputHandler(sideInputs, sideInputStateInternals);
- sideInputReader = sideInputHandler;
-
- // maybe init by initializeState
- if (pushbackStateInternals == null) {
- if (keyCoder != null) {
- pushbackStateInternals = new FlinkKeyGroupStateInternals<>(keyCoder,
- getKeyedStateBackend());
- } else {
- pushbackStateInternals =
- new FlinkSplitStateInternals<Object>(getOperatorStateBackend());
- }
- }
-
- pushedBackWatermark = Optional.absent();
-
- }
-
- outputManager = outputManagerFactory.create(output);
-
- // StatefulPardo or WindowDoFn
- if (keyCoder != null) {
- stateInternals = new FlinkStateInternals<>((KeyedStateBackend) getKeyedStateBackend(),
- keyCoder);
-
- timerService = (HeapInternalTimerService<?, TimerInternals.TimerData>)
- getInternalTimerService("beam-timer", new CoderTypeSerializer<>(timerCoder), this);
-
- timerInternals = new FlinkTimerInternals();
-
- }
-
- // WindowDoFnOperator need use state and timer to get DoFn.
- // So must wait StateInternals and TimerInternals ready.
- this.doFn = getDoFn();
- doFnInvoker = DoFnInvokers.invokerFor(doFn);
-
- doFnInvoker.invokeSetup();
-
- ExecutionContext.StepContext stepContext = createStepContext();
-
- doFnRunner = DoFnRunners.simpleRunner(
- serializedOptions.getPipelineOptions(),
- doFn,
- sideInputReader,
- outputManager,
- mainOutputTag,
- additionalOutputTags,
- stepContext,
- aggregatorFactory,
- windowingStrategy);
-
- if (doFn instanceof GroupAlsoByWindowViaWindowSetNewDoFn) {
- // When the doFn is this, we know it came from WindowDoFnOperator and
- // InputT = KeyedWorkItem<K, V>
- // OutputT = KV<K, V>
- //
- // for some K, V
-
-
- doFnRunner = DoFnRunners.lateDataDroppingRunner(
- (DoFnRunner) doFnRunner,
- stepContext,
- windowingStrategy,
- ((GroupAlsoByWindowViaWindowSetNewDoFn) doFn).getDroppedDueToLatenessAggregator());
- } else if (keyCoder != null) {
- // It is a stateful DoFn
-
- StatefulDoFnRunner.CleanupTimer cleanupTimer =
- new StatefulDoFnRunner.TimeInternalsCleanupTimer(
- stepContext.timerInternals(), windowingStrategy);
-
- // we don't know the window type
- @SuppressWarnings({"unchecked", "rawtypes"})
- Coder windowCoder = windowingStrategy.getWindowFn().windowCoder();
-
- @SuppressWarnings({"unchecked", "rawtypes"})
- StatefulDoFnRunner.StateCleaner<?> stateCleaner =
- new StatefulDoFnRunner.StateInternalsStateCleaner<>(
- doFn, stepContext.stateInternals(), windowCoder);
-
- doFnRunner = DoFnRunners.defaultStatefulDoFnRunner(
- doFn,
- doFnRunner,
- stepContext,
- aggregatorFactory,
- windowingStrategy,
- cleanupTimer,
- stateCleaner);
- }
-
- pushbackDoFnRunner =
- SimplePushbackSideInputDoFnRunner.create(doFnRunner, sideInputs, sideInputHandler);
- }
-
- @Override
- public void close() throws Exception {
- super.close();
- doFnInvoker.invokeTeardown();
- }
-
- protected final long getPushbackWatermarkHold() {
- // if we don't have side inputs we never hold the watermark
- if (sideInputs.isEmpty()) {
- return BoundedWindow.TIMESTAMP_MAX_VALUE.getMillis();
- }
-
- try {
- checkInitPushedBackWatermark();
- return pushedBackWatermark.get();
- } catch (Exception e) {
- throw new RuntimeException("Error retrieving pushed back watermark state.", e);
- }
- }
-
- private void checkInitPushedBackWatermark() {
- // init and restore from pushedBack state.
- // Not done in initializeState, because OperatorState is not ready.
- if (!pushedBackWatermark.isPresent()) {
-
- BagState<WindowedValue<InputT>> pushedBack =
- pushbackStateInternals.state(StateNamespaces.global(), pushedBackTag);
-
- long min = BoundedWindow.TIMESTAMP_MAX_VALUE.getMillis();
- for (WindowedValue<InputT> value : pushedBack.read()) {
- min = Math.min(min, value.getTimestamp().getMillis());
- }
- setPushedBackWatermark(min);
- }
- }
-
- @Override
- public final void processElement(
- StreamRecord<WindowedValue<InputT>> streamRecord) throws Exception {
- doFnRunner.startBundle();
- doFnRunner.processElement(streamRecord.getValue());
- doFnRunner.finishBundle();
- }
-
- private void setPushedBackWatermark(long watermark) {
- pushedBackWatermark = Optional.fromNullable(watermark);
- }
-
- @Override
- public final void processElement1(
- StreamRecord<WindowedValue<InputT>> streamRecord) throws Exception {
- pushbackDoFnRunner.startBundle();
- Iterable<WindowedValue<InputT>> justPushedBack =
- pushbackDoFnRunner.processElementInReadyWindows(streamRecord.getValue());
-
- BagState<WindowedValue<InputT>> pushedBack =
- pushbackStateInternals.state(StateNamespaces.global(), pushedBackTag);
-
- checkInitPushedBackWatermark();
-
- long min = pushedBackWatermark.get();
- for (WindowedValue<InputT> pushedBackValue : justPushedBack) {
- min = Math.min(min, pushedBackValue.getTimestamp().getMillis());
- pushedBack.add(pushedBackValue);
- }
- setPushedBackWatermark(min);
- pushbackDoFnRunner.finishBundle();
- }
-
- @Override
- public final void processElement2(
- StreamRecord<RawUnionValue> streamRecord) throws Exception {
- pushbackDoFnRunner.startBundle();
-
- @SuppressWarnings("unchecked")
- WindowedValue<Iterable<?>> value =
- (WindowedValue<Iterable<?>>) streamRecord.getValue().getValue();
-
- PCollectionView<?> sideInput = sideInputTagMapping.get(streamRecord.getValue().getUnionTag());
- sideInputHandler.addSideInputValue(sideInput, value);
-
- BagState<WindowedValue<InputT>> pushedBack =
- pushbackStateInternals.state(StateNamespaces.global(), pushedBackTag);
-
- List<WindowedValue<InputT>> newPushedBack = new ArrayList<>();
-
- Iterable<WindowedValue<InputT>> pushedBackContents = pushedBack.read();
- if (pushedBackContents != null) {
- for (WindowedValue<InputT> elem : pushedBackContents) {
-
- // we need to set the correct key in case the operator is
- // a (keyed) window operator
- setKeyContextElement1(new StreamRecord<>(elem));
-
- Iterable<WindowedValue<InputT>> justPushedBack =
- pushbackDoFnRunner.processElementInReadyWindows(elem);
- Iterables.addAll(newPushedBack, justPushedBack);
- }
- }
-
- pushedBack.clear();
- long min = BoundedWindow.TIMESTAMP_MAX_VALUE.getMillis();
- for (WindowedValue<InputT> pushedBackValue : newPushedBack) {
- min = Math.min(min, pushedBackValue.getTimestamp().getMillis());
- pushedBack.add(pushedBackValue);
- }
- setPushedBackWatermark(min);
-
- pushbackDoFnRunner.finishBundle();
-
- // maybe output a new watermark
- processWatermark1(new Watermark(currentInputWatermark));
- }
-
- @Override
- public void processWatermark(Watermark mark) throws Exception {
- processWatermark1(mark);
- }
-
- @Override
- public void processWatermark1(Watermark mark) throws Exception {
- if (keyCoder == null) {
- this.currentInputWatermark = mark.getTimestamp();
- long potentialOutputWatermark =
- Math.min(getPushbackWatermarkHold(), currentInputWatermark);
- if (potentialOutputWatermark > currentOutputWatermark) {
- currentOutputWatermark = potentialOutputWatermark;
- output.emitWatermark(new Watermark(currentOutputWatermark));
- }
- } else {
- // fireTimers, so we need startBundle.
- pushbackDoFnRunner.startBundle();
-
- this.currentInputWatermark = mark.getTimestamp();
-
- // hold back by the pushed back values waiting for side inputs
- long actualInputWatermark = Math.min(getPushbackWatermarkHold(), mark.getTimestamp());
-
- timerService.advanceWatermark(actualInputWatermark);
-
- Instant watermarkHold = stateInternals.watermarkHold();
-
- long combinedWatermarkHold = Math.min(watermarkHold.getMillis(), getPushbackWatermarkHold());
-
- long potentialOutputWatermark = Math.min(currentInputWatermark, combinedWatermarkHold);
-
- if (potentialOutputWatermark > currentOutputWatermark) {
- currentOutputWatermark = potentialOutputWatermark;
- output.emitWatermark(new Watermark(currentOutputWatermark));
- }
- pushbackDoFnRunner.finishBundle();
- }
- }
-
- @Override
- public void processWatermark2(Watermark mark) throws Exception {
- // ignore watermarks from the side-input input
- }
-
- @Override
- public void snapshotState(StateSnapshotContext context) throws Exception {
- // copy from AbstractStreamOperator
- if (getKeyedStateBackend() != null) {
- KeyedStateCheckpointOutputStream out;
-
- try {
- out = context.getRawKeyedOperatorStateOutput();
- } catch (Exception exception) {
- throw new Exception("Could not open raw keyed operator state stream for "
- + getOperatorName() + '.', exception);
- }
-
- try {
- KeyGroupsList allKeyGroups = out.getKeyGroupList();
- for (int keyGroupIdx : allKeyGroups) {
- out.startNewKeyGroup(keyGroupIdx);
-
- DataOutputViewStreamWrapper dov = new DataOutputViewStreamWrapper(out);
-
- // if (this instanceof KeyGroupCheckpointedOperator)
- snapshotKeyGroupState(keyGroupIdx, dov);
-
- // We can't get all timerServices, so we just snapshot our timerService
- // Maybe this is a normal DoFn that has no timerService
- if (keyCoder != null) {
- timerService.snapshotTimersForKeyGroup(dov, keyGroupIdx);
- }
-
- }
- } catch (Exception exception) {
- throw new Exception("Could not write timer service of " + getOperatorName()
- + " to checkpoint state stream.", exception);
- } finally {
- try {
- out.close();
- } catch (Exception closeException) {
- LOG.warn("Could not close raw keyed operator state stream for {}. This "
- + "might have prevented deleting some state data.", getOperatorName(),
- closeException);
- }
- }
- }
- }
-
- @Override
- public void snapshotKeyGroupState(int keyGroupIndex, DataOutputStream out) throws Exception {
- if (!sideInputs.isEmpty() && keyCoder != null) {
- ((FlinkKeyGroupStateInternals) pushbackStateInternals).snapshotKeyGroupState(
- keyGroupIndex, out);
- }
- }
-
- @Override
- public void initializeState(StateInitializationContext context) throws Exception {
- if (getKeyedStateBackend() != null) {
- int totalKeyGroups = getKeyedStateBackend().getNumberOfKeyGroups();
- KeyGroupsList localKeyGroupRange = getKeyedStateBackend().getKeyGroupRange();
-
- for (KeyGroupStatePartitionStreamProvider streamProvider : context.getRawKeyedStateInputs()) {
- DataInputViewStreamWrapper div = new DataInputViewStreamWrapper(streamProvider.getStream());
-
- int keyGroupIdx = streamProvider.getKeyGroupId();
- checkArgument(localKeyGroupRange.contains(keyGroupIdx),
- "Key Group " + keyGroupIdx + " does not belong to the local range.");
-
- // if (this instanceof KeyGroupRestoringOperator)
- restoreKeyGroupState(keyGroupIdx, div);
-
- // We just initialize our timerService
- if (keyCoder != null) {
- if (timerService == null) {
- timerService = new HeapInternalTimerService<>(
- totalKeyGroups,
- localKeyGroupRange,
- this,
- getRuntimeContext().getProcessingTimeService());
- }
- timerService.restoreTimersForKeyGroup(div, keyGroupIdx, getUserCodeClassloader());
- }
- }
- }
- }
-
- @Override
- public void restoreKeyGroupState(int keyGroupIndex, DataInputStream in) throws Exception {
- if (!sideInputs.isEmpty() && keyCoder != null) {
- if (pushbackStateInternals == null) {
- pushbackStateInternals = new FlinkKeyGroupStateInternals<>(keyCoder,
- getKeyedStateBackend());
- }
- ((FlinkKeyGroupStateInternals) pushbackStateInternals)
- .restoreKeyGroupState(keyGroupIndex, in, getUserCodeClassloader());
- }
- }
-
- @Override
- public void onEventTime(InternalTimer<Object, TimerData> timer) throws Exception {
- fireTimer(timer);
- }
-
- @Override
- public void onProcessingTime(InternalTimer<Object, TimerData> timer) throws Exception {
- fireTimer(timer);
- }
-
- // allow overriding this in WindowDoFnOperator
- public void fireTimer(InternalTimer<?, TimerData> timer) {
- TimerInternals.TimerData timerData = timer.getNamespace();
- StateNamespace namespace = timerData.getNamespace();
- // This is a user timer, so namespace must be WindowNamespace
- checkArgument(namespace instanceof WindowNamespace);
- BoundedWindow window = ((WindowNamespace) namespace).getWindow();
- pushbackDoFnRunner.onTimer(timerData.getTimerId(), window,
- timerData.getTimestamp(), timerData.getDomain());
- }
-
- /**
- * Factory for creating an {@link DoFnRunners.OutputManager} from
- * a Flink {@link Output}.
- */
- interface OutputManagerFactory<OutputT> extends Serializable {
- DoFnRunners.OutputManager create(Output<StreamRecord<OutputT>> output);
- }
-
- /**
- * Default implementation of {@link OutputManagerFactory} that creates an
- * {@link DoFnRunners.OutputManager} that only writes to
- * a single logical output.
- */
- public static class DefaultOutputManagerFactory<OutputT>
- implements OutputManagerFactory<OutputT> {
- @Override
- public DoFnRunners.OutputManager create(final Output<StreamRecord<OutputT>> output) {
- return new DoFnRunners.OutputManager() {
- @Override
- public <T> void output(TupleTag<T> tag, WindowedValue<T> value) {
- // with tagged outputs we can't get around this because we don't
- // know our own output type...
- @SuppressWarnings("unchecked")
- OutputT castValue = (OutputT) value;
- output.collect(new StreamRecord<>(castValue));
- }
- };
- }
- }
-
- /**
- * Implementation of {@link OutputManagerFactory} that creates an
- * {@link DoFnRunners.OutputManager} that can write to multiple logical
- * outputs by unioning them in a {@link RawUnionValue}.
- */
- public static class MultiOutputOutputManagerFactory
- implements OutputManagerFactory<RawUnionValue> {
-
- Map<TupleTag<?>, Integer> mapping;
-
- public MultiOutputOutputManagerFactory(Map<TupleTag<?>, Integer> mapping) {
- this.mapping = mapping;
- }
-
- @Override
- public DoFnRunners.OutputManager create(final Output<StreamRecord<RawUnionValue>> output) {
- return new DoFnRunners.OutputManager() {
- @Override
- public <T> void output(TupleTag<T> tag, WindowedValue<T> value) {
- int intTag = mapping.get(tag);
- output.collect(new StreamRecord<>(new RawUnionValue(intTag, value)));
- }
- };
- }
- }
-
- /**
- * {@link StepContext} for running {@link DoFn DoFns} on Flink. This does not allow
- * accessing state or timer internals.
- */
- protected class StepContext implements ExecutionContext.StepContext {
-
- @Override
- public String getStepName() {
- return null;
- }
-
- @Override
- public String getTransformName() {
- return null;
- }
-
- @Override
- public void noteOutput(WindowedValue<?> output) {}
-
- @Override
- public void noteOutput(TupleTag<?> tag, WindowedValue<?> output) {}
-
- @Override
- public <T, W extends BoundedWindow> void writePCollectionViewData(
- TupleTag<?> tag,
- Iterable<WindowedValue<T>> data,
- Coder<Iterable<WindowedValue<T>>> dataCoder,
- W window,
- Coder<W> windowCoder) throws IOException {
- throw new UnsupportedOperationException("Writing side-input data is not supported.");
- }
-
- @Override
- public StateInternals<?> stateInternals() {
- return stateInternals;
- }
-
- @Override
- public TimerInternals timerInternals() {
- return timerInternals;
- }
- }
-
- private class FlinkTimerInternals implements TimerInternals {
-
- @Override
- public void setTimer(
- StateNamespace namespace, String timerId, Instant target, TimeDomain timeDomain) {
- setTimer(TimerData.of(timerId, namespace, target, timeDomain));
- }
-
- @Deprecated
- @Override
- public void setTimer(TimerData timerKey) {
- long time = timerKey.getTimestamp().getMillis();
- if (timerKey.getDomain().equals(TimeDomain.EVENT_TIME)) {
- timerService.registerEventTimeTimer(timerKey, time);
- } else if (timerKey.getDomain().equals(TimeDomain.PROCESSING_TIME)) {
- timerService.registerProcessingTimeTimer(timerKey, time);
- } else {
- throw new UnsupportedOperationException(
- "Unsupported time domain: " + timerKey.getDomain());
- }
- }
-
- @Deprecated
- @Override
- public void deleteTimer(StateNamespace namespace, String timerId) {
- throw new UnsupportedOperationException(
- "Canceling of a timer by ID is not yet supported.");
- }
-
- @Override
- public void deleteTimer(StateNamespace namespace, String timerId, TimeDomain timeDomain) {
- throw new UnsupportedOperationException(
- "Canceling of a timer by ID is not yet supported.");
- }
-
- @Deprecated
- @Override
- public void deleteTimer(TimerData timerKey) {
- long time = timerKey.getTimestamp().getMillis();
- if (timerKey.getDomain().equals(TimeDomain.EVENT_TIME)) {
- timerService.deleteEventTimeTimer(timerKey, time);
- } else if (timerKey.getDomain().equals(TimeDomain.PROCESSING_TIME)) {
- timerService.deleteProcessingTimeTimer(timerKey, time);
- } else {
- throw new UnsupportedOperationException(
- "Unsupported time domain: " + timerKey.getDomain());
- }
- }
-
- @Override
- public Instant currentProcessingTime() {
- return new Instant(timerService.currentProcessingTime());
- }
-
- @Nullable
- @Override
- public Instant currentSynchronizedProcessingTime() {
- return new Instant(timerService.currentProcessingTime());
- }
-
- @Override
- public Instant currentInputWatermarkTime() {
- return new Instant(Math.min(currentInputWatermark, getPushbackWatermarkHold()));
- }
-
- @Nullable
- @Override
- public Instant currentOutputWatermarkTime() {
- return new Instant(currentOutputWatermark);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/KvToByteBufferKeySelector.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/KvToByteBufferKeySelector.java b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/KvToByteBufferKeySelector.java
deleted file mode 100644
index dce2e68..0000000
--- a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/KvToByteBufferKeySelector.java
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink.translation.wrappers.streaming;
-
-import java.nio.ByteBuffer;
-import org.apache.beam.sdk.coders.Coder;
-import org.apache.beam.sdk.util.CoderUtils;
-import org.apache.beam.sdk.util.WindowedValue;
-import org.apache.beam.sdk.values.KV;
-import org.apache.flink.api.common.typeinfo.TypeInformation;
-import org.apache.flink.api.java.functions.KeySelector;
-import org.apache.flink.api.java.typeutils.GenericTypeInfo;
-import org.apache.flink.api.java.typeutils.ResultTypeQueryable;
-
-/**
- * {@link KeySelector} that retrieves a key from a {@link KV}. This will return
- * the key as encoded by the provided {@link Coder} in a {@link ByteBuffer}. This ensures
- * that all key comparisons/hashing happen on the encoded form.
- */
-public class KvToByteBufferKeySelector<K, V>
- implements KeySelector<WindowedValue<KV<K, V>>, ByteBuffer>,
- ResultTypeQueryable<ByteBuffer> {
-
- private final Coder<K> keyCoder;
-
- public KvToByteBufferKeySelector(Coder<K> keyCoder) {
- this.keyCoder = keyCoder;
- }
-
- @Override
- public ByteBuffer getKey(WindowedValue<KV<K, V>> value) throws Exception {
- K key = value.getValue().getKey();
- byte[] keyBytes = CoderUtils.encodeToByteArray(keyCoder, key);
- return ByteBuffer.wrap(keyBytes);
- }
-
- @Override
- public TypeInformation<ByteBuffer> getProducedType() {
- return new GenericTypeInfo<>(ByteBuffer.class);
- }
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/SingletonKeyedWorkItem.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/SingletonKeyedWorkItem.java b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/SingletonKeyedWorkItem.java
deleted file mode 100644
index e843660..0000000
--- a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/SingletonKeyedWorkItem.java
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink.translation.wrappers.streaming;
-
-import java.util.Collections;
-import org.apache.beam.runners.core.KeyedWorkItem;
-import org.apache.beam.runners.core.TimerInternals;
-import org.apache.beam.sdk.util.WindowedValue;
-
-/**
- * Singleton keyed word item.
- */
-public class SingletonKeyedWorkItem<K, ElemT> implements KeyedWorkItem<K, ElemT> {
-
- final K key;
- final WindowedValue<ElemT> value;
-
- public SingletonKeyedWorkItem(K key, WindowedValue<ElemT> value) {
- this.key = key;
- this.value = value;
- }
-
- @Override
- public K key() {
- return key;
- }
-
- public WindowedValue<ElemT> value() {
- return value;
- }
-
- @Override
- public Iterable<TimerInternals.TimerData> timersIterable() {
- return Collections.EMPTY_LIST;
- }
-
- @Override
- public Iterable<WindowedValue<ElemT>> elementsIterable() {
- return Collections.singletonList(value);
- }
-}
[03/50] [abbrv] beam git commit: Minor cleanups in ParDoEvaluator
Posted by dh...@apache.org.
Minor cleanups in ParDoEvaluator
Project: http://git-wip-us.apache.org/repos/asf/beam/repo
Commit: http://git-wip-us.apache.org/repos/asf/beam/commit/1cc16b0d
Tree: http://git-wip-us.apache.org/repos/asf/beam/tree/1cc16b0d
Diff: http://git-wip-us.apache.org/repos/asf/beam/diff/1cc16b0d
Branch: refs/heads/DSL_SQL
Commit: 1cc16b0d6cea7b01b01427758eaf427cc29635b6
Parents: 3fd8890
Author: Eugene Kirpichov <ki...@google.com>
Authored: Mon Apr 17 12:25:02 2017 -0700
Committer: Eugene Kirpichov <ki...@google.com>
Committed: Tue Apr 18 18:02:06 2017 -0700
----------------------------------------------------------------------
...oFnLifecycleManagerRemovingTransformEvaluator.java | 6 +++---
.../apache/beam/runners/direct/ParDoEvaluator.java | 14 +++++---------
.../beam/runners/direct/ParDoEvaluatorFactory.java | 2 +-
.../SplittableProcessElementsEvaluatorFactory.java | 2 +-
...ifecycleManagerRemovingTransformEvaluatorTest.java | 8 ++++----
.../beam/runners/direct/ParDoEvaluatorTest.java | 4 ++--
6 files changed, 16 insertions(+), 20 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/beam/blob/1cc16b0d/runners/direct-java/src/main/java/org/apache/beam/runners/direct/DoFnLifecycleManagerRemovingTransformEvaluator.java
----------------------------------------------------------------------
diff --git a/runners/direct-java/src/main/java/org/apache/beam/runners/direct/DoFnLifecycleManagerRemovingTransformEvaluator.java b/runners/direct-java/src/main/java/org/apache/beam/runners/direct/DoFnLifecycleManagerRemovingTransformEvaluator.java
index 9bcd569..e537962 100644
--- a/runners/direct-java/src/main/java/org/apache/beam/runners/direct/DoFnLifecycleManagerRemovingTransformEvaluator.java
+++ b/runners/direct-java/src/main/java/org/apache/beam/runners/direct/DoFnLifecycleManagerRemovingTransformEvaluator.java
@@ -31,16 +31,16 @@ import org.slf4j.LoggerFactory;
class DoFnLifecycleManagerRemovingTransformEvaluator<InputT> implements TransformEvaluator<InputT> {
private static final Logger LOG =
LoggerFactory.getLogger(DoFnLifecycleManagerRemovingTransformEvaluator.class);
- private final ParDoEvaluator<InputT, ?> underlying;
+ private final ParDoEvaluator<InputT> underlying;
private final DoFnLifecycleManager lifecycleManager;
public static <InputT> DoFnLifecycleManagerRemovingTransformEvaluator<InputT> wrapping(
- ParDoEvaluator<InputT, ?> underlying, DoFnLifecycleManager lifecycleManager) {
+ ParDoEvaluator<InputT> underlying, DoFnLifecycleManager lifecycleManager) {
return new DoFnLifecycleManagerRemovingTransformEvaluator<>(underlying, lifecycleManager);
}
private DoFnLifecycleManagerRemovingTransformEvaluator(
- ParDoEvaluator<InputT, ?> underlying, DoFnLifecycleManager lifecycleManager) {
+ ParDoEvaluator<InputT> underlying, DoFnLifecycleManager lifecycleManager) {
this.underlying = underlying;
this.lifecycleManager = lifecycleManager;
}
http://git-wip-us.apache.org/repos/asf/beam/blob/1cc16b0d/runners/direct-java/src/main/java/org/apache/beam/runners/direct/ParDoEvaluator.java
----------------------------------------------------------------------
diff --git a/runners/direct-java/src/main/java/org/apache/beam/runners/direct/ParDoEvaluator.java b/runners/direct-java/src/main/java/org/apache/beam/runners/direct/ParDoEvaluator.java
index 49d0723..131716f 100644
--- a/runners/direct-java/src/main/java/org/apache/beam/runners/direct/ParDoEvaluator.java
+++ b/runners/direct-java/src/main/java/org/apache/beam/runners/direct/ParDoEvaluator.java
@@ -40,9 +40,9 @@ import org.apache.beam.sdk.values.PCollection;
import org.apache.beam.sdk.values.PCollectionView;
import org.apache.beam.sdk.values.TupleTag;
-class ParDoEvaluator<InputT, OutputT> implements TransformEvaluator<InputT> {
+class ParDoEvaluator<InputT> implements TransformEvaluator<InputT> {
- public static <InputT, OutputT> ParDoEvaluator<InputT, OutputT> create(
+ public static <InputT, OutputT> ParDoEvaluator<InputT> create(
EvaluationContext evaluationContext,
DirectStepContext stepContext,
AppliedPTransform<?, ?, ?> application,
@@ -93,13 +93,11 @@ class ParDoEvaluator<InputT, OutputT> implements TransformEvaluator<InputT> {
throw UserCodeException.wrap(e);
}
- return new ParDoEvaluator<>(
- evaluationContext, runner, application, aggregatorChanges, outputManager, stepContext);
+ return new ParDoEvaluator<>(runner, application, aggregatorChanges, outputManager, stepContext);
}
////////////////////////////////////////////////////////////////////////////////////////////////
- private final EvaluationContext evaluationContext;
private final PushbackSideInputDoFnRunner<InputT, ?> fnRunner;
private final AppliedPTransform<?, ?, ?> transform;
private final AggregatorContainer.Mutator aggregatorChanges;
@@ -109,13 +107,11 @@ class ParDoEvaluator<InputT, OutputT> implements TransformEvaluator<InputT> {
private final ImmutableList.Builder<WindowedValue<InputT>> unprocessedElements;
private ParDoEvaluator(
- EvaluationContext evaluationContext,
PushbackSideInputDoFnRunner<InputT, ?> fnRunner,
AppliedPTransform<?, ?, ?> transform,
AggregatorContainer.Mutator aggregatorChanges,
BundleOutputManager outputManager,
DirectStepContext stepContext) {
- this.evaluationContext = evaluationContext;
this.fnRunner = fnRunner;
this.transform = transform;
this.outputManager = outputManager;
@@ -153,11 +149,11 @@ class ParDoEvaluator<InputT, OutputT> implements TransformEvaluator<InputT> {
} catch (Exception e) {
throw UserCodeException.wrap(e);
}
- StepTransformResult.Builder resultBuilder;
+ StepTransformResult.Builder<InputT> resultBuilder;
CopyOnAccessInMemoryStateInternals<?> state = stepContext.commitState();
if (state != null) {
resultBuilder =
- StepTransformResult.withHold(transform, state.getEarliestWatermarkHold())
+ StepTransformResult.<InputT>withHold(transform, state.getEarliestWatermarkHold())
.withState(state);
} else {
resultBuilder = StepTransformResult.withoutHold(transform);
http://git-wip-us.apache.org/repos/asf/beam/blob/1cc16b0d/runners/direct-java/src/main/java/org/apache/beam/runners/direct/ParDoEvaluatorFactory.java
----------------------------------------------------------------------
diff --git a/runners/direct-java/src/main/java/org/apache/beam/runners/direct/ParDoEvaluatorFactory.java b/runners/direct-java/src/main/java/org/apache/beam/runners/direct/ParDoEvaluatorFactory.java
index 0372295..93f204a 100644
--- a/runners/direct-java/src/main/java/org/apache/beam/runners/direct/ParDoEvaluatorFactory.java
+++ b/runners/direct-java/src/main/java/org/apache/beam/runners/direct/ParDoEvaluatorFactory.java
@@ -126,7 +126,7 @@ final class ParDoEvaluatorFactory<InputT, OutputT> implements TransformEvaluator
fnManager);
}
- ParDoEvaluator<InputT, OutputT> createParDoEvaluator(
+ ParDoEvaluator<InputT> createParDoEvaluator(
AppliedPTransform<PCollection<InputT>, PCollectionTuple, ?> application,
StructuralKey<?> key,
List<PCollectionView<?>> sideInputs,
http://git-wip-us.apache.org/repos/asf/beam/blob/1cc16b0d/runners/direct-java/src/main/java/org/apache/beam/runners/direct/SplittableProcessElementsEvaluatorFactory.java
----------------------------------------------------------------------
diff --git a/runners/direct-java/src/main/java/org/apache/beam/runners/direct/SplittableProcessElementsEvaluatorFactory.java b/runners/direct-java/src/main/java/org/apache/beam/runners/direct/SplittableProcessElementsEvaluatorFactory.java
index 64cef35..00b16dd 100644
--- a/runners/direct-java/src/main/java/org/apache/beam/runners/direct/SplittableProcessElementsEvaluatorFactory.java
+++ b/runners/direct-java/src/main/java/org/apache/beam/runners/direct/SplittableProcessElementsEvaluatorFactory.java
@@ -98,7 +98,7 @@ class SplittableProcessElementsEvaluatorFactory<
.getExecutionContext(application, inputBundle.getKey())
.getOrCreateStepContext(stepName, stepName);
- ParDoEvaluator<KeyedWorkItem<String, ElementAndRestriction<InputT, RestrictionT>>, OutputT>
+ ParDoEvaluator<KeyedWorkItem<String, ElementAndRestriction<InputT, RestrictionT>>>
parDoEvaluator =
delegateFactory.createParDoEvaluator(
application,
http://git-wip-us.apache.org/repos/asf/beam/blob/1cc16b0d/runners/direct-java/src/test/java/org/apache/beam/runners/direct/DoFnLifecycleManagerRemovingTransformEvaluatorTest.java
----------------------------------------------------------------------
diff --git a/runners/direct-java/src/test/java/org/apache/beam/runners/direct/DoFnLifecycleManagerRemovingTransformEvaluatorTest.java b/runners/direct-java/src/test/java/org/apache/beam/runners/direct/DoFnLifecycleManagerRemovingTransformEvaluatorTest.java
index d046ce5..1ac4d6d 100644
--- a/runners/direct-java/src/test/java/org/apache/beam/runners/direct/DoFnLifecycleManagerRemovingTransformEvaluatorTest.java
+++ b/runners/direct-java/src/test/java/org/apache/beam/runners/direct/DoFnLifecycleManagerRemovingTransformEvaluatorTest.java
@@ -53,7 +53,7 @@ public class DoFnLifecycleManagerRemovingTransformEvaluatorTest {
@Test
public void delegatesToUnderlying() throws Exception {
- ParDoEvaluator<Object, Object> underlying = mock(ParDoEvaluator.class);
+ ParDoEvaluator<Object> underlying = mock(ParDoEvaluator.class);
DoFn<?, ?> original = lifecycleManager.get();
TransformEvaluator<Object> evaluator =
DoFnLifecycleManagerRemovingTransformEvaluator.wrapping(underlying, lifecycleManager);
@@ -72,7 +72,7 @@ public class DoFnLifecycleManagerRemovingTransformEvaluatorTest {
@Test
public void removesOnExceptionInProcessElement() throws Exception {
- ParDoEvaluator<Object, Object> underlying = mock(ParDoEvaluator.class);
+ ParDoEvaluator<Object> underlying = mock(ParDoEvaluator.class);
doThrow(Exception.class).when(underlying).processElement(any(WindowedValue.class));
DoFn<?, ?> original = lifecycleManager.get();
@@ -91,7 +91,7 @@ public class DoFnLifecycleManagerRemovingTransformEvaluatorTest {
@Test
public void removesOnExceptionInOnTimer() throws Exception {
- ParDoEvaluator<Object, Object> underlying = mock(ParDoEvaluator.class);
+ ParDoEvaluator<Object> underlying = mock(ParDoEvaluator.class);
doThrow(Exception.class)
.when(underlying)
.onTimer(any(TimerData.class), any(BoundedWindow.class));
@@ -114,7 +114,7 @@ public class DoFnLifecycleManagerRemovingTransformEvaluatorTest {
@Test
public void removesOnExceptionInFinishBundle() throws Exception {
- ParDoEvaluator<Object, Object> underlying = mock(ParDoEvaluator.class);
+ ParDoEvaluator<Object> underlying = mock(ParDoEvaluator.class);
doThrow(Exception.class).when(underlying).finishBundle();
DoFn<?, ?> original = lifecycleManager.get();
http://git-wip-us.apache.org/repos/asf/beam/blob/1cc16b0d/runners/direct-java/src/test/java/org/apache/beam/runners/direct/ParDoEvaluatorTest.java
----------------------------------------------------------------------
diff --git a/runners/direct-java/src/test/java/org/apache/beam/runners/direct/ParDoEvaluatorTest.java b/runners/direct-java/src/test/java/org/apache/beam/runners/direct/ParDoEvaluatorTest.java
index 65a1248..2be0f9d 100644
--- a/runners/direct-java/src/test/java/org/apache/beam/runners/direct/ParDoEvaluatorTest.java
+++ b/runners/direct-java/src/test/java/org/apache/beam/runners/direct/ParDoEvaluatorTest.java
@@ -98,7 +98,7 @@ public class ParDoEvaluatorTest {
UncommittedBundle<Integer> outputBundle = bundleFactory.createBundle(output);
when(evaluationContext.createBundle(output)).thenReturn(outputBundle);
- ParDoEvaluator<Integer, Integer> evaluator =
+ ParDoEvaluator<Integer> evaluator =
createEvaluator(singletonView, fn, output);
IntervalWindow nonGlobalWindow = new IntervalWindow(new Instant(0), new Instant(10_000L));
@@ -130,7 +130,7 @@ public class ParDoEvaluatorTest {
WindowedValue.timestampedValueInGlobalWindow(6, new Instant(2468L))));
}
- private ParDoEvaluator<Integer, Integer> createEvaluator(
+ private ParDoEvaluator<Integer> createEvaluator(
PCollectionView<Integer> singletonView,
RecorderFn fn,
PCollection<Integer> output) {
[28/50] [abbrv] beam git commit: [BEAM-1994] Remove Flink examples
package
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/types/EncodedValueTypeInformation.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/types/EncodedValueTypeInformation.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/types/EncodedValueTypeInformation.java
new file mode 100644
index 0000000..e24bf31
--- /dev/null
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/types/EncodedValueTypeInformation.java
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink.translation.types;
+
+import org.apache.beam.sdk.coders.Coder;
+import org.apache.flink.api.common.ExecutionConfig;
+import org.apache.flink.api.common.typeinfo.AtomicType;
+import org.apache.flink.api.common.typeinfo.TypeInformation;
+import org.apache.flink.api.common.typeutils.TypeComparator;
+import org.apache.flink.api.common.typeutils.TypeSerializer;
+
+/**
+ * Flink {@link TypeInformation} for Beam values that have been encoded to byte data
+ * by a {@link Coder}.
+ */
+public class EncodedValueTypeInformation
+ extends TypeInformation<byte[]>
+ implements AtomicType<byte[]> {
+
+ private static final long serialVersionUID = 1L;
+
+ @Override
+ public boolean isBasicType() {
+ return false;
+ }
+
+ @Override
+ public boolean isTupleType() {
+ return false;
+ }
+
+ @Override
+ public int getArity() {
+ return 0;
+ }
+
+ @Override
+ public int getTotalFields() {
+ return 0;
+ }
+
+ @Override
+ public Class<byte[]> getTypeClass() {
+ return byte[].class;
+ }
+
+ @Override
+ public boolean isKeyType() {
+ return true;
+ }
+
+ @Override
+ public TypeSerializer<byte[]> createSerializer(ExecutionConfig executionConfig) {
+ return new EncodedValueSerializer();
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ return other instanceof EncodedValueTypeInformation;
+ }
+
+ @Override
+ public int hashCode() {
+ return this.getClass().hashCode();
+ }
+
+ @Override
+ public boolean canEqual(Object obj) {
+ return obj instanceof EncodedValueTypeInformation;
+ }
+
+ @Override
+ public String toString() {
+ return "EncodedValueTypeInformation";
+ }
+
+ @Override
+ public TypeComparator<byte[]> createComparator(
+ boolean sortOrderAscending,
+ ExecutionConfig executionConfig) {
+ return new EncodedValueComparator(sortOrderAscending);
+ }
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/types/InspectableByteArrayOutputStream.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/types/InspectableByteArrayOutputStream.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/types/InspectableByteArrayOutputStream.java
new file mode 100644
index 0000000..36b5ba3
--- /dev/null
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/types/InspectableByteArrayOutputStream.java
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink.translation.types;
+
+import java.io.ByteArrayOutputStream;
+
+/**
+ * Version of {@link java.io.ByteArrayOutputStream} that allows to retrieve the internal
+ * byte[] buffer without incurring an array copy.
+ */
+public class InspectableByteArrayOutputStream extends ByteArrayOutputStream {
+
+ /**
+ * Get the underlying byte array.
+ */
+ public byte[] getBuffer() {
+ return buf;
+ }
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/types/KvKeySelector.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/types/KvKeySelector.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/types/KvKeySelector.java
new file mode 100644
index 0000000..9df6836
--- /dev/null
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/types/KvKeySelector.java
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink.translation.types;
+
+import org.apache.beam.sdk.coders.Coder;
+import org.apache.beam.sdk.util.CoderUtils;
+import org.apache.beam.sdk.util.WindowedValue;
+import org.apache.beam.sdk.values.KV;
+import org.apache.flink.api.common.typeinfo.TypeInformation;
+import org.apache.flink.api.java.functions.KeySelector;
+import org.apache.flink.api.java.typeutils.ResultTypeQueryable;
+
+/**
+ * {@link KeySelector} that extracts the key from a {@link KV} and returns
+ * it in encoded form as a {@code byte} array.
+ */
+public class KvKeySelector<InputT, K>
+ implements KeySelector<WindowedValue<KV<K, InputT>>, byte[]>, ResultTypeQueryable<byte[]> {
+
+ private final Coder<K> keyCoder;
+
+ public KvKeySelector(Coder<K> keyCoder) {
+ this.keyCoder = keyCoder;
+ }
+
+ @Override
+ public byte[] getKey(WindowedValue<KV<K, InputT>> value) throws Exception {
+ return CoderUtils.encodeToByteArray(keyCoder, value.getValue().getKey());
+ }
+
+ @Override
+ public TypeInformation<byte[]> getProducedType() {
+ return new EncodedValueTypeInformation();
+ }
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/types/package-info.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/types/package-info.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/types/package-info.java
new file mode 100644
index 0000000..6fb3182
--- /dev/null
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/types/package-info.java
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Internal implementation of the Beam runner for Apache Flink.
+ */
+package org.apache.beam.runners.flink.translation.types;
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/utils/SerializedPipelineOptions.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/utils/SerializedPipelineOptions.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/utils/SerializedPipelineOptions.java
new file mode 100644
index 0000000..2256bb1
--- /dev/null
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/utils/SerializedPipelineOptions.java
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.beam.runners.flink.translation.utils;
+
+import static com.google.common.base.Preconditions.checkNotNull;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.Serializable;
+import org.apache.beam.sdk.io.FileSystems;
+import org.apache.beam.sdk.options.PipelineOptions;
+import org.apache.beam.sdk.util.IOChannelUtils;
+
+/**
+ * Encapsulates the PipelineOptions in serialized form to ship them to the cluster.
+ */
+public class SerializedPipelineOptions implements Serializable {
+
+ private final byte[] serializedOptions;
+
+ /** Lazily initialized copy of deserialized options. */
+ private transient PipelineOptions pipelineOptions;
+
+ public SerializedPipelineOptions(PipelineOptions options) {
+ checkNotNull(options, "PipelineOptions must not be null.");
+
+ try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
+ new ObjectMapper().writeValue(baos, options);
+ this.serializedOptions = baos.toByteArray();
+ } catch (Exception e) {
+ throw new RuntimeException("Couldn't serialize PipelineOptions.", e);
+ }
+
+ }
+
+ public PipelineOptions getPipelineOptions() {
+ if (pipelineOptions == null) {
+ try {
+ pipelineOptions = new ObjectMapper().readValue(serializedOptions, PipelineOptions.class);
+
+ IOChannelUtils.registerIOFactoriesAllowOverride(pipelineOptions);
+ FileSystems.setDefaultConfigInWorkers(pipelineOptions);
+ } catch (IOException e) {
+ throw new RuntimeException("Couldn't deserialize the PipelineOptions.", e);
+ }
+ }
+
+ return pipelineOptions;
+ }
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/utils/package-info.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/utils/package-info.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/utils/package-info.java
new file mode 100644
index 0000000..5dedd53
--- /dev/null
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/utils/package-info.java
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Internal implementation of the Beam runner for Apache Flink.
+ */
+package org.apache.beam.runners.flink.translation.utils;
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/DataInputViewWrapper.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/DataInputViewWrapper.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/DataInputViewWrapper.java
new file mode 100644
index 0000000..82a2c4e
--- /dev/null
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/DataInputViewWrapper.java
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink.translation.wrappers;
+
+import java.io.EOFException;
+import java.io.IOException;
+import java.io.InputStream;
+import org.apache.flink.core.memory.DataInputView;
+
+/**
+ * Wrapper for {@link DataInputView}. We need this because Flink reads data using a
+ * {@link org.apache.flink.core.memory.DataInputView} while
+ * Dataflow {@link org.apache.beam.sdk.coders.Coder}s expect an
+ * {@link java.io.InputStream}.
+ */
+public class DataInputViewWrapper extends InputStream {
+
+ private DataInputView inputView;
+
+ public DataInputViewWrapper(DataInputView inputView) {
+ this.inputView = inputView;
+ }
+
+ public void setInputView(DataInputView inputView) {
+ this.inputView = inputView;
+ }
+
+ @Override
+ public int read() throws IOException {
+ try {
+ return inputView.readUnsignedByte();
+ } catch (EOFException e) {
+ // translate between DataInput and InputStream,
+ // DataInput signals EOF by exception, InputStream does it by returning -1
+ return -1;
+ }
+ }
+
+ @Override
+ public int read(byte[] b, int off, int len) throws IOException {
+ return inputView.read(b, off, len);
+ }
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/DataOutputViewWrapper.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/DataOutputViewWrapper.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/DataOutputViewWrapper.java
new file mode 100644
index 0000000..f2d9db2
--- /dev/null
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/DataOutputViewWrapper.java
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink.translation.wrappers;
+
+import java.io.IOException;
+import java.io.OutputStream;
+import org.apache.flink.core.memory.DataOutputView;
+
+/**
+ * Wrapper for {@link org.apache.flink.core.memory.DataOutputView}. We need this because
+ * Flink writes data using a {@link org.apache.flink.core.memory.DataInputView} while
+ * Dataflow {@link org.apache.beam.sdk.coders.Coder}s expect an
+ * {@link java.io.OutputStream}.
+ */
+public class DataOutputViewWrapper extends OutputStream {
+
+ private DataOutputView outputView;
+
+ public DataOutputViewWrapper(DataOutputView outputView) {
+ this.outputView = outputView;
+ }
+
+ public void setOutputView(DataOutputView outputView) {
+ this.outputView = outputView;
+ }
+
+ @Override
+ public void write(int b) throws IOException {
+ outputView.write(b);
+ }
+
+ @Override
+ public void write(byte[] b, int off, int len) throws IOException {
+ outputView.write(b, off, len);
+ }
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/SerializableFnAggregatorWrapper.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/SerializableFnAggregatorWrapper.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/SerializableFnAggregatorWrapper.java
new file mode 100644
index 0000000..70d97e3
--- /dev/null
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/SerializableFnAggregatorWrapper.java
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink.translation.wrappers;
+
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.Lists;
+import java.io.Serializable;
+import org.apache.beam.sdk.transforms.Aggregator;
+import org.apache.beam.sdk.transforms.Combine;
+import org.apache.flink.api.common.accumulators.Accumulator;
+
+/**
+ * Wrapper that wraps a {@link org.apache.beam.sdk.transforms.Combine.CombineFn}
+ * in a Flink {@link org.apache.flink.api.common.accumulators.Accumulator} for using
+ * the function as an aggregator in a {@link org.apache.beam.sdk.transforms.ParDo}
+ * operation.
+ */
+public class SerializableFnAggregatorWrapper<InputT, OutputT>
+ implements Aggregator<InputT, OutputT>, Accumulator<InputT, Serializable> {
+
+ private OutputT aa;
+ private Combine.CombineFn<InputT, ?, OutputT> combiner;
+
+ public SerializableFnAggregatorWrapper(Combine.CombineFn<InputT, ?, OutputT> combiner) {
+ this.combiner = combiner;
+ resetLocal();
+ }
+
+ @Override
+ @SuppressWarnings("unchecked")
+ public void add(InputT value) {
+ this.aa = combiner.apply(ImmutableList.of((InputT) aa, value));
+ }
+
+ @Override
+ public Serializable getLocalValue() {
+ return (Serializable) aa;
+ }
+
+ @Override
+ public void resetLocal() {
+ this.aa = combiner.apply(ImmutableList.<InputT>of());
+ }
+
+ @Override
+ @SuppressWarnings("unchecked")
+ public void merge(Accumulator<InputT, Serializable> other) {
+ this.aa = combiner.apply(ImmutableList.of((InputT) aa, (InputT) other.getLocalValue()));
+ }
+
+ @Override
+ public void addValue(InputT value) {
+ add(value);
+ }
+
+ @Override
+ public String getName() {
+ return "Aggregator :" + combiner.toString();
+ }
+
+ @Override
+ public Combine.CombineFn<InputT, ?, OutputT> getCombineFn() {
+ return combiner;
+ }
+
+ @Override
+ public Accumulator<InputT, Serializable> clone() {
+ try {
+ super.clone();
+ } catch (CloneNotSupportedException e) {
+ // Flink Accumulators cannot throw CloneNotSupportedException, work around that.
+ throw new RuntimeException(e);
+ }
+
+ // copy it by merging
+ OutputT resultCopy = combiner.apply(Lists.newArrayList((InputT) aa));
+ SerializableFnAggregatorWrapper<InputT, OutputT> result = new
+ SerializableFnAggregatorWrapper<>(combiner);
+
+ result.aa = resultCopy;
+ return result;
+ }
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/SourceInputFormat.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/SourceInputFormat.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/SourceInputFormat.java
new file mode 100644
index 0000000..a87472b
--- /dev/null
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/SourceInputFormat.java
@@ -0,0 +1,150 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink.translation.wrappers;
+
+import java.io.IOException;
+import java.util.List;
+import org.apache.beam.runners.flink.translation.utils.SerializedPipelineOptions;
+import org.apache.beam.sdk.io.BoundedSource;
+import org.apache.beam.sdk.io.Source;
+import org.apache.beam.sdk.options.PipelineOptions;
+import org.apache.beam.sdk.transforms.windowing.GlobalWindow;
+import org.apache.beam.sdk.transforms.windowing.PaneInfo;
+import org.apache.beam.sdk.util.WindowedValue;
+import org.apache.flink.api.common.io.DefaultInputSplitAssigner;
+import org.apache.flink.api.common.io.InputFormat;
+import org.apache.flink.api.common.io.statistics.BaseStatistics;
+import org.apache.flink.configuration.Configuration;
+import org.apache.flink.core.io.InputSplitAssigner;
+import org.joda.time.Instant;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+
+/**
+ * Wrapper for executing a {@link Source} as a Flink {@link InputFormat}.
+ */
+public class SourceInputFormat<T>
+ implements InputFormat<WindowedValue<T>, SourceInputSplit<T>> {
+ private static final Logger LOG = LoggerFactory.getLogger(SourceInputFormat.class);
+
+ private final BoundedSource<T> initialSource;
+
+ private transient PipelineOptions options;
+ private final SerializedPipelineOptions serializedOptions;
+
+ private transient BoundedSource.BoundedReader<T> reader;
+ private boolean inputAvailable = false;
+
+ public SourceInputFormat(BoundedSource<T> initialSource, PipelineOptions options) {
+ this.initialSource = initialSource;
+ this.serializedOptions = new SerializedPipelineOptions(options);
+ }
+
+ @Override
+ public void configure(Configuration configuration) {
+ options = serializedOptions.getPipelineOptions();
+ }
+
+ @Override
+ public void open(SourceInputSplit<T> sourceInputSplit) throws IOException {
+ reader = ((BoundedSource<T>) sourceInputSplit.getSource()).createReader(options);
+ inputAvailable = reader.start();
+ }
+
+ @Override
+ public BaseStatistics getStatistics(BaseStatistics baseStatistics) throws IOException {
+ try {
+ final long estimatedSize = initialSource.getEstimatedSizeBytes(options);
+
+ return new BaseStatistics() {
+ @Override
+ public long getTotalInputSize() {
+ return estimatedSize;
+ }
+
+ @Override
+ public long getNumberOfRecords() {
+ return BaseStatistics.NUM_RECORDS_UNKNOWN;
+ }
+
+ @Override
+ public float getAverageRecordWidth() {
+ return BaseStatistics.AVG_RECORD_BYTES_UNKNOWN;
+ }
+ };
+ } catch (Exception e) {
+ LOG.warn("Could not read Source statistics: {}", e);
+ }
+
+ return null;
+ }
+
+ @Override
+ @SuppressWarnings("unchecked")
+ public SourceInputSplit<T>[] createInputSplits(int numSplits) throws IOException {
+ try {
+ long desiredSizeBytes = initialSource.getEstimatedSizeBytes(options) / numSplits;
+ List<? extends Source<T>> shards =
+ initialSource.split(desiredSizeBytes, options);
+ int numShards = shards.size();
+ SourceInputSplit<T>[] sourceInputSplits = new SourceInputSplit[numShards];
+ for (int i = 0; i < numShards; i++) {
+ sourceInputSplits[i] = new SourceInputSplit<>(shards.get(i), i);
+ }
+ return sourceInputSplits;
+ } catch (Exception e) {
+ throw new IOException("Could not create input splits from Source.", e);
+ }
+ }
+
+ @Override
+ public InputSplitAssigner getInputSplitAssigner(final SourceInputSplit[] sourceInputSplits) {
+ return new DefaultInputSplitAssigner(sourceInputSplits);
+ }
+
+
+ @Override
+ public boolean reachedEnd() throws IOException {
+ return !inputAvailable;
+ }
+
+ @Override
+ public WindowedValue<T> nextRecord(WindowedValue<T> t) throws IOException {
+ if (inputAvailable) {
+ final T current = reader.getCurrent();
+ final Instant timestamp = reader.getCurrentTimestamp();
+ // advance reader to have a record ready next time
+ inputAvailable = reader.advance();
+ return WindowedValue.of(
+ current,
+ timestamp,
+ GlobalWindow.INSTANCE, PaneInfo.NO_FIRING);
+ }
+
+ return null;
+ }
+
+ @Override
+ public void close() throws IOException {
+ // TODO null check can be removed once FLINK-3796 is fixed
+ if (reader != null) {
+ reader.close();
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/SourceInputSplit.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/SourceInputSplit.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/SourceInputSplit.java
new file mode 100644
index 0000000..e4a7386
--- /dev/null
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/SourceInputSplit.java
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink.translation.wrappers;
+
+import org.apache.beam.sdk.io.Source;
+import org.apache.flink.core.io.InputSplit;
+
+/**
+ * {@link org.apache.flink.core.io.InputSplit} for
+ * {@link org.apache.beam.runners.flink.translation.wrappers.SourceInputFormat}. We pass
+ * the sharded Source around in the input split because Sources simply split up into several
+ * Sources for sharding. This is different to how Flink creates a separate InputSplit from
+ * an InputFormat.
+ */
+public class SourceInputSplit<T> implements InputSplit {
+
+ private Source<T> source;
+ private int splitNumber;
+
+ public SourceInputSplit() {
+ }
+
+ public SourceInputSplit(Source<T> source, int splitNumber) {
+ this.source = source;
+ this.splitNumber = splitNumber;
+ }
+
+ @Override
+ public int getSplitNumber() {
+ return splitNumber;
+ }
+
+ public Source<T> getSource() {
+ return source;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/package-info.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/package-info.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/package-info.java
new file mode 100644
index 0000000..72f7deb
--- /dev/null
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/package-info.java
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Internal implementation of the Beam runner for Apache Flink.
+ */
+package org.apache.beam.runners.flink.translation.wrappers;
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/DoFnOperator.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/DoFnOperator.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/DoFnOperator.java
new file mode 100644
index 0000000..8a09286
--- /dev/null
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/DoFnOperator.java
@@ -0,0 +1,774 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink.translation.wrappers.streaming;
+
+import static org.apache.flink.util.Preconditions.checkArgument;
+
+import com.google.common.base.Optional;
+import com.google.common.collect.Iterables;
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+import java.util.Map;
+import javax.annotation.Nullable;
+import org.apache.beam.runners.core.AggregatorFactory;
+import org.apache.beam.runners.core.DoFnRunner;
+import org.apache.beam.runners.core.DoFnRunners;
+import org.apache.beam.runners.core.ExecutionContext;
+import org.apache.beam.runners.core.GroupAlsoByWindowViaWindowSetNewDoFn;
+import org.apache.beam.runners.core.PushbackSideInputDoFnRunner;
+import org.apache.beam.runners.core.SideInputHandler;
+import org.apache.beam.runners.core.SimplePushbackSideInputDoFnRunner;
+import org.apache.beam.runners.core.StateInternals;
+import org.apache.beam.runners.core.StateNamespace;
+import org.apache.beam.runners.core.StateNamespaces;
+import org.apache.beam.runners.core.StateNamespaces.WindowNamespace;
+import org.apache.beam.runners.core.StateTag;
+import org.apache.beam.runners.core.StateTags;
+import org.apache.beam.runners.core.StatefulDoFnRunner;
+import org.apache.beam.runners.core.TimerInternals;
+import org.apache.beam.runners.core.TimerInternals.TimerData;
+import org.apache.beam.runners.flink.translation.types.CoderTypeSerializer;
+import org.apache.beam.runners.flink.translation.utils.SerializedPipelineOptions;
+import org.apache.beam.runners.flink.translation.wrappers.SerializableFnAggregatorWrapper;
+import org.apache.beam.runners.flink.translation.wrappers.streaming.state.FlinkBroadcastStateInternals;
+import org.apache.beam.runners.flink.translation.wrappers.streaming.state.FlinkKeyGroupStateInternals;
+import org.apache.beam.runners.flink.translation.wrappers.streaming.state.FlinkSplitStateInternals;
+import org.apache.beam.runners.flink.translation.wrappers.streaming.state.FlinkStateInternals;
+import org.apache.beam.runners.flink.translation.wrappers.streaming.state.KeyGroupCheckpointedOperator;
+import org.apache.beam.sdk.coders.Coder;
+import org.apache.beam.sdk.options.PipelineOptions;
+import org.apache.beam.sdk.transforms.Aggregator;
+import org.apache.beam.sdk.transforms.Combine;
+import org.apache.beam.sdk.transforms.DoFn;
+import org.apache.beam.sdk.transforms.join.RawUnionValue;
+import org.apache.beam.sdk.transforms.reflect.DoFnInvoker;
+import org.apache.beam.sdk.transforms.reflect.DoFnInvokers;
+import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
+import org.apache.beam.sdk.util.NullSideInputReader;
+import org.apache.beam.sdk.util.SideInputReader;
+import org.apache.beam.sdk.util.TimeDomain;
+import org.apache.beam.sdk.util.WindowedValue;
+import org.apache.beam.sdk.util.WindowingStrategy;
+import org.apache.beam.sdk.util.state.BagState;
+import org.apache.beam.sdk.values.PCollectionView;
+import org.apache.beam.sdk.values.TupleTag;
+import org.apache.flink.core.memory.DataInputViewStreamWrapper;
+import org.apache.flink.core.memory.DataOutputViewStreamWrapper;
+import org.apache.flink.runtime.state.KeyGroupStatePartitionStreamProvider;
+import org.apache.flink.runtime.state.KeyGroupsList;
+import org.apache.flink.runtime.state.KeyedStateBackend;
+import org.apache.flink.runtime.state.KeyedStateCheckpointOutputStream;
+import org.apache.flink.runtime.state.StateInitializationContext;
+import org.apache.flink.runtime.state.StateSnapshotContext;
+import org.apache.flink.streaming.api.operators.AbstractStreamOperator;
+import org.apache.flink.streaming.api.operators.ChainingStrategy;
+import org.apache.flink.streaming.api.operators.HeapInternalTimerService;
+import org.apache.flink.streaming.api.operators.InternalTimer;
+import org.apache.flink.streaming.api.operators.OneInputStreamOperator;
+import org.apache.flink.streaming.api.operators.Output;
+import org.apache.flink.streaming.api.operators.Triggerable;
+import org.apache.flink.streaming.api.operators.TwoInputStreamOperator;
+import org.apache.flink.streaming.api.watermark.Watermark;
+import org.apache.flink.streaming.runtime.streamrecord.StreamRecord;
+import org.joda.time.Instant;
+
+/**
+ * Flink operator for executing {@link DoFn DoFns}.
+ *
+ * @param <InputT> the input type of the {@link DoFn}
+ * @param <FnOutputT> the output type of the {@link DoFn}
+ * @param <OutputT> the output type of the operator, this can be different from the fn output
+ * type when we have additional tagged outputs
+ */
+public class DoFnOperator<InputT, FnOutputT, OutputT>
+ extends AbstractStreamOperator<OutputT>
+ implements OneInputStreamOperator<WindowedValue<InputT>, OutputT>,
+ TwoInputStreamOperator<WindowedValue<InputT>, RawUnionValue, OutputT>,
+ KeyGroupCheckpointedOperator, Triggerable<Object, TimerData> {
+
+ protected DoFn<InputT, FnOutputT> doFn;
+
+ protected final SerializedPipelineOptions serializedOptions;
+
+ protected final TupleTag<FnOutputT> mainOutputTag;
+ protected final List<TupleTag<?>> additionalOutputTags;
+
+ protected final Collection<PCollectionView<?>> sideInputs;
+ protected final Map<Integer, PCollectionView<?>> sideInputTagMapping;
+
+ protected final WindowingStrategy<?, ?> windowingStrategy;
+
+ protected final OutputManagerFactory<OutputT> outputManagerFactory;
+
+ protected transient DoFnRunner<InputT, FnOutputT> doFnRunner;
+ protected transient PushbackSideInputDoFnRunner<InputT, FnOutputT> pushbackDoFnRunner;
+
+ protected transient SideInputHandler sideInputHandler;
+
+ protected transient SideInputReader sideInputReader;
+
+ protected transient DoFnRunners.OutputManager outputManager;
+
+ private transient DoFnInvoker<InputT, FnOutputT> doFnInvoker;
+
+ protected transient long currentInputWatermark;
+
+ protected transient long currentOutputWatermark;
+
+ private transient StateTag<Object, BagState<WindowedValue<InputT>>> pushedBackTag;
+
+ protected transient FlinkStateInternals<?> stateInternals;
+
+ private Coder<WindowedValue<InputT>> inputCoder;
+
+ private final Coder<?> keyCoder;
+
+ private final TimerInternals.TimerDataCoder timerCoder;
+
+ protected transient HeapInternalTimerService<?, TimerInternals.TimerData> timerService;
+
+ protected transient FlinkTimerInternals timerInternals;
+
+ private transient StateInternals<?> pushbackStateInternals;
+
+ private transient Optional<Long> pushedBackWatermark;
+
+ public DoFnOperator(
+ DoFn<InputT, FnOutputT> doFn,
+ Coder<WindowedValue<InputT>> inputCoder,
+ TupleTag<FnOutputT> mainOutputTag,
+ List<TupleTag<?>> additionalOutputTags,
+ OutputManagerFactory<OutputT> outputManagerFactory,
+ WindowingStrategy<?, ?> windowingStrategy,
+ Map<Integer, PCollectionView<?>> sideInputTagMapping,
+ Collection<PCollectionView<?>> sideInputs,
+ PipelineOptions options,
+ Coder<?> keyCoder) {
+ this.doFn = doFn;
+ this.inputCoder = inputCoder;
+ this.mainOutputTag = mainOutputTag;
+ this.additionalOutputTags = additionalOutputTags;
+ this.sideInputTagMapping = sideInputTagMapping;
+ this.sideInputs = sideInputs;
+ this.serializedOptions = new SerializedPipelineOptions(options);
+ this.windowingStrategy = windowingStrategy;
+ this.outputManagerFactory = outputManagerFactory;
+
+ setChainingStrategy(ChainingStrategy.ALWAYS);
+
+ this.keyCoder = keyCoder;
+
+ this.timerCoder =
+ TimerInternals.TimerDataCoder.of(windowingStrategy.getWindowFn().windowCoder());
+ }
+
+ private ExecutionContext.StepContext createStepContext() {
+ return new StepContext();
+ }
+
+ // allow overriding this in WindowDoFnOperator because this one dynamically creates
+ // the DoFn
+ protected DoFn<InputT, FnOutputT> getDoFn() {
+ return doFn;
+ }
+
+ @Override
+ public void open() throws Exception {
+ super.open();
+
+ currentInputWatermark = Long.MIN_VALUE;
+ currentOutputWatermark = Long.MIN_VALUE;
+
+ AggregatorFactory aggregatorFactory = new AggregatorFactory() {
+ @Override
+ public <InputT, AccumT, OutputT> Aggregator<InputT, OutputT> createAggregatorForDoFn(
+ Class<?> fnClass,
+ ExecutionContext.StepContext stepContext,
+ String aggregatorName,
+ Combine.CombineFn<InputT, AccumT, OutputT> combine) {
+
+ @SuppressWarnings("unchecked")
+ SerializableFnAggregatorWrapper<InputT, OutputT> result =
+ (SerializableFnAggregatorWrapper<InputT, OutputT>)
+ getRuntimeContext().getAccumulator(aggregatorName);
+
+ if (result == null) {
+ result = new SerializableFnAggregatorWrapper<>(combine);
+ getRuntimeContext().addAccumulator(aggregatorName, result);
+ }
+ return result;
+ }
+ };
+
+ sideInputReader = NullSideInputReader.of(sideInputs);
+
+ if (!sideInputs.isEmpty()) {
+
+ pushedBackTag = StateTags.bag("pushed-back-values", inputCoder);
+
+ FlinkBroadcastStateInternals sideInputStateInternals =
+ new FlinkBroadcastStateInternals<>(
+ getContainingTask().getIndexInSubtaskGroup(), getOperatorStateBackend());
+
+ sideInputHandler = new SideInputHandler(sideInputs, sideInputStateInternals);
+ sideInputReader = sideInputHandler;
+
+ // maybe init by initializeState
+ if (pushbackStateInternals == null) {
+ if (keyCoder != null) {
+ pushbackStateInternals = new FlinkKeyGroupStateInternals<>(keyCoder,
+ getKeyedStateBackend());
+ } else {
+ pushbackStateInternals =
+ new FlinkSplitStateInternals<Object>(getOperatorStateBackend());
+ }
+ }
+
+ pushedBackWatermark = Optional.absent();
+
+ }
+
+ outputManager = outputManagerFactory.create(output);
+
+ // StatefulPardo or WindowDoFn
+ if (keyCoder != null) {
+ stateInternals = new FlinkStateInternals<>((KeyedStateBackend) getKeyedStateBackend(),
+ keyCoder);
+
+ timerService = (HeapInternalTimerService<?, TimerInternals.TimerData>)
+ getInternalTimerService("beam-timer", new CoderTypeSerializer<>(timerCoder), this);
+
+ timerInternals = new FlinkTimerInternals();
+
+ }
+
+ // WindowDoFnOperator need use state and timer to get DoFn.
+ // So must wait StateInternals and TimerInternals ready.
+ this.doFn = getDoFn();
+ doFnInvoker = DoFnInvokers.invokerFor(doFn);
+
+ doFnInvoker.invokeSetup();
+
+ ExecutionContext.StepContext stepContext = createStepContext();
+
+ doFnRunner = DoFnRunners.simpleRunner(
+ serializedOptions.getPipelineOptions(),
+ doFn,
+ sideInputReader,
+ outputManager,
+ mainOutputTag,
+ additionalOutputTags,
+ stepContext,
+ aggregatorFactory,
+ windowingStrategy);
+
+ if (doFn instanceof GroupAlsoByWindowViaWindowSetNewDoFn) {
+ // When the doFn is this, we know it came from WindowDoFnOperator and
+ // InputT = KeyedWorkItem<K, V>
+ // OutputT = KV<K, V>
+ //
+ // for some K, V
+
+
+ doFnRunner = DoFnRunners.lateDataDroppingRunner(
+ (DoFnRunner) doFnRunner,
+ stepContext,
+ windowingStrategy,
+ ((GroupAlsoByWindowViaWindowSetNewDoFn) doFn).getDroppedDueToLatenessAggregator());
+ } else if (keyCoder != null) {
+ // It is a stateful DoFn
+
+ StatefulDoFnRunner.CleanupTimer cleanupTimer =
+ new StatefulDoFnRunner.TimeInternalsCleanupTimer(
+ stepContext.timerInternals(), windowingStrategy);
+
+ // we don't know the window type
+ @SuppressWarnings({"unchecked", "rawtypes"})
+ Coder windowCoder = windowingStrategy.getWindowFn().windowCoder();
+
+ @SuppressWarnings({"unchecked", "rawtypes"})
+ StatefulDoFnRunner.StateCleaner<?> stateCleaner =
+ new StatefulDoFnRunner.StateInternalsStateCleaner<>(
+ doFn, stepContext.stateInternals(), windowCoder);
+
+ doFnRunner = DoFnRunners.defaultStatefulDoFnRunner(
+ doFn,
+ doFnRunner,
+ stepContext,
+ aggregatorFactory,
+ windowingStrategy,
+ cleanupTimer,
+ stateCleaner);
+ }
+
+ pushbackDoFnRunner =
+ SimplePushbackSideInputDoFnRunner.create(doFnRunner, sideInputs, sideInputHandler);
+ }
+
+ @Override
+ public void close() throws Exception {
+ super.close();
+ doFnInvoker.invokeTeardown();
+ }
+
+ protected final long getPushbackWatermarkHold() {
+ // if we don't have side inputs we never hold the watermark
+ if (sideInputs.isEmpty()) {
+ return BoundedWindow.TIMESTAMP_MAX_VALUE.getMillis();
+ }
+
+ try {
+ checkInitPushedBackWatermark();
+ return pushedBackWatermark.get();
+ } catch (Exception e) {
+ throw new RuntimeException("Error retrieving pushed back watermark state.", e);
+ }
+ }
+
+ private void checkInitPushedBackWatermark() {
+ // init and restore from pushedBack state.
+ // Not done in initializeState, because OperatorState is not ready.
+ if (!pushedBackWatermark.isPresent()) {
+
+ BagState<WindowedValue<InputT>> pushedBack =
+ pushbackStateInternals.state(StateNamespaces.global(), pushedBackTag);
+
+ long min = BoundedWindow.TIMESTAMP_MAX_VALUE.getMillis();
+ for (WindowedValue<InputT> value : pushedBack.read()) {
+ min = Math.min(min, value.getTimestamp().getMillis());
+ }
+ setPushedBackWatermark(min);
+ }
+ }
+
+ @Override
+ public final void processElement(
+ StreamRecord<WindowedValue<InputT>> streamRecord) throws Exception {
+ doFnRunner.startBundle();
+ doFnRunner.processElement(streamRecord.getValue());
+ doFnRunner.finishBundle();
+ }
+
+ private void setPushedBackWatermark(long watermark) {
+ pushedBackWatermark = Optional.fromNullable(watermark);
+ }
+
+ @Override
+ public final void processElement1(
+ StreamRecord<WindowedValue<InputT>> streamRecord) throws Exception {
+ pushbackDoFnRunner.startBundle();
+ Iterable<WindowedValue<InputT>> justPushedBack =
+ pushbackDoFnRunner.processElementInReadyWindows(streamRecord.getValue());
+
+ BagState<WindowedValue<InputT>> pushedBack =
+ pushbackStateInternals.state(StateNamespaces.global(), pushedBackTag);
+
+ checkInitPushedBackWatermark();
+
+ long min = pushedBackWatermark.get();
+ for (WindowedValue<InputT> pushedBackValue : justPushedBack) {
+ min = Math.min(min, pushedBackValue.getTimestamp().getMillis());
+ pushedBack.add(pushedBackValue);
+ }
+ setPushedBackWatermark(min);
+ pushbackDoFnRunner.finishBundle();
+ }
+
+ @Override
+ public final void processElement2(
+ StreamRecord<RawUnionValue> streamRecord) throws Exception {
+ pushbackDoFnRunner.startBundle();
+
+ @SuppressWarnings("unchecked")
+ WindowedValue<Iterable<?>> value =
+ (WindowedValue<Iterable<?>>) streamRecord.getValue().getValue();
+
+ PCollectionView<?> sideInput = sideInputTagMapping.get(streamRecord.getValue().getUnionTag());
+ sideInputHandler.addSideInputValue(sideInput, value);
+
+ BagState<WindowedValue<InputT>> pushedBack =
+ pushbackStateInternals.state(StateNamespaces.global(), pushedBackTag);
+
+ List<WindowedValue<InputT>> newPushedBack = new ArrayList<>();
+
+ Iterable<WindowedValue<InputT>> pushedBackContents = pushedBack.read();
+ if (pushedBackContents != null) {
+ for (WindowedValue<InputT> elem : pushedBackContents) {
+
+ // we need to set the correct key in case the operator is
+ // a (keyed) window operator
+ setKeyContextElement1(new StreamRecord<>(elem));
+
+ Iterable<WindowedValue<InputT>> justPushedBack =
+ pushbackDoFnRunner.processElementInReadyWindows(elem);
+ Iterables.addAll(newPushedBack, justPushedBack);
+ }
+ }
+
+ pushedBack.clear();
+ long min = BoundedWindow.TIMESTAMP_MAX_VALUE.getMillis();
+ for (WindowedValue<InputT> pushedBackValue : newPushedBack) {
+ min = Math.min(min, pushedBackValue.getTimestamp().getMillis());
+ pushedBack.add(pushedBackValue);
+ }
+ setPushedBackWatermark(min);
+
+ pushbackDoFnRunner.finishBundle();
+
+ // maybe output a new watermark
+ processWatermark1(new Watermark(currentInputWatermark));
+ }
+
+ @Override
+ public void processWatermark(Watermark mark) throws Exception {
+ processWatermark1(mark);
+ }
+
+ @Override
+ public void processWatermark1(Watermark mark) throws Exception {
+ if (keyCoder == null) {
+ this.currentInputWatermark = mark.getTimestamp();
+ long potentialOutputWatermark =
+ Math.min(getPushbackWatermarkHold(), currentInputWatermark);
+ if (potentialOutputWatermark > currentOutputWatermark) {
+ currentOutputWatermark = potentialOutputWatermark;
+ output.emitWatermark(new Watermark(currentOutputWatermark));
+ }
+ } else {
+ // fireTimers, so we need startBundle.
+ pushbackDoFnRunner.startBundle();
+
+ this.currentInputWatermark = mark.getTimestamp();
+
+ // hold back by the pushed back values waiting for side inputs
+ long actualInputWatermark = Math.min(getPushbackWatermarkHold(), mark.getTimestamp());
+
+ timerService.advanceWatermark(actualInputWatermark);
+
+ Instant watermarkHold = stateInternals.watermarkHold();
+
+ long combinedWatermarkHold = Math.min(watermarkHold.getMillis(), getPushbackWatermarkHold());
+
+ long potentialOutputWatermark = Math.min(currentInputWatermark, combinedWatermarkHold);
+
+ if (potentialOutputWatermark > currentOutputWatermark) {
+ currentOutputWatermark = potentialOutputWatermark;
+ output.emitWatermark(new Watermark(currentOutputWatermark));
+ }
+ pushbackDoFnRunner.finishBundle();
+ }
+ }
+
+ @Override
+ public void processWatermark2(Watermark mark) throws Exception {
+ // ignore watermarks from the side-input input
+ }
+
+ @Override
+ public void snapshotState(StateSnapshotContext context) throws Exception {
+ // copy from AbstractStreamOperator
+ if (getKeyedStateBackend() != null) {
+ KeyedStateCheckpointOutputStream out;
+
+ try {
+ out = context.getRawKeyedOperatorStateOutput();
+ } catch (Exception exception) {
+ throw new Exception("Could not open raw keyed operator state stream for "
+ + getOperatorName() + '.', exception);
+ }
+
+ try {
+ KeyGroupsList allKeyGroups = out.getKeyGroupList();
+ for (int keyGroupIdx : allKeyGroups) {
+ out.startNewKeyGroup(keyGroupIdx);
+
+ DataOutputViewStreamWrapper dov = new DataOutputViewStreamWrapper(out);
+
+ // if (this instanceof KeyGroupCheckpointedOperator)
+ snapshotKeyGroupState(keyGroupIdx, dov);
+
+ // We can't get all timerServices, so we just snapshot our timerService
+ // Maybe this is a normal DoFn that has no timerService
+ if (keyCoder != null) {
+ timerService.snapshotTimersForKeyGroup(dov, keyGroupIdx);
+ }
+
+ }
+ } catch (Exception exception) {
+ throw new Exception("Could not write timer service of " + getOperatorName()
+ + " to checkpoint state stream.", exception);
+ } finally {
+ try {
+ out.close();
+ } catch (Exception closeException) {
+ LOG.warn("Could not close raw keyed operator state stream for {}. This "
+ + "might have prevented deleting some state data.", getOperatorName(),
+ closeException);
+ }
+ }
+ }
+ }
+
+ @Override
+ public void snapshotKeyGroupState(int keyGroupIndex, DataOutputStream out) throws Exception {
+ if (!sideInputs.isEmpty() && keyCoder != null) {
+ ((FlinkKeyGroupStateInternals) pushbackStateInternals).snapshotKeyGroupState(
+ keyGroupIndex, out);
+ }
+ }
+
+ @Override
+ public void initializeState(StateInitializationContext context) throws Exception {
+ if (getKeyedStateBackend() != null) {
+ int totalKeyGroups = getKeyedStateBackend().getNumberOfKeyGroups();
+ KeyGroupsList localKeyGroupRange = getKeyedStateBackend().getKeyGroupRange();
+
+ for (KeyGroupStatePartitionStreamProvider streamProvider : context.getRawKeyedStateInputs()) {
+ DataInputViewStreamWrapper div = new DataInputViewStreamWrapper(streamProvider.getStream());
+
+ int keyGroupIdx = streamProvider.getKeyGroupId();
+ checkArgument(localKeyGroupRange.contains(keyGroupIdx),
+ "Key Group " + keyGroupIdx + " does not belong to the local range.");
+
+ // if (this instanceof KeyGroupRestoringOperator)
+ restoreKeyGroupState(keyGroupIdx, div);
+
+ // We just initialize our timerService
+ if (keyCoder != null) {
+ if (timerService == null) {
+ timerService = new HeapInternalTimerService<>(
+ totalKeyGroups,
+ localKeyGroupRange,
+ this,
+ getRuntimeContext().getProcessingTimeService());
+ }
+ timerService.restoreTimersForKeyGroup(div, keyGroupIdx, getUserCodeClassloader());
+ }
+ }
+ }
+ }
+
+ @Override
+ public void restoreKeyGroupState(int keyGroupIndex, DataInputStream in) throws Exception {
+ if (!sideInputs.isEmpty() && keyCoder != null) {
+ if (pushbackStateInternals == null) {
+ pushbackStateInternals = new FlinkKeyGroupStateInternals<>(keyCoder,
+ getKeyedStateBackend());
+ }
+ ((FlinkKeyGroupStateInternals) pushbackStateInternals)
+ .restoreKeyGroupState(keyGroupIndex, in, getUserCodeClassloader());
+ }
+ }
+
+ @Override
+ public void onEventTime(InternalTimer<Object, TimerData> timer) throws Exception {
+ fireTimer(timer);
+ }
+
+ @Override
+ public void onProcessingTime(InternalTimer<Object, TimerData> timer) throws Exception {
+ fireTimer(timer);
+ }
+
+ // allow overriding this in WindowDoFnOperator
+ public void fireTimer(InternalTimer<?, TimerData> timer) {
+ TimerInternals.TimerData timerData = timer.getNamespace();
+ StateNamespace namespace = timerData.getNamespace();
+ // This is a user timer, so namespace must be WindowNamespace
+ checkArgument(namespace instanceof WindowNamespace);
+ BoundedWindow window = ((WindowNamespace) namespace).getWindow();
+ pushbackDoFnRunner.onTimer(timerData.getTimerId(), window,
+ timerData.getTimestamp(), timerData.getDomain());
+ }
+
+ /**
+ * Factory for creating an {@link DoFnRunners.OutputManager} from
+ * a Flink {@link Output}.
+ */
+ interface OutputManagerFactory<OutputT> extends Serializable {
+ DoFnRunners.OutputManager create(Output<StreamRecord<OutputT>> output);
+ }
+
+ /**
+ * Default implementation of {@link OutputManagerFactory} that creates an
+ * {@link DoFnRunners.OutputManager} that only writes to
+ * a single logical output.
+ */
+ public static class DefaultOutputManagerFactory<OutputT>
+ implements OutputManagerFactory<OutputT> {
+ @Override
+ public DoFnRunners.OutputManager create(final Output<StreamRecord<OutputT>> output) {
+ return new DoFnRunners.OutputManager() {
+ @Override
+ public <T> void output(TupleTag<T> tag, WindowedValue<T> value) {
+ // with tagged outputs we can't get around this because we don't
+ // know our own output type...
+ @SuppressWarnings("unchecked")
+ OutputT castValue = (OutputT) value;
+ output.collect(new StreamRecord<>(castValue));
+ }
+ };
+ }
+ }
+
+ /**
+ * Implementation of {@link OutputManagerFactory} that creates an
+ * {@link DoFnRunners.OutputManager} that can write to multiple logical
+ * outputs by unioning them in a {@link RawUnionValue}.
+ */
+ public static class MultiOutputOutputManagerFactory
+ implements OutputManagerFactory<RawUnionValue> {
+
+ Map<TupleTag<?>, Integer> mapping;
+
+ public MultiOutputOutputManagerFactory(Map<TupleTag<?>, Integer> mapping) {
+ this.mapping = mapping;
+ }
+
+ @Override
+ public DoFnRunners.OutputManager create(final Output<StreamRecord<RawUnionValue>> output) {
+ return new DoFnRunners.OutputManager() {
+ @Override
+ public <T> void output(TupleTag<T> tag, WindowedValue<T> value) {
+ int intTag = mapping.get(tag);
+ output.collect(new StreamRecord<>(new RawUnionValue(intTag, value)));
+ }
+ };
+ }
+ }
+
+ /**
+ * {@link StepContext} for running {@link DoFn DoFns} on Flink. This does not allow
+ * accessing state or timer internals.
+ */
+ protected class StepContext implements ExecutionContext.StepContext {
+
+ @Override
+ public String getStepName() {
+ return null;
+ }
+
+ @Override
+ public String getTransformName() {
+ return null;
+ }
+
+ @Override
+ public void noteOutput(WindowedValue<?> output) {}
+
+ @Override
+ public void noteOutput(TupleTag<?> tag, WindowedValue<?> output) {}
+
+ @Override
+ public <T, W extends BoundedWindow> void writePCollectionViewData(
+ TupleTag<?> tag,
+ Iterable<WindowedValue<T>> data,
+ Coder<Iterable<WindowedValue<T>>> dataCoder,
+ W window,
+ Coder<W> windowCoder) throws IOException {
+ throw new UnsupportedOperationException("Writing side-input data is not supported.");
+ }
+
+ @Override
+ public StateInternals<?> stateInternals() {
+ return stateInternals;
+ }
+
+ @Override
+ public TimerInternals timerInternals() {
+ return timerInternals;
+ }
+ }
+
+ private class FlinkTimerInternals implements TimerInternals {
+
+ @Override
+ public void setTimer(
+ StateNamespace namespace, String timerId, Instant target, TimeDomain timeDomain) {
+ setTimer(TimerData.of(timerId, namespace, target, timeDomain));
+ }
+
+ @Deprecated
+ @Override
+ public void setTimer(TimerData timerKey) {
+ long time = timerKey.getTimestamp().getMillis();
+ if (timerKey.getDomain().equals(TimeDomain.EVENT_TIME)) {
+ timerService.registerEventTimeTimer(timerKey, time);
+ } else if (timerKey.getDomain().equals(TimeDomain.PROCESSING_TIME)) {
+ timerService.registerProcessingTimeTimer(timerKey, time);
+ } else {
+ throw new UnsupportedOperationException(
+ "Unsupported time domain: " + timerKey.getDomain());
+ }
+ }
+
+ @Deprecated
+ @Override
+ public void deleteTimer(StateNamespace namespace, String timerId) {
+ throw new UnsupportedOperationException(
+ "Canceling of a timer by ID is not yet supported.");
+ }
+
+ @Override
+ public void deleteTimer(StateNamespace namespace, String timerId, TimeDomain timeDomain) {
+ throw new UnsupportedOperationException(
+ "Canceling of a timer by ID is not yet supported.");
+ }
+
+ @Deprecated
+ @Override
+ public void deleteTimer(TimerData timerKey) {
+ long time = timerKey.getTimestamp().getMillis();
+ if (timerKey.getDomain().equals(TimeDomain.EVENT_TIME)) {
+ timerService.deleteEventTimeTimer(timerKey, time);
+ } else if (timerKey.getDomain().equals(TimeDomain.PROCESSING_TIME)) {
+ timerService.deleteProcessingTimeTimer(timerKey, time);
+ } else {
+ throw new UnsupportedOperationException(
+ "Unsupported time domain: " + timerKey.getDomain());
+ }
+ }
+
+ @Override
+ public Instant currentProcessingTime() {
+ return new Instant(timerService.currentProcessingTime());
+ }
+
+ @Nullable
+ @Override
+ public Instant currentSynchronizedProcessingTime() {
+ return new Instant(timerService.currentProcessingTime());
+ }
+
+ @Override
+ public Instant currentInputWatermarkTime() {
+ return new Instant(Math.min(currentInputWatermark, getPushbackWatermarkHold()));
+ }
+
+ @Nullable
+ @Override
+ public Instant currentOutputWatermarkTime() {
+ return new Instant(currentOutputWatermark);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/KvToByteBufferKeySelector.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/KvToByteBufferKeySelector.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/KvToByteBufferKeySelector.java
new file mode 100644
index 0000000..dce2e68
--- /dev/null
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/KvToByteBufferKeySelector.java
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink.translation.wrappers.streaming;
+
+import java.nio.ByteBuffer;
+import org.apache.beam.sdk.coders.Coder;
+import org.apache.beam.sdk.util.CoderUtils;
+import org.apache.beam.sdk.util.WindowedValue;
+import org.apache.beam.sdk.values.KV;
+import org.apache.flink.api.common.typeinfo.TypeInformation;
+import org.apache.flink.api.java.functions.KeySelector;
+import org.apache.flink.api.java.typeutils.GenericTypeInfo;
+import org.apache.flink.api.java.typeutils.ResultTypeQueryable;
+
+/**
+ * {@link KeySelector} that retrieves a key from a {@link KV}. This will return
+ * the key as encoded by the provided {@link Coder} in a {@link ByteBuffer}. This ensures
+ * that all key comparisons/hashing happen on the encoded form.
+ */
+public class KvToByteBufferKeySelector<K, V>
+ implements KeySelector<WindowedValue<KV<K, V>>, ByteBuffer>,
+ ResultTypeQueryable<ByteBuffer> {
+
+ private final Coder<K> keyCoder;
+
+ public KvToByteBufferKeySelector(Coder<K> keyCoder) {
+ this.keyCoder = keyCoder;
+ }
+
+ @Override
+ public ByteBuffer getKey(WindowedValue<KV<K, V>> value) throws Exception {
+ K key = value.getValue().getKey();
+ byte[] keyBytes = CoderUtils.encodeToByteArray(keyCoder, key);
+ return ByteBuffer.wrap(keyBytes);
+ }
+
+ @Override
+ public TypeInformation<ByteBuffer> getProducedType() {
+ return new GenericTypeInfo<>(ByteBuffer.class);
+ }
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/SingletonKeyedWorkItem.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/SingletonKeyedWorkItem.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/SingletonKeyedWorkItem.java
new file mode 100644
index 0000000..e843660
--- /dev/null
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/SingletonKeyedWorkItem.java
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink.translation.wrappers.streaming;
+
+import java.util.Collections;
+import org.apache.beam.runners.core.KeyedWorkItem;
+import org.apache.beam.runners.core.TimerInternals;
+import org.apache.beam.sdk.util.WindowedValue;
+
+/**
+ * Singleton keyed word item.
+ */
+public class SingletonKeyedWorkItem<K, ElemT> implements KeyedWorkItem<K, ElemT> {
+
+ final K key;
+ final WindowedValue<ElemT> value;
+
+ public SingletonKeyedWorkItem(K key, WindowedValue<ElemT> value) {
+ this.key = key;
+ this.value = value;
+ }
+
+ @Override
+ public K key() {
+ return key;
+ }
+
+ public WindowedValue<ElemT> value() {
+ return value;
+ }
+
+ @Override
+ public Iterable<TimerInternals.TimerData> timersIterable() {
+ return Collections.EMPTY_LIST;
+ }
+
+ @Override
+ public Iterable<WindowedValue<ElemT>> elementsIterable() {
+ return Collections.singletonList(value);
+ }
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/SingletonKeyedWorkItemCoder.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/SingletonKeyedWorkItemCoder.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/SingletonKeyedWorkItemCoder.java
new file mode 100644
index 0000000..9a52330
--- /dev/null
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/SingletonKeyedWorkItemCoder.java
@@ -0,0 +1,126 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink.translation.wrappers.streaming;
+
+import static com.google.common.base.Preconditions.checkArgument;
+
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.google.common.collect.ImmutableList;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.util.List;
+import org.apache.beam.runners.core.KeyedWorkItem;
+import org.apache.beam.runners.core.KeyedWorkItemCoder;
+import org.apache.beam.sdk.coders.Coder;
+import org.apache.beam.sdk.coders.CoderException;
+import org.apache.beam.sdk.coders.StandardCoder;
+import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
+import org.apache.beam.sdk.util.PropertyNames;
+import org.apache.beam.sdk.util.WindowedValue;
+
+/**
+ * Singleton keyed work item coder.
+ */
+public class SingletonKeyedWorkItemCoder<K, ElemT>
+ extends StandardCoder<SingletonKeyedWorkItem<K, ElemT>> {
+ /**
+ * Create a new {@link KeyedWorkItemCoder} with the provided key coder, element coder, and window
+ * coder.
+ */
+ public static <K, ElemT> SingletonKeyedWorkItemCoder<K, ElemT> of(
+ Coder<K> keyCoder, Coder<ElemT> elemCoder, Coder<? extends BoundedWindow> windowCoder) {
+ return new SingletonKeyedWorkItemCoder<>(keyCoder, elemCoder, windowCoder);
+ }
+
+ @JsonCreator
+ public static <K, ElemT> SingletonKeyedWorkItemCoder<K, ElemT> of(
+ @JsonProperty(PropertyNames.COMPONENT_ENCODINGS) List<Coder<?>> components) {
+ checkArgument(components.size() == 3, "Expecting 3 components, got %s", components.size());
+ @SuppressWarnings("unchecked")
+ Coder<K> keyCoder = (Coder<K>) components.get(0);
+ @SuppressWarnings("unchecked")
+ Coder<ElemT> elemCoder = (Coder<ElemT>) components.get(1);
+ @SuppressWarnings("unchecked")
+ Coder<? extends BoundedWindow> windowCoder = (Coder<? extends BoundedWindow>) components.get(2);
+ return new SingletonKeyedWorkItemCoder<>(keyCoder, elemCoder, windowCoder);
+ }
+
+ private final Coder<K> keyCoder;
+ private final Coder<ElemT> elemCoder;
+ private final Coder<? extends BoundedWindow> windowCoder;
+ private final WindowedValue.FullWindowedValueCoder<ElemT> valueCoder;
+
+ private SingletonKeyedWorkItemCoder(
+ Coder<K> keyCoder, Coder<ElemT> elemCoder, Coder<? extends BoundedWindow> windowCoder) {
+ this.keyCoder = keyCoder;
+ this.elemCoder = elemCoder;
+ this.windowCoder = windowCoder;
+ valueCoder = WindowedValue.FullWindowedValueCoder.of(elemCoder, windowCoder);
+ }
+
+ public Coder<K> getKeyCoder() {
+ return keyCoder;
+ }
+
+ public Coder<ElemT> getElementCoder() {
+ return elemCoder;
+ }
+
+ @Override
+ public void encode(SingletonKeyedWorkItem<K, ElemT> value,
+ OutputStream outStream,
+ Context context)
+ throws CoderException, IOException {
+ keyCoder.encode(value.key(), outStream, context.nested());
+ valueCoder.encode(value.value, outStream, context);
+ }
+
+ @Override
+ public SingletonKeyedWorkItem<K, ElemT> decode(InputStream inStream, Context context)
+ throws CoderException, IOException {
+ K key = keyCoder.decode(inStream, context.nested());
+ WindowedValue<ElemT> value = valueCoder.decode(inStream, context);
+ return new SingletonKeyedWorkItem<>(key, value);
+ }
+
+ @Override
+ public List<? extends Coder<?>> getCoderArguments() {
+ return ImmutableList.of(keyCoder, elemCoder, windowCoder);
+ }
+
+ @Override
+ public void verifyDeterministic() throws NonDeterministicException {
+ keyCoder.verifyDeterministic();
+ elemCoder.verifyDeterministic();
+ windowCoder.verifyDeterministic();
+ }
+
+ /**
+ * {@inheritDoc}.
+ *
+ * {@link KeyedWorkItemCoder} is not consistent with equals as it can return a
+ * {@link KeyedWorkItem} of a type different from the originally encoded type.
+ */
+ @Override
+ public boolean consistentWithEquals() {
+ return false;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/SplittableDoFnOperator.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/SplittableDoFnOperator.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/SplittableDoFnOperator.java
new file mode 100644
index 0000000..40f70e4
--- /dev/null
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/SplittableDoFnOperator.java
@@ -0,0 +1,150 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink.translation.wrappers.streaming;
+
+import static com.google.common.base.Preconditions.checkState;
+
+import java.util.Collection;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.Executors;
+import org.apache.beam.runners.core.ElementAndRestriction;
+import org.apache.beam.runners.core.KeyedWorkItem;
+import org.apache.beam.runners.core.KeyedWorkItems;
+import org.apache.beam.runners.core.OutputAndTimeBoundedSplittableProcessElementInvoker;
+import org.apache.beam.runners.core.OutputWindowedValue;
+import org.apache.beam.runners.core.SplittableParDo;
+import org.apache.beam.runners.core.StateInternals;
+import org.apache.beam.runners.core.StateInternalsFactory;
+import org.apache.beam.runners.core.TimerInternals;
+import org.apache.beam.runners.core.TimerInternalsFactory;
+import org.apache.beam.sdk.coders.Coder;
+import org.apache.beam.sdk.options.PipelineOptions;
+import org.apache.beam.sdk.transforms.DoFn;
+import org.apache.beam.sdk.transforms.splittabledofn.RestrictionTracker;
+import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
+import org.apache.beam.sdk.transforms.windowing.PaneInfo;
+import org.apache.beam.sdk.util.WindowedValue;
+import org.apache.beam.sdk.util.WindowingStrategy;
+import org.apache.beam.sdk.values.PCollectionView;
+import org.apache.beam.sdk.values.TupleTag;
+import org.apache.flink.streaming.api.operators.InternalTimer;
+import org.joda.time.Duration;
+import org.joda.time.Instant;
+
+/**
+ * Flink operator for executing splittable {@link DoFn DoFns}. Specifically, for executing
+ * the {@code @ProcessElement} method of a splittable {@link DoFn}.
+ */
+public class SplittableDoFnOperator<
+ InputT, FnOutputT, OutputT, RestrictionT, TrackerT extends RestrictionTracker<RestrictionT>>
+ extends DoFnOperator<
+ KeyedWorkItem<String, ElementAndRestriction<InputT, RestrictionT>>, FnOutputT, OutputT> {
+
+ public SplittableDoFnOperator(
+ DoFn<KeyedWorkItem<String, ElementAndRestriction<InputT, RestrictionT>>, FnOutputT> doFn,
+ Coder<
+ WindowedValue<
+ KeyedWorkItem<String, ElementAndRestriction<InputT, RestrictionT>>>> inputCoder,
+ TupleTag<FnOutputT> mainOutputTag,
+ List<TupleTag<?>> additionalOutputTags,
+ OutputManagerFactory<OutputT> outputManagerFactory,
+ WindowingStrategy<?, ?> windowingStrategy,
+ Map<Integer, PCollectionView<?>> sideInputTagMapping,
+ Collection<PCollectionView<?>> sideInputs,
+ PipelineOptions options,
+ Coder<?> keyCoder) {
+ super(
+ doFn,
+ inputCoder,
+ mainOutputTag,
+ additionalOutputTags,
+ outputManagerFactory,
+ windowingStrategy,
+ sideInputTagMapping,
+ sideInputs,
+ options,
+ keyCoder);
+
+ }
+
+ @Override
+ public void open() throws Exception {
+ super.open();
+
+ checkState(doFn instanceof SplittableParDo.ProcessFn);
+
+ StateInternalsFactory<String> stateInternalsFactory = new StateInternalsFactory<String>() {
+ @Override
+ public StateInternals<String> stateInternalsForKey(String key) {
+ //this will implicitly be keyed by the key of the incoming
+ // element or by the key of a firing timer
+ return (StateInternals<String>) stateInternals;
+ }
+ };
+ TimerInternalsFactory<String> timerInternalsFactory = new TimerInternalsFactory<String>() {
+ @Override
+ public TimerInternals timerInternalsForKey(String key) {
+ //this will implicitly be keyed like the StateInternalsFactory
+ return timerInternals;
+ }
+ };
+
+ ((SplittableParDo.ProcessFn) doFn).setStateInternalsFactory(stateInternalsFactory);
+ ((SplittableParDo.ProcessFn) doFn).setTimerInternalsFactory(timerInternalsFactory);
+ ((SplittableParDo.ProcessFn) doFn).setProcessElementInvoker(
+ new OutputAndTimeBoundedSplittableProcessElementInvoker<>(
+ doFn,
+ serializedOptions.getPipelineOptions(),
+ new OutputWindowedValue<FnOutputT>() {
+ @Override
+ public void outputWindowedValue(
+ FnOutputT output,
+ Instant timestamp,
+ Collection<? extends BoundedWindow> windows,
+ PaneInfo pane) {
+ outputManager.output(
+ mainOutputTag,
+ WindowedValue.of(output, timestamp, windows, pane));
+ }
+
+ @Override
+ public <AdditionalOutputT> void outputWindowedValue(
+ TupleTag<AdditionalOutputT> tag,
+ AdditionalOutputT output,
+ Instant timestamp,
+ Collection<? extends BoundedWindow> windows,
+ PaneInfo pane) {
+ outputManager.output(tag, WindowedValue.of(output, timestamp, windows, pane));
+ }
+ },
+ sideInputReader,
+ Executors.newSingleThreadScheduledExecutor(Executors.defaultThreadFactory()),
+ 10000,
+ Duration.standardSeconds(10)));
+ }
+
+ @Override
+ public void fireTimer(InternalTimer<?, TimerInternals.TimerData> timer) {
+ doFnRunner.processElement(WindowedValue.valueInGlobalWindow(
+ KeyedWorkItems.<String, ElementAndRestriction<InputT, RestrictionT>>timersWorkItem(
+ (String) stateInternals.getKey(),
+ Collections.singletonList(timer.getNamespace()))));
+ }
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/WindowDoFnOperator.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/WindowDoFnOperator.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/WindowDoFnOperator.java
new file mode 100644
index 0000000..9b2136c
--- /dev/null
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/WindowDoFnOperator.java
@@ -0,0 +1,117 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink.translation.wrappers.streaming;
+
+import static org.apache.beam.runners.core.TimerInternals.TimerData;
+
+import java.util.Collection;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import org.apache.beam.runners.core.GroupAlsoByWindowViaWindowSetNewDoFn;
+import org.apache.beam.runners.core.KeyedWorkItem;
+import org.apache.beam.runners.core.KeyedWorkItems;
+import org.apache.beam.runners.core.StateInternals;
+import org.apache.beam.runners.core.StateInternalsFactory;
+import org.apache.beam.runners.core.SystemReduceFn;
+import org.apache.beam.runners.core.TimerInternals;
+import org.apache.beam.runners.core.TimerInternalsFactory;
+import org.apache.beam.sdk.coders.Coder;
+import org.apache.beam.sdk.options.PipelineOptions;
+import org.apache.beam.sdk.transforms.DoFn;
+import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
+import org.apache.beam.sdk.util.WindowedValue;
+import org.apache.beam.sdk.util.WindowingStrategy;
+import org.apache.beam.sdk.values.KV;
+import org.apache.beam.sdk.values.PCollectionView;
+import org.apache.beam.sdk.values.TupleTag;
+import org.apache.flink.streaming.api.operators.InternalTimer;
+
+/**
+ * Flink operator for executing window {@link DoFn DoFns}.
+ */
+public class WindowDoFnOperator<K, InputT, OutputT>
+ extends DoFnOperator<KeyedWorkItem<K, InputT>, KV<K, OutputT>, WindowedValue<KV<K, OutputT>>> {
+
+ private final SystemReduceFn<K, InputT, ?, OutputT, BoundedWindow> systemReduceFn;
+
+ public WindowDoFnOperator(
+ SystemReduceFn<K, InputT, ?, OutputT, BoundedWindow> systemReduceFn,
+ Coder<WindowedValue<KeyedWorkItem<K, InputT>>> inputCoder,
+ TupleTag<KV<K, OutputT>> mainOutputTag,
+ List<TupleTag<?>> additionalOutputTags,
+ OutputManagerFactory<WindowedValue<KV<K, OutputT>>> outputManagerFactory,
+ WindowingStrategy<?, ?> windowingStrategy,
+ Map<Integer, PCollectionView<?>> sideInputTagMapping,
+ Collection<PCollectionView<?>> sideInputs,
+ PipelineOptions options,
+ Coder<K> keyCoder) {
+ super(
+ null,
+ inputCoder,
+ mainOutputTag,
+ additionalOutputTags,
+ outputManagerFactory,
+ windowingStrategy,
+ sideInputTagMapping,
+ sideInputs,
+ options,
+ keyCoder);
+
+ this.systemReduceFn = systemReduceFn;
+
+ }
+
+ @Override
+ protected DoFn<KeyedWorkItem<K, InputT>, KV<K, OutputT>> getDoFn() {
+ StateInternalsFactory<K> stateInternalsFactory = new StateInternalsFactory<K>() {
+ @Override
+ public StateInternals<K> stateInternalsForKey(K key) {
+ //this will implicitly be keyed by the key of the incoming
+ // element or by the key of a firing timer
+ return (StateInternals<K>) stateInternals;
+ }
+ };
+ TimerInternalsFactory<K> timerInternalsFactory = new TimerInternalsFactory<K>() {
+ @Override
+ public TimerInternals timerInternalsForKey(K key) {
+ //this will implicitly be keyed like the StateInternalsFactory
+ return timerInternals;
+ }
+ };
+
+ // we have to do the unchecked cast because GroupAlsoByWindowViaWindowSetDoFn.create
+ // has the window type as generic parameter while WindowingStrategy is almost always
+ // untyped.
+ @SuppressWarnings("unchecked")
+ DoFn<KeyedWorkItem<K, InputT>, KV<K, OutputT>> doFn =
+ GroupAlsoByWindowViaWindowSetNewDoFn.create(
+ windowingStrategy, stateInternalsFactory, timerInternalsFactory, sideInputReader,
+ (SystemReduceFn) systemReduceFn, outputManager, mainOutputTag);
+ return doFn;
+ }
+
+ @Override
+ public void fireTimer(InternalTimer<?, TimerData> timer) {
+ doFnRunner.processElement(WindowedValue.valueInGlobalWindow(
+ KeyedWorkItems.<K, InputT>timersWorkItem(
+ (K) stateInternals.getKey(),
+ Collections.singletonList(timer.getNamespace()))));
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/WorkItemKeySelector.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/WorkItemKeySelector.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/WorkItemKeySelector.java
new file mode 100644
index 0000000..1dff367
--- /dev/null
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/WorkItemKeySelector.java
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink.translation.wrappers.streaming;
+
+import java.nio.ByteBuffer;
+import org.apache.beam.runners.core.KeyedWorkItem;
+import org.apache.beam.sdk.coders.Coder;
+import org.apache.beam.sdk.util.CoderUtils;
+import org.apache.beam.sdk.util.WindowedValue;
+import org.apache.flink.api.common.typeinfo.TypeInformation;
+import org.apache.flink.api.java.functions.KeySelector;
+import org.apache.flink.api.java.typeutils.GenericTypeInfo;
+import org.apache.flink.api.java.typeutils.ResultTypeQueryable;
+
+/**
+ * {@link KeySelector} that retrieves a key from a {@link KeyedWorkItem}. This will return
+ * the key as encoded by the provided {@link Coder} in a {@link ByteBuffer}. This ensures
+ * that all key comparisons/hashing happen on the encoded form.
+ */
+public class WorkItemKeySelector<K, V>
+ implements KeySelector<WindowedValue<SingletonKeyedWorkItem<K, V>>, ByteBuffer>,
+ ResultTypeQueryable<ByteBuffer> {
+
+ private final Coder<K> keyCoder;
+
+ public WorkItemKeySelector(Coder<K> keyCoder) {
+ this.keyCoder = keyCoder;
+ }
+
+ @Override
+ public ByteBuffer getKey(WindowedValue<SingletonKeyedWorkItem<K, V>> value) throws Exception {
+ K key = value.getValue().key();
+ byte[] keyBytes = CoderUtils.encodeToByteArray(keyCoder, key);
+ return ByteBuffer.wrap(keyBytes);
+ }
+
+ @Override
+ public TypeInformation<ByteBuffer> getProducedType() {
+ return new GenericTypeInfo<>(ByteBuffer.class);
+ }
+}
[29/50] [abbrv] beam git commit: [BEAM-1994] Remove Flink examples
package
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkMergingNonShuffleReduceFunction.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkMergingNonShuffleReduceFunction.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkMergingNonShuffleReduceFunction.java
new file mode 100644
index 0000000..26fd0b4
--- /dev/null
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkMergingNonShuffleReduceFunction.java
@@ -0,0 +1,228 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink.translation.functions;
+
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Lists;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import org.apache.beam.runners.core.PerKeyCombineFnRunner;
+import org.apache.beam.runners.core.PerKeyCombineFnRunners;
+import org.apache.beam.runners.flink.translation.utils.SerializedPipelineOptions;
+import org.apache.beam.sdk.options.PipelineOptions;
+import org.apache.beam.sdk.transforms.CombineFnBase;
+import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
+import org.apache.beam.sdk.transforms.windowing.IntervalWindow;
+import org.apache.beam.sdk.transforms.windowing.OutputTimeFn;
+import org.apache.beam.sdk.transforms.windowing.PaneInfo;
+import org.apache.beam.sdk.util.WindowedValue;
+import org.apache.beam.sdk.util.WindowingStrategy;
+import org.apache.beam.sdk.values.KV;
+import org.apache.beam.sdk.values.PCollectionView;
+import org.apache.flink.api.common.functions.RichGroupReduceFunction;
+import org.apache.flink.util.Collector;
+import org.joda.time.Instant;
+
+/**
+ * Special version of {@link FlinkReduceFunction} that supports merging windows. This
+ * assumes that the windows are {@link IntervalWindow IntervalWindows} and exhibits the
+ * same behaviour as {@code MergeOverlappingIntervalWindows}.
+ *
+ * <p>This is different from the pair of function for the non-merging windows case
+ * in that we cannot do combining before the shuffle because elements would not
+ * yet be in their correct windows for side-input access.
+ */
+public class FlinkMergingNonShuffleReduceFunction<
+ K, InputT, AccumT, OutputT, W extends IntervalWindow>
+ extends RichGroupReduceFunction<WindowedValue<KV<K, InputT>>, WindowedValue<KV<K, OutputT>>> {
+
+ private final CombineFnBase.PerKeyCombineFn<K, InputT, AccumT, OutputT> combineFn;
+
+ private final WindowingStrategy<?, W> windowingStrategy;
+
+ private final Map<PCollectionView<?>, WindowingStrategy<?, ?>> sideInputs;
+
+ private final SerializedPipelineOptions serializedOptions;
+
+ public FlinkMergingNonShuffleReduceFunction(
+ CombineFnBase.PerKeyCombineFn<K, InputT, AccumT, OutputT> keyedCombineFn,
+ WindowingStrategy<?, W> windowingStrategy,
+ Map<PCollectionView<?>, WindowingStrategy<?, ?>> sideInputs,
+ PipelineOptions pipelineOptions) {
+
+ this.combineFn = keyedCombineFn;
+
+ this.windowingStrategy = windowingStrategy;
+ this.sideInputs = sideInputs;
+
+ this.serializedOptions = new SerializedPipelineOptions(pipelineOptions);
+
+ }
+
+ @Override
+ public void reduce(
+ Iterable<WindowedValue<KV<K, InputT>>> elements,
+ Collector<WindowedValue<KV<K, OutputT>>> out) throws Exception {
+
+ PipelineOptions options = serializedOptions.getPipelineOptions();
+
+ FlinkSideInputReader sideInputReader =
+ new FlinkSideInputReader(sideInputs, getRuntimeContext());
+
+ PerKeyCombineFnRunner<K, InputT, AccumT, OutputT> combineFnRunner =
+ PerKeyCombineFnRunners.create(combineFn);
+
+ @SuppressWarnings("unchecked")
+ OutputTimeFn<? super BoundedWindow> outputTimeFn =
+ (OutputTimeFn<? super BoundedWindow>) windowingStrategy.getOutputTimeFn();
+
+ // get all elements so that we can sort them, has to fit into
+ // memory
+ // this seems very unprudent, but correct, for now
+ List<WindowedValue<KV<K, InputT>>> sortedInput = Lists.newArrayList();
+ for (WindowedValue<KV<K, InputT>> inputValue : elements) {
+ for (WindowedValue<KV<K, InputT>> exploded : inputValue.explodeWindows()) {
+ sortedInput.add(exploded);
+ }
+ }
+ Collections.sort(sortedInput, new Comparator<WindowedValue<KV<K, InputT>>>() {
+ @Override
+ public int compare(
+ WindowedValue<KV<K, InputT>> o1,
+ WindowedValue<KV<K, InputT>> o2) {
+ return Iterables.getOnlyElement(o1.getWindows()).maxTimestamp()
+ .compareTo(Iterables.getOnlyElement(o2.getWindows()).maxTimestamp());
+ }
+ });
+
+ // merge windows, we have to do it in an extra pre-processing step and
+ // can't do it as we go since the window of early elements would not
+ // be correct when calling the CombineFn
+ mergeWindow(sortedInput);
+
+ // iterate over the elements that are sorted by window timestamp
+ final Iterator<WindowedValue<KV<K, InputT>>> iterator = sortedInput.iterator();
+
+ // create accumulator using the first elements key
+ WindowedValue<KV<K, InputT>> currentValue = iterator.next();
+ K key = currentValue.getValue().getKey();
+ IntervalWindow currentWindow =
+ (IntervalWindow) Iterables.getOnlyElement(currentValue.getWindows());
+ InputT firstValue = currentValue.getValue().getValue();
+ AccumT accumulator =
+ combineFnRunner.createAccumulator(key, options, sideInputReader, currentValue.getWindows());
+ accumulator = combineFnRunner.addInput(key, accumulator, firstValue,
+ options, sideInputReader, currentValue.getWindows());
+
+ // we use this to keep track of the timestamps assigned by the OutputTimeFn
+ Instant windowTimestamp =
+ outputTimeFn.assignOutputTime(currentValue.getTimestamp(), currentWindow);
+
+ while (iterator.hasNext()) {
+ WindowedValue<KV<K, InputT>> nextValue = iterator.next();
+ IntervalWindow nextWindow =
+ (IntervalWindow) Iterables.getOnlyElement(nextValue.getWindows());
+
+ if (currentWindow.equals(nextWindow)) {
+ // continue accumulating and merge windows
+
+ InputT value = nextValue.getValue().getValue();
+ accumulator = combineFnRunner.addInput(key, accumulator, value,
+ options, sideInputReader, currentValue.getWindows());
+
+ windowTimestamp = outputTimeFn.combine(
+ windowTimestamp,
+ outputTimeFn.assignOutputTime(nextValue.getTimestamp(), currentWindow));
+
+ } else {
+ // emit the value that we currently have
+ out.collect(
+ WindowedValue.of(
+ KV.of(key, combineFnRunner.extractOutput(key, accumulator,
+ options, sideInputReader, currentValue.getWindows())),
+ windowTimestamp,
+ currentWindow,
+ PaneInfo.NO_FIRING));
+
+ currentWindow = nextWindow;
+ currentValue = nextValue;
+ InputT value = nextValue.getValue().getValue();
+ accumulator = combineFnRunner.createAccumulator(key,
+ options, sideInputReader, currentValue.getWindows());
+ accumulator = combineFnRunner.addInput(key, accumulator, value,
+ options, sideInputReader, currentValue.getWindows());
+ windowTimestamp = outputTimeFn.assignOutputTime(nextValue.getTimestamp(), currentWindow);
+ }
+
+ }
+
+ // emit the final accumulator
+ out.collect(
+ WindowedValue.of(
+ KV.of(key, combineFnRunner.extractOutput(key, accumulator,
+ options, sideInputReader, currentValue.getWindows())),
+ windowTimestamp,
+ currentWindow,
+ PaneInfo.NO_FIRING));
+ }
+
+ /**
+ * Merge windows. This assumes that the list of elements is sorted by window-end timestamp.
+ * This replaces windows in the input list.
+ */
+ private void mergeWindow(List<WindowedValue<KV<K, InputT>>> elements) {
+ int currentStart = 0;
+ IntervalWindow currentWindow =
+ (IntervalWindow) Iterables.getOnlyElement(elements.get(0).getWindows());
+
+ for (int i = 1; i < elements.size(); i++) {
+ WindowedValue<KV<K, InputT>> nextValue = elements.get(i);
+ IntervalWindow nextWindow =
+ (IntervalWindow) Iterables.getOnlyElement(nextValue.getWindows());
+ if (currentWindow.intersects(nextWindow)) {
+ // we continue
+ currentWindow = currentWindow.span(nextWindow);
+ } else {
+ // retrofit the merged window to all windows up to "currentStart"
+ for (int j = i - 1; j >= currentStart; j--) {
+ WindowedValue<KV<K, InputT>> value = elements.get(j);
+ elements.set(
+ j,
+ WindowedValue.of(
+ value.getValue(), value.getTimestamp(), currentWindow, value.getPane()));
+ }
+ currentStart = i;
+ currentWindow = nextWindow;
+ }
+ }
+ if (currentStart < elements.size() - 1) {
+ // we have to retrofit the last batch
+ for (int j = elements.size() - 1; j >= currentStart; j--) {
+ WindowedValue<KV<K, InputT>> value = elements.get(j);
+ elements.set(
+ j,
+ WindowedValue.of(
+ value.getValue(), value.getTimestamp(), currentWindow, value.getPane()));
+ }
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkMergingPartialReduceFunction.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkMergingPartialReduceFunction.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkMergingPartialReduceFunction.java
new file mode 100644
index 0000000..c68f155
--- /dev/null
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkMergingPartialReduceFunction.java
@@ -0,0 +1,201 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink.translation.functions;
+
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Lists;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import org.apache.beam.runners.core.PerKeyCombineFnRunner;
+import org.apache.beam.runners.core.PerKeyCombineFnRunners;
+import org.apache.beam.sdk.options.PipelineOptions;
+import org.apache.beam.sdk.transforms.CombineFnBase;
+import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
+import org.apache.beam.sdk.transforms.windowing.IntervalWindow;
+import org.apache.beam.sdk.transforms.windowing.OutputTimeFn;
+import org.apache.beam.sdk.transforms.windowing.PaneInfo;
+import org.apache.beam.sdk.util.WindowedValue;
+import org.apache.beam.sdk.util.WindowingStrategy;
+import org.apache.beam.sdk.values.KV;
+import org.apache.beam.sdk.values.PCollectionView;
+import org.apache.flink.util.Collector;
+import org.joda.time.Instant;
+
+/**
+ * Special version of {@link FlinkPartialReduceFunction} that supports merging windows. This
+ * assumes that the windows are {@link IntervalWindow IntervalWindows} and exhibits the
+ * same behaviour as {@code MergeOverlappingIntervalWindows}.
+ */
+public class FlinkMergingPartialReduceFunction<K, InputT, AccumT, W extends IntervalWindow>
+ extends FlinkPartialReduceFunction<K, InputT, AccumT, W> {
+
+ public FlinkMergingPartialReduceFunction(
+ CombineFnBase.PerKeyCombineFn<K, InputT, AccumT, ?> combineFn,
+ WindowingStrategy<?, W> windowingStrategy,
+ Map<PCollectionView<?>, WindowingStrategy<?, ?>> sideInputs,
+ PipelineOptions pipelineOptions) {
+ super(combineFn, windowingStrategy, sideInputs, pipelineOptions);
+ }
+
+ @Override
+ public void combine(
+ Iterable<WindowedValue<KV<K, InputT>>> elements,
+ Collector<WindowedValue<KV<K, AccumT>>> out) throws Exception {
+
+ PipelineOptions options = serializedOptions.getPipelineOptions();
+
+ FlinkSideInputReader sideInputReader =
+ new FlinkSideInputReader(sideInputs, getRuntimeContext());
+
+ PerKeyCombineFnRunner<K, InputT, AccumT, ?> combineFnRunner =
+ PerKeyCombineFnRunners.create(combineFn);
+
+ @SuppressWarnings("unchecked")
+ OutputTimeFn<? super BoundedWindow> outputTimeFn =
+ (OutputTimeFn<? super BoundedWindow>) windowingStrategy.getOutputTimeFn();
+
+ // get all elements so that we can sort them, has to fit into
+ // memory
+ // this seems very unprudent, but correct, for now
+ List<WindowedValue<KV<K, InputT>>> sortedInput = Lists.newArrayList();
+ for (WindowedValue<KV<K, InputT>> inputValue : elements) {
+ for (WindowedValue<KV<K, InputT>> exploded : inputValue.explodeWindows()) {
+ sortedInput.add(exploded);
+ }
+ }
+ Collections.sort(sortedInput, new Comparator<WindowedValue<KV<K, InputT>>>() {
+ @Override
+ public int compare(
+ WindowedValue<KV<K, InputT>> o1,
+ WindowedValue<KV<K, InputT>> o2) {
+ return Iterables.getOnlyElement(o1.getWindows()).maxTimestamp()
+ .compareTo(Iterables.getOnlyElement(o2.getWindows()).maxTimestamp());
+ }
+ });
+
+ // merge windows, we have to do it in an extra pre-processing step and
+ // can't do it as we go since the window of early elements would not
+ // be correct when calling the CombineFn
+ mergeWindow(sortedInput);
+
+ // iterate over the elements that are sorted by window timestamp
+ final Iterator<WindowedValue<KV<K, InputT>>> iterator = sortedInput.iterator();
+
+ // create accumulator using the first elements key
+ WindowedValue<KV<K, InputT>> currentValue = iterator.next();
+ K key = currentValue.getValue().getKey();
+ IntervalWindow currentWindow =
+ (IntervalWindow) Iterables.getOnlyElement(currentValue.getWindows());
+ InputT firstValue = currentValue.getValue().getValue();
+ AccumT accumulator = combineFnRunner.createAccumulator(key,
+ options, sideInputReader, currentValue.getWindows());
+ accumulator = combineFnRunner.addInput(key, accumulator, firstValue,
+ options, sideInputReader, currentValue.getWindows());
+
+ // we use this to keep track of the timestamps assigned by the OutputTimeFn
+ Instant windowTimestamp =
+ outputTimeFn.assignOutputTime(currentValue.getTimestamp(), currentWindow);
+
+ while (iterator.hasNext()) {
+ WindowedValue<KV<K, InputT>> nextValue = iterator.next();
+ IntervalWindow nextWindow = (IntervalWindow) Iterables.getOnlyElement(nextValue.getWindows());
+
+ if (currentWindow.equals(nextWindow)) {
+ // continue accumulating and merge windows
+
+ InputT value = nextValue.getValue().getValue();
+ accumulator = combineFnRunner.addInput(key, accumulator, value,
+ options, sideInputReader, currentValue.getWindows());
+
+ windowTimestamp = outputTimeFn.combine(
+ windowTimestamp,
+ outputTimeFn.assignOutputTime(nextValue.getTimestamp(), currentWindow));
+
+ } else {
+ // emit the value that we currently have
+ out.collect(
+ WindowedValue.of(
+ KV.of(key, accumulator),
+ windowTimestamp,
+ currentWindow,
+ PaneInfo.NO_FIRING));
+
+ currentWindow = nextWindow;
+ currentValue = nextValue;
+ InputT value = nextValue.getValue().getValue();
+ accumulator = combineFnRunner.createAccumulator(key,
+ options, sideInputReader, currentValue.getWindows());
+ accumulator = combineFnRunner.addInput(key, accumulator, value,
+ options, sideInputReader, currentValue.getWindows());
+ windowTimestamp = outputTimeFn.assignOutputTime(nextValue.getTimestamp(), currentWindow);
+ }
+ }
+
+ // emit the final accumulator
+ out.collect(
+ WindowedValue.of(
+ KV.of(key, accumulator),
+ windowTimestamp,
+ currentWindow,
+ PaneInfo.NO_FIRING));
+ }
+
+ /**
+ * Merge windows. This assumes that the list of elements is sorted by window-end timestamp.
+ * This replaces windows in the input list.
+ */
+ private void mergeWindow(List<WindowedValue<KV<K, InputT>>> elements) {
+ int currentStart = 0;
+ IntervalWindow currentWindow =
+ (IntervalWindow) Iterables.getOnlyElement(elements.get(0).getWindows());
+
+ for (int i = 1; i < elements.size(); i++) {
+ WindowedValue<KV<K, InputT>> nextValue = elements.get(i);
+ IntervalWindow nextWindow =
+ (IntervalWindow) Iterables.getOnlyElement(nextValue.getWindows());
+ if (currentWindow.intersects(nextWindow)) {
+ // we continue
+ currentWindow = currentWindow.span(nextWindow);
+ } else {
+ // retrofit the merged window to all windows up to "currentStart"
+ for (int j = i - 1; j >= currentStart; j--) {
+ WindowedValue<KV<K, InputT>> value = elements.get(j);
+ elements.set(
+ j,
+ WindowedValue.of(
+ value.getValue(), value.getTimestamp(), currentWindow, value.getPane()));
+ }
+ currentStart = i;
+ currentWindow = nextWindow;
+ }
+ }
+ if (currentStart < elements.size() - 1) {
+ // we have to retrofit the last batch
+ for (int j = elements.size() - 1; j >= currentStart; j--) {
+ WindowedValue<KV<K, InputT>> value = elements.get(j);
+ elements.set(
+ j,
+ WindowedValue.of(
+ value.getValue(), value.getTimestamp(), currentWindow, value.getPane()));
+ }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkMergingReduceFunction.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkMergingReduceFunction.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkMergingReduceFunction.java
new file mode 100644
index 0000000..84b3adc
--- /dev/null
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkMergingReduceFunction.java
@@ -0,0 +1,199 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink.translation.functions;
+
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Lists;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import org.apache.beam.runners.core.PerKeyCombineFnRunner;
+import org.apache.beam.runners.core.PerKeyCombineFnRunners;
+import org.apache.beam.sdk.options.PipelineOptions;
+import org.apache.beam.sdk.transforms.CombineFnBase;
+import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
+import org.apache.beam.sdk.transforms.windowing.IntervalWindow;
+import org.apache.beam.sdk.transforms.windowing.OutputTimeFn;
+import org.apache.beam.sdk.transforms.windowing.PaneInfo;
+import org.apache.beam.sdk.util.WindowedValue;
+import org.apache.beam.sdk.util.WindowingStrategy;
+import org.apache.beam.sdk.values.KV;
+import org.apache.beam.sdk.values.PCollectionView;
+import org.apache.flink.util.Collector;
+import org.joda.time.Instant;
+
+/**
+ * Special version of {@link FlinkReduceFunction} that supports merging windows. This
+ * assumes that the windows are {@link IntervalWindow IntervalWindows} and exhibits the
+ * same behaviour as {@code MergeOverlappingIntervalWindows}.
+ */
+public class FlinkMergingReduceFunction<K, AccumT, OutputT, W extends IntervalWindow>
+ extends FlinkReduceFunction<K, AccumT, OutputT, W> {
+
+ public FlinkMergingReduceFunction(
+ CombineFnBase.PerKeyCombineFn<K, ?, AccumT, OutputT> keyedCombineFn,
+ WindowingStrategy<?, W> windowingStrategy,
+ Map<PCollectionView<?>, WindowingStrategy<?, ?>> sideInputs,
+ PipelineOptions pipelineOptions) {
+ super(keyedCombineFn, windowingStrategy, sideInputs, pipelineOptions);
+ }
+
+ @Override
+ public void reduce(
+ Iterable<WindowedValue<KV<K, AccumT>>> elements,
+ Collector<WindowedValue<KV<K, OutputT>>> out) throws Exception {
+
+ PipelineOptions options = serializedOptions.getPipelineOptions();
+
+ FlinkSideInputReader sideInputReader =
+ new FlinkSideInputReader(sideInputs, getRuntimeContext());
+
+ PerKeyCombineFnRunner<K, ?, AccumT, OutputT> combineFnRunner =
+ PerKeyCombineFnRunners.create(combineFn);
+
+ @SuppressWarnings("unchecked")
+ OutputTimeFn<? super BoundedWindow> outputTimeFn =
+ (OutputTimeFn<? super BoundedWindow>) windowingStrategy.getOutputTimeFn();
+
+ // get all elements so that we can sort them, has to fit into
+ // memory
+ // this seems very unprudent, but correct, for now
+ ArrayList<WindowedValue<KV<K, AccumT>>> sortedInput = Lists.newArrayList();
+ for (WindowedValue<KV<K, AccumT>> inputValue : elements) {
+ for (WindowedValue<KV<K, AccumT>> exploded : inputValue.explodeWindows()) {
+ sortedInput.add(exploded);
+ }
+ }
+ Collections.sort(sortedInput, new Comparator<WindowedValue<KV<K, AccumT>>>() {
+ @Override
+ public int compare(
+ WindowedValue<KV<K, AccumT>> o1,
+ WindowedValue<KV<K, AccumT>> o2) {
+ return Iterables.getOnlyElement(o1.getWindows()).maxTimestamp()
+ .compareTo(Iterables.getOnlyElement(o2.getWindows()).maxTimestamp());
+ }
+ });
+
+ // merge windows, we have to do it in an extra pre-processing step and
+ // can't do it as we go since the window of early elements would not
+ // be correct when calling the CombineFn
+ mergeWindow(sortedInput);
+
+ // iterate over the elements that are sorted by window timestamp
+ final Iterator<WindowedValue<KV<K, AccumT>>> iterator = sortedInput.iterator();
+
+ // get the first accumulator
+ WindowedValue<KV<K, AccumT>> currentValue = iterator.next();
+ K key = currentValue.getValue().getKey();
+ IntervalWindow currentWindow =
+ (IntervalWindow) Iterables.getOnlyElement(currentValue.getWindows());
+ AccumT accumulator = currentValue.getValue().getValue();
+
+ // we use this to keep track of the timestamps assigned by the OutputTimeFn,
+ // in FlinkPartialReduceFunction we already merge the timestamps assigned
+ // to individual elements, here we just merge them
+ List<Instant> windowTimestamps = new ArrayList<>();
+ windowTimestamps.add(currentValue.getTimestamp());
+
+ while (iterator.hasNext()) {
+ WindowedValue<KV<K, AccumT>> nextValue = iterator.next();
+ IntervalWindow nextWindow =
+ (IntervalWindow) Iterables.getOnlyElement(nextValue.getWindows());
+
+ if (nextWindow.equals(currentWindow)) {
+ // continue accumulating and merge windows
+
+ accumulator = combineFnRunner.mergeAccumulators(
+ key, ImmutableList.of(accumulator, nextValue.getValue().getValue()),
+ options, sideInputReader, currentValue.getWindows());
+
+ windowTimestamps.add(nextValue.getTimestamp());
+ } else {
+ out.collect(
+ WindowedValue.of(
+ KV.of(key, combineFnRunner.extractOutput(key, accumulator,
+ options, sideInputReader, currentValue.getWindows())),
+ outputTimeFn.merge(currentWindow, windowTimestamps),
+ currentWindow,
+ PaneInfo.NO_FIRING));
+
+ windowTimestamps.clear();
+
+ currentWindow = nextWindow;
+ currentValue = nextValue;
+ accumulator = nextValue.getValue().getValue();
+ windowTimestamps.add(nextValue.getTimestamp());
+ }
+ }
+
+ // emit the final accumulator
+ out.collect(
+ WindowedValue.of(
+ KV.of(key, combineFnRunner.extractOutput(key, accumulator,
+ options, sideInputReader, currentValue.getWindows())),
+ outputTimeFn.merge(currentWindow, windowTimestamps),
+ currentWindow,
+ PaneInfo.NO_FIRING));
+ }
+
+ /**
+ * Merge windows. This assumes that the list of elements is sorted by window-end timestamp.
+ * This replaces windows in the input list.
+ */
+ private void mergeWindow(List<WindowedValue<KV<K, AccumT>>> elements) {
+ int currentStart = 0;
+ IntervalWindow currentWindow =
+ (IntervalWindow) Iterables.getOnlyElement(elements.get(0).getWindows());
+
+ for (int i = 1; i < elements.size(); i++) {
+ WindowedValue<KV<K, AccumT>> nextValue = elements.get(i);
+ IntervalWindow nextWindow =
+ (IntervalWindow) Iterables.getOnlyElement(nextValue.getWindows());
+ if (currentWindow.intersects(nextWindow)) {
+ // we continue
+ currentWindow = currentWindow.span(nextWindow);
+ } else {
+ // retrofit the merged window to all windows up to "currentStart"
+ for (int j = i - 1; j >= currentStart; j--) {
+ WindowedValue<KV<K, AccumT>> value = elements.get(j);
+ elements.set(
+ j,
+ WindowedValue.of(
+ value.getValue(), value.getTimestamp(), currentWindow, value.getPane()));
+ }
+ currentStart = i;
+ currentWindow = nextWindow;
+ }
+ }
+ if (currentStart < elements.size() - 1) {
+ // we have to retrofit the last batch
+ for (int j = elements.size() - 1; j >= currentStart; j--) {
+ WindowedValue<KV<K, AccumT>> value = elements.get(j);
+ elements.set(
+ j,
+ WindowedValue.of(
+ value.getValue(), value.getTimestamp(), currentWindow, value.getPane()));
+ }
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkMultiOutputPruningFunction.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkMultiOutputPruningFunction.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkMultiOutputPruningFunction.java
new file mode 100644
index 0000000..9071cc5
--- /dev/null
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkMultiOutputPruningFunction.java
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink.translation.functions;
+
+import org.apache.beam.sdk.transforms.join.RawUnionValue;
+import org.apache.beam.sdk.util.WindowedValue;
+import org.apache.flink.api.common.functions.FlatMapFunction;
+import org.apache.flink.util.Collector;
+
+/**
+ * A {@link FlatMapFunction} function that filters out those elements that don't belong in this
+ * output. We need this to implement MultiOutput ParDo functions in combination with
+ * {@link FlinkDoFnFunction}.
+ */
+public class FlinkMultiOutputPruningFunction<T>
+ implements FlatMapFunction<WindowedValue<RawUnionValue>, WindowedValue<T>> {
+
+ private final int ourOutputTag;
+
+ public FlinkMultiOutputPruningFunction(int ourOutputTag) {
+ this.ourOutputTag = ourOutputTag;
+ }
+
+ @Override
+ @SuppressWarnings("unchecked")
+ public void flatMap(
+ WindowedValue<RawUnionValue> windowedValue,
+ Collector<WindowedValue<T>> collector) throws Exception {
+ int unionTag = windowedValue.getValue().getUnionTag();
+ if (unionTag == ourOutputTag) {
+ collector.collect(
+ (WindowedValue<T>) windowedValue.withValue(windowedValue.getValue().getValue()));
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkNoOpStepContext.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkNoOpStepContext.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkNoOpStepContext.java
new file mode 100644
index 0000000..847a00a
--- /dev/null
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkNoOpStepContext.java
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink.translation.functions;
+
+import java.io.IOException;
+import org.apache.beam.runners.core.ExecutionContext.StepContext;
+import org.apache.beam.runners.core.StateInternals;
+import org.apache.beam.runners.core.TimerInternals;
+import org.apache.beam.sdk.coders.Coder;
+import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
+import org.apache.beam.sdk.util.WindowedValue;
+import org.apache.beam.sdk.values.TupleTag;
+
+/**
+ * A {@link StepContext} for Flink Batch Runner execution.
+ */
+public class FlinkNoOpStepContext implements StepContext {
+
+ @Override
+ public String getStepName() {
+ return null;
+ }
+
+ @Override
+ public String getTransformName() {
+ return null;
+ }
+
+ @Override
+ public void noteOutput(WindowedValue<?> output) {
+
+ }
+
+ @Override
+ public void noteOutput(TupleTag<?> tag, WindowedValue<?> output) {
+
+ }
+
+ @Override
+ public <T, W extends BoundedWindow> void writePCollectionViewData(
+ TupleTag<?> tag,
+ Iterable<WindowedValue<T>> data,
+ Coder<Iterable<WindowedValue<T>>> dataCoder,
+ W window,
+ Coder<W> windowCoder) throws IOException {
+ }
+
+ @Override
+ public StateInternals<?> stateInternals() {
+ return null;
+ }
+
+ @Override
+ public TimerInternals timerInternals() {
+ return null;
+ }
+}
+
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkPartialReduceFunction.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkPartialReduceFunction.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkPartialReduceFunction.java
new file mode 100644
index 0000000..1d1ff9f
--- /dev/null
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkPartialReduceFunction.java
@@ -0,0 +1,172 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink.translation.functions;
+
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Lists;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.Iterator;
+import java.util.Map;
+import org.apache.beam.runners.core.PerKeyCombineFnRunner;
+import org.apache.beam.runners.core.PerKeyCombineFnRunners;
+import org.apache.beam.runners.flink.translation.utils.SerializedPipelineOptions;
+import org.apache.beam.sdk.options.PipelineOptions;
+import org.apache.beam.sdk.transforms.CombineFnBase;
+import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
+import org.apache.beam.sdk.transforms.windowing.OutputTimeFn;
+import org.apache.beam.sdk.transforms.windowing.PaneInfo;
+import org.apache.beam.sdk.util.WindowedValue;
+import org.apache.beam.sdk.util.WindowingStrategy;
+import org.apache.beam.sdk.values.KV;
+import org.apache.beam.sdk.values.PCollectionView;
+import org.apache.flink.api.common.functions.RichGroupCombineFunction;
+import org.apache.flink.util.Collector;
+import org.joda.time.Instant;
+
+/**
+ * This is is the first step for executing a {@link org.apache.beam.sdk.transforms.Combine.PerKey}
+ * on Flink. The second part is {@link FlinkReduceFunction}. This function performs a local
+ * combine step before shuffling while the latter does the final combination after a shuffle.
+ *
+ * <p>The input to {@link #combine(Iterable, Collector)} are elements of the same key but
+ * for different windows. We have to ensure that we only combine elements of matching
+ * windows.
+ */
+public class FlinkPartialReduceFunction<K, InputT, AccumT, W extends BoundedWindow>
+ extends RichGroupCombineFunction<WindowedValue<KV<K, InputT>>, WindowedValue<KV<K, AccumT>>> {
+
+ protected final CombineFnBase.PerKeyCombineFn<K, InputT, AccumT, ?> combineFn;
+
+ protected final WindowingStrategy<?, W> windowingStrategy;
+
+ protected final SerializedPipelineOptions serializedOptions;
+
+ protected final Map<PCollectionView<?>, WindowingStrategy<?, ?>> sideInputs;
+
+ public FlinkPartialReduceFunction(
+ CombineFnBase.PerKeyCombineFn<K, InputT, AccumT, ?> combineFn,
+ WindowingStrategy<?, W> windowingStrategy,
+ Map<PCollectionView<?>, WindowingStrategy<?, ?>> sideInputs,
+ PipelineOptions pipelineOptions) {
+
+ this.combineFn = combineFn;
+ this.windowingStrategy = windowingStrategy;
+ this.sideInputs = sideInputs;
+ this.serializedOptions = new SerializedPipelineOptions(pipelineOptions);
+
+ }
+
+ @Override
+ public void combine(
+ Iterable<WindowedValue<KV<K, InputT>>> elements,
+ Collector<WindowedValue<KV<K, AccumT>>> out) throws Exception {
+
+ PipelineOptions options = serializedOptions.getPipelineOptions();
+
+ FlinkSideInputReader sideInputReader =
+ new FlinkSideInputReader(sideInputs, getRuntimeContext());
+
+ PerKeyCombineFnRunner<K, InputT, AccumT, ?> combineFnRunner =
+ PerKeyCombineFnRunners.create(combineFn);
+
+ @SuppressWarnings("unchecked")
+ OutputTimeFn<? super BoundedWindow> outputTimeFn =
+ (OutputTimeFn<? super BoundedWindow>) windowingStrategy.getOutputTimeFn();
+
+ // get all elements so that we can sort them, has to fit into
+ // memory
+ // this seems very unprudent, but correct, for now
+ ArrayList<WindowedValue<KV<K, InputT>>> sortedInput = Lists.newArrayList();
+ for (WindowedValue<KV<K, InputT>> inputValue : elements) {
+ for (WindowedValue<KV<K, InputT>> exploded : inputValue.explodeWindows()) {
+ sortedInput.add(exploded);
+ }
+ }
+ Collections.sort(sortedInput, new Comparator<WindowedValue<KV<K, InputT>>>() {
+ @Override
+ public int compare(
+ WindowedValue<KV<K, InputT>> o1,
+ WindowedValue<KV<K, InputT>> o2) {
+ return Iterables.getOnlyElement(o1.getWindows()).maxTimestamp()
+ .compareTo(Iterables.getOnlyElement(o2.getWindows()).maxTimestamp());
+ }
+ });
+
+ // iterate over the elements that are sorted by window timestamp
+ //
+ final Iterator<WindowedValue<KV<K, InputT>>> iterator = sortedInput.iterator();
+
+ // create accumulator using the first elements key
+ WindowedValue<KV<K, InputT>> currentValue = iterator.next();
+ K key = currentValue.getValue().getKey();
+ BoundedWindow currentWindow = Iterables.getFirst(currentValue.getWindows(), null);
+ InputT firstValue = currentValue.getValue().getValue();
+ AccumT accumulator = combineFnRunner.createAccumulator(key,
+ options, sideInputReader, currentValue.getWindows());
+ accumulator = combineFnRunner.addInput(key, accumulator, firstValue,
+ options, sideInputReader, currentValue.getWindows());
+
+ // we use this to keep track of the timestamps assigned by the OutputTimeFn
+ Instant windowTimestamp =
+ outputTimeFn.assignOutputTime(currentValue.getTimestamp(), currentWindow);
+
+ while (iterator.hasNext()) {
+ WindowedValue<KV<K, InputT>> nextValue = iterator.next();
+ BoundedWindow nextWindow = Iterables.getOnlyElement(nextValue.getWindows());
+
+ if (nextWindow.equals(currentWindow)) {
+ // continue accumulating
+ InputT value = nextValue.getValue().getValue();
+ accumulator = combineFnRunner.addInput(key, accumulator, value,
+ options, sideInputReader, currentValue.getWindows());
+
+ windowTimestamp = outputTimeFn.combine(
+ windowTimestamp,
+ outputTimeFn.assignOutputTime(nextValue.getTimestamp(), currentWindow));
+
+ } else {
+ // emit the value that we currently have
+ out.collect(
+ WindowedValue.of(
+ KV.of(key, accumulator),
+ windowTimestamp,
+ currentWindow,
+ PaneInfo.NO_FIRING));
+
+ currentWindow = nextWindow;
+ currentValue = nextValue;
+ InputT value = nextValue.getValue().getValue();
+ accumulator = combineFnRunner.createAccumulator(key,
+ options, sideInputReader, currentValue.getWindows());
+ accumulator = combineFnRunner.addInput(key, accumulator, value,
+ options, sideInputReader, currentValue.getWindows());
+ windowTimestamp = outputTimeFn.assignOutputTime(nextValue.getTimestamp(), currentWindow);
+ }
+ }
+
+ // emit the final accumulator
+ out.collect(
+ WindowedValue.of(
+ KV.of(key, accumulator),
+ windowTimestamp,
+ currentWindow,
+ PaneInfo.NO_FIRING));
+ }
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkReduceFunction.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkReduceFunction.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkReduceFunction.java
new file mode 100644
index 0000000..3e4f742
--- /dev/null
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkReduceFunction.java
@@ -0,0 +1,173 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink.translation.functions;
+
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Lists;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import org.apache.beam.runners.core.PerKeyCombineFnRunner;
+import org.apache.beam.runners.core.PerKeyCombineFnRunners;
+import org.apache.beam.runners.flink.translation.utils.SerializedPipelineOptions;
+import org.apache.beam.sdk.options.PipelineOptions;
+import org.apache.beam.sdk.transforms.CombineFnBase;
+import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
+import org.apache.beam.sdk.transforms.windowing.OutputTimeFn;
+import org.apache.beam.sdk.transforms.windowing.PaneInfo;
+import org.apache.beam.sdk.util.WindowedValue;
+import org.apache.beam.sdk.util.WindowingStrategy;
+import org.apache.beam.sdk.values.KV;
+import org.apache.beam.sdk.values.PCollectionView;
+import org.apache.flink.api.common.functions.RichGroupReduceFunction;
+import org.apache.flink.util.Collector;
+import org.joda.time.Instant;
+
+/**
+ * This is the second part for executing a {@link org.apache.beam.sdk.transforms.Combine.PerKey}
+ * on Flink, the second part is {@link FlinkReduceFunction}. This function performs the final
+ * combination of the pre-combined values after a shuffle.
+ *
+ * <p>The input to {@link #reduce(Iterable, Collector)} are elements of the same key but
+ * for different windows. We have to ensure that we only combine elements of matching
+ * windows.
+ */
+public class FlinkReduceFunction<K, AccumT, OutputT, W extends BoundedWindow>
+ extends RichGroupReduceFunction<WindowedValue<KV<K, AccumT>>, WindowedValue<KV<K, OutputT>>> {
+
+ protected final CombineFnBase.PerKeyCombineFn<K, ?, AccumT, OutputT> combineFn;
+
+ protected final WindowingStrategy<?, W> windowingStrategy;
+
+ protected final Map<PCollectionView<?>, WindowingStrategy<?, ?>> sideInputs;
+
+ protected final SerializedPipelineOptions serializedOptions;
+
+ public FlinkReduceFunction(
+ CombineFnBase.PerKeyCombineFn<K, ?, AccumT, OutputT> keyedCombineFn,
+ WindowingStrategy<?, W> windowingStrategy,
+ Map<PCollectionView<?>, WindowingStrategy<?, ?>> sideInputs,
+ PipelineOptions pipelineOptions) {
+
+ this.combineFn = keyedCombineFn;
+
+ this.windowingStrategy = windowingStrategy;
+ this.sideInputs = sideInputs;
+
+ this.serializedOptions = new SerializedPipelineOptions(pipelineOptions);
+
+ }
+
+ @Override
+ public void reduce(
+ Iterable<WindowedValue<KV<K, AccumT>>> elements,
+ Collector<WindowedValue<KV<K, OutputT>>> out) throws Exception {
+
+ PipelineOptions options = serializedOptions.getPipelineOptions();
+
+ FlinkSideInputReader sideInputReader =
+ new FlinkSideInputReader(sideInputs, getRuntimeContext());
+
+ PerKeyCombineFnRunner<K, ?, AccumT, OutputT> combineFnRunner =
+ PerKeyCombineFnRunners.create(combineFn);
+
+ @SuppressWarnings("unchecked")
+ OutputTimeFn<? super BoundedWindow> outputTimeFn =
+ (OutputTimeFn<? super BoundedWindow>) windowingStrategy.getOutputTimeFn();
+
+
+ // get all elements so that we can sort them, has to fit into
+ // memory
+ // this seems very unprudent, but correct, for now
+ ArrayList<WindowedValue<KV<K, AccumT>>> sortedInput = Lists.newArrayList();
+ for (WindowedValue<KV<K, AccumT>> inputValue: elements) {
+ for (WindowedValue<KV<K, AccumT>> exploded: inputValue.explodeWindows()) {
+ sortedInput.add(exploded);
+ }
+ }
+ Collections.sort(sortedInput, new Comparator<WindowedValue<KV<K, AccumT>>>() {
+ @Override
+ public int compare(
+ WindowedValue<KV<K, AccumT>> o1,
+ WindowedValue<KV<K, AccumT>> o2) {
+ return Iterables.getOnlyElement(o1.getWindows()).maxTimestamp()
+ .compareTo(Iterables.getOnlyElement(o2.getWindows()).maxTimestamp());
+ }
+ });
+
+ // iterate over the elements that are sorted by window timestamp
+ //
+ final Iterator<WindowedValue<KV<K, AccumT>>> iterator = sortedInput.iterator();
+
+ // get the first accumulator
+ WindowedValue<KV<K, AccumT>> currentValue = iterator.next();
+ K key = currentValue.getValue().getKey();
+ BoundedWindow currentWindow = Iterables.getFirst(currentValue.getWindows(), null);
+ AccumT accumulator = currentValue.getValue().getValue();
+
+ // we use this to keep track of the timestamps assigned by the OutputTimeFn,
+ // in FlinkPartialReduceFunction we already merge the timestamps assigned
+ // to individual elements, here we just merge them
+ List<Instant> windowTimestamps = new ArrayList<>();
+ windowTimestamps.add(currentValue.getTimestamp());
+
+ while (iterator.hasNext()) {
+ WindowedValue<KV<K, AccumT>> nextValue = iterator.next();
+ BoundedWindow nextWindow = Iterables.getOnlyElement(nextValue.getWindows());
+
+ if (nextWindow.equals(currentWindow)) {
+ // continue accumulating
+ accumulator = combineFnRunner.mergeAccumulators(
+ key, ImmutableList.of(accumulator, nextValue.getValue().getValue()),
+ options, sideInputReader, currentValue.getWindows());
+
+ windowTimestamps.add(nextValue.getTimestamp());
+ } else {
+ // emit the value that we currently have
+ out.collect(
+ WindowedValue.of(
+ KV.of(key, combineFnRunner.extractOutput(key, accumulator,
+ options, sideInputReader, currentValue.getWindows())),
+ outputTimeFn.merge(currentWindow, windowTimestamps),
+ currentWindow,
+ PaneInfo.NO_FIRING));
+
+ windowTimestamps.clear();
+
+ currentWindow = nextWindow;
+ currentValue = nextValue;
+ accumulator = nextValue.getValue().getValue();
+ windowTimestamps.add(nextValue.getTimestamp());
+ }
+
+ }
+
+ // emit the final accumulator
+ out.collect(
+ WindowedValue.of(
+ KV.of(key, combineFnRunner.extractOutput(key, accumulator,
+ options, sideInputReader, currentValue.getWindows())),
+ outputTimeFn.merge(currentWindow, windowTimestamps),
+ currentWindow,
+ PaneInfo.NO_FIRING));
+ }
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkSideInputReader.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkSideInputReader.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkSideInputReader.java
new file mode 100644
index 0000000..c317182
--- /dev/null
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkSideInputReader.java
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink.translation.functions;
+
+import static com.google.common.base.Preconditions.checkNotNull;
+
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+import javax.annotation.Nullable;
+import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
+import org.apache.beam.sdk.util.SideInputReader;
+import org.apache.beam.sdk.util.WindowedValue;
+import org.apache.beam.sdk.util.WindowingStrategy;
+import org.apache.beam.sdk.values.PCollectionView;
+import org.apache.beam.sdk.values.TupleTag;
+import org.apache.flink.api.common.functions.RuntimeContext;
+
+/**
+ * A {@link SideInputReader} for the Flink Batch Runner.
+ */
+public class FlinkSideInputReader implements SideInputReader {
+
+ private final Map<TupleTag<?>, WindowingStrategy<?, ?>> sideInputs;
+
+ private RuntimeContext runtimeContext;
+
+ public FlinkSideInputReader(Map<PCollectionView<?>, WindowingStrategy<?, ?>> indexByView,
+ RuntimeContext runtimeContext) {
+ sideInputs = new HashMap<>();
+ for (Map.Entry<PCollectionView<?>, WindowingStrategy<?, ?>> entry : indexByView.entrySet()) {
+ sideInputs.put(entry.getKey().getTagInternal(), entry.getValue());
+ }
+ this.runtimeContext = runtimeContext;
+ }
+
+ @Nullable
+ @Override
+ public <T> T get(PCollectionView<T> view, BoundedWindow window) {
+ checkNotNull(view, "View passed to sideInput cannot be null");
+ TupleTag<Iterable<WindowedValue<?>>> tag = view.getTagInternal();
+ checkNotNull(
+ sideInputs.get(tag),
+ "Side input for " + view + " not available.");
+
+ Map<BoundedWindow, T> sideInputs =
+ runtimeContext.getBroadcastVariableWithInitializer(
+ tag.getId(), new SideInputInitializer<>(view));
+ T result = sideInputs.get(window);
+ if (result == null) {
+ result = view.getViewFn().apply(Collections.<WindowedValue<?>>emptyList());
+ }
+ return result;
+ }
+
+ @Override
+ public <T> boolean contains(PCollectionView<T> view) {
+ return sideInputs.containsKey(view.getTagInternal());
+ }
+
+ @Override
+ public boolean isEmpty() {
+ return sideInputs.isEmpty();
+ }
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkStatefulDoFnFunction.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkStatefulDoFnFunction.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkStatefulDoFnFunction.java
new file mode 100644
index 0000000..c8193d2
--- /dev/null
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkStatefulDoFnFunction.java
@@ -0,0 +1,198 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink.translation.functions;
+
+import static org.apache.flink.util.Preconditions.checkArgument;
+
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.Map;
+import org.apache.beam.runners.core.DoFnRunner;
+import org.apache.beam.runners.core.DoFnRunners;
+import org.apache.beam.runners.core.InMemoryStateInternals;
+import org.apache.beam.runners.core.InMemoryTimerInternals;
+import org.apache.beam.runners.core.StateInternals;
+import org.apache.beam.runners.core.StateNamespace;
+import org.apache.beam.runners.core.StateNamespaces;
+import org.apache.beam.runners.core.TimerInternals;
+import org.apache.beam.runners.flink.translation.utils.SerializedPipelineOptions;
+import org.apache.beam.sdk.options.PipelineOptions;
+import org.apache.beam.sdk.transforms.DoFn;
+import org.apache.beam.sdk.transforms.ParDo;
+import org.apache.beam.sdk.transforms.reflect.DoFnInvoker;
+import org.apache.beam.sdk.transforms.reflect.DoFnInvokers;
+import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
+import org.apache.beam.sdk.util.WindowedValue;
+import org.apache.beam.sdk.util.WindowingStrategy;
+import org.apache.beam.sdk.values.KV;
+import org.apache.beam.sdk.values.PCollectionView;
+import org.apache.beam.sdk.values.TupleTag;
+import org.apache.flink.api.common.functions.RichGroupReduceFunction;
+import org.apache.flink.api.common.functions.RuntimeContext;
+import org.apache.flink.configuration.Configuration;
+import org.apache.flink.util.Collector;
+import org.joda.time.Instant;
+
+/**
+ * A {@link RichGroupReduceFunction} for stateful {@link ParDo} in Flink Batch Runner.
+ */
+public class FlinkStatefulDoFnFunction<K, V, OutputT>
+ extends RichGroupReduceFunction<WindowedValue<KV<K, V>>, WindowedValue<OutputT>> {
+
+ private final DoFn<KV<K, V>, OutputT> dofn;
+ private final WindowingStrategy<?, ?> windowingStrategy;
+ private final Map<PCollectionView<?>, WindowingStrategy<?, ?>> sideInputs;
+ private final SerializedPipelineOptions serializedOptions;
+ private final Map<TupleTag<?>, Integer> outputMap;
+ private final TupleTag<OutputT> mainOutputTag;
+ private transient DoFnInvoker doFnInvoker;
+
+ public FlinkStatefulDoFnFunction(
+ DoFn<KV<K, V>, OutputT> dofn,
+ WindowingStrategy<?, ?> windowingStrategy,
+ Map<PCollectionView<?>, WindowingStrategy<?, ?>> sideInputs,
+ PipelineOptions pipelineOptions,
+ Map<TupleTag<?>, Integer> outputMap,
+ TupleTag<OutputT> mainOutputTag) {
+
+ this.dofn = dofn;
+ this.windowingStrategy = windowingStrategy;
+ this.sideInputs = sideInputs;
+ this.serializedOptions = new SerializedPipelineOptions(pipelineOptions);
+ this.outputMap = outputMap;
+ this.mainOutputTag = mainOutputTag;
+ }
+
+ @Override
+ public void reduce(
+ Iterable<WindowedValue<KV<K, V>>> values,
+ Collector<WindowedValue<OutputT>> out) throws Exception {
+ RuntimeContext runtimeContext = getRuntimeContext();
+
+ DoFnRunners.OutputManager outputManager;
+ if (outputMap == null) {
+ outputManager = new FlinkDoFnFunction.DoFnOutputManager(out);
+ } else {
+ // it has some additional Outputs
+ outputManager =
+ new FlinkDoFnFunction.MultiDoFnOutputManager((Collector) out, outputMap);
+ }
+
+ final Iterator<WindowedValue<KV<K, V>>> iterator = values.iterator();
+
+ // get the first value, we need this for initializing the state internals with the key.
+ // we are guaranteed to have a first value, otherwise reduce() would not have been called.
+ WindowedValue<KV<K, V>> currentValue = iterator.next();
+ final K key = currentValue.getValue().getKey();
+
+ final InMemoryStateInternals<K> stateInternals = InMemoryStateInternals.forKey(key);
+
+ // Used with Batch, we know that all the data is available for this key. We can't use the
+ // timer manager from the context because it doesn't exist. So we create one and advance
+ // time to the end after processing all elements.
+ final InMemoryTimerInternals timerInternals = new InMemoryTimerInternals();
+ timerInternals.advanceProcessingTime(Instant.now());
+ timerInternals.advanceSynchronizedProcessingTime(Instant.now());
+
+ DoFnRunner<KV<K, V>, OutputT> doFnRunner = DoFnRunners.simpleRunner(
+ serializedOptions.getPipelineOptions(), dofn,
+ new FlinkSideInputReader(sideInputs, runtimeContext),
+ outputManager,
+ mainOutputTag,
+ // see SimpleDoFnRunner, just use it to limit number of additional outputs
+ Collections.<TupleTag<?>>emptyList(),
+ new FlinkNoOpStepContext() {
+ @Override
+ public StateInternals<?> stateInternals() {
+ return stateInternals;
+ }
+ @Override
+ public TimerInternals timerInternals() {
+ return timerInternals;
+ }
+ },
+ new FlinkAggregatorFactory(runtimeContext),
+ windowingStrategy);
+
+ doFnRunner.startBundle();
+
+ doFnRunner.processElement(currentValue);
+ while (iterator.hasNext()) {
+ currentValue = iterator.next();
+ doFnRunner.processElement(currentValue);
+ }
+
+ // Finish any pending windows by advancing the input watermark to infinity.
+ timerInternals.advanceInputWatermark(BoundedWindow.TIMESTAMP_MAX_VALUE);
+
+ // Finally, advance the processing time to infinity to fire any timers.
+ timerInternals.advanceProcessingTime(BoundedWindow.TIMESTAMP_MAX_VALUE);
+ timerInternals.advanceSynchronizedProcessingTime(BoundedWindow.TIMESTAMP_MAX_VALUE);
+
+ fireEligibleTimers(timerInternals, doFnRunner);
+
+ doFnRunner.finishBundle();
+ }
+
+ private void fireEligibleTimers(
+ InMemoryTimerInternals timerInternals, DoFnRunner<KV<K, V>, OutputT> runner)
+ throws Exception {
+
+ while (true) {
+
+ TimerInternals.TimerData timer;
+ boolean hasFired = false;
+
+ while ((timer = timerInternals.removeNextEventTimer()) != null) {
+ hasFired = true;
+ fireTimer(timer, runner);
+ }
+ while ((timer = timerInternals.removeNextProcessingTimer()) != null) {
+ hasFired = true;
+ fireTimer(timer, runner);
+ }
+ while ((timer = timerInternals.removeNextSynchronizedProcessingTimer()) != null) {
+ hasFired = true;
+ fireTimer(timer, runner);
+ }
+ if (!hasFired) {
+ break;
+ }
+ }
+ }
+
+ private void fireTimer(
+ TimerInternals.TimerData timer, DoFnRunner<KV<K, V>, OutputT> doFnRunner) {
+ StateNamespace namespace = timer.getNamespace();
+ checkArgument(namespace instanceof StateNamespaces.WindowNamespace);
+ BoundedWindow window = ((StateNamespaces.WindowNamespace) namespace).getWindow();
+ doFnRunner.onTimer(timer.getTimerId(), window, timer.getTimestamp(), timer.getDomain());
+ }
+
+ @Override
+ public void open(Configuration parameters) throws Exception {
+ doFnInvoker = DoFnInvokers.invokerFor(dofn);
+ doFnInvoker.invokeSetup();
+ }
+
+ @Override
+ public void close() throws Exception {
+ doFnInvoker.invokeTeardown();
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/functions/SideInputInitializer.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/functions/SideInputInitializer.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/functions/SideInputInitializer.java
new file mode 100644
index 0000000..12222b4
--- /dev/null
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/functions/SideInputInitializer.java
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink.translation.functions;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
+import org.apache.beam.sdk.util.WindowedValue;
+import org.apache.beam.sdk.values.PCollectionView;
+import org.apache.flink.api.common.functions.BroadcastVariableInitializer;
+
+/**
+ * {@link BroadcastVariableInitializer} that initializes the broadcast input as a {@code Map}
+ * from window to side input.
+ */
+public class SideInputInitializer<ElemT, ViewT, W extends BoundedWindow>
+ implements BroadcastVariableInitializer<WindowedValue<ElemT>, Map<BoundedWindow, ViewT>> {
+
+ PCollectionView<ViewT> view;
+
+ public SideInputInitializer(PCollectionView<ViewT> view) {
+ this.view = view;
+ }
+
+ @Override
+ public Map<BoundedWindow, ViewT> initializeBroadcastVariable(
+ Iterable<WindowedValue<ElemT>> inputValues) {
+
+ // first partition into windows
+ Map<BoundedWindow, List<WindowedValue<ElemT>>> partitionedElements = new HashMap<>();
+ for (WindowedValue<ElemT> value: inputValues) {
+ for (BoundedWindow window: value.getWindows()) {
+ List<WindowedValue<ElemT>> windowedValues = partitionedElements.get(window);
+ if (windowedValues == null) {
+ windowedValues = new ArrayList<>();
+ partitionedElements.put(window, windowedValues);
+ }
+ windowedValues.add(value);
+ }
+ }
+
+ Map<BoundedWindow, ViewT> resultMap = new HashMap<>();
+
+ for (Map.Entry<BoundedWindow, List<WindowedValue<ElemT>>> elements:
+ partitionedElements.entrySet()) {
+
+ @SuppressWarnings("unchecked")
+ Iterable<WindowedValue<?>> elementsIterable =
+ (List<WindowedValue<?>>) (List<?>) elements.getValue();
+
+ resultMap.put(elements.getKey(), view.getViewFn().apply(elementsIterable));
+ }
+
+ return resultMap;
+ }
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/functions/package-info.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/functions/package-info.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/functions/package-info.java
new file mode 100644
index 0000000..9f11212
--- /dev/null
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/functions/package-info.java
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Internal implementation of the Beam runner for Apache Flink.
+ */
+package org.apache.beam.runners.flink.translation.functions;
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/package-info.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/package-info.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/package-info.java
new file mode 100644
index 0000000..af4b354
--- /dev/null
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/package-info.java
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Internal implementation of the Beam runner for Apache Flink.
+ */
+package org.apache.beam.runners.flink.translation;
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/types/CoderTypeInformation.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/types/CoderTypeInformation.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/types/CoderTypeInformation.java
new file mode 100644
index 0000000..9b449aa
--- /dev/null
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/types/CoderTypeInformation.java
@@ -0,0 +1,120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink.translation.types;
+
+import static com.google.common.base.Preconditions.checkNotNull;
+
+import org.apache.beam.sdk.coders.Coder;
+import org.apache.flink.api.common.ExecutionConfig;
+import org.apache.flink.api.common.typeinfo.AtomicType;
+import org.apache.flink.api.common.typeinfo.TypeInformation;
+import org.apache.flink.api.common.typeutils.TypeComparator;
+import org.apache.flink.api.common.typeutils.TypeSerializer;
+
+/**
+ * Flink {@link org.apache.flink.api.common.typeinfo.TypeInformation} for
+ * Dataflow {@link org.apache.beam.sdk.coders.Coder}s.
+ */
+public class CoderTypeInformation<T> extends TypeInformation<T> implements AtomicType<T> {
+
+ private final Coder<T> coder;
+
+ public CoderTypeInformation(Coder<T> coder) {
+ checkNotNull(coder);
+ this.coder = coder;
+ }
+
+ public Coder<T> getCoder() {
+ return coder;
+ }
+
+ @Override
+ public boolean isBasicType() {
+ return false;
+ }
+
+ @Override
+ public boolean isTupleType() {
+ return false;
+ }
+
+ @Override
+ public int getArity() {
+ return 1;
+ }
+
+ @Override
+ @SuppressWarnings("unchecked")
+ public Class<T> getTypeClass() {
+ // We don't have the Class, so we have to pass null here. What a shame...
+ return (Class<T>) Object.class;
+ }
+
+ @Override
+ public boolean isKeyType() {
+ return true;
+ }
+
+ @Override
+ @SuppressWarnings("unchecked")
+ public TypeSerializer<T> createSerializer(ExecutionConfig config) {
+ return new CoderTypeSerializer<>(coder);
+ }
+
+ @Override
+ public int getTotalFields() {
+ return 2;
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) {
+ return true;
+ }
+ if (o == null || getClass() != o.getClass()) {
+ return false;
+ }
+
+ CoderTypeInformation that = (CoderTypeInformation) o;
+
+ return coder.equals(that.coder);
+
+ }
+
+ @Override
+ public int hashCode() {
+ return coder.hashCode();
+ }
+
+ @Override
+ public boolean canEqual(Object obj) {
+ return obj instanceof CoderTypeInformation;
+ }
+
+ @Override
+ public String toString() {
+ return "CoderTypeInformation{coder=" + coder + '}';
+ }
+
+ @Override
+ public TypeComparator<T> createComparator(boolean sortOrderAscending, ExecutionConfig
+ executionConfig) {
+ throw new UnsupportedOperationException(
+ "Non-encoded values cannot be compared directly.");
+ }
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/types/CoderTypeSerializer.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/types/CoderTypeSerializer.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/types/CoderTypeSerializer.java
new file mode 100644
index 0000000..e210ed9
--- /dev/null
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/types/CoderTypeSerializer.java
@@ -0,0 +1,132 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink.translation.types;
+
+import java.io.EOFException;
+import java.io.IOException;
+import org.apache.beam.runners.flink.translation.wrappers.DataInputViewWrapper;
+import org.apache.beam.runners.flink.translation.wrappers.DataOutputViewWrapper;
+import org.apache.beam.sdk.coders.Coder;
+import org.apache.beam.sdk.coders.CoderException;
+import org.apache.beam.sdk.util.CoderUtils;
+import org.apache.flink.api.common.typeutils.TypeSerializer;
+import org.apache.flink.core.memory.DataInputView;
+import org.apache.flink.core.memory.DataOutputView;
+
+/**
+ * Flink {@link org.apache.flink.api.common.typeutils.TypeSerializer} for
+ * Dataflow {@link org.apache.beam.sdk.coders.Coder Coders}.
+ */
+public class CoderTypeSerializer<T> extends TypeSerializer<T> {
+
+ private Coder<T> coder;
+
+ public CoderTypeSerializer(Coder<T> coder) {
+ this.coder = coder;
+ }
+
+ @Override
+ public boolean isImmutableType() {
+ return false;
+ }
+
+ @Override
+ public CoderTypeSerializer<T> duplicate() {
+ return new CoderTypeSerializer<>(coder);
+ }
+
+ @Override
+ public T createInstance() {
+ return null;
+ }
+
+ @Override
+ public T copy(T t) {
+ try {
+ return CoderUtils.clone(coder, t);
+ } catch (CoderException e) {
+ throw new RuntimeException("Could not clone.", e);
+ }
+ }
+
+ @Override
+ public T copy(T t, T reuse) {
+ return copy(t);
+ }
+
+ @Override
+ public int getLength() {
+ return -1;
+ }
+
+ @Override
+ public void serialize(T t, DataOutputView dataOutputView) throws IOException {
+ DataOutputViewWrapper outputWrapper = new DataOutputViewWrapper(dataOutputView);
+ coder.encode(t, outputWrapper, Coder.Context.NESTED);
+ }
+
+ @Override
+ public T deserialize(DataInputView dataInputView) throws IOException {
+ try {
+ DataInputViewWrapper inputWrapper = new DataInputViewWrapper(dataInputView);
+ return coder.decode(inputWrapper, Coder.Context.NESTED);
+ } catch (CoderException e) {
+ Throwable cause = e.getCause();
+ if (cause instanceof EOFException) {
+ throw (EOFException) cause;
+ } else {
+ throw e;
+ }
+ }
+ }
+
+ @Override
+ public T deserialize(T t, DataInputView dataInputView) throws IOException {
+ return deserialize(dataInputView);
+ }
+
+ @Override
+ public void copy(
+ DataInputView dataInputView,
+ DataOutputView dataOutputView) throws IOException {
+ serialize(deserialize(dataInputView), dataOutputView);
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) {
+ return true;
+ }
+ if (o == null || getClass() != o.getClass()) {
+ return false;
+ }
+
+ CoderTypeSerializer that = (CoderTypeSerializer) o;
+ return coder.equals(that.coder);
+ }
+
+ @Override
+ public boolean canEqual(Object obj) {
+ return obj instanceof CoderTypeSerializer;
+ }
+
+ @Override
+ public int hashCode() {
+ return coder.hashCode();
+ }
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/types/EncodedValueComparator.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/types/EncodedValueComparator.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/types/EncodedValueComparator.java
new file mode 100644
index 0000000..667ef45
--- /dev/null
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/types/EncodedValueComparator.java
@@ -0,0 +1,195 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink.translation.types;
+
+import java.io.IOException;
+import java.util.Arrays;
+import org.apache.beam.sdk.coders.Coder;
+import org.apache.flink.api.common.typeutils.TypeComparator;
+import org.apache.flink.core.memory.DataInputView;
+import org.apache.flink.core.memory.DataOutputView;
+import org.apache.flink.core.memory.MemorySegment;
+
+/**
+ * Flink {@link org.apache.flink.api.common.typeutils.TypeComparator} for Beam values that have
+ * been encoded to byte data by a {@link Coder}.
+ */
+public class EncodedValueComparator extends TypeComparator<byte[]> {
+
+ /** For storing the Reference in encoded form. */
+ private transient byte[] encodedReferenceKey;
+
+ private final boolean ascending;
+
+ public EncodedValueComparator(boolean ascending) {
+ this.ascending = ascending;
+ }
+
+ @Override
+ public int hash(byte[] record) {
+ return Arrays.hashCode(record);
+ }
+
+ @Override
+ public void setReference(byte[] toCompare) {
+ this.encodedReferenceKey = toCompare;
+ }
+
+ @Override
+ public boolean equalToReference(byte[] candidate) {
+ if (encodedReferenceKey.length != candidate.length) {
+ return false;
+ }
+ int len = candidate.length;
+ for (int i = 0; i < len; i++) {
+ if (encodedReferenceKey[i] != candidate[i]) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ @Override
+ public int compareToReference(TypeComparator<byte[]> other) {
+ // VERY IMPORTANT: compareToReference does not behave like Comparable.compare
+ // the meaning of the return value is inverted.
+
+ EncodedValueComparator otherEncodedValueComparator = (EncodedValueComparator) other;
+
+ int len = Math.min(
+ encodedReferenceKey.length,
+ otherEncodedValueComparator.encodedReferenceKey.length);
+
+ for (int i = 0; i < len; i++) {
+ byte b1 = encodedReferenceKey[i];
+ byte b2 = otherEncodedValueComparator.encodedReferenceKey[i];
+ int result = (b1 < b2 ? -1 : (b1 == b2 ? 0 : 1));
+ if (result != 0) {
+ return ascending ? -result : result;
+ }
+ }
+ int result =
+ encodedReferenceKey.length - otherEncodedValueComparator.encodedReferenceKey.length;
+ return ascending ? -result : result;
+ }
+
+
+ @Override
+ public int compare(byte[] first, byte[] second) {
+ int len = Math.min(first.length, second.length);
+ for (int i = 0; i < len; i++) {
+ byte b1 = first[i];
+ byte b2 = second[i];
+ int result = (b1 < b2 ? -1 : (b1 == b2 ? 0 : 1));
+ if (result != 0) {
+ return ascending ? result : -result;
+ }
+ }
+ int result = first.length - second.length;
+ return ascending ? result : -result;
+ }
+
+ @Override
+ public int compareSerialized(
+ DataInputView firstSource,
+ DataInputView secondSource) throws IOException {
+ int lengthFirst = firstSource.readInt();
+ int lengthSecond = secondSource.readInt();
+
+ int len = Math.min(lengthFirst, lengthSecond);
+ for (int i = 0; i < len; i++) {
+ byte b1 = firstSource.readByte();
+ byte b2 = secondSource.readByte();
+ int result = (b1 < b2 ? -1 : (b1 == b2 ? 0 : 1));
+ if (result != 0) {
+ return ascending ? result : -result;
+ }
+ }
+
+ int result = lengthFirst - lengthSecond;
+ return ascending ? result : -result;
+ }
+
+
+
+ @Override
+ public boolean supportsNormalizedKey() {
+ // disabled because this seems to not work with some coders,
+ // such as the AvroCoder
+ return false;
+ }
+
+ @Override
+ public boolean supportsSerializationWithKeyNormalization() {
+ return false;
+ }
+
+ @Override
+ public int getNormalizeKeyLen() {
+ return Integer.MAX_VALUE;
+ }
+
+ @Override
+ public boolean isNormalizedKeyPrefixOnly(int keyBytes) {
+ return true;
+ }
+
+ @Override
+ public void putNormalizedKey(byte[] record, MemorySegment target, int offset, int numBytes) {
+ final int limit = offset + numBytes;
+
+ target.put(offset, record, 0, Math.min(numBytes, record.length));
+
+ offset += record.length;
+
+ while (offset < limit) {
+ target.put(offset++, (byte) 0);
+ }
+ }
+
+ @Override
+ public void writeWithKeyNormalization(byte[] record, DataOutputView target) throws IOException {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public byte[] readWithKeyDenormalization(byte[] reuse, DataInputView source) throws IOException {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public boolean invertNormalizedKey() {
+ return !ascending;
+ }
+
+ @Override
+ public TypeComparator<byte[]> duplicate() {
+ return new EncodedValueComparator(ascending);
+ }
+
+ @Override
+ public int extractKeys(Object record, Object[] target, int index) {
+ target[index] = record;
+ return 1;
+ }
+
+ @Override
+ public TypeComparator[] getFlatComparators() {
+ return new TypeComparator[] { this.duplicate() };
+ }
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/types/EncodedValueSerializer.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/types/EncodedValueSerializer.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/types/EncodedValueSerializer.java
new file mode 100644
index 0000000..41db61e
--- /dev/null
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/types/EncodedValueSerializer.java
@@ -0,0 +1,113 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink.translation.types;
+
+import java.io.IOException;
+
+import org.apache.beam.sdk.coders.Coder;
+
+import org.apache.flink.api.common.typeutils.TypeSerializer;
+import org.apache.flink.core.memory.DataInputView;
+import org.apache.flink.core.memory.DataOutputView;
+
+/**
+ * {@link TypeSerializer} for values that were encoded using a {@link Coder}.
+ */
+public final class EncodedValueSerializer extends TypeSerializer<byte[]> {
+
+ private static final long serialVersionUID = 1L;
+
+ private static final byte[] EMPTY = new byte[0];
+
+ @Override
+ public boolean isImmutableType() {
+ return true;
+ }
+
+ @Override
+ public byte[] createInstance() {
+ return EMPTY;
+ }
+
+ @Override
+ public byte[] copy(byte[] from) {
+ return from;
+ }
+
+ @Override
+ public byte[] copy(byte[] from, byte[] reuse) {
+ return copy(from);
+ }
+
+ @Override
+ public int getLength() {
+ return -1;
+ }
+
+
+ @Override
+ public void serialize(byte[] record, DataOutputView target) throws IOException {
+ if (record == null) {
+ throw new IllegalArgumentException("The record must not be null.");
+ }
+
+ final int len = record.length;
+ target.writeInt(len);
+ target.write(record);
+ }
+
+ @Override
+ public byte[] deserialize(DataInputView source) throws IOException {
+ final int len = source.readInt();
+ byte[] result = new byte[len];
+ source.readFully(result);
+ return result;
+ }
+
+ @Override
+ public byte[] deserialize(byte[] reuse, DataInputView source) throws IOException {
+ return deserialize(source);
+ }
+
+ @Override
+ public void copy(DataInputView source, DataOutputView target) throws IOException {
+ final int len = source.readInt();
+ target.writeInt(len);
+ target.write(source, len);
+ }
+
+ @Override
+ public boolean canEqual(Object obj) {
+ return obj instanceof EncodedValueSerializer;
+ }
+
+ @Override
+ public int hashCode() {
+ return this.getClass().hashCode();
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ return obj instanceof EncodedValueSerializer;
+ }
+
+ @Override
+ public TypeSerializer<byte[]> duplicate() {
+ return this;
+ }
+}
[45/50] [abbrv] beam git commit: This closes #2587
Posted by dh...@apache.org.
This closes #2587
Project: http://git-wip-us.apache.org/repos/asf/beam/repo
Commit: http://git-wip-us.apache.org/repos/asf/beam/commit/29e054a8
Tree: http://git-wip-us.apache.org/repos/asf/beam/tree/29e054a8
Diff: http://git-wip-us.apache.org/repos/asf/beam/diff/29e054a8
Branch: refs/heads/DSL_SQL
Commit: 29e054a8d7ffe6a061dbbe9a1885185b02f4e8ec
Parents: 714fdd2 418c304
Author: Thomas Groh <tg...@google.com>
Authored: Wed Apr 19 10:53:30 2017 -0700
Committer: Thomas Groh <tg...@google.com>
Committed: Wed Apr 19 10:53:30 2017 -0700
----------------------------------------------------------------------
.../core/construction/UnconsumedReads.java | 72 +++++++++++++
.../core/construction/UnconsumedReadsTest.java | 105 +++++++++++++++++++
.../beam/runners/dataflow/DataflowRunner.java | 4 +
.../runners/dataflow/DataflowRunnerTest.java | 24 +++++
4 files changed, 205 insertions(+)
----------------------------------------------------------------------
[08/50] [abbrv] beam git commit: Creates ProcessFnRunner and wires it
through ParDoEvaluator
Posted by dh...@apache.org.
Creates ProcessFnRunner and wires it through ParDoEvaluator
Project: http://git-wip-us.apache.org/repos/asf/beam/repo
Commit: http://git-wip-us.apache.org/repos/asf/beam/commit/b93de58f
Tree: http://git-wip-us.apache.org/repos/asf/beam/tree/b93de58f
Diff: http://git-wip-us.apache.org/repos/asf/beam/diff/b93de58f
Branch: refs/heads/DSL_SQL
Commit: b93de58f5a3a10877997815a793725cb0e53cc2d
Parents: 7e1a267
Author: Eugene Kirpichov <ki...@google.com>
Authored: Mon Apr 17 14:52:23 2017 -0700
Committer: Eugene Kirpichov <ki...@google.com>
Committed: Tue Apr 18 18:02:07 2017 -0700
----------------------------------------------------------------------
.../apache/beam/runners/core/DoFnRunners.java | 32 +++++
.../beam/runners/core/ProcessFnRunner.java | 127 +++++++++++++++++++
.../beam/runners/direct/ParDoEvaluator.java | 114 +++++++++++++----
.../runners/direct/ParDoEvaluatorFactory.java | 11 +-
...littableProcessElementsEvaluatorFactory.java | 106 ++++++++++++----
.../direct/StatefulParDoEvaluatorFactory.java | 4 +-
.../direct/TransformEvaluatorRegistry.java | 4 +-
.../beam/runners/direct/ParDoEvaluatorTest.java | 3 +-
8 files changed, 341 insertions(+), 60 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/beam/blob/b93de58f/runners/core-java/src/main/java/org/apache/beam/runners/core/DoFnRunners.java
----------------------------------------------------------------------
diff --git a/runners/core-java/src/main/java/org/apache/beam/runners/core/DoFnRunners.java b/runners/core-java/src/main/java/org/apache/beam/runners/core/DoFnRunners.java
index b09ee08..8501e72 100644
--- a/runners/core-java/src/main/java/org/apache/beam/runners/core/DoFnRunners.java
+++ b/runners/core-java/src/main/java/org/apache/beam/runners/core/DoFnRunners.java
@@ -17,8 +17,10 @@
*/
package org.apache.beam.runners.core;
+import java.util.Collection;
import java.util.List;
import org.apache.beam.runners.core.ExecutionContext.StepContext;
+import org.apache.beam.runners.core.SplittableParDo.ProcessFn;
import org.apache.beam.runners.core.StatefulDoFnRunner.CleanupTimer;
import org.apache.beam.runners.core.StatefulDoFnRunner.StateCleaner;
import org.apache.beam.sdk.options.PipelineOptions;
@@ -26,10 +28,12 @@ import org.apache.beam.sdk.transforms.Aggregator;
import org.apache.beam.sdk.transforms.DoFn;
import org.apache.beam.sdk.transforms.Sum;
import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
+import org.apache.beam.sdk.util.ReadyCheckingSideInputReader;
import org.apache.beam.sdk.util.SideInputReader;
import org.apache.beam.sdk.util.WindowedValue;
import org.apache.beam.sdk.util.WindowingStrategy;
import org.apache.beam.sdk.values.KV;
+import org.apache.beam.sdk.values.PCollectionView;
import org.apache.beam.sdk.values.TupleTag;
/**
@@ -146,4 +150,32 @@ public class DoFnRunners {
stateCleaner,
droppedDueToLateness);
}
+
+ public static <InputT, OutputT, RestrictionT>
+ ProcessFnRunner<InputT, OutputT, RestrictionT>
+ newProcessFnRunner(
+ ProcessFn<InputT, OutputT, RestrictionT, ?> fn,
+ PipelineOptions options,
+ Collection<PCollectionView<?>> views,
+ ReadyCheckingSideInputReader sideInputReader,
+ OutputManager outputManager,
+ TupleTag<OutputT> mainOutputTag,
+ List<TupleTag<?>> additionalOutputTags,
+ StepContext stepContext,
+ AggregatorFactory aggregatorFactory,
+ WindowingStrategy<?, ?> windowingStrategy) {
+ return new ProcessFnRunner<>(
+ simpleRunner(
+ options,
+ fn,
+ sideInputReader,
+ outputManager,
+ mainOutputTag,
+ additionalOutputTags,
+ stepContext,
+ aggregatorFactory,
+ windowingStrategy),
+ views,
+ sideInputReader);
+ }
}
http://git-wip-us.apache.org/repos/asf/beam/blob/b93de58f/runners/core-java/src/main/java/org/apache/beam/runners/core/ProcessFnRunner.java
----------------------------------------------------------------------
diff --git a/runners/core-java/src/main/java/org/apache/beam/runners/core/ProcessFnRunner.java b/runners/core-java/src/main/java/org/apache/beam/runners/core/ProcessFnRunner.java
new file mode 100644
index 0000000..3ae3f50
--- /dev/null
+++ b/runners/core-java/src/main/java/org/apache/beam/runners/core/ProcessFnRunner.java
@@ -0,0 +1,127 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.core;
+
+import static com.google.common.base.Preconditions.checkArgument;
+import static org.apache.beam.runners.core.SplittableParDo.ProcessFn;
+
+import com.google.common.collect.Iterables;
+import java.util.Collection;
+import java.util.Collections;
+import org.apache.beam.runners.core.StateNamespaces.WindowNamespace;
+import org.apache.beam.runners.core.TimerInternals.TimerData;
+import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
+import org.apache.beam.sdk.transforms.windowing.GlobalWindow;
+import org.apache.beam.sdk.util.ReadyCheckingSideInputReader;
+import org.apache.beam.sdk.util.TimeDomain;
+import org.apache.beam.sdk.util.WindowedValue;
+import org.apache.beam.sdk.values.PCollectionView;
+import org.joda.time.Instant;
+
+/** Runs a {@link ProcessFn} by constructing the appropriate contexts and passing them in. */
+public class ProcessFnRunner<InputT, OutputT, RestrictionT>
+ implements PushbackSideInputDoFnRunner<
+ KeyedWorkItem<String, ElementAndRestriction<InputT, RestrictionT>>, OutputT> {
+ private final DoFnRunner<
+ KeyedWorkItem<String, ElementAndRestriction<InputT, RestrictionT>>, OutputT>
+ underlying;
+ private final Collection<PCollectionView<?>> views;
+ private final ReadyCheckingSideInputReader sideInputReader;
+
+ ProcessFnRunner(
+ DoFnRunner<KeyedWorkItem<String, ElementAndRestriction<InputT, RestrictionT>>, OutputT>
+ underlying,
+ Collection<PCollectionView<?>> views,
+ ReadyCheckingSideInputReader sideInputReader) {
+ this.underlying = underlying;
+ this.views = views;
+ this.sideInputReader = sideInputReader;
+ }
+
+ @Override
+ public void startBundle() {
+ underlying.startBundle();
+ }
+
+ @Override
+ public Iterable<WindowedValue<KeyedWorkItem<String, ElementAndRestriction<InputT, RestrictionT>>>>
+ processElementInReadyWindows(
+ WindowedValue<KeyedWorkItem<String, ElementAndRestriction<InputT, RestrictionT>>>
+ windowedKWI) {
+ checkTrivialOuterWindows(windowedKWI);
+ BoundedWindow window = getUnderlyingWindow(windowedKWI.getValue());
+ if (!isReady(window)) {
+ return Collections.singletonList(windowedKWI);
+ }
+ underlying.processElement(windowedKWI);
+ return Collections.emptyList();
+ }
+
+ @Override
+ public void finishBundle() {
+ underlying.finishBundle();
+ }
+
+ @Override
+ public void onTimer(
+ String timerId, BoundedWindow window, Instant timestamp, TimeDomain timeDomain) {
+ throw new UnsupportedOperationException("User timers unsupported in ProcessFn");
+ }
+
+ private static <T> void checkTrivialOuterWindows(
+ WindowedValue<KeyedWorkItem<String, T>> windowedKWI) {
+ // In practice it will be in 0 or 1 windows (ValueInEmptyWindows or ValueInGlobalWindow)
+ Collection<? extends BoundedWindow> outerWindows = windowedKWI.getWindows();
+ if (!outerWindows.isEmpty()) {
+ checkArgument(
+ outerWindows.size() == 1,
+ "The KeyedWorkItem itself must not be in multiple windows, but was in: %s",
+ outerWindows);
+ BoundedWindow onlyWindow = Iterables.getOnlyElement(outerWindows);
+ checkArgument(
+ onlyWindow instanceof GlobalWindow,
+ "KeyedWorkItem must be in the Global window, but was in: %s",
+ onlyWindow);
+ }
+ }
+
+ private static <T> BoundedWindow getUnderlyingWindow(KeyedWorkItem<String, T> kwi) {
+ if (Iterables.isEmpty(kwi.elementsIterable())) {
+ // ProcessFn sets only a single timer.
+ TimerData timer = Iterables.getOnlyElement(kwi.timersIterable());
+ return ((WindowNamespace) timer.getNamespace()).getWindow();
+ } else {
+ // KWI must have a single element in elementsIterable, because it follows a GBK by a
+ // uniquely generated key.
+ // Additionally, windows must be exploded before GBKIntoKeyedWorkItems, so there's also
+ // only a single window.
+ WindowedValue<T> value = Iterables.getOnlyElement(kwi.elementsIterable());
+ return Iterables.getOnlyElement(value.getWindows());
+ }
+ }
+
+ private boolean isReady(BoundedWindow mainInputWindow) {
+ for (PCollectionView<?> view : views) {
+ BoundedWindow sideInputWindow = view.getWindowMappingFn().getSideInputWindow(mainInputWindow);
+ if (!sideInputReader.isReady(view, sideInputWindow)) {
+ return false;
+ }
+ }
+ return true;
+ }
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/b93de58f/runners/direct-java/src/main/java/org/apache/beam/runners/direct/ParDoEvaluator.java
----------------------------------------------------------------------
diff --git a/runners/direct-java/src/main/java/org/apache/beam/runners/direct/ParDoEvaluator.java b/runners/direct-java/src/main/java/org/apache/beam/runners/direct/ParDoEvaluator.java
index bab7b2c..cab11db 100644
--- a/runners/direct-java/src/main/java/org/apache/beam/runners/direct/ParDoEvaluator.java
+++ b/runners/direct-java/src/main/java/org/apache/beam/runners/direct/ParDoEvaluator.java
@@ -30,6 +30,7 @@ import org.apache.beam.runners.core.SimplePushbackSideInputDoFnRunner;
import org.apache.beam.runners.core.TimerInternals.TimerData;
import org.apache.beam.runners.direct.DirectExecutionContext.DirectStepContext;
import org.apache.beam.runners.direct.DirectRunner.UncommittedBundle;
+import org.apache.beam.sdk.options.PipelineOptions;
import org.apache.beam.sdk.transforms.AppliedPTransform;
import org.apache.beam.sdk.transforms.DoFn;
import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
@@ -43,6 +44,50 @@ import org.apache.beam.sdk.values.TupleTag;
class ParDoEvaluator<InputT> implements TransformEvaluator<InputT> {
+ public interface DoFnRunnerFactory<InputT, OutputT> {
+ PushbackSideInputDoFnRunner<InputT, OutputT> createRunner(
+ PipelineOptions options,
+ DoFn<InputT, OutputT> fn,
+ List<PCollectionView<?>> sideInputs,
+ ReadyCheckingSideInputReader sideInputReader,
+ OutputManager outputManager,
+ TupleTag<OutputT> mainOutputTag,
+ List<TupleTag<?>> additionalOutputTags,
+ DirectStepContext stepContext,
+ AggregatorContainer.Mutator aggregatorChanges,
+ WindowingStrategy<?, ? extends BoundedWindow> windowingStrategy);
+ }
+
+ public static <InputT, OutputT> DoFnRunnerFactory<InputT, OutputT> defaultRunnerFactory() {
+ return new DoFnRunnerFactory<InputT, OutputT>() {
+ @Override
+ public PushbackSideInputDoFnRunner<InputT, OutputT> createRunner(
+ PipelineOptions options,
+ DoFn<InputT, OutputT> fn,
+ List<PCollectionView<?>> sideInputs,
+ ReadyCheckingSideInputReader sideInputReader,
+ OutputManager outputManager,
+ TupleTag<OutputT> mainOutputTag,
+ List<TupleTag<?>> additionalOutputTags,
+ DirectStepContext stepContext,
+ AggregatorContainer.Mutator aggregatorChanges,
+ WindowingStrategy<?, ? extends BoundedWindow> windowingStrategy) {
+ DoFnRunner<InputT, OutputT> underlying =
+ DoFnRunners.simpleRunner(
+ options,
+ fn,
+ sideInputReader,
+ outputManager,
+ mainOutputTag,
+ additionalOutputTags,
+ stepContext,
+ aggregatorChanges,
+ windowingStrategy);
+ return SimplePushbackSideInputDoFnRunner.create(underlying, sideInputs, sideInputReader);
+ }
+ };
+ }
+
public static <InputT, OutputT> ParDoEvaluator<InputT> create(
EvaluationContext evaluationContext,
DirectStepContext stepContext,
@@ -53,9 +98,43 @@ class ParDoEvaluator<InputT> implements TransformEvaluator<InputT> {
List<PCollectionView<?>> sideInputs,
TupleTag<OutputT> mainOutputTag,
List<TupleTag<?>> additionalOutputTags,
- Map<TupleTag<?>, PCollection<?>> outputs) {
+ Map<TupleTag<?>, PCollection<?>> outputs,
+ DoFnRunnerFactory<InputT, OutputT> runnerFactory) {
AggregatorContainer.Mutator aggregatorChanges = evaluationContext.getAggregatorMutator();
+ BundleOutputManager outputManager = createOutputManager(evaluationContext, key, outputs);
+
+ ReadyCheckingSideInputReader sideInputReader =
+ evaluationContext.createSideInputReader(sideInputs);
+
+ PushbackSideInputDoFnRunner<InputT, OutputT> runner = runnerFactory.createRunner(
+ evaluationContext.getPipelineOptions(),
+ fn,
+ sideInputs,
+ sideInputReader,
+ outputManager,
+ mainOutputTag,
+ additionalOutputTags,
+ stepContext,
+ aggregatorChanges,
+ windowingStrategy);
+
+ return create(runner, stepContext, application, aggregatorChanges, outputManager);
+ }
+
+ public static <InputT, OutputT> ParDoEvaluator<InputT> create(
+ PushbackSideInputDoFnRunner<InputT, OutputT> runner,
+ DirectStepContext stepContext,
+ AppliedPTransform<?, ?, ?> application,
+ AggregatorContainer.Mutator aggregatorChanges,
+ BundleOutputManager outputManager) {
+ return new ParDoEvaluator<>(runner, application, aggregatorChanges, outputManager, stepContext);
+ }
+
+ static BundleOutputManager createOutputManager(
+ EvaluationContext evaluationContext,
+ StructuralKey<?> key,
+ Map<TupleTag<?>, PCollection<?>> outputs) {
Map<TupleTag<?>, UncommittedBundle<?>> outputBundles = new HashMap<>();
for (Map.Entry<TupleTag<?>, PCollection<?>> outputEntry : outputs.entrySet()) {
// Just trust the context's decision as to whether the output should be keyed.
@@ -69,32 +148,7 @@ class ParDoEvaluator<InputT> implements TransformEvaluator<InputT> {
outputEntry.getKey(), evaluationContext.createBundle(outputEntry.getValue()));
}
}
- BundleOutputManager outputManager = BundleOutputManager.create(outputBundles);
-
- ReadyCheckingSideInputReader sideInputReader =
- evaluationContext.createSideInputReader(sideInputs);
-
- DoFnRunner<InputT, OutputT> underlying =
- DoFnRunners.simpleRunner(
- evaluationContext.getPipelineOptions(),
- fn,
- sideInputReader,
- outputManager,
- mainOutputTag,
- additionalOutputTags,
- stepContext,
- aggregatorChanges,
- windowingStrategy);
- PushbackSideInputDoFnRunner<InputT, OutputT> runner =
- SimplePushbackSideInputDoFnRunner.create(underlying, sideInputs, sideInputReader);
-
- try {
- runner.startBundle();
- } catch (Exception e) {
- throw UserCodeException.wrap(e);
- }
-
- return new ParDoEvaluator<>(runner, application, aggregatorChanges, outputManager, stepContext);
+ return BundleOutputManager.create(outputBundles);
}
////////////////////////////////////////////////////////////////////////////////////////////////
@@ -119,6 +173,12 @@ class ParDoEvaluator<InputT> implements TransformEvaluator<InputT> {
this.stepContext = stepContext;
this.aggregatorChanges = aggregatorChanges;
this.unprocessedElements = ImmutableList.builder();
+
+ try {
+ fnRunner.startBundle();
+ } catch (Exception e) {
+ throw UserCodeException.wrap(e);
+ }
}
public BundleOutputManager getOutputManager() {
http://git-wip-us.apache.org/repos/asf/beam/blob/b93de58f/runners/direct-java/src/main/java/org/apache/beam/runners/direct/ParDoEvaluatorFactory.java
----------------------------------------------------------------------
diff --git a/runners/direct-java/src/main/java/org/apache/beam/runners/direct/ParDoEvaluatorFactory.java b/runners/direct-java/src/main/java/org/apache/beam/runners/direct/ParDoEvaluatorFactory.java
index 93f204a..b00c2b6 100644
--- a/runners/direct-java/src/main/java/org/apache/beam/runners/direct/ParDoEvaluatorFactory.java
+++ b/runners/direct-java/src/main/java/org/apache/beam/runners/direct/ParDoEvaluatorFactory.java
@@ -43,9 +43,13 @@ final class ParDoEvaluatorFactory<InputT, OutputT> implements TransformEvaluator
private static final Logger LOG = LoggerFactory.getLogger(ParDoEvaluatorFactory.class);
private final LoadingCache<DoFn<?, ?>, DoFnLifecycleManager> fnClones;
private final EvaluationContext evaluationContext;
+ private final ParDoEvaluator.DoFnRunnerFactory<InputT, OutputT> runnerFactory;
- ParDoEvaluatorFactory(EvaluationContext evaluationContext) {
+ ParDoEvaluatorFactory(
+ EvaluationContext evaluationContext,
+ ParDoEvaluator.DoFnRunnerFactory<InputT, OutputT> runnerFactory) {
this.evaluationContext = evaluationContext;
+ this.runnerFactory = runnerFactory;
fnClones =
CacheBuilder.newBuilder()
.build(
@@ -148,7 +152,8 @@ final class ParDoEvaluatorFactory<InputT, OutputT> implements TransformEvaluator
sideInputs,
mainOutputTag,
additionalOutputTags,
- pcollections(application.getOutputs()));
+ pcollections(application.getOutputs()),
+ runnerFactory);
} catch (Exception e) {
try {
fnManager.remove();
@@ -162,7 +167,7 @@ final class ParDoEvaluatorFactory<InputT, OutputT> implements TransformEvaluator
}
}
- private Map<TupleTag<?>, PCollection<?>> pcollections(Map<TupleTag<?>, PValue> outputs) {
+ static Map<TupleTag<?>, PCollection<?>> pcollections(Map<TupleTag<?>, PValue> outputs) {
Map<TupleTag<?>, PCollection<?>> pcs = new HashMap<>();
for (Map.Entry<TupleTag<?>, PValue> output : outputs.entrySet()) {
pcs.put(output.getKey(), (PCollection<?>) output.getValue());
http://git-wip-us.apache.org/repos/asf/beam/blob/b93de58f/runners/direct-java/src/main/java/org/apache/beam/runners/direct/SplittableProcessElementsEvaluatorFactory.java
----------------------------------------------------------------------
diff --git a/runners/direct-java/src/main/java/org/apache/beam/runners/direct/SplittableProcessElementsEvaluatorFactory.java b/runners/direct-java/src/main/java/org/apache/beam/runners/direct/SplittableProcessElementsEvaluatorFactory.java
index 00b16dd..7efdb52 100644
--- a/runners/direct-java/src/main/java/org/apache/beam/runners/direct/SplittableProcessElementsEvaluatorFactory.java
+++ b/runners/direct-java/src/main/java/org/apache/beam/runners/direct/SplittableProcessElementsEvaluatorFactory.java
@@ -18,25 +18,34 @@
package org.apache.beam.runners.direct;
import java.util.Collection;
+import java.util.List;
import java.util.concurrent.Executors;
+import org.apache.beam.runners.core.DoFnRunners;
import org.apache.beam.runners.core.DoFnRunners.OutputManager;
import org.apache.beam.runners.core.ElementAndRestriction;
import org.apache.beam.runners.core.KeyedWorkItem;
import org.apache.beam.runners.core.OutputAndTimeBoundedSplittableProcessElementInvoker;
import org.apache.beam.runners.core.OutputWindowedValue;
+import org.apache.beam.runners.core.PushbackSideInputDoFnRunner;
import org.apache.beam.runners.core.SplittableParDo;
+import org.apache.beam.runners.core.SplittableParDo.ProcessFn;
import org.apache.beam.runners.core.StateInternals;
import org.apache.beam.runners.core.StateInternalsFactory;
import org.apache.beam.runners.core.TimerInternals;
import org.apache.beam.runners.core.TimerInternalsFactory;
import org.apache.beam.runners.direct.DirectRunner.CommittedBundle;
+import org.apache.beam.sdk.options.PipelineOptions;
import org.apache.beam.sdk.transforms.AppliedPTransform;
+import org.apache.beam.sdk.transforms.DoFn;
import org.apache.beam.sdk.transforms.splittabledofn.RestrictionTracker;
import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
import org.apache.beam.sdk.transforms.windowing.PaneInfo;
+import org.apache.beam.sdk.util.ReadyCheckingSideInputReader;
import org.apache.beam.sdk.util.WindowedValue;
+import org.apache.beam.sdk.util.WindowingStrategy;
import org.apache.beam.sdk.values.PCollection;
import org.apache.beam.sdk.values.PCollectionTuple;
+import org.apache.beam.sdk.values.PCollectionView;
import org.apache.beam.sdk.values.TupleTag;
import org.joda.time.Duration;
import org.joda.time.Instant;
@@ -51,7 +60,11 @@ class SplittableProcessElementsEvaluatorFactory<
SplittableProcessElementsEvaluatorFactory(EvaluationContext evaluationContext) {
this.evaluationContext = evaluationContext;
- this.delegateFactory = new ParDoEvaluatorFactory<>(evaluationContext);
+ this.delegateFactory =
+ new ParDoEvaluatorFactory<>(
+ evaluationContext,
+ SplittableProcessElementsEvaluatorFactory
+ .<InputT, OutputT, RestrictionT>processFnRunnerFactory());
}
@Override
@@ -82,12 +95,12 @@ class SplittableProcessElementsEvaluatorFactory<
final SplittableParDo.ProcessElements<InputT, OutputT, RestrictionT, TrackerT> transform =
application.getTransform();
- SplittableParDo.ProcessFn<InputT, OutputT, RestrictionT, TrackerT> processFn =
+ ProcessFn<InputT, OutputT, RestrictionT, TrackerT> processFn =
transform.newProcessFn(transform.getFn());
DoFnLifecycleManager fnManager = DoFnLifecycleManager.of(processFn);
processFn =
- ((SplittableParDo.ProcessFn<InputT, OutputT, RestrictionT, TrackerT>)
+ ((ProcessFn<InputT, OutputT, RestrictionT, TrackerT>)
fnManager
.<KeyedWorkItem<String, ElementAndRestriction<InputT, RestrictionT>>, OutputT>
get());
@@ -98,7 +111,7 @@ class SplittableProcessElementsEvaluatorFactory<
.getExecutionContext(application, inputBundle.getKey())
.getOrCreateStepContext(stepName, stepName);
- ParDoEvaluator<KeyedWorkItem<String, ElementAndRestriction<InputT, RestrictionT>>>
+ final ParDoEvaluator<KeyedWorkItem<String, ElementAndRestriction<InputT, RestrictionT>>>
parDoEvaluator =
delegateFactory.createParDoEvaluator(
application,
@@ -127,34 +140,36 @@ class SplittableProcessElementsEvaluatorFactory<
}
});
- final OutputManager outputManager = parDoEvaluator.getOutputManager();
+ OutputWindowedValue<OutputT> outputWindowedValue =
+ new OutputWindowedValue<OutputT>() {
+ private final OutputManager outputManager = parDoEvaluator.getOutputManager();
+
+ @Override
+ public void outputWindowedValue(
+ OutputT output,
+ Instant timestamp,
+ Collection<? extends BoundedWindow> windows,
+ PaneInfo pane) {
+ outputManager.output(
+ transform.getMainOutputTag(), WindowedValue.of(output, timestamp, windows, pane));
+ }
+
+ @Override
+ public <AdditionalOutputT> void outputWindowedValue(
+ TupleTag<AdditionalOutputT> tag,
+ AdditionalOutputT output,
+ Instant timestamp,
+ Collection<? extends BoundedWindow> windows,
+ PaneInfo pane) {
+ outputManager.output(tag, WindowedValue.of(output, timestamp, windows, pane));
+ }
+ };
processFn.setProcessElementInvoker(
new OutputAndTimeBoundedSplittableProcessElementInvoker<
InputT, OutputT, RestrictionT, TrackerT>(
transform.getFn(),
evaluationContext.getPipelineOptions(),
- new OutputWindowedValue<OutputT>() {
- @Override
- public void outputWindowedValue(
- OutputT output,
- Instant timestamp,
- Collection<? extends BoundedWindow> windows,
- PaneInfo pane) {
- outputManager.output(
- transform.getMainOutputTag(),
- WindowedValue.of(output, timestamp, windows, pane));
- }
-
- @Override
- public <AdditionalOutputT> void outputWindowedValue(
- TupleTag<AdditionalOutputT> tag,
- AdditionalOutputT output,
- Instant timestamp,
- Collection<? extends BoundedWindow> windows,
- PaneInfo pane) {
- outputManager.output(tag, WindowedValue.of(output, timestamp, windows, pane));
- }
- },
+ outputWindowedValue,
evaluationContext.createSideInputReader(transform.getSideInputs()),
// TODO: For better performance, use a higher-level executor?
Executors.newSingleThreadScheduledExecutor(Executors.defaultThreadFactory()),
@@ -163,4 +178,41 @@ class SplittableProcessElementsEvaluatorFactory<
return DoFnLifecycleManagerRemovingTransformEvaluator.wrapping(parDoEvaluator, fnManager);
}
+
+ private static <InputT, OutputT, RestrictionT>
+ ParDoEvaluator.DoFnRunnerFactory<
+ KeyedWorkItem<String, ElementAndRestriction<InputT, RestrictionT>>, OutputT>
+ processFnRunnerFactory() {
+ return new ParDoEvaluator.DoFnRunnerFactory<
+ KeyedWorkItem<String, ElementAndRestriction<InputT, RestrictionT>>, OutputT>() {
+ @Override
+ public PushbackSideInputDoFnRunner<
+ KeyedWorkItem<String, ElementAndRestriction<InputT, RestrictionT>>, OutputT>
+ createRunner(
+ PipelineOptions options,
+ DoFn<KeyedWorkItem<String, ElementAndRestriction<InputT, RestrictionT>>, OutputT> fn,
+ List<PCollectionView<?>> sideInputs,
+ ReadyCheckingSideInputReader sideInputReader,
+ OutputManager outputManager,
+ TupleTag<OutputT> mainOutputTag,
+ List<TupleTag<?>> additionalOutputTags,
+ DirectExecutionContext.DirectStepContext stepContext,
+ AggregatorContainer.Mutator aggregatorChanges,
+ WindowingStrategy<?, ? extends BoundedWindow> windowingStrategy) {
+ ProcessFn<InputT, OutputT, RestrictionT, ?> processFn =
+ (ProcessFn) fn;
+ return DoFnRunners.newProcessFnRunner(
+ processFn,
+ options,
+ sideInputs,
+ sideInputReader,
+ outputManager,
+ mainOutputTag,
+ additionalOutputTags,
+ stepContext,
+ aggregatorChanges,
+ windowingStrategy);
+ }
+ };
+ }
}
http://git-wip-us.apache.org/repos/asf/beam/blob/b93de58f/runners/direct-java/src/main/java/org/apache/beam/runners/direct/StatefulParDoEvaluatorFactory.java
----------------------------------------------------------------------
diff --git a/runners/direct-java/src/main/java/org/apache/beam/runners/direct/StatefulParDoEvaluatorFactory.java b/runners/direct-java/src/main/java/org/apache/beam/runners/direct/StatefulParDoEvaluatorFactory.java
index be77ea1..8793ae8 100644
--- a/runners/direct-java/src/main/java/org/apache/beam/runners/direct/StatefulParDoEvaluatorFactory.java
+++ b/runners/direct-java/src/main/java/org/apache/beam/runners/direct/StatefulParDoEvaluatorFactory.java
@@ -65,7 +65,9 @@ final class StatefulParDoEvaluatorFactory<K, InputT, OutputT> implements Transfo
private final ParDoEvaluatorFactory<KV<K, InputT>, OutputT> delegateFactory;
StatefulParDoEvaluatorFactory(EvaluationContext evaluationContext) {
- this.delegateFactory = new ParDoEvaluatorFactory<>(evaluationContext);
+ this.delegateFactory =
+ new ParDoEvaluatorFactory<>(
+ evaluationContext, ParDoEvaluator.<KV<K, InputT>, OutputT>defaultRunnerFactory());
this.cleanupRegistry =
CacheBuilder.newBuilder()
.weakValues()
http://git-wip-us.apache.org/repos/asf/beam/blob/b93de58f/runners/direct-java/src/main/java/org/apache/beam/runners/direct/TransformEvaluatorRegistry.java
----------------------------------------------------------------------
diff --git a/runners/direct-java/src/main/java/org/apache/beam/runners/direct/TransformEvaluatorRegistry.java b/runners/direct-java/src/main/java/org/apache/beam/runners/direct/TransformEvaluatorRegistry.java
index ae7ad93..d06c460 100644
--- a/runners/direct-java/src/main/java/org/apache/beam/runners/direct/TransformEvaluatorRegistry.java
+++ b/runners/direct-java/src/main/java/org/apache/beam/runners/direct/TransformEvaluatorRegistry.java
@@ -52,7 +52,9 @@ class TransformEvaluatorRegistry implements TransformEvaluatorFactory {
ImmutableMap.<Class<? extends PTransform>, TransformEvaluatorFactory>builder()
.put(Read.Bounded.class, new BoundedReadEvaluatorFactory(ctxt))
.put(Read.Unbounded.class, new UnboundedReadEvaluatorFactory(ctxt))
- .put(ParDo.MultiOutput.class, new ParDoEvaluatorFactory<>(ctxt))
+ .put(
+ ParDo.MultiOutput.class,
+ new ParDoEvaluatorFactory<>(ctxt, ParDoEvaluator.defaultRunnerFactory()))
.put(StatefulParDo.class, new StatefulParDoEvaluatorFactory<>(ctxt))
.put(PCollections.class, new FlattenEvaluatorFactory(ctxt))
.put(WriteView.class, new ViewEvaluatorFactory(ctxt))
http://git-wip-us.apache.org/repos/asf/beam/blob/b93de58f/runners/direct-java/src/test/java/org/apache/beam/runners/direct/ParDoEvaluatorTest.java
----------------------------------------------------------------------
diff --git a/runners/direct-java/src/test/java/org/apache/beam/runners/direct/ParDoEvaluatorTest.java b/runners/direct-java/src/test/java/org/apache/beam/runners/direct/ParDoEvaluatorTest.java
index 2be0f9d..e99e4bf 100644
--- a/runners/direct-java/src/test/java/org/apache/beam/runners/direct/ParDoEvaluatorTest.java
+++ b/runners/direct-java/src/test/java/org/apache/beam/runners/direct/ParDoEvaluatorTest.java
@@ -169,7 +169,8 @@ public class ParDoEvaluatorTest {
ImmutableList.<PCollectionView<?>>of(singletonView),
mainOutputTag,
additionalOutputTags,
- ImmutableMap.<TupleTag<?>, PCollection<?>>of(mainOutputTag, output));
+ ImmutableMap.<TupleTag<?>, PCollection<?>>of(mainOutputTag, output),
+ ParDoEvaluator.<Integer, Integer>defaultRunnerFactory());
}
private static class RecorderFn extends DoFn<Integer, Integer> {
[47/50] [abbrv] beam git commit: This closes #2594
Posted by dh...@apache.org.
This closes #2594
Project: http://git-wip-us.apache.org/repos/asf/beam/repo
Commit: http://git-wip-us.apache.org/repos/asf/beam/commit/391fb77c
Tree: http://git-wip-us.apache.org/repos/asf/beam/tree/391fb77c
Diff: http://git-wip-us.apache.org/repos/asf/beam/diff/391fb77c
Branch: refs/heads/DSL_SQL
Commit: 391fb77c379d271494527c6f78ef8ada6f40dc23
Parents: 29e054a 1533e2b
Author: Eugene Kirpichov <ki...@google.com>
Authored: Wed Apr 19 11:39:36 2017 -0700
Committer: Eugene Kirpichov <ki...@google.com>
Committed: Wed Apr 19 11:39:36 2017 -0700
----------------------------------------------------------------------
.../sdk/io/gcp/bigquery/BigQuerySourceBase.java | 31 +++++++++++++-------
.../sdk/io/gcp/bigquery/BigQueryIOTest.java | 18 +++++-------
.../sdk/io/gcp/bigquery/FakeJobService.java | 9 ++++++
3 files changed, 37 insertions(+), 21 deletions(-)
----------------------------------------------------------------------
[35/50] [abbrv] beam git commit: [BEAM-1994] Remove Flink examples
package
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/SingletonKeyedWorkItemCoder.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/SingletonKeyedWorkItemCoder.java b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/SingletonKeyedWorkItemCoder.java
deleted file mode 100644
index 9a52330..0000000
--- a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/SingletonKeyedWorkItemCoder.java
+++ /dev/null
@@ -1,126 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink.translation.wrappers.streaming;
-
-import static com.google.common.base.Preconditions.checkArgument;
-
-import com.fasterxml.jackson.annotation.JsonCreator;
-import com.fasterxml.jackson.annotation.JsonProperty;
-import com.google.common.collect.ImmutableList;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.util.List;
-import org.apache.beam.runners.core.KeyedWorkItem;
-import org.apache.beam.runners.core.KeyedWorkItemCoder;
-import org.apache.beam.sdk.coders.Coder;
-import org.apache.beam.sdk.coders.CoderException;
-import org.apache.beam.sdk.coders.StandardCoder;
-import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
-import org.apache.beam.sdk.util.PropertyNames;
-import org.apache.beam.sdk.util.WindowedValue;
-
-/**
- * Singleton keyed work item coder.
- */
-public class SingletonKeyedWorkItemCoder<K, ElemT>
- extends StandardCoder<SingletonKeyedWorkItem<K, ElemT>> {
- /**
- * Create a new {@link KeyedWorkItemCoder} with the provided key coder, element coder, and window
- * coder.
- */
- public static <K, ElemT> SingletonKeyedWorkItemCoder<K, ElemT> of(
- Coder<K> keyCoder, Coder<ElemT> elemCoder, Coder<? extends BoundedWindow> windowCoder) {
- return new SingletonKeyedWorkItemCoder<>(keyCoder, elemCoder, windowCoder);
- }
-
- @JsonCreator
- public static <K, ElemT> SingletonKeyedWorkItemCoder<K, ElemT> of(
- @JsonProperty(PropertyNames.COMPONENT_ENCODINGS) List<Coder<?>> components) {
- checkArgument(components.size() == 3, "Expecting 3 components, got %s", components.size());
- @SuppressWarnings("unchecked")
- Coder<K> keyCoder = (Coder<K>) components.get(0);
- @SuppressWarnings("unchecked")
- Coder<ElemT> elemCoder = (Coder<ElemT>) components.get(1);
- @SuppressWarnings("unchecked")
- Coder<? extends BoundedWindow> windowCoder = (Coder<? extends BoundedWindow>) components.get(2);
- return new SingletonKeyedWorkItemCoder<>(keyCoder, elemCoder, windowCoder);
- }
-
- private final Coder<K> keyCoder;
- private final Coder<ElemT> elemCoder;
- private final Coder<? extends BoundedWindow> windowCoder;
- private final WindowedValue.FullWindowedValueCoder<ElemT> valueCoder;
-
- private SingletonKeyedWorkItemCoder(
- Coder<K> keyCoder, Coder<ElemT> elemCoder, Coder<? extends BoundedWindow> windowCoder) {
- this.keyCoder = keyCoder;
- this.elemCoder = elemCoder;
- this.windowCoder = windowCoder;
- valueCoder = WindowedValue.FullWindowedValueCoder.of(elemCoder, windowCoder);
- }
-
- public Coder<K> getKeyCoder() {
- return keyCoder;
- }
-
- public Coder<ElemT> getElementCoder() {
- return elemCoder;
- }
-
- @Override
- public void encode(SingletonKeyedWorkItem<K, ElemT> value,
- OutputStream outStream,
- Context context)
- throws CoderException, IOException {
- keyCoder.encode(value.key(), outStream, context.nested());
- valueCoder.encode(value.value, outStream, context);
- }
-
- @Override
- public SingletonKeyedWorkItem<K, ElemT> decode(InputStream inStream, Context context)
- throws CoderException, IOException {
- K key = keyCoder.decode(inStream, context.nested());
- WindowedValue<ElemT> value = valueCoder.decode(inStream, context);
- return new SingletonKeyedWorkItem<>(key, value);
- }
-
- @Override
- public List<? extends Coder<?>> getCoderArguments() {
- return ImmutableList.of(keyCoder, elemCoder, windowCoder);
- }
-
- @Override
- public void verifyDeterministic() throws NonDeterministicException {
- keyCoder.verifyDeterministic();
- elemCoder.verifyDeterministic();
- windowCoder.verifyDeterministic();
- }
-
- /**
- * {@inheritDoc}.
- *
- * {@link KeyedWorkItemCoder} is not consistent with equals as it can return a
- * {@link KeyedWorkItem} of a type different from the originally encoded type.
- */
- @Override
- public boolean consistentWithEquals() {
- return false;
- }
-
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/SplittableDoFnOperator.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/SplittableDoFnOperator.java b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/SplittableDoFnOperator.java
deleted file mode 100644
index 40f70e4..0000000
--- a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/SplittableDoFnOperator.java
+++ /dev/null
@@ -1,150 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink.translation.wrappers.streaming;
-
-import static com.google.common.base.Preconditions.checkState;
-
-import java.util.Collection;
-import java.util.Collections;
-import java.util.List;
-import java.util.Map;
-import java.util.concurrent.Executors;
-import org.apache.beam.runners.core.ElementAndRestriction;
-import org.apache.beam.runners.core.KeyedWorkItem;
-import org.apache.beam.runners.core.KeyedWorkItems;
-import org.apache.beam.runners.core.OutputAndTimeBoundedSplittableProcessElementInvoker;
-import org.apache.beam.runners.core.OutputWindowedValue;
-import org.apache.beam.runners.core.SplittableParDo;
-import org.apache.beam.runners.core.StateInternals;
-import org.apache.beam.runners.core.StateInternalsFactory;
-import org.apache.beam.runners.core.TimerInternals;
-import org.apache.beam.runners.core.TimerInternalsFactory;
-import org.apache.beam.sdk.coders.Coder;
-import org.apache.beam.sdk.options.PipelineOptions;
-import org.apache.beam.sdk.transforms.DoFn;
-import org.apache.beam.sdk.transforms.splittabledofn.RestrictionTracker;
-import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
-import org.apache.beam.sdk.transforms.windowing.PaneInfo;
-import org.apache.beam.sdk.util.WindowedValue;
-import org.apache.beam.sdk.util.WindowingStrategy;
-import org.apache.beam.sdk.values.PCollectionView;
-import org.apache.beam.sdk.values.TupleTag;
-import org.apache.flink.streaming.api.operators.InternalTimer;
-import org.joda.time.Duration;
-import org.joda.time.Instant;
-
-/**
- * Flink operator for executing splittable {@link DoFn DoFns}. Specifically, for executing
- * the {@code @ProcessElement} method of a splittable {@link DoFn}.
- */
-public class SplittableDoFnOperator<
- InputT, FnOutputT, OutputT, RestrictionT, TrackerT extends RestrictionTracker<RestrictionT>>
- extends DoFnOperator<
- KeyedWorkItem<String, ElementAndRestriction<InputT, RestrictionT>>, FnOutputT, OutputT> {
-
- public SplittableDoFnOperator(
- DoFn<KeyedWorkItem<String, ElementAndRestriction<InputT, RestrictionT>>, FnOutputT> doFn,
- Coder<
- WindowedValue<
- KeyedWorkItem<String, ElementAndRestriction<InputT, RestrictionT>>>> inputCoder,
- TupleTag<FnOutputT> mainOutputTag,
- List<TupleTag<?>> additionalOutputTags,
- OutputManagerFactory<OutputT> outputManagerFactory,
- WindowingStrategy<?, ?> windowingStrategy,
- Map<Integer, PCollectionView<?>> sideInputTagMapping,
- Collection<PCollectionView<?>> sideInputs,
- PipelineOptions options,
- Coder<?> keyCoder) {
- super(
- doFn,
- inputCoder,
- mainOutputTag,
- additionalOutputTags,
- outputManagerFactory,
- windowingStrategy,
- sideInputTagMapping,
- sideInputs,
- options,
- keyCoder);
-
- }
-
- @Override
- public void open() throws Exception {
- super.open();
-
- checkState(doFn instanceof SplittableParDo.ProcessFn);
-
- StateInternalsFactory<String> stateInternalsFactory = new StateInternalsFactory<String>() {
- @Override
- public StateInternals<String> stateInternalsForKey(String key) {
- //this will implicitly be keyed by the key of the incoming
- // element or by the key of a firing timer
- return (StateInternals<String>) stateInternals;
- }
- };
- TimerInternalsFactory<String> timerInternalsFactory = new TimerInternalsFactory<String>() {
- @Override
- public TimerInternals timerInternalsForKey(String key) {
- //this will implicitly be keyed like the StateInternalsFactory
- return timerInternals;
- }
- };
-
- ((SplittableParDo.ProcessFn) doFn).setStateInternalsFactory(stateInternalsFactory);
- ((SplittableParDo.ProcessFn) doFn).setTimerInternalsFactory(timerInternalsFactory);
- ((SplittableParDo.ProcessFn) doFn).setProcessElementInvoker(
- new OutputAndTimeBoundedSplittableProcessElementInvoker<>(
- doFn,
- serializedOptions.getPipelineOptions(),
- new OutputWindowedValue<FnOutputT>() {
- @Override
- public void outputWindowedValue(
- FnOutputT output,
- Instant timestamp,
- Collection<? extends BoundedWindow> windows,
- PaneInfo pane) {
- outputManager.output(
- mainOutputTag,
- WindowedValue.of(output, timestamp, windows, pane));
- }
-
- @Override
- public <AdditionalOutputT> void outputWindowedValue(
- TupleTag<AdditionalOutputT> tag,
- AdditionalOutputT output,
- Instant timestamp,
- Collection<? extends BoundedWindow> windows,
- PaneInfo pane) {
- outputManager.output(tag, WindowedValue.of(output, timestamp, windows, pane));
- }
- },
- sideInputReader,
- Executors.newSingleThreadScheduledExecutor(Executors.defaultThreadFactory()),
- 10000,
- Duration.standardSeconds(10)));
- }
-
- @Override
- public void fireTimer(InternalTimer<?, TimerInternals.TimerData> timer) {
- doFnRunner.processElement(WindowedValue.valueInGlobalWindow(
- KeyedWorkItems.<String, ElementAndRestriction<InputT, RestrictionT>>timersWorkItem(
- (String) stateInternals.getKey(),
- Collections.singletonList(timer.getNamespace()))));
- }
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/WindowDoFnOperator.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/WindowDoFnOperator.java b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/WindowDoFnOperator.java
deleted file mode 100644
index 9b2136c..0000000
--- a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/WindowDoFnOperator.java
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink.translation.wrappers.streaming;
-
-import static org.apache.beam.runners.core.TimerInternals.TimerData;
-
-import java.util.Collection;
-import java.util.Collections;
-import java.util.List;
-import java.util.Map;
-import org.apache.beam.runners.core.GroupAlsoByWindowViaWindowSetNewDoFn;
-import org.apache.beam.runners.core.KeyedWorkItem;
-import org.apache.beam.runners.core.KeyedWorkItems;
-import org.apache.beam.runners.core.StateInternals;
-import org.apache.beam.runners.core.StateInternalsFactory;
-import org.apache.beam.runners.core.SystemReduceFn;
-import org.apache.beam.runners.core.TimerInternals;
-import org.apache.beam.runners.core.TimerInternalsFactory;
-import org.apache.beam.sdk.coders.Coder;
-import org.apache.beam.sdk.options.PipelineOptions;
-import org.apache.beam.sdk.transforms.DoFn;
-import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
-import org.apache.beam.sdk.util.WindowedValue;
-import org.apache.beam.sdk.util.WindowingStrategy;
-import org.apache.beam.sdk.values.KV;
-import org.apache.beam.sdk.values.PCollectionView;
-import org.apache.beam.sdk.values.TupleTag;
-import org.apache.flink.streaming.api.operators.InternalTimer;
-
-/**
- * Flink operator for executing window {@link DoFn DoFns}.
- */
-public class WindowDoFnOperator<K, InputT, OutputT>
- extends DoFnOperator<KeyedWorkItem<K, InputT>, KV<K, OutputT>, WindowedValue<KV<K, OutputT>>> {
-
- private final SystemReduceFn<K, InputT, ?, OutputT, BoundedWindow> systemReduceFn;
-
- public WindowDoFnOperator(
- SystemReduceFn<K, InputT, ?, OutputT, BoundedWindow> systemReduceFn,
- Coder<WindowedValue<KeyedWorkItem<K, InputT>>> inputCoder,
- TupleTag<KV<K, OutputT>> mainOutputTag,
- List<TupleTag<?>> additionalOutputTags,
- OutputManagerFactory<WindowedValue<KV<K, OutputT>>> outputManagerFactory,
- WindowingStrategy<?, ?> windowingStrategy,
- Map<Integer, PCollectionView<?>> sideInputTagMapping,
- Collection<PCollectionView<?>> sideInputs,
- PipelineOptions options,
- Coder<K> keyCoder) {
- super(
- null,
- inputCoder,
- mainOutputTag,
- additionalOutputTags,
- outputManagerFactory,
- windowingStrategy,
- sideInputTagMapping,
- sideInputs,
- options,
- keyCoder);
-
- this.systemReduceFn = systemReduceFn;
-
- }
-
- @Override
- protected DoFn<KeyedWorkItem<K, InputT>, KV<K, OutputT>> getDoFn() {
- StateInternalsFactory<K> stateInternalsFactory = new StateInternalsFactory<K>() {
- @Override
- public StateInternals<K> stateInternalsForKey(K key) {
- //this will implicitly be keyed by the key of the incoming
- // element or by the key of a firing timer
- return (StateInternals<K>) stateInternals;
- }
- };
- TimerInternalsFactory<K> timerInternalsFactory = new TimerInternalsFactory<K>() {
- @Override
- public TimerInternals timerInternalsForKey(K key) {
- //this will implicitly be keyed like the StateInternalsFactory
- return timerInternals;
- }
- };
-
- // we have to do the unchecked cast because GroupAlsoByWindowViaWindowSetDoFn.create
- // has the window type as generic parameter while WindowingStrategy is almost always
- // untyped.
- @SuppressWarnings("unchecked")
- DoFn<KeyedWorkItem<K, InputT>, KV<K, OutputT>> doFn =
- GroupAlsoByWindowViaWindowSetNewDoFn.create(
- windowingStrategy, stateInternalsFactory, timerInternalsFactory, sideInputReader,
- (SystemReduceFn) systemReduceFn, outputManager, mainOutputTag);
- return doFn;
- }
-
- @Override
- public void fireTimer(InternalTimer<?, TimerData> timer) {
- doFnRunner.processElement(WindowedValue.valueInGlobalWindow(
- KeyedWorkItems.<K, InputT>timersWorkItem(
- (K) stateInternals.getKey(),
- Collections.singletonList(timer.getNamespace()))));
- }
-
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/WorkItemKeySelector.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/WorkItemKeySelector.java b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/WorkItemKeySelector.java
deleted file mode 100644
index 1dff367..0000000
--- a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/WorkItemKeySelector.java
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink.translation.wrappers.streaming;
-
-import java.nio.ByteBuffer;
-import org.apache.beam.runners.core.KeyedWorkItem;
-import org.apache.beam.sdk.coders.Coder;
-import org.apache.beam.sdk.util.CoderUtils;
-import org.apache.beam.sdk.util.WindowedValue;
-import org.apache.flink.api.common.typeinfo.TypeInformation;
-import org.apache.flink.api.java.functions.KeySelector;
-import org.apache.flink.api.java.typeutils.GenericTypeInfo;
-import org.apache.flink.api.java.typeutils.ResultTypeQueryable;
-
-/**
- * {@link KeySelector} that retrieves a key from a {@link KeyedWorkItem}. This will return
- * the key as encoded by the provided {@link Coder} in a {@link ByteBuffer}. This ensures
- * that all key comparisons/hashing happen on the encoded form.
- */
-public class WorkItemKeySelector<K, V>
- implements KeySelector<WindowedValue<SingletonKeyedWorkItem<K, V>>, ByteBuffer>,
- ResultTypeQueryable<ByteBuffer> {
-
- private final Coder<K> keyCoder;
-
- public WorkItemKeySelector(Coder<K> keyCoder) {
- this.keyCoder = keyCoder;
- }
-
- @Override
- public ByteBuffer getKey(WindowedValue<SingletonKeyedWorkItem<K, V>> value) throws Exception {
- K key = value.getValue().key();
- byte[] keyBytes = CoderUtils.encodeToByteArray(keyCoder, key);
- return ByteBuffer.wrap(keyBytes);
- }
-
- @Override
- public TypeInformation<ByteBuffer> getProducedType() {
- return new GenericTypeInfo<>(ByteBuffer.class);
- }
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/BoundedSourceWrapper.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/BoundedSourceWrapper.java b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/BoundedSourceWrapper.java
deleted file mode 100644
index 2ed5024..0000000
--- a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/BoundedSourceWrapper.java
+++ /dev/null
@@ -1,218 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink.translation.wrappers.streaming.io;
-
-import com.google.common.annotations.VisibleForTesting;
-import java.util.ArrayList;
-import java.util.List;
-import org.apache.beam.runners.flink.translation.utils.SerializedPipelineOptions;
-import org.apache.beam.sdk.io.BoundedSource;
-import org.apache.beam.sdk.options.PipelineOptions;
-import org.apache.beam.sdk.transforms.windowing.GlobalWindow;
-import org.apache.beam.sdk.transforms.windowing.PaneInfo;
-import org.apache.beam.sdk.util.WindowedValue;
-import org.apache.flink.api.common.functions.StoppableFunction;
-import org.apache.flink.streaming.api.functions.source.RichParallelSourceFunction;
-import org.apache.flink.streaming.api.watermark.Watermark;
-import org.joda.time.Instant;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * Wrapper for executing {@link BoundedSource BoundedSources} as a Flink Source.
- */
-public class BoundedSourceWrapper<OutputT>
- extends RichParallelSourceFunction<WindowedValue<OutputT>>
- implements StoppableFunction {
-
- private static final Logger LOG = LoggerFactory.getLogger(BoundedSourceWrapper.class);
-
- /**
- * Keep the options so that we can initialize the readers.
- */
- private final SerializedPipelineOptions serializedOptions;
-
- /**
- * The split sources. We split them in the constructor to ensure that all parallel
- * sources are consistent about the split sources.
- */
- private List<? extends BoundedSource<OutputT>> splitSources;
-
- /**
- * Make it a field so that we can access it in {@link #close()}.
- */
- private transient List<BoundedSource.BoundedReader<OutputT>> readers;
-
- /**
- * Initialize here and not in run() to prevent races where we cancel a job before run() is
- * ever called or run() is called after cancel().
- */
- private volatile boolean isRunning = true;
-
- @SuppressWarnings("unchecked")
- public BoundedSourceWrapper(
- PipelineOptions pipelineOptions,
- BoundedSource<OutputT> source,
- int parallelism) throws Exception {
- this.serializedOptions = new SerializedPipelineOptions(pipelineOptions);
-
- long desiredBundleSize = source.getEstimatedSizeBytes(pipelineOptions) / parallelism;
-
- // get the splits early. we assume that the generated splits are stable,
- // this is necessary so that the mapping of state to source is correct
- // when restoring
- splitSources = source.split(desiredBundleSize, pipelineOptions);
- }
-
- @Override
- public void run(SourceContext<WindowedValue<OutputT>> ctx) throws Exception {
-
- // figure out which split sources we're responsible for
- int subtaskIndex = getRuntimeContext().getIndexOfThisSubtask();
- int numSubtasks = getRuntimeContext().getNumberOfParallelSubtasks();
-
- List<BoundedSource<OutputT>> localSources = new ArrayList<>();
-
- for (int i = 0; i < splitSources.size(); i++) {
- if (i % numSubtasks == subtaskIndex) {
- localSources.add(splitSources.get(i));
- }
- }
-
- LOG.info("Bounded Flink Source {}/{} is reading from sources: {}",
- subtaskIndex,
- numSubtasks,
- localSources);
-
- readers = new ArrayList<>();
- // initialize readers from scratch
- for (BoundedSource<OutputT> source : localSources) {
- readers.add(source.createReader(serializedOptions.getPipelineOptions()));
- }
-
- if (readers.size() == 1) {
- // the easy case, we just read from one reader
- BoundedSource.BoundedReader<OutputT> reader = readers.get(0);
-
- boolean dataAvailable = reader.start();
- if (dataAvailable) {
- emitElement(ctx, reader);
- }
-
- while (isRunning) {
- dataAvailable = reader.advance();
-
- if (dataAvailable) {
- emitElement(ctx, reader);
- } else {
- break;
- }
- }
- } else {
- // a bit more complicated, we are responsible for several readers
- // loop through them and sleep if none of them had any data
-
- int currentReader = 0;
-
- // start each reader and emit data if immediately available
- for (BoundedSource.BoundedReader<OutputT> reader : readers) {
- boolean dataAvailable = reader.start();
- if (dataAvailable) {
- emitElement(ctx, reader);
- }
- }
-
- // a flag telling us whether any of the readers had data
- // if no reader had data, sleep for bit
- boolean hadData = false;
- while (isRunning && !readers.isEmpty()) {
- BoundedSource.BoundedReader<OutputT> reader = readers.get(currentReader);
- boolean dataAvailable = reader.advance();
-
- if (dataAvailable) {
- emitElement(ctx, reader);
- hadData = true;
- } else {
- readers.remove(currentReader);
- currentReader--;
- if (readers.isEmpty()) {
- break;
- }
- }
-
- currentReader = (currentReader + 1) % readers.size();
- if (currentReader == 0 && !hadData) {
- Thread.sleep(50);
- } else if (currentReader == 0) {
- hadData = false;
- }
- }
-
- }
-
- // emit final Long.MAX_VALUE watermark, just to be sure
- ctx.emitWatermark(new Watermark(Long.MAX_VALUE));
- }
-
- /**
- * Emit the current element from the given Reader. The reader is guaranteed to have data.
- */
- private void emitElement(
- SourceContext<WindowedValue<OutputT>> ctx,
- BoundedSource.BoundedReader<OutputT> reader) {
- // make sure that reader state update and element emission are atomic
- // with respect to snapshots
- synchronized (ctx.getCheckpointLock()) {
-
- OutputT item = reader.getCurrent();
- Instant timestamp = reader.getCurrentTimestamp();
-
- WindowedValue<OutputT> windowedValue =
- WindowedValue.of(item, timestamp, GlobalWindow.INSTANCE, PaneInfo.NO_FIRING);
- ctx.collectWithTimestamp(windowedValue, timestamp.getMillis());
- }
- }
-
- @Override
- public void close() throws Exception {
- super.close();
- if (readers != null) {
- for (BoundedSource.BoundedReader<OutputT> reader: readers) {
- reader.close();
- }
- }
- }
-
- @Override
- public void cancel() {
- isRunning = false;
- }
-
- @Override
- public void stop() {
- this.isRunning = false;
- }
-
- /**
- * Visible so that we can check this in tests. Must not be used for anything else.
- */
- @VisibleForTesting
- public List<? extends BoundedSource<OutputT>> getSplitSources() {
- return splitSources;
- }
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/UnboundedSocketSource.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/UnboundedSocketSource.java b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/UnboundedSocketSource.java
deleted file mode 100644
index 910a33f..0000000
--- a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/UnboundedSocketSource.java
+++ /dev/null
@@ -1,249 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink.translation.wrappers.streaming.io;
-
-import static com.google.common.base.Preconditions.checkArgument;
-
-import java.io.BufferedReader;
-import java.io.IOException;
-import java.io.InputStreamReader;
-import java.net.InetSocketAddress;
-import java.net.Socket;
-import java.util.Collections;
-import java.util.List;
-import java.util.NoSuchElementException;
-import javax.annotation.Nullable;
-import org.apache.beam.sdk.coders.Coder;
-import org.apache.beam.sdk.coders.StringUtf8Coder;
-import org.apache.beam.sdk.io.UnboundedSource;
-import org.apache.beam.sdk.options.PipelineOptions;
-import org.joda.time.Instant;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * An example unbounded Beam source that reads input from a socket.
- * This is used mainly for testing and debugging.
- * */
-public class UnboundedSocketSource<CheckpointMarkT extends UnboundedSource.CheckpointMark>
- extends UnboundedSource<String, CheckpointMarkT> {
-
- private static final Coder<String> DEFAULT_SOCKET_CODER = StringUtf8Coder.of();
-
- private static final long serialVersionUID = 1L;
-
- private static final int DEFAULT_CONNECTION_RETRY_SLEEP = 500;
-
- private static final int CONNECTION_TIMEOUT_TIME = 0;
-
- private final String hostname;
- private final int port;
- private final char delimiter;
- private final long maxNumRetries;
- private final long delayBetweenRetries;
-
- public UnboundedSocketSource(String hostname, int port, char delimiter, long maxNumRetries) {
- this(hostname, port, delimiter, maxNumRetries, DEFAULT_CONNECTION_RETRY_SLEEP);
- }
-
- public UnboundedSocketSource(String hostname,
- int port,
- char delimiter,
- long maxNumRetries,
- long delayBetweenRetries) {
- this.hostname = hostname;
- this.port = port;
- this.delimiter = delimiter;
- this.maxNumRetries = maxNumRetries;
- this.delayBetweenRetries = delayBetweenRetries;
- }
-
- public String getHostname() {
- return this.hostname;
- }
-
- public int getPort() {
- return this.port;
- }
-
- public char getDelimiter() {
- return this.delimiter;
- }
-
- public long getMaxNumRetries() {
- return this.maxNumRetries;
- }
-
- public long getDelayBetweenRetries() {
- return this.delayBetweenRetries;
- }
-
- @Override
- public List<? extends UnboundedSource<String, CheckpointMarkT>> split(
- int desiredNumSplits,
- PipelineOptions options) throws Exception {
- return Collections.<UnboundedSource<String, CheckpointMarkT>>singletonList(this);
- }
-
- @Override
- public UnboundedReader<String> createReader(PipelineOptions options,
- @Nullable CheckpointMarkT checkpointMark) {
- return new UnboundedSocketReader(this);
- }
-
- @Nullable
- @Override
- public Coder getCheckpointMarkCoder() {
- // Flink and Dataflow have different checkpointing mechanisms.
- // In our case we do not need a coder.
- return null;
- }
-
- @Override
- public void validate() {
- checkArgument(port > 0 && port < 65536, "port is out of range");
- checkArgument(maxNumRetries >= -1, "maxNumRetries must be zero or larger (num retries), "
- + "or -1 (infinite retries)");
- checkArgument(delayBetweenRetries >= 0, "delayBetweenRetries must be zero or positive");
- }
-
- @Override
- public Coder getDefaultOutputCoder() {
- return DEFAULT_SOCKET_CODER;
- }
-
- /**
- * Unbounded socket reader.
- */
- public static class UnboundedSocketReader extends UnboundedSource.UnboundedReader<String> {
-
- private static final Logger LOG = LoggerFactory.getLogger(UnboundedSocketReader.class);
-
- private final UnboundedSocketSource source;
-
- private Socket socket;
- private BufferedReader reader;
-
- private boolean isRunning;
-
- private String currentRecord;
-
- public UnboundedSocketReader(UnboundedSocketSource source) {
- this.source = source;
- }
-
- private void openConnection() throws IOException {
- this.socket = new Socket();
- this.socket.connect(new InetSocketAddress(this.source.getHostname(), this.source.getPort()),
- CONNECTION_TIMEOUT_TIME);
- this.reader = new BufferedReader(new InputStreamReader(this.socket.getInputStream()));
- this.isRunning = true;
- }
-
- @Override
- public boolean start() throws IOException {
- int attempt = 0;
- while (!isRunning) {
- try {
- openConnection();
- LOG.info("Connected to server socket " + this.source.getHostname() + ':'
- + this.source.getPort());
-
- return advance();
- } catch (IOException e) {
- LOG.info("Lost connection to server socket " + this.source.getHostname() + ':'
- + this.source.getPort() + ". Retrying in "
- + this.source.getDelayBetweenRetries() + " msecs...");
-
- if (this.source.getMaxNumRetries() == -1 || attempt++ < this.source.getMaxNumRetries()) {
- try {
- Thread.sleep(this.source.getDelayBetweenRetries());
- } catch (InterruptedException e1) {
- e1.printStackTrace();
- }
- } else {
- this.isRunning = false;
- break;
- }
- }
- }
- LOG.error("Unable to connect to host " + this.source.getHostname()
- + " : " + this.source.getPort());
- return false;
- }
-
- @Override
- public boolean advance() throws IOException {
- final StringBuilder buffer = new StringBuilder();
- int data;
- while (isRunning && (data = reader.read()) != -1) {
- // check if the string is complete
- if (data != this.source.getDelimiter()) {
- buffer.append((char) data);
- } else {
- if (buffer.length() > 0 && buffer.charAt(buffer.length() - 1) == '\r') {
- buffer.setLength(buffer.length() - 1);
- }
- this.currentRecord = buffer.toString();
- buffer.setLength(0);
- return true;
- }
- }
- return false;
- }
-
- @Override
- public byte[] getCurrentRecordId() throws NoSuchElementException {
- return new byte[0];
- }
-
- @Override
- public String getCurrent() throws NoSuchElementException {
- return this.currentRecord;
- }
-
- @Override
- public Instant getCurrentTimestamp() throws NoSuchElementException {
- return Instant.now();
- }
-
- @Override
- public void close() throws IOException {
- this.reader.close();
- this.socket.close();
- this.isRunning = false;
- LOG.info("Closed connection to server socket at " + this.source.getHostname() + ":"
- + this.source.getPort() + ".");
- }
-
- @Override
- public Instant getWatermark() {
- return Instant.now();
- }
-
- @Override
- public CheckpointMark getCheckpointMark() {
- return null;
- }
-
- @Override
- public UnboundedSource<String, ?> getCurrentSource() {
- return this.source;
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/UnboundedSourceWrapper.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/UnboundedSourceWrapper.java b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/UnboundedSourceWrapper.java
deleted file mode 100644
index bb9b58a..0000000
--- a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/UnboundedSourceWrapper.java
+++ /dev/null
@@ -1,476 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink.translation.wrappers.streaming.io;
-
-import com.google.common.annotations.VisibleForTesting;
-import java.util.ArrayList;
-import java.util.Iterator;
-import java.util.LinkedHashMap;
-import java.util.List;
-import org.apache.beam.runners.flink.translation.types.CoderTypeInformation;
-import org.apache.beam.runners.flink.translation.utils.SerializedPipelineOptions;
-import org.apache.beam.sdk.coders.Coder;
-import org.apache.beam.sdk.coders.KvCoder;
-import org.apache.beam.sdk.coders.SerializableCoder;
-import org.apache.beam.sdk.io.UnboundedSource;
-import org.apache.beam.sdk.options.PipelineOptions;
-import org.apache.beam.sdk.transforms.windowing.GlobalWindow;
-import org.apache.beam.sdk.transforms.windowing.PaneInfo;
-import org.apache.beam.sdk.util.WindowedValue;
-import org.apache.beam.sdk.values.KV;
-import org.apache.beam.sdk.values.TypeDescriptor;
-import org.apache.flink.api.common.ExecutionConfig;
-import org.apache.flink.api.common.functions.StoppableFunction;
-import org.apache.flink.api.common.state.ListState;
-import org.apache.flink.api.common.state.ListStateDescriptor;
-import org.apache.flink.api.common.state.OperatorStateStore;
-import org.apache.flink.configuration.Configuration;
-import org.apache.flink.runtime.state.CheckpointListener;
-import org.apache.flink.runtime.state.DefaultOperatorStateBackend;
-import org.apache.flink.runtime.state.FunctionInitializationContext;
-import org.apache.flink.runtime.state.FunctionSnapshotContext;
-import org.apache.flink.streaming.api.checkpoint.CheckpointedFunction;
-import org.apache.flink.streaming.api.functions.source.RichParallelSourceFunction;
-import org.apache.flink.streaming.api.operators.StreamingRuntimeContext;
-import org.apache.flink.streaming.api.watermark.Watermark;
-import org.apache.flink.streaming.runtime.tasks.ProcessingTimeCallback;
-import org.joda.time.Instant;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * Wrapper for executing {@link UnboundedSource UnboundedSources} as a Flink Source.
- */
-public class UnboundedSourceWrapper<
- OutputT, CheckpointMarkT extends UnboundedSource.CheckpointMark>
- extends RichParallelSourceFunction<WindowedValue<OutputT>>
- implements ProcessingTimeCallback, StoppableFunction,
- CheckpointListener, CheckpointedFunction {
-
- private static final Logger LOG = LoggerFactory.getLogger(UnboundedSourceWrapper.class);
-
- /**
- * Keep the options so that we can initialize the localReaders.
- */
- private final SerializedPipelineOptions serializedOptions;
-
- /**
- * For snapshot and restore.
- */
- private final KvCoder<
- ? extends UnboundedSource<OutputT, CheckpointMarkT>, CheckpointMarkT> checkpointCoder;
-
- /**
- * The split sources. We split them in the constructor to ensure that all parallel
- * sources are consistent about the split sources.
- */
- private final List<? extends UnboundedSource<OutputT, CheckpointMarkT>> splitSources;
-
- /**
- * The local split sources. Assigned at runtime when the wrapper is executed in parallel.
- */
- private transient List<UnboundedSource<OutputT, CheckpointMarkT>> localSplitSources;
-
- /**
- * The local split readers. Assigned at runtime when the wrapper is executed in parallel.
- * Make it a field so that we can access it in {@link #onProcessingTime(long)} for
- * emitting watermarks.
- */
- private transient List<UnboundedSource.UnboundedReader<OutputT>> localReaders;
-
- /**
- * Flag to indicate whether the source is running.
- * Initialize here and not in run() to prevent races where we cancel a job before run() is
- * ever called or run() is called after cancel().
- */
- private volatile boolean isRunning = true;
-
- /**
- * Make it a field so that we can access it in {@link #onProcessingTime(long)} for registering new
- * triggers.
- */
- private transient StreamingRuntimeContext runtimeContext;
-
- /**
- * Make it a field so that we can access it in {@link #onProcessingTime(long)} for emitting
- * watermarks.
- */
- private transient SourceContext<WindowedValue<OutputT>> context;
-
- /**
- * Pending checkpoints which have not been acknowledged yet.
- */
- private transient LinkedHashMap<Long, List<CheckpointMarkT>> pendingCheckpoints;
- /**
- * Keep a maximum of 32 checkpoints for {@code CheckpointMark.finalizeCheckpoint()}.
- */
- private static final int MAX_NUMBER_PENDING_CHECKPOINTS = 32;
-
- private transient ListState<KV<? extends
- UnboundedSource<OutputT, CheckpointMarkT>, CheckpointMarkT>> stateForCheckpoint;
-
- /**
- * false if checkpointCoder is null or no restore state by starting first.
- */
- private transient boolean isRestored = false;
-
- @SuppressWarnings("unchecked")
- public UnboundedSourceWrapper(
- PipelineOptions pipelineOptions,
- UnboundedSource<OutputT, CheckpointMarkT> source,
- int parallelism) throws Exception {
- this.serializedOptions = new SerializedPipelineOptions(pipelineOptions);
-
- if (source.requiresDeduping()) {
- LOG.warn("Source {} requires deduping but Flink runner doesn't support this yet.", source);
- }
-
- Coder<CheckpointMarkT> checkpointMarkCoder = source.getCheckpointMarkCoder();
- if (checkpointMarkCoder == null) {
- LOG.info("No CheckpointMarkCoder specified for this source. Won't create snapshots.");
- checkpointCoder = null;
- } else {
-
- Coder<? extends UnboundedSource<OutputT, CheckpointMarkT>> sourceCoder =
- (Coder) SerializableCoder.of(new TypeDescriptor<UnboundedSource>() {
- });
-
- checkpointCoder = KvCoder.of(sourceCoder, checkpointMarkCoder);
- }
-
- // get the splits early. we assume that the generated splits are stable,
- // this is necessary so that the mapping of state to source is correct
- // when restoring
- splitSources = source.split(parallelism, pipelineOptions);
- }
-
-
- /**
- * Initialize and restore state before starting execution of the source.
- */
- @Override
- public void open(Configuration parameters) throws Exception {
- runtimeContext = (StreamingRuntimeContext) getRuntimeContext();
-
- // figure out which split sources we're responsible for
- int subtaskIndex = runtimeContext.getIndexOfThisSubtask();
- int numSubtasks = runtimeContext.getNumberOfParallelSubtasks();
-
- localSplitSources = new ArrayList<>();
- localReaders = new ArrayList<>();
-
- pendingCheckpoints = new LinkedHashMap<>();
-
- if (isRestored) {
- // restore the splitSources from the checkpoint to ensure consistent ordering
- for (KV<? extends UnboundedSource<OutputT, CheckpointMarkT>, CheckpointMarkT> restored:
- stateForCheckpoint.get()) {
- localSplitSources.add(restored.getKey());
- localReaders.add(restored.getKey().createReader(
- serializedOptions.getPipelineOptions(), restored.getValue()));
- }
- } else {
- // initialize localReaders and localSources from scratch
- for (int i = 0; i < splitSources.size(); i++) {
- if (i % numSubtasks == subtaskIndex) {
- UnboundedSource<OutputT, CheckpointMarkT> source =
- splitSources.get(i);
- UnboundedSource.UnboundedReader<OutputT> reader =
- source.createReader(serializedOptions.getPipelineOptions(), null);
- localSplitSources.add(source);
- localReaders.add(reader);
- }
- }
- }
-
- LOG.info("Unbounded Flink Source {}/{} is reading from sources: {}",
- subtaskIndex,
- numSubtasks,
- localSplitSources);
- }
-
- @Override
- public void run(SourceContext<WindowedValue<OutputT>> ctx) throws Exception {
-
- context = ctx;
-
- if (localReaders.size() == 0) {
- // do nothing, but still look busy ...
- // also, output a Long.MAX_VALUE watermark since we know that we're not
- // going to emit anything
- // we can't return here since Flink requires that all operators stay up,
- // otherwise checkpointing would not work correctly anymore
- ctx.emitWatermark(new Watermark(Long.MAX_VALUE));
-
- // wait until this is canceled
- final Object waitLock = new Object();
- while (isRunning) {
- try {
- // Flink will interrupt us at some point
- //noinspection SynchronizationOnLocalVariableOrMethodParameter
- synchronized (waitLock) {
- // don't wait indefinitely, in case something goes horribly wrong
- waitLock.wait(1000);
- }
- } catch (InterruptedException e) {
- if (!isRunning) {
- // restore the interrupted state, and fall through the loop
- Thread.currentThread().interrupt();
- }
- }
- }
- } else if (localReaders.size() == 1) {
- // the easy case, we just read from one reader
- UnboundedSource.UnboundedReader<OutputT> reader = localReaders.get(0);
-
- boolean dataAvailable = reader.start();
- if (dataAvailable) {
- emitElement(ctx, reader);
- }
-
- setNextWatermarkTimer(this.runtimeContext);
-
- while (isRunning) {
- dataAvailable = reader.advance();
-
- if (dataAvailable) {
- emitElement(ctx, reader);
- } else {
- Thread.sleep(50);
- }
- }
- } else {
- // a bit more complicated, we are responsible for several localReaders
- // loop through them and sleep if none of them had any data
-
- int numReaders = localReaders.size();
- int currentReader = 0;
-
- // start each reader and emit data if immediately available
- for (UnboundedSource.UnboundedReader<OutputT> reader : localReaders) {
- boolean dataAvailable = reader.start();
- if (dataAvailable) {
- emitElement(ctx, reader);
- }
- }
-
- // a flag telling us whether any of the localReaders had data
- // if no reader had data, sleep for bit
- boolean hadData = false;
- while (isRunning) {
- UnboundedSource.UnboundedReader<OutputT> reader = localReaders.get(currentReader);
- boolean dataAvailable = reader.advance();
-
- if (dataAvailable) {
- emitElement(ctx, reader);
- hadData = true;
- }
-
- currentReader = (currentReader + 1) % numReaders;
- if (currentReader == 0 && !hadData) {
- Thread.sleep(50);
- } else if (currentReader == 0) {
- hadData = false;
- }
- }
-
- }
- }
-
- /**
- * Emit the current element from the given Reader. The reader is guaranteed to have data.
- */
- private void emitElement(
- SourceContext<WindowedValue<OutputT>> ctx,
- UnboundedSource.UnboundedReader<OutputT> reader) {
- // make sure that reader state update and element emission are atomic
- // with respect to snapshots
- synchronized (ctx.getCheckpointLock()) {
-
- OutputT item = reader.getCurrent();
- Instant timestamp = reader.getCurrentTimestamp();
-
- WindowedValue<OutputT> windowedValue =
- WindowedValue.of(item, timestamp, GlobalWindow.INSTANCE, PaneInfo.NO_FIRING);
- ctx.collectWithTimestamp(windowedValue, timestamp.getMillis());
- }
- }
-
- @Override
- public void close() throws Exception {
- super.close();
- if (localReaders != null) {
- for (UnboundedSource.UnboundedReader<OutputT> reader: localReaders) {
- reader.close();
- }
- }
- }
-
- @Override
- public void cancel() {
- isRunning = false;
- }
-
- @Override
- public void stop() {
- isRunning = false;
- }
-
- // ------------------------------------------------------------------------
- // Checkpoint and restore
- // ------------------------------------------------------------------------
-
- @Override
- public void snapshotState(FunctionSnapshotContext functionSnapshotContext) throws Exception {
- if (!isRunning) {
- LOG.debug("snapshotState() called on closed source");
- } else {
-
- if (checkpointCoder == null) {
- // no checkpoint coder available in this source
- return;
- }
-
- stateForCheckpoint.clear();
-
- long checkpointId = functionSnapshotContext.getCheckpointId();
-
- // we checkpoint the sources along with the CheckpointMarkT to ensure
- // than we have a correct mapping of checkpoints to sources when
- // restoring
- List<CheckpointMarkT> checkpointMarks = new ArrayList<>(localSplitSources.size());
-
- for (int i = 0; i < localSplitSources.size(); i++) {
- UnboundedSource<OutputT, CheckpointMarkT> source = localSplitSources.get(i);
- UnboundedSource.UnboundedReader<OutputT> reader = localReaders.get(i);
-
- @SuppressWarnings("unchecked")
- CheckpointMarkT mark = (CheckpointMarkT) reader.getCheckpointMark();
- checkpointMarks.add(mark);
- KV<UnboundedSource<OutputT, CheckpointMarkT>, CheckpointMarkT> kv =
- KV.of(source, mark);
- stateForCheckpoint.add(kv);
- }
-
- // cleanup old pending checkpoints and add new checkpoint
- int diff = pendingCheckpoints.size() - MAX_NUMBER_PENDING_CHECKPOINTS;
- if (diff >= 0) {
- for (Iterator<Long> iterator = pendingCheckpoints.keySet().iterator();
- diff >= 0;
- diff--) {
- iterator.next();
- iterator.remove();
- }
- }
- pendingCheckpoints.put(checkpointId, checkpointMarks);
-
- }
- }
-
- @Override
- public void initializeState(FunctionInitializationContext context) throws Exception {
- if (checkpointCoder == null) {
- // no checkpoint coder available in this source
- return;
- }
-
- OperatorStateStore stateStore = context.getOperatorStateStore();
- CoderTypeInformation<
- KV<? extends UnboundedSource<OutputT, CheckpointMarkT>, CheckpointMarkT>>
- typeInformation = (CoderTypeInformation) new CoderTypeInformation<>(checkpointCoder);
- stateForCheckpoint = stateStore.getOperatorState(
- new ListStateDescriptor<>(DefaultOperatorStateBackend.DEFAULT_OPERATOR_STATE_NAME,
- typeInformation.createSerializer(new ExecutionConfig())));
-
- if (context.isRestored()) {
- isRestored = true;
- LOG.info("Having restore state in the UnbounedSourceWrapper.");
- } else {
- LOG.info("No restore state for UnbounedSourceWrapper.");
- }
- }
-
- @Override
- public void onProcessingTime(long timestamp) throws Exception {
- if (this.isRunning) {
- synchronized (context.getCheckpointLock()) {
- // find minimum watermark over all localReaders
- long watermarkMillis = Long.MAX_VALUE;
- for (UnboundedSource.UnboundedReader<OutputT> reader: localReaders) {
- Instant watermark = reader.getWatermark();
- if (watermark != null) {
- watermarkMillis = Math.min(watermark.getMillis(), watermarkMillis);
- }
- }
- context.emitWatermark(new Watermark(watermarkMillis));
- }
- setNextWatermarkTimer(this.runtimeContext);
- }
- }
-
- private void setNextWatermarkTimer(StreamingRuntimeContext runtime) {
- if (this.isRunning) {
- long watermarkInterval = runtime.getExecutionConfig().getAutoWatermarkInterval();
- long timeToNextWatermark = getTimeToNextWatermark(watermarkInterval);
- runtime.getProcessingTimeService().registerTimer(timeToNextWatermark, this);
- }
- }
-
- private long getTimeToNextWatermark(long watermarkInterval) {
- return System.currentTimeMillis() + watermarkInterval;
- }
-
- /**
- * Visible so that we can check this in tests. Must not be used for anything else.
- */
- @VisibleForTesting
- public List<? extends UnboundedSource<OutputT, CheckpointMarkT>> getSplitSources() {
- return splitSources;
- }
-
- /**
- * Visible so that we can check this in tests. Must not be used for anything else.
- */
- @VisibleForTesting
- public List<? extends UnboundedSource<OutputT, CheckpointMarkT>> getLocalSplitSources() {
- return localSplitSources;
- }
-
- @Override
- public void notifyCheckpointComplete(long checkpointId) throws Exception {
-
- List<CheckpointMarkT> checkpointMarks = pendingCheckpoints.get(checkpointId);
-
- if (checkpointMarks != null) {
-
- // remove old checkpoints including the current one
- Iterator<Long> iterator = pendingCheckpoints.keySet().iterator();
- long currentId;
- do {
- currentId = iterator.next();
- iterator.remove();
- } while (currentId != checkpointId);
-
- // confirm all marks
- for (CheckpointMarkT mark : checkpointMarks) {
- mark.finalizeCheckpoint();
- }
-
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/package-info.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/package-info.java b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/package-info.java
deleted file mode 100644
index b431ce7..0000000
--- a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/package-info.java
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * Internal implementation of the Beam runner for Apache Flink.
- */
-package org.apache.beam.runners.flink.translation.wrappers.streaming.io;
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/package-info.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/package-info.java b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/package-info.java
deleted file mode 100644
index 0674871..0000000
--- a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/package-info.java
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * Internal implementation of the Beam runner for Apache Flink.
- */
-package org.apache.beam.runners.flink.translation.wrappers.streaming;
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/state/FlinkBroadcastStateInternals.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/state/FlinkBroadcastStateInternals.java b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/state/FlinkBroadcastStateInternals.java
deleted file mode 100644
index 3203446..0000000
--- a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/state/FlinkBroadcastStateInternals.java
+++ /dev/null
@@ -1,865 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink.translation.wrappers.streaming.state;
-
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-import org.apache.beam.runners.core.StateInternals;
-import org.apache.beam.runners.core.StateNamespace;
-import org.apache.beam.runners.core.StateTag;
-import org.apache.beam.runners.flink.translation.types.CoderTypeInformation;
-import org.apache.beam.sdk.coders.Coder;
-import org.apache.beam.sdk.coders.ListCoder;
-import org.apache.beam.sdk.coders.MapCoder;
-import org.apache.beam.sdk.coders.StringUtf8Coder;
-import org.apache.beam.sdk.transforms.Combine;
-import org.apache.beam.sdk.transforms.CombineWithContext;
-import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
-import org.apache.beam.sdk.transforms.windowing.OutputTimeFn;
-import org.apache.beam.sdk.util.CombineContextFactory;
-import org.apache.beam.sdk.util.state.BagState;
-import org.apache.beam.sdk.util.state.CombiningState;
-import org.apache.beam.sdk.util.state.MapState;
-import org.apache.beam.sdk.util.state.ReadableState;
-import org.apache.beam.sdk.util.state.SetState;
-import org.apache.beam.sdk.util.state.State;
-import org.apache.beam.sdk.util.state.StateContext;
-import org.apache.beam.sdk.util.state.StateContexts;
-import org.apache.beam.sdk.util.state.ValueState;
-import org.apache.beam.sdk.util.state.WatermarkHoldState;
-import org.apache.flink.api.common.ExecutionConfig;
-import org.apache.flink.api.common.state.ListState;
-import org.apache.flink.api.common.state.ListStateDescriptor;
-import org.apache.flink.runtime.state.DefaultOperatorStateBackend;
-import org.apache.flink.runtime.state.OperatorStateBackend;
-
-/**
- * {@link StateInternals} that uses a Flink {@link DefaultOperatorStateBackend}
- * to manage the broadcast state.
- * The state is the same on all parallel instances of the operator.
- * So we just need store state of operator-0 in OperatorStateBackend.
- *
- * <p>Note: Ignore index of key.
- * Mainly for SideInputs.
- */
-public class FlinkBroadcastStateInternals<K> implements StateInternals<K> {
-
- private int indexInSubtaskGroup;
- private final DefaultOperatorStateBackend stateBackend;
- // stateName -> <namespace, state>
- private Map<String, Map<String, ?>> stateForNonZeroOperator;
-
- public FlinkBroadcastStateInternals(int indexInSubtaskGroup, OperatorStateBackend stateBackend) {
- //TODO flink do not yet expose through public API
- this.stateBackend = (DefaultOperatorStateBackend) stateBackend;
- this.indexInSubtaskGroup = indexInSubtaskGroup;
- if (indexInSubtaskGroup != 0) {
- stateForNonZeroOperator = new HashMap<>();
- }
- }
-
- @Override
- public K getKey() {
- return null;
- }
-
- @Override
- public <T extends State> T state(
- final StateNamespace namespace,
- StateTag<? super K, T> address) {
-
- return state(namespace, address, StateContexts.nullContext());
- }
-
- @Override
- public <T extends State> T state(
- final StateNamespace namespace,
- StateTag<? super K, T> address,
- final StateContext<?> context) {
-
- return address.bind(new StateTag.StateBinder<K>() {
-
- @Override
- public <T> ValueState<T> bindValue(
- StateTag<? super K, ValueState<T>> address,
- Coder<T> coder) {
-
- return new FlinkBroadcastValueState<>(stateBackend, address, namespace, coder);
- }
-
- @Override
- public <T> BagState<T> bindBag(
- StateTag<? super K, BagState<T>> address,
- Coder<T> elemCoder) {
-
- return new FlinkBroadcastBagState<>(stateBackend, address, namespace, elemCoder);
- }
-
- @Override
- public <T> SetState<T> bindSet(
- StateTag<? super K, SetState<T>> address,
- Coder<T> elemCoder) {
- throw new UnsupportedOperationException(
- String.format("%s is not supported", SetState.class.getSimpleName()));
- }
-
- @Override
- public <KeyT, ValueT> MapState<KeyT, ValueT> bindMap(
- StateTag<? super K, MapState<KeyT, ValueT>> spec,
- Coder<KeyT> mapKeyCoder, Coder<ValueT> mapValueCoder) {
- throw new UnsupportedOperationException(
- String.format("%s is not supported", MapState.class.getSimpleName()));
- }
-
- @Override
- public <InputT, AccumT, OutputT>
- CombiningState<InputT, AccumT, OutputT>
- bindCombiningValue(
- StateTag<? super K, CombiningState<InputT, AccumT, OutputT>> address,
- Coder<AccumT> accumCoder,
- Combine.CombineFn<InputT, AccumT, OutputT> combineFn) {
-
- return new FlinkCombiningState<>(
- stateBackend, address, combineFn, namespace, accumCoder);
- }
-
- @Override
- public <InputT, AccumT, OutputT>
- CombiningState<InputT, AccumT, OutputT> bindKeyedCombiningValue(
- StateTag<? super K, CombiningState<InputT, AccumT, OutputT>> address,
- Coder<AccumT> accumCoder,
- final Combine.KeyedCombineFn<? super K, InputT, AccumT, OutputT> combineFn) {
- return new FlinkKeyedCombiningState<>(
- stateBackend,
- address,
- combineFn,
- namespace,
- accumCoder,
- FlinkBroadcastStateInternals.this);
- }
-
- @Override
- public <InputT, AccumT, OutputT>
- CombiningState<InputT, AccumT, OutputT> bindKeyedCombiningValueWithContext(
- StateTag<? super K, CombiningState<InputT, AccumT, OutputT>> address,
- Coder<AccumT> accumCoder,
- CombineWithContext.KeyedCombineFnWithContext<
- ? super K, InputT, AccumT, OutputT> combineFn) {
- return new FlinkCombiningStateWithContext<>(
- stateBackend,
- address,
- combineFn,
- namespace,
- accumCoder,
- FlinkBroadcastStateInternals.this,
- CombineContextFactory.createFromStateContext(context));
- }
-
- @Override
- public <W extends BoundedWindow> WatermarkHoldState<W> bindWatermark(
- StateTag<? super K, WatermarkHoldState<W>> address,
- OutputTimeFn<? super W> outputTimeFn) {
- throw new UnsupportedOperationException(
- String.format("%s is not supported", WatermarkHoldState.class.getSimpleName()));
- }
- });
- }
-
- /**
- * 1. The way we would use it is to only checkpoint anything from the operator
- * with subtask index 0 because we assume that the state is the same on all
- * parallel instances of the operator.
- *
- * <p>2. Use map to support namespace.
- */
- private abstract class AbstractBroadcastState<T> {
-
- private String name;
- private final StateNamespace namespace;
- private final ListStateDescriptor<Map<String, T>> flinkStateDescriptor;
- private final DefaultOperatorStateBackend flinkStateBackend;
-
- AbstractBroadcastState(
- DefaultOperatorStateBackend flinkStateBackend,
- String name,
- StateNamespace namespace,
- Coder<T> coder) {
- this.name = name;
-
- this.namespace = namespace;
- this.flinkStateBackend = flinkStateBackend;
-
- CoderTypeInformation<Map<String, T>> typeInfo =
- new CoderTypeInformation<>(MapCoder.of(StringUtf8Coder.of(), coder));
-
- flinkStateDescriptor = new ListStateDescriptor<>(name,
- typeInfo.createSerializer(new ExecutionConfig()));
- }
-
- /**
- * Get map(namespce->T) from index 0.
- */
- Map<String, T> getMap() throws Exception {
- if (indexInSubtaskGroup == 0) {
- return getMapFromBroadcastState();
- } else {
- Map<String, T> result = (Map<String, T>) stateForNonZeroOperator.get(name);
- // maybe restore from BroadcastState of Operator-0
- if (result == null) {
- result = getMapFromBroadcastState();
- if (result != null) {
- stateForNonZeroOperator.put(name, result);
- // we don't need it anymore, must clear it.
- flinkStateBackend.getBroadcastOperatorState(
- flinkStateDescriptor).clear();
- }
- }
- return result;
- }
- }
-
- Map<String, T> getMapFromBroadcastState() throws Exception {
- ListState<Map<String, T>> state = flinkStateBackend.getBroadcastOperatorState(
- flinkStateDescriptor);
- Iterable<Map<String, T>> iterable = state.get();
- Map<String, T> ret = null;
- if (iterable != null) {
- // just use index 0
- Iterator<Map<String, T>> iterator = iterable.iterator();
- if (iterator.hasNext()) {
- ret = iterator.next();
- }
- }
- return ret;
- }
-
- /**
- * Update map(namespce->T) from index 0.
- */
- void updateMap(Map<String, T> map) throws Exception {
- if (indexInSubtaskGroup == 0) {
- ListState<Map<String, T>> state = flinkStateBackend.getBroadcastOperatorState(
- flinkStateDescriptor);
- state.clear();
- if (map.size() > 0) {
- state.add(map);
- }
- } else {
- if (map.size() == 0) {
- stateForNonZeroOperator.remove(name);
- // updateMap is always behind getMap,
- // getMap will clear map in BroadcastOperatorState,
- // we don't need clear here.
- } else {
- stateForNonZeroOperator.put(name, map);
- }
- }
- }
-
- void writeInternal(T input) {
- try {
- Map<String, T> map = getMap();
- if (map == null) {
- map = new HashMap<>();
- }
- map.put(namespace.stringKey(), input);
- updateMap(map);
- } catch (Exception e) {
- throw new RuntimeException("Error updating state.", e);
- }
- }
-
- T readInternal() {
- try {
- Map<String, T> map = getMap();
- if (map == null) {
- return null;
- } else {
- return map.get(namespace.stringKey());
- }
- } catch (Exception e) {
- throw new RuntimeException("Error reading state.", e);
- }
- }
-
- void clearInternal() {
- try {
- Map<String, T> map = getMap();
- if (map != null) {
- map.remove(namespace.stringKey());
- updateMap(map);
- }
- } catch (Exception e) {
- throw new RuntimeException("Error clearing state.", e);
- }
- }
-
- }
-
- private class FlinkBroadcastValueState<K, T>
- extends AbstractBroadcastState<T> implements ValueState<T> {
-
- private final StateNamespace namespace;
- private final StateTag<? super K, ValueState<T>> address;
-
- FlinkBroadcastValueState(
- DefaultOperatorStateBackend flinkStateBackend,
- StateTag<? super K, ValueState<T>> address,
- StateNamespace namespace,
- Coder<T> coder) {
- super(flinkStateBackend, address.getId(), namespace, coder);
-
- this.namespace = namespace;
- this.address = address;
-
- }
-
- @Override
- public void write(T input) {
- writeInternal(input);
- }
-
- @Override
- public ValueState<T> readLater() {
- return this;
- }
-
- @Override
- public T read() {
- return readInternal();
- }
-
- @Override
- public boolean equals(Object o) {
- if (this == o) {
- return true;
- }
- if (o == null || getClass() != o.getClass()) {
- return false;
- }
-
- FlinkBroadcastValueState<?, ?> that = (FlinkBroadcastValueState<?, ?>) o;
-
- return namespace.equals(that.namespace) && address.equals(that.address);
-
- }
-
- @Override
- public int hashCode() {
- int result = namespace.hashCode();
- result = 31 * result + address.hashCode();
- return result;
- }
-
- @Override
- public void clear() {
- clearInternal();
- }
- }
-
- private class FlinkBroadcastBagState<K, T> extends AbstractBroadcastState<List<T>>
- implements BagState<T> {
-
- private final StateNamespace namespace;
- private final StateTag<? super K, BagState<T>> address;
-
- FlinkBroadcastBagState(
- DefaultOperatorStateBackend flinkStateBackend,
- StateTag<? super K, BagState<T>> address,
- StateNamespace namespace,
- Coder<T> coder) {
- super(flinkStateBackend, address.getId(), namespace, ListCoder.of(coder));
-
- this.namespace = namespace;
- this.address = address;
- }
-
- @Override
- public void add(T input) {
- List<T> list = readInternal();
- if (list == null) {
- list = new ArrayList<>();
- }
- list.add(input);
- writeInternal(list);
- }
-
- @Override
- public BagState<T> readLater() {
- return this;
- }
-
- @Override
- public Iterable<T> read() {
- List<T> result = readInternal();
- return result != null ? result : Collections.<T>emptyList();
- }
-
- @Override
- public ReadableState<Boolean> isEmpty() {
- return new ReadableState<Boolean>() {
- @Override
- public Boolean read() {
- try {
- List<T> result = readInternal();
- return result == null;
- } catch (Exception e) {
- throw new RuntimeException("Error reading state.", e);
- }
-
- }
-
- @Override
- public ReadableState<Boolean> readLater() {
- return this;
- }
- };
- }
-
- @Override
- public void clear() {
- clearInternal();
- }
-
- @Override
- public boolean equals(Object o) {
- if (this == o) {
- return true;
- }
- if (o == null || getClass() != o.getClass()) {
- return false;
- }
-
- FlinkBroadcastBagState<?, ?> that = (FlinkBroadcastBagState<?, ?>) o;
-
- return namespace.equals(that.namespace) && address.equals(that.address);
-
- }
-
- @Override
- public int hashCode() {
- int result = namespace.hashCode();
- result = 31 * result + address.hashCode();
- return result;
- }
- }
-
- private class FlinkCombiningState<K, InputT, AccumT, OutputT>
- extends AbstractBroadcastState<AccumT>
- implements CombiningState<InputT, AccumT, OutputT> {
-
- private final StateNamespace namespace;
- private final StateTag<? super K, CombiningState<InputT, AccumT, OutputT>> address;
- private final Combine.CombineFn<InputT, AccumT, OutputT> combineFn;
-
- FlinkCombiningState(
- DefaultOperatorStateBackend flinkStateBackend,
- StateTag<? super K, CombiningState<InputT, AccumT, OutputT>> address,
- Combine.CombineFn<InputT, AccumT, OutputT> combineFn,
- StateNamespace namespace,
- Coder<AccumT> accumCoder) {
- super(flinkStateBackend, address.getId(), namespace, accumCoder);
-
- this.namespace = namespace;
- this.address = address;
- this.combineFn = combineFn;
- }
-
- @Override
- public CombiningState<InputT, AccumT, OutputT> readLater() {
- return this;
- }
-
- @Override
- public void add(InputT value) {
- AccumT current = readInternal();
- if (current == null) {
- current = combineFn.createAccumulator();
- }
- current = combineFn.addInput(current, value);
- writeInternal(current);
- }
-
- @Override
- public void addAccum(AccumT accum) {
- AccumT current = readInternal();
-
- if (current == null) {
- writeInternal(accum);
- } else {
- current = combineFn.mergeAccumulators(Arrays.asList(current, accum));
- writeInternal(current);
- }
- }
-
- @Override
- public AccumT getAccum() {
- return readInternal();
- }
-
- @Override
- public AccumT mergeAccumulators(Iterable<AccumT> accumulators) {
- return combineFn.mergeAccumulators(accumulators);
- }
-
- @Override
- public OutputT read() {
- AccumT accum = readInternal();
- if (accum != null) {
- return combineFn.extractOutput(accum);
- } else {
- return combineFn.extractOutput(combineFn.createAccumulator());
- }
- }
-
- @Override
- public ReadableState<Boolean> isEmpty() {
- return new ReadableState<Boolean>() {
- @Override
- public Boolean read() {
- try {
- return readInternal() == null;
- } catch (Exception e) {
- throw new RuntimeException("Error reading state.", e);
- }
-
- }
-
- @Override
- public ReadableState<Boolean> readLater() {
- return this;
- }
- };
- }
-
- @Override
- public void clear() {
- clearInternal();
- }
-
- @Override
- public boolean equals(Object o) {
- if (this == o) {
- return true;
- }
- if (o == null || getClass() != o.getClass()) {
- return false;
- }
-
- FlinkCombiningState<?, ?, ?, ?> that =
- (FlinkCombiningState<?, ?, ?, ?>) o;
-
- return namespace.equals(that.namespace) && address.equals(that.address);
-
- }
-
- @Override
- public int hashCode() {
- int result = namespace.hashCode();
- result = 31 * result + address.hashCode();
- return result;
- }
- }
-
- private class FlinkKeyedCombiningState<K, InputT, AccumT, OutputT>
- extends AbstractBroadcastState<AccumT>
- implements CombiningState<InputT, AccumT, OutputT> {
-
- private final StateNamespace namespace;
- private final StateTag<? super K, CombiningState<InputT, AccumT, OutputT>> address;
- private final Combine.KeyedCombineFn<? super K, InputT, AccumT, OutputT> combineFn;
- private final FlinkBroadcastStateInternals<K> flinkStateInternals;
-
- FlinkKeyedCombiningState(
- DefaultOperatorStateBackend flinkStateBackend,
- StateTag<? super K, CombiningState<InputT, AccumT, OutputT>> address,
- Combine.KeyedCombineFn<? super K, InputT, AccumT, OutputT> combineFn,
- StateNamespace namespace,
- Coder<AccumT> accumCoder,
- FlinkBroadcastStateInternals<K> flinkStateInternals) {
- super(flinkStateBackend, address.getId(), namespace, accumCoder);
-
- this.namespace = namespace;
- this.address = address;
- this.combineFn = combineFn;
- this.flinkStateInternals = flinkStateInternals;
-
- }
-
- @Override
- public CombiningState<InputT, AccumT, OutputT> readLater() {
- return this;
- }
-
- @Override
- public void add(InputT value) {
- try {
- AccumT current = readInternal();
- if (current == null) {
- current = combineFn.createAccumulator(flinkStateInternals.getKey());
- }
- current = combineFn.addInput(flinkStateInternals.getKey(), current, value);
- writeInternal(current);
- } catch (Exception e) {
- throw new RuntimeException("Error adding to state." , e);
- }
- }
-
- @Override
- public void addAccum(AccumT accum) {
- try {
- AccumT current = readInternal();
- if (current == null) {
- writeInternal(accum);
- } else {
- current = combineFn.mergeAccumulators(
- flinkStateInternals.getKey(),
- Arrays.asList(current, accum));
- writeInternal(current);
- }
- } catch (Exception e) {
- throw new RuntimeException("Error adding to state.", e);
- }
- }
-
- @Override
- public AccumT getAccum() {
- try {
- return readInternal();
- } catch (Exception e) {
- throw new RuntimeException("Error reading state.", e);
- }
- }
-
- @Override
- public AccumT mergeAccumulators(Iterable<AccumT> accumulators) {
- return combineFn.mergeAccumulators(flinkStateInternals.getKey(), accumulators);
- }
-
- @Override
- public OutputT read() {
- try {
- AccumT accum = readInternal();
- if (accum != null) {
- return combineFn.extractOutput(flinkStateInternals.getKey(), accum);
- } else {
- return combineFn.extractOutput(
- flinkStateInternals.getKey(),
- combineFn.createAccumulator(flinkStateInternals.getKey()));
- }
- } catch (Exception e) {
- throw new RuntimeException("Error reading state.", e);
- }
- }
-
- @Override
- public ReadableState<Boolean> isEmpty() {
- return new ReadableState<Boolean>() {
- @Override
- public Boolean read() {
- try {
- return readInternal() == null;
- } catch (Exception e) {
- throw new RuntimeException("Error reading state.", e);
- }
-
- }
-
- @Override
- public ReadableState<Boolean> readLater() {
- return this;
- }
- };
- }
-
- @Override
- public void clear() {
- clearInternal();
- }
-
- @Override
- public boolean equals(Object o) {
- if (this == o) {
- return true;
- }
- if (o == null || getClass() != o.getClass()) {
- return false;
- }
-
- FlinkKeyedCombiningState<?, ?, ?, ?> that =
- (FlinkKeyedCombiningState<?, ?, ?, ?>) o;
-
- return namespace.equals(that.namespace) && address.equals(that.address);
-
- }
-
- @Override
- public int hashCode() {
- int result = namespace.hashCode();
- result = 31 * result + address.hashCode();
- return result;
- }
- }
-
- private class FlinkCombiningStateWithContext<K, InputT, AccumT, OutputT>
- extends AbstractBroadcastState<AccumT>
- implements CombiningState<InputT, AccumT, OutputT> {
-
- private final StateNamespace namespace;
- private final StateTag<? super K, CombiningState<InputT, AccumT, OutputT>> address;
- private final CombineWithContext.KeyedCombineFnWithContext<
- ? super K, InputT, AccumT, OutputT> combineFn;
- private final FlinkBroadcastStateInternals<K> flinkStateInternals;
- private final CombineWithContext.Context context;
-
- FlinkCombiningStateWithContext(
- DefaultOperatorStateBackend flinkStateBackend,
- StateTag<? super K, CombiningState<InputT, AccumT, OutputT>> address,
- CombineWithContext.KeyedCombineFnWithContext<
- ? super K, InputT, AccumT, OutputT> combineFn,
- StateNamespace namespace,
- Coder<AccumT> accumCoder,
- FlinkBroadcastStateInternals<K> flinkStateInternals,
- CombineWithContext.Context context) {
- super(flinkStateBackend, address.getId(), namespace, accumCoder);
-
- this.namespace = namespace;
- this.address = address;
- this.combineFn = combineFn;
- this.flinkStateInternals = flinkStateInternals;
- this.context = context;
-
- }
-
- @Override
- public CombiningState<InputT, AccumT, OutputT> readLater() {
- return this;
- }
-
- @Override
- public void add(InputT value) {
- try {
- AccumT current = readInternal();
- if (current == null) {
- current = combineFn.createAccumulator(flinkStateInternals.getKey(), context);
- }
- current = combineFn.addInput(flinkStateInternals.getKey(), current, value, context);
- writeInternal(current);
- } catch (Exception e) {
- throw new RuntimeException("Error adding to state." , e);
- }
- }
-
- @Override
- public void addAccum(AccumT accum) {
- try {
-
- AccumT current = readInternal();
- if (current == null) {
- writeInternal(accum);
- } else {
- current = combineFn.mergeAccumulators(
- flinkStateInternals.getKey(),
- Arrays.asList(current, accum),
- context);
- writeInternal(current);
- }
- } catch (Exception e) {
- throw new RuntimeException("Error adding to state.", e);
- }
- }
-
- @Override
- public AccumT getAccum() {
- try {
- return readInternal();
- } catch (Exception e) {
- throw new RuntimeException("Error reading state.", e);
- }
- }
-
- @Override
- public AccumT mergeAccumulators(Iterable<AccumT> accumulators) {
- return combineFn.mergeAccumulators(flinkStateInternals.getKey(), accumulators, context);
- }
-
- @Override
- public OutputT read() {
- try {
- AccumT accum = readInternal();
- return combineFn.extractOutput(flinkStateInternals.getKey(), accum, context);
- } catch (Exception e) {
- throw new RuntimeException("Error reading state.", e);
- }
- }
-
- @Override
- public ReadableState<Boolean> isEmpty() {
- return new ReadableState<Boolean>() {
- @Override
- public Boolean read() {
- try {
- return readInternal() == null;
- } catch (Exception e) {
- throw new RuntimeException("Error reading state.", e);
- }
-
- }
-
- @Override
- public ReadableState<Boolean> readLater() {
- return this;
- }
- };
- }
-
- @Override
- public void clear() {
- clearInternal();
- }
-
- @Override
- public boolean equals(Object o) {
- if (this == o) {
- return true;
- }
- if (o == null || getClass() != o.getClass()) {
- return false;
- }
-
- FlinkCombiningStateWithContext<?, ?, ?, ?> that =
- (FlinkCombiningStateWithContext<?, ?, ?, ?>) o;
-
- return namespace.equals(that.namespace) && address.equals(that.address);
-
- }
-
- @Override
- public int hashCode() {
- int result = namespace.hashCode();
- result = 31 * result + address.hashCode();
- return result;
- }
- }
-
-}
[30/50] [abbrv] beam git commit: [BEAM-1994] Remove Flink examples
package
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkStreamingTransformTranslators.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkStreamingTransformTranslators.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkStreamingTransformTranslators.java
new file mode 100644
index 0000000..123d5e7
--- /dev/null
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkStreamingTransformTranslators.java
@@ -0,0 +1,1044 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.beam.runners.flink;
+
+import static com.google.common.base.Preconditions.checkArgument;
+
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+import org.apache.beam.runners.core.ElementAndRestriction;
+import org.apache.beam.runners.core.KeyedWorkItem;
+import org.apache.beam.runners.core.SplittableParDo;
+import org.apache.beam.runners.core.SystemReduceFn;
+import org.apache.beam.runners.flink.translation.functions.FlinkAssignWindows;
+import org.apache.beam.runners.flink.translation.types.CoderTypeInformation;
+import org.apache.beam.runners.flink.translation.wrappers.streaming.DoFnOperator;
+import org.apache.beam.runners.flink.translation.wrappers.streaming.KvToByteBufferKeySelector;
+import org.apache.beam.runners.flink.translation.wrappers.streaming.SingletonKeyedWorkItem;
+import org.apache.beam.runners.flink.translation.wrappers.streaming.SingletonKeyedWorkItemCoder;
+import org.apache.beam.runners.flink.translation.wrappers.streaming.SplittableDoFnOperator;
+import org.apache.beam.runners.flink.translation.wrappers.streaming.WindowDoFnOperator;
+import org.apache.beam.runners.flink.translation.wrappers.streaming.WorkItemKeySelector;
+import org.apache.beam.runners.flink.translation.wrappers.streaming.io.BoundedSourceWrapper;
+import org.apache.beam.runners.flink.translation.wrappers.streaming.io.UnboundedSourceWrapper;
+import org.apache.beam.sdk.coders.Coder;
+import org.apache.beam.sdk.coders.KvCoder;
+import org.apache.beam.sdk.coders.StringUtf8Coder;
+import org.apache.beam.sdk.coders.VoidCoder;
+import org.apache.beam.sdk.io.Read;
+import org.apache.beam.sdk.io.TextIO;
+import org.apache.beam.sdk.transforms.Combine;
+import org.apache.beam.sdk.transforms.DoFn;
+import org.apache.beam.sdk.transforms.Flatten;
+import org.apache.beam.sdk.transforms.GroupByKey;
+import org.apache.beam.sdk.transforms.PTransform;
+import org.apache.beam.sdk.transforms.ParDo;
+import org.apache.beam.sdk.transforms.join.RawUnionValue;
+import org.apache.beam.sdk.transforms.join.UnionCoder;
+import org.apache.beam.sdk.transforms.reflect.DoFnSignature;
+import org.apache.beam.sdk.transforms.reflect.DoFnSignatures;
+import org.apache.beam.sdk.transforms.splittabledofn.RestrictionTracker;
+import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
+import org.apache.beam.sdk.transforms.windowing.GlobalWindow;
+import org.apache.beam.sdk.transforms.windowing.Window;
+import org.apache.beam.sdk.transforms.windowing.WindowFn;
+import org.apache.beam.sdk.util.AppliedCombineFn;
+import org.apache.beam.sdk.util.Reshuffle;
+import org.apache.beam.sdk.util.WindowedValue;
+import org.apache.beam.sdk.util.WindowingStrategy;
+import org.apache.beam.sdk.values.KV;
+import org.apache.beam.sdk.values.PCollection;
+import org.apache.beam.sdk.values.PCollectionView;
+import org.apache.beam.sdk.values.PValue;
+import org.apache.beam.sdk.values.TupleTag;
+import org.apache.flink.api.common.functions.FlatMapFunction;
+import org.apache.flink.api.common.functions.MapFunction;
+import org.apache.flink.api.common.functions.RichFlatMapFunction;
+import org.apache.flink.api.common.typeinfo.TypeInformation;
+import org.apache.flink.api.java.tuple.Tuple2;
+import org.apache.flink.core.fs.FileSystem;
+import org.apache.flink.streaming.api.collector.selector.OutputSelector;
+import org.apache.flink.streaming.api.datastream.DataStream;
+import org.apache.flink.streaming.api.datastream.DataStreamSink;
+import org.apache.flink.streaming.api.datastream.DataStreamSource;
+import org.apache.flink.streaming.api.datastream.KeyedStream;
+import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
+import org.apache.flink.streaming.api.datastream.SplitStream;
+import org.apache.flink.streaming.api.operators.OneInputStreamOperator;
+import org.apache.flink.streaming.api.operators.TwoInputStreamOperator;
+import org.apache.flink.streaming.api.transformations.TwoInputTransformation;
+import org.apache.flink.util.Collector;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This class contains all the mappings between Beam and Flink
+ * <b>streaming</b> transformations. The {@link FlinkStreamingPipelineTranslator}
+ * traverses the Beam job and comes here to translate the encountered Beam transformations
+ * into Flink one, based on the mapping available in this class.
+ */
+class FlinkStreamingTransformTranslators {
+
+ // --------------------------------------------------------------------------------------------
+ // Transform Translator Registry
+ // --------------------------------------------------------------------------------------------
+
+ @SuppressWarnings("rawtypes")
+ private static final Map<
+ Class<? extends PTransform>,
+ FlinkStreamingPipelineTranslator.StreamTransformTranslator> TRANSLATORS = new HashMap<>();
+
+ // here you can find all the available translators.
+ static {
+ TRANSLATORS.put(Read.Bounded.class, new BoundedReadSourceTranslator());
+ TRANSLATORS.put(Read.Unbounded.class, new UnboundedReadSourceTranslator());
+ TRANSLATORS.put(TextIO.Write.Bound.class, new TextIOWriteBoundStreamingTranslator());
+
+ TRANSLATORS.put(ParDo.MultiOutput.class, new ParDoStreamingTranslator());
+ TRANSLATORS.put(
+ SplittableParDo.ProcessElements.class, new SplittableProcessElementsStreamingTranslator());
+ TRANSLATORS.put(
+ SplittableParDo.GBKIntoKeyedWorkItems.class, new GBKIntoKeyedWorkItemsTranslator());
+
+
+ TRANSLATORS.put(Window.Assign.class, new WindowAssignTranslator());
+ TRANSLATORS.put(Flatten.PCollections.class, new FlattenPCollectionTranslator());
+ TRANSLATORS.put(
+ FlinkStreamingViewOverrides.CreateFlinkPCollectionView.class,
+ new CreateViewStreamingTranslator());
+
+ TRANSLATORS.put(Reshuffle.class, new ReshuffleTranslatorStreaming());
+ TRANSLATORS.put(GroupByKey.class, new GroupByKeyTranslator());
+ TRANSLATORS.put(Combine.PerKey.class, new CombinePerKeyTranslator());
+ }
+
+ public static FlinkStreamingPipelineTranslator.StreamTransformTranslator<?> getTranslator(
+ PTransform<?, ?> transform) {
+ return TRANSLATORS.get(transform.getClass());
+ }
+
+ // --------------------------------------------------------------------------------------------
+ // Transformation Implementations
+ // --------------------------------------------------------------------------------------------
+
+ private static class TextIOWriteBoundStreamingTranslator
+ extends FlinkStreamingPipelineTranslator.StreamTransformTranslator<TextIO.Write.Bound> {
+
+ private static final Logger LOG =
+ LoggerFactory.getLogger(TextIOWriteBoundStreamingTranslator.class);
+
+ @Override
+ public void translateNode(
+ TextIO.Write.Bound transform,
+ FlinkStreamingTranslationContext context) {
+ PValue input = context.getInput(transform);
+ DataStream<WindowedValue<String>> inputDataStream = context.getInputDataStream(input);
+
+ String filenamePrefix = transform.getFilenamePrefix();
+ String filenameSuffix = transform.getFilenameSuffix();
+ boolean needsValidation = transform.needsValidation();
+ int numShards = transform.getNumShards();
+ String shardNameTemplate = transform.getShardNameTemplate();
+
+ // TODO: Implement these. We need Flink support for this.
+ LOG.warn(
+ "Translation of TextIO.Write.needsValidation not yet supported. Is: {}.",
+ needsValidation);
+ LOG.warn(
+ "Translation of TextIO.Write.filenameSuffix not yet supported. Is: {}.",
+ filenameSuffix);
+ LOG.warn(
+ "Translation of TextIO.Write.shardNameTemplate not yet supported. Is: {}.",
+ shardNameTemplate);
+
+ DataStream<String> dataSink = inputDataStream
+ .flatMap(new FlatMapFunction<WindowedValue<String>, String>() {
+ @Override
+ public void flatMap(
+ WindowedValue<String> value,
+ Collector<String> out)
+ throws Exception {
+ out.collect(value.getValue());
+ }
+ });
+ DataStreamSink<String> output =
+ dataSink.writeAsText(filenamePrefix, FileSystem.WriteMode.OVERWRITE);
+
+ if (numShards > 0) {
+ output.setParallelism(numShards);
+ }
+ }
+ }
+
+ private static class UnboundedReadSourceTranslator<T>
+ extends FlinkStreamingPipelineTranslator.StreamTransformTranslator<Read.Unbounded<T>> {
+
+ @Override
+ public void translateNode(
+ Read.Unbounded<T> transform,
+ FlinkStreamingTranslationContext context) {
+ PCollection<T> output = context.getOutput(transform);
+
+ TypeInformation<WindowedValue<T>> outputTypeInfo =
+ context.getTypeInfo(context.getOutput(transform));
+
+ DataStream<WindowedValue<T>> source;
+ try {
+ UnboundedSourceWrapper<T, ?> sourceWrapper =
+ new UnboundedSourceWrapper<>(
+ context.getPipelineOptions(),
+ transform.getSource(),
+ context.getExecutionEnvironment().getParallelism());
+ source = context
+ .getExecutionEnvironment()
+ .addSource(sourceWrapper).name(transform.getName()).returns(outputTypeInfo);
+ } catch (Exception e) {
+ throw new RuntimeException(
+ "Error while translating UnboundedSource: " + transform.getSource(), e);
+ }
+
+ context.setOutputDataStream(output, source);
+ }
+ }
+
+ private static class BoundedReadSourceTranslator<T>
+ extends FlinkStreamingPipelineTranslator.StreamTransformTranslator<Read.Bounded<T>> {
+
+ @Override
+ public void translateNode(
+ Read.Bounded<T> transform,
+ FlinkStreamingTranslationContext context) {
+ PCollection<T> output = context.getOutput(transform);
+
+ TypeInformation<WindowedValue<T>> outputTypeInfo =
+ context.getTypeInfo(context.getOutput(transform));
+
+
+ DataStream<WindowedValue<T>> source;
+ try {
+ BoundedSourceWrapper<T> sourceWrapper =
+ new BoundedSourceWrapper<>(
+ context.getPipelineOptions(),
+ transform.getSource(),
+ context.getExecutionEnvironment().getParallelism());
+ source = context
+ .getExecutionEnvironment()
+ .addSource(sourceWrapper).name(transform.getName()).returns(outputTypeInfo);
+ } catch (Exception e) {
+ throw new RuntimeException(
+ "Error while translating BoundedSource: " + transform.getSource(), e);
+ }
+
+ context.setOutputDataStream(output, source);
+ }
+ }
+
+ /**
+ * Wraps each element in a {@link RawUnionValue} with the given tag id.
+ */
+ private static class ToRawUnion<T> implements MapFunction<T, RawUnionValue> {
+ private final int intTag;
+
+ public ToRawUnion(int intTag) {
+ this.intTag = intTag;
+ }
+
+ @Override
+ public RawUnionValue map(T o) throws Exception {
+ return new RawUnionValue(intTag, o);
+ }
+ }
+
+ private static Tuple2<Map<Integer, PCollectionView<?>>, DataStream<RawUnionValue>>
+ transformSideInputs(
+ Collection<PCollectionView<?>> sideInputs,
+ FlinkStreamingTranslationContext context) {
+
+ // collect all side inputs
+ Map<TupleTag<?>, Integer> tagToIntMapping = new HashMap<>();
+ Map<Integer, PCollectionView<?>> intToViewMapping = new HashMap<>();
+ int count = 0;
+ for (PCollectionView<?> sideInput: sideInputs) {
+ TupleTag<?> tag = sideInput.getTagInternal();
+ intToViewMapping.put(count, sideInput);
+ tagToIntMapping.put(tag, count);
+ count++;
+ Coder<Iterable<WindowedValue<?>>> coder = sideInput.getCoderInternal();
+ }
+
+
+ List<Coder<?>> inputCoders = new ArrayList<>();
+ for (PCollectionView<?> sideInput: sideInputs) {
+ DataStream<Object> sideInputStream = context.getInputDataStream(sideInput);
+ TypeInformation<Object> tpe = sideInputStream.getType();
+ if (!(tpe instanceof CoderTypeInformation)) {
+ throw new IllegalStateException(
+ "Input Stream TypeInformation is no CoderTypeInformation.");
+ }
+
+ Coder<?> coder = ((CoderTypeInformation) tpe).getCoder();
+ inputCoders.add(coder);
+ }
+
+ UnionCoder unionCoder = UnionCoder.of(inputCoders);
+
+ CoderTypeInformation<RawUnionValue> unionTypeInformation =
+ new CoderTypeInformation<>(unionCoder);
+
+ // transform each side input to RawUnionValue and union them
+ DataStream<RawUnionValue> sideInputUnion = null;
+
+ for (PCollectionView<?> sideInput: sideInputs) {
+ TupleTag<?> tag = sideInput.getTagInternal();
+ final int intTag = tagToIntMapping.get(tag);
+ DataStream<Object> sideInputStream = context.getInputDataStream(sideInput);
+ DataStream<RawUnionValue> unionValueStream =
+ sideInputStream.map(new ToRawUnion<>(intTag)).returns(unionTypeInformation);
+
+ if (sideInputUnion == null) {
+ sideInputUnion = unionValueStream;
+ } else {
+ sideInputUnion = sideInputUnion.union(unionValueStream);
+ }
+ }
+
+ if (sideInputUnion == null) {
+ throw new IllegalStateException("No unioned side inputs, this indicates a bug.");
+ }
+
+ return new Tuple2<>(intToViewMapping, sideInputUnion);
+ }
+
+ /**
+ * Helper for translating {@link ParDo.MultiOutput} and {@link SplittableParDo.ProcessElements}.
+ */
+ static class ParDoTranslationHelper {
+
+ interface DoFnOperatorFactory<InputT, OutputT> {
+ DoFnOperator<InputT, OutputT, RawUnionValue> createDoFnOperator(
+ DoFn<InputT, OutputT> doFn,
+ List<PCollectionView<?>> sideInputs,
+ TupleTag<OutputT> mainOutputTag,
+ List<TupleTag<?>> additionalOutputTags,
+ FlinkStreamingTranslationContext context,
+ WindowingStrategy<?, ?> windowingStrategy,
+ Map<TupleTag<?>, Integer> tagsToLabels,
+ Coder<WindowedValue<InputT>> inputCoder,
+ Coder keyCoder,
+ Map<Integer, PCollectionView<?>> transformedSideInputs);
+ }
+
+ static <InputT, OutputT> void translateParDo(
+ String transformName,
+ DoFn<InputT, OutputT> doFn,
+ PCollection<InputT> input,
+ List<PCollectionView<?>> sideInputs,
+ Map<TupleTag<?>, PValue> outputs,
+ TupleTag<OutputT> mainOutputTag,
+ List<TupleTag<?>> additionalOutputTags,
+ FlinkStreamingTranslationContext context,
+ DoFnOperatorFactory<InputT, OutputT> doFnOperatorFactory) {
+
+ // we assume that the transformation does not change the windowing strategy.
+ WindowingStrategy<?, ?> windowingStrategy = input.getWindowingStrategy();
+
+ Map<TupleTag<?>, Integer> tagsToLabels =
+ transformTupleTagsToLabels(mainOutputTag, outputs);
+
+ SingleOutputStreamOperator<RawUnionValue> unionOutputStream;
+
+ Coder<WindowedValue<InputT>> inputCoder = context.getCoder(input);
+
+ DataStream<WindowedValue<InputT>> inputDataStream = context.getInputDataStream(input);
+
+ Coder keyCoder = null;
+ boolean stateful = false;
+ DoFnSignature signature = DoFnSignatures.getSignature(doFn.getClass());
+ if (signature.stateDeclarations().size() > 0
+ || signature.timerDeclarations().size() > 0) {
+ // Based on the fact that the signature is stateful, DoFnSignatures ensures
+ // that it is also keyed
+ keyCoder = ((KvCoder) input.getCoder()).getKeyCoder();
+ inputDataStream = inputDataStream.keyBy(new KvToByteBufferKeySelector(keyCoder));
+ stateful = true;
+ } else if (doFn instanceof SplittableParDo.ProcessFn) {
+ // we know that it is keyed on String
+ keyCoder = StringUtf8Coder.of();
+ stateful = true;
+ }
+
+ if (sideInputs.isEmpty()) {
+ DoFnOperator<InputT, OutputT, RawUnionValue> doFnOperator =
+ doFnOperatorFactory.createDoFnOperator(
+ doFn,
+ sideInputs,
+ mainOutputTag,
+ additionalOutputTags,
+ context,
+ windowingStrategy,
+ tagsToLabels,
+ inputCoder,
+ keyCoder,
+ new HashMap<Integer, PCollectionView<?>>() /* side-input mapping */);
+
+ UnionCoder outputUnionCoder = createUnionCoder(outputs);
+
+ CoderTypeInformation<RawUnionValue> outputUnionTypeInformation =
+ new CoderTypeInformation<>(outputUnionCoder);
+
+ unionOutputStream = inputDataStream
+ .transform(transformName, outputUnionTypeInformation, doFnOperator);
+
+ } else {
+ Tuple2<Map<Integer, PCollectionView<?>>, DataStream<RawUnionValue>> transformedSideInputs =
+ transformSideInputs(sideInputs, context);
+
+ DoFnOperator<InputT, OutputT, RawUnionValue> doFnOperator =
+ doFnOperatorFactory.createDoFnOperator(
+ doFn,
+ sideInputs,
+ mainOutputTag,
+ additionalOutputTags,
+ context,
+ windowingStrategy,
+ tagsToLabels,
+ inputCoder,
+ keyCoder,
+ transformedSideInputs.f0);
+
+ UnionCoder outputUnionCoder = createUnionCoder(outputs);
+
+ CoderTypeInformation<RawUnionValue> outputUnionTypeInformation =
+ new CoderTypeInformation<>(outputUnionCoder);
+
+ if (stateful) {
+ // we have to manually contruct the two-input transform because we're not
+ // allowed to have only one input keyed, normally.
+ KeyedStream keyedStream = (KeyedStream<?, InputT>) inputDataStream;
+ TwoInputTransformation<
+ WindowedValue<KV<?, InputT>>,
+ RawUnionValue,
+ WindowedValue<OutputT>> rawFlinkTransform = new TwoInputTransformation(
+ keyedStream.getTransformation(),
+ transformedSideInputs.f1.broadcast().getTransformation(),
+ transformName,
+ (TwoInputStreamOperator) doFnOperator,
+ outputUnionTypeInformation,
+ keyedStream.getParallelism());
+
+ rawFlinkTransform.setStateKeyType(keyedStream.getKeyType());
+ rawFlinkTransform.setStateKeySelectors(keyedStream.getKeySelector(), null);
+
+ unionOutputStream = new SingleOutputStreamOperator(
+ keyedStream.getExecutionEnvironment(),
+ rawFlinkTransform) {}; // we have to cheat around the ctor being protected
+
+ keyedStream.getExecutionEnvironment().addOperator(rawFlinkTransform);
+
+ } else {
+ unionOutputStream = inputDataStream
+ .connect(transformedSideInputs.f1.broadcast())
+ .transform(transformName, outputUnionTypeInformation, doFnOperator);
+ }
+ }
+
+ SplitStream<RawUnionValue> splitStream = unionOutputStream
+ .split(new OutputSelector<RawUnionValue>() {
+ @Override
+ public Iterable<String> select(RawUnionValue value) {
+ return Collections.singletonList(Integer.toString(value.getUnionTag()));
+ }
+ });
+
+ for (Entry<TupleTag<?>, PValue> output : outputs.entrySet()) {
+ final int outputTag = tagsToLabels.get(output.getKey());
+
+ TypeInformation outputTypeInfo = context.getTypeInfo((PCollection<?>) output.getValue());
+
+ @SuppressWarnings("unchecked")
+ DataStream unwrapped = splitStream.select(String.valueOf(outputTag))
+ .flatMap(new FlatMapFunction<RawUnionValue, Object>() {
+ @Override
+ public void flatMap(RawUnionValue value, Collector<Object> out) throws Exception {
+ out.collect(value.getValue());
+ }
+ }).returns(outputTypeInfo);
+
+ context.setOutputDataStream(output.getValue(), unwrapped);
+ }
+ }
+
+ private static Map<TupleTag<?>, Integer> transformTupleTagsToLabels(
+ TupleTag<?> mainTag,
+ Map<TupleTag<?>, PValue> allTaggedValues) {
+
+ Map<TupleTag<?>, Integer> tagToLabelMap = Maps.newHashMap();
+ int count = 0;
+ tagToLabelMap.put(mainTag, count++);
+ for (TupleTag<?> key : allTaggedValues.keySet()) {
+ if (!tagToLabelMap.containsKey(key)) {
+ tagToLabelMap.put(key, count++);
+ }
+ }
+ return tagToLabelMap;
+ }
+
+ private static UnionCoder createUnionCoder(Map<TupleTag<?>, PValue> taggedCollections) {
+ List<Coder<?>> outputCoders = Lists.newArrayList();
+ for (PValue taggedColl : taggedCollections.values()) {
+ checkArgument(
+ taggedColl instanceof PCollection,
+ "A Union Coder can only be created for a Collection of Tagged %s. Got %s",
+ PCollection.class.getSimpleName(),
+ taggedColl.getClass().getSimpleName());
+ PCollection<?> coll = (PCollection<?>) taggedColl;
+ WindowedValue.FullWindowedValueCoder<?> windowedValueCoder =
+ WindowedValue.getFullCoder(
+ coll.getCoder(),
+ coll.getWindowingStrategy().getWindowFn().windowCoder());
+ outputCoders.add(windowedValueCoder);
+ }
+ return UnionCoder.of(outputCoders);
+ }
+ }
+
+ private static class ParDoStreamingTranslator<InputT, OutputT>
+ extends FlinkStreamingPipelineTranslator.StreamTransformTranslator<
+ ParDo.MultiOutput<InputT, OutputT>> {
+
+ @Override
+ public void translateNode(
+ ParDo.MultiOutput<InputT, OutputT> transform,
+ FlinkStreamingTranslationContext context) {
+
+ ParDoTranslationHelper.translateParDo(
+ transform.getName(),
+ transform.getFn(),
+ (PCollection<InputT>) context.getInput(transform),
+ transform.getSideInputs(),
+ context.getOutputs(transform),
+ transform.getMainOutputTag(),
+ transform.getAdditionalOutputTags().getAll(),
+ context,
+ new ParDoTranslationHelper.DoFnOperatorFactory<InputT, OutputT>() {
+ @Override
+ public DoFnOperator<InputT, OutputT, RawUnionValue> createDoFnOperator(
+ DoFn<InputT, OutputT> doFn,
+ List<PCollectionView<?>> sideInputs,
+ TupleTag<OutputT> mainOutputTag,
+ List<TupleTag<?>> additionalOutputTags,
+ FlinkStreamingTranslationContext context,
+ WindowingStrategy<?, ?> windowingStrategy,
+ Map<TupleTag<?>, Integer> tagsToLabels,
+ Coder<WindowedValue<InputT>> inputCoder,
+ Coder keyCoder,
+ Map<Integer, PCollectionView<?>> transformedSideInputs) {
+ return new DoFnOperator<>(
+ doFn,
+ inputCoder,
+ mainOutputTag,
+ additionalOutputTags,
+ new DoFnOperator.MultiOutputOutputManagerFactory(tagsToLabels),
+ windowingStrategy,
+ transformedSideInputs,
+ sideInputs,
+ context.getPipelineOptions(),
+ keyCoder);
+ }
+ });
+ }
+ }
+
+ private static class SplittableProcessElementsStreamingTranslator<
+ InputT, OutputT, RestrictionT, TrackerT extends RestrictionTracker<RestrictionT>>
+ extends FlinkStreamingPipelineTranslator.StreamTransformTranslator<
+ SplittableParDo.ProcessElements<InputT, OutputT, RestrictionT, TrackerT>> {
+
+ @Override
+ public void translateNode(
+ SplittableParDo.ProcessElements<InputT, OutputT, RestrictionT, TrackerT> transform,
+ FlinkStreamingTranslationContext context) {
+
+ ParDoTranslationHelper.translateParDo(
+ transform.getName(),
+ transform.newProcessFn(transform.getFn()),
+ (PCollection<KeyedWorkItem<String, ElementAndRestriction<InputT, RestrictionT>>>)
+ context.getInput(transform),
+ transform.getSideInputs(),
+ context.getOutputs(transform),
+ transform.getMainOutputTag(),
+ transform.getAdditionalOutputTags().getAll(),
+ context,
+ new ParDoTranslationHelper.DoFnOperatorFactory<
+ KeyedWorkItem<String, ElementAndRestriction<InputT, RestrictionT>>, OutputT>() {
+ @Override
+ public DoFnOperator<
+ KeyedWorkItem<String, ElementAndRestriction<InputT, RestrictionT>>,
+ OutputT,
+ RawUnionValue> createDoFnOperator(
+ DoFn<
+ KeyedWorkItem<String, ElementAndRestriction<InputT, RestrictionT>>,
+ OutputT> doFn,
+ List<PCollectionView<?>> sideInputs,
+ TupleTag<OutputT> mainOutputTag,
+ List<TupleTag<?>> additionalOutputTags,
+ FlinkStreamingTranslationContext context,
+ WindowingStrategy<?, ?> windowingStrategy,
+ Map<TupleTag<?>, Integer> tagsToLabels,
+ Coder<
+ WindowedValue<
+ KeyedWorkItem<
+ String,
+ ElementAndRestriction<InputT, RestrictionT>>>> inputCoder,
+ Coder keyCoder,
+ Map<Integer, PCollectionView<?>> transformedSideInputs) {
+ return new SplittableDoFnOperator<>(
+ doFn,
+ inputCoder,
+ mainOutputTag,
+ additionalOutputTags,
+ new DoFnOperator.MultiOutputOutputManagerFactory(tagsToLabels),
+ windowingStrategy,
+ transformedSideInputs,
+ sideInputs,
+ context.getPipelineOptions(),
+ keyCoder);
+ }
+ });
+ }
+ }
+
+ private static class CreateViewStreamingTranslator<ElemT, ViewT>
+ extends FlinkStreamingPipelineTranslator.StreamTransformTranslator<
+ FlinkStreamingViewOverrides.CreateFlinkPCollectionView<ElemT, ViewT>> {
+
+ @Override
+ public void translateNode(
+ FlinkStreamingViewOverrides.CreateFlinkPCollectionView<ElemT, ViewT> transform,
+ FlinkStreamingTranslationContext context) {
+ // just forward
+ DataStream<WindowedValue<List<ElemT>>> inputDataSet =
+ context.getInputDataStream(context.getInput(transform));
+
+ PCollectionView<ViewT> view = context.getOutput(transform);
+
+ context.setOutputDataStream(view, inputDataSet);
+ }
+ }
+
+ private static class WindowAssignTranslator<T>
+ extends FlinkStreamingPipelineTranslator.StreamTransformTranslator<Window.Assign<T>> {
+
+ @Override
+ public void translateNode(
+ Window.Assign<T> transform,
+ FlinkStreamingTranslationContext context) {
+
+ @SuppressWarnings("unchecked")
+ WindowingStrategy<T, BoundedWindow> windowingStrategy =
+ (WindowingStrategy<T, BoundedWindow>)
+ context.getOutput(transform).getWindowingStrategy();
+
+ TypeInformation<WindowedValue<T>> typeInfo =
+ context.getTypeInfo(context.getOutput(transform));
+
+ DataStream<WindowedValue<T>> inputDataStream =
+ context.getInputDataStream(context.getInput(transform));
+
+ WindowFn<T, ? extends BoundedWindow> windowFn = windowingStrategy.getWindowFn();
+
+ FlinkAssignWindows<T, ? extends BoundedWindow> assignWindowsFunction =
+ new FlinkAssignWindows<>(windowFn);
+
+ SingleOutputStreamOperator<WindowedValue<T>> outputDataStream = inputDataStream
+ .flatMap(assignWindowsFunction)
+ .name(context.getOutput(transform).getName())
+ .returns(typeInfo);
+
+ context.setOutputDataStream(context.getOutput(transform), outputDataStream);
+ }
+ }
+
+ private static class ReshuffleTranslatorStreaming<K, InputT>
+ extends FlinkStreamingPipelineTranslator.StreamTransformTranslator<Reshuffle<K, InputT>> {
+
+ @Override
+ public void translateNode(
+ Reshuffle<K, InputT> transform,
+ FlinkStreamingTranslationContext context) {
+
+ DataStream<WindowedValue<KV<K, InputT>>> inputDataSet =
+ context.getInputDataStream(context.getInput(transform));
+
+ context.setOutputDataStream(context.getOutput(transform), inputDataSet.rebalance());
+
+ }
+ }
+
+
+ private static class GroupByKeyTranslator<K, InputT>
+ extends FlinkStreamingPipelineTranslator.StreamTransformTranslator<GroupByKey<K, InputT>> {
+
+ @Override
+ public void translateNode(
+ GroupByKey<K, InputT> transform,
+ FlinkStreamingTranslationContext context) {
+
+ PCollection<KV<K, InputT>> input = context.getInput(transform);
+
+ @SuppressWarnings("unchecked")
+ WindowingStrategy<?, BoundedWindow> windowingStrategy =
+ (WindowingStrategy<?, BoundedWindow>) input.getWindowingStrategy();
+
+ KvCoder<K, InputT> inputKvCoder = (KvCoder<K, InputT>) input.getCoder();
+
+ SingletonKeyedWorkItemCoder<K, InputT> workItemCoder = SingletonKeyedWorkItemCoder.of(
+ inputKvCoder.getKeyCoder(),
+ inputKvCoder.getValueCoder(),
+ input.getWindowingStrategy().getWindowFn().windowCoder());
+
+ DataStream<WindowedValue<KV<K, InputT>>> inputDataStream = context.getInputDataStream(input);
+
+ WindowedValue.
+ FullWindowedValueCoder<SingletonKeyedWorkItem<K, InputT>> windowedWorkItemCoder =
+ WindowedValue.getFullCoder(
+ workItemCoder,
+ input.getWindowingStrategy().getWindowFn().windowCoder());
+
+ CoderTypeInformation<WindowedValue<SingletonKeyedWorkItem<K, InputT>>> workItemTypeInfo =
+ new CoderTypeInformation<>(windowedWorkItemCoder);
+
+ DataStream<WindowedValue<SingletonKeyedWorkItem<K, InputT>>> workItemStream =
+ inputDataStream
+ .flatMap(new ToKeyedWorkItem<K, InputT>())
+ .returns(workItemTypeInfo).name("ToKeyedWorkItem");
+
+ KeyedStream<
+ WindowedValue<
+ SingletonKeyedWorkItem<K, InputT>>, ByteBuffer> keyedWorkItemStream = workItemStream
+ .keyBy(new WorkItemKeySelector<K, InputT>(inputKvCoder.getKeyCoder()));
+
+ SystemReduceFn<K, InputT, Iterable<InputT>, Iterable<InputT>, BoundedWindow> reduceFn =
+ SystemReduceFn.buffering(inputKvCoder.getValueCoder());
+
+ TypeInformation<WindowedValue<KV<K, Iterable<InputT>>>> outputTypeInfo =
+ context.getTypeInfo(context.getOutput(transform));
+
+ DoFnOperator.DefaultOutputManagerFactory<
+ WindowedValue<KV<K, Iterable<InputT>>>> outputManagerFactory =
+ new DoFnOperator.DefaultOutputManagerFactory<>();
+
+ WindowDoFnOperator<K, InputT, Iterable<InputT>> doFnOperator =
+ new WindowDoFnOperator<>(
+ reduceFn,
+ (Coder) windowedWorkItemCoder,
+ new TupleTag<KV<K, Iterable<InputT>>>("main output"),
+ Collections.<TupleTag<?>>emptyList(),
+ outputManagerFactory,
+ windowingStrategy,
+ new HashMap<Integer, PCollectionView<?>>(), /* side-input mapping */
+ Collections.<PCollectionView<?>>emptyList(), /* side inputs */
+ context.getPipelineOptions(),
+ inputKvCoder.getKeyCoder());
+
+ // our operator excepts WindowedValue<KeyedWorkItem> while our input stream
+ // is WindowedValue<SingletonKeyedWorkItem>, which is fine but Java doesn't like it ...
+ @SuppressWarnings("unchecked")
+ SingleOutputStreamOperator<WindowedValue<KV<K, Iterable<InputT>>>> outDataStream =
+ keyedWorkItemStream
+ .transform(
+ transform.getName(),
+ outputTypeInfo,
+ (OneInputStreamOperator) doFnOperator);
+
+ context.setOutputDataStream(context.getOutput(transform), outDataStream);
+
+ }
+ }
+
+ private static class CombinePerKeyTranslator<K, InputT, OutputT>
+ extends FlinkStreamingPipelineTranslator.StreamTransformTranslator<
+ Combine.PerKey<K, InputT, OutputT>> {
+
+ @Override
+ boolean canTranslate(
+ Combine.PerKey<K, InputT, OutputT> transform,
+ FlinkStreamingTranslationContext context) {
+
+ // if we have a merging window strategy and side inputs we cannot
+ // translate as a proper combine. We have to group and then run the combine
+ // over the final grouped values.
+ PCollection<KV<K, InputT>> input = context.getInput(transform);
+
+ @SuppressWarnings("unchecked")
+ WindowingStrategy<?, BoundedWindow> windowingStrategy =
+ (WindowingStrategy<?, BoundedWindow>) input.getWindowingStrategy();
+
+ return windowingStrategy.getWindowFn().isNonMerging() || transform.getSideInputs().isEmpty();
+ }
+
+ @Override
+ public void translateNode(
+ Combine.PerKey<K, InputT, OutputT> transform,
+ FlinkStreamingTranslationContext context) {
+
+ PCollection<KV<K, InputT>> input = context.getInput(transform);
+
+ @SuppressWarnings("unchecked")
+ WindowingStrategy<?, BoundedWindow> windowingStrategy =
+ (WindowingStrategy<?, BoundedWindow>) input.getWindowingStrategy();
+
+ KvCoder<K, InputT> inputKvCoder = (KvCoder<K, InputT>) input.getCoder();
+
+ SingletonKeyedWorkItemCoder<K, InputT> workItemCoder = SingletonKeyedWorkItemCoder.of(
+ inputKvCoder.getKeyCoder(),
+ inputKvCoder.getValueCoder(),
+ input.getWindowingStrategy().getWindowFn().windowCoder());
+
+ DataStream<WindowedValue<KV<K, InputT>>> inputDataStream = context.getInputDataStream(input);
+
+ WindowedValue.
+ FullWindowedValueCoder<SingletonKeyedWorkItem<K, InputT>> windowedWorkItemCoder =
+ WindowedValue.getFullCoder(
+ workItemCoder,
+ input.getWindowingStrategy().getWindowFn().windowCoder());
+
+ CoderTypeInformation<WindowedValue<SingletonKeyedWorkItem<K, InputT>>> workItemTypeInfo =
+ new CoderTypeInformation<>(windowedWorkItemCoder);
+
+ DataStream<WindowedValue<SingletonKeyedWorkItem<K, InputT>>> workItemStream =
+ inputDataStream
+ .flatMap(new ToKeyedWorkItem<K, InputT>())
+ .returns(workItemTypeInfo).name("ToKeyedWorkItem");
+
+ KeyedStream<
+ WindowedValue<
+ SingletonKeyedWorkItem<K, InputT>>, ByteBuffer> keyedWorkItemStream = workItemStream
+ .keyBy(new WorkItemKeySelector<K, InputT>(inputKvCoder.getKeyCoder()));
+
+ SystemReduceFn<K, InputT, ?, OutputT, BoundedWindow> reduceFn = SystemReduceFn.combining(
+ inputKvCoder.getKeyCoder(),
+ AppliedCombineFn.withInputCoder(
+ transform.getFn(), input.getPipeline().getCoderRegistry(), inputKvCoder));
+
+ TypeInformation<WindowedValue<KV<K, OutputT>>> outputTypeInfo =
+ context.getTypeInfo(context.getOutput(transform));
+
+ List<PCollectionView<?>> sideInputs = transform.getSideInputs();
+
+ if (sideInputs.isEmpty()) {
+
+ WindowDoFnOperator<K, InputT, OutputT> doFnOperator =
+ new WindowDoFnOperator<>(
+ reduceFn,
+ (Coder) windowedWorkItemCoder,
+ new TupleTag<KV<K, OutputT>>("main output"),
+ Collections.<TupleTag<?>>emptyList(),
+ new DoFnOperator.DefaultOutputManagerFactory<WindowedValue<KV<K, OutputT>>>(),
+ windowingStrategy,
+ new HashMap<Integer, PCollectionView<?>>(), /* side-input mapping */
+ Collections.<PCollectionView<?>>emptyList(), /* side inputs */
+ context.getPipelineOptions(),
+ inputKvCoder.getKeyCoder());
+
+ // our operator excepts WindowedValue<KeyedWorkItem> while our input stream
+ // is WindowedValue<SingletonKeyedWorkItem>, which is fine but Java doesn't like it ...
+ @SuppressWarnings("unchecked")
+ SingleOutputStreamOperator<WindowedValue<KV<K, OutputT>>> outDataStream =
+ keyedWorkItemStream.transform(
+ transform.getName(), outputTypeInfo, (OneInputStreamOperator) doFnOperator);
+
+ context.setOutputDataStream(context.getOutput(transform), outDataStream);
+ } else {
+ Tuple2<Map<Integer, PCollectionView<?>>, DataStream<RawUnionValue>> transformSideInputs =
+ transformSideInputs(sideInputs, context);
+
+ WindowDoFnOperator<K, InputT, OutputT> doFnOperator =
+ new WindowDoFnOperator<>(
+ reduceFn,
+ (Coder) windowedWorkItemCoder,
+ new TupleTag<KV<K, OutputT>>("main output"),
+ Collections.<TupleTag<?>>emptyList(),
+ new DoFnOperator.DefaultOutputManagerFactory<WindowedValue<KV<K, OutputT>>>(),
+ windowingStrategy,
+ transformSideInputs.f0,
+ sideInputs,
+ context.getPipelineOptions(),
+ inputKvCoder.getKeyCoder());
+
+ // we have to manually contruct the two-input transform because we're not
+ // allowed to have only one input keyed, normally.
+
+ TwoInputTransformation<
+ WindowedValue<SingletonKeyedWorkItem<K, InputT>>,
+ RawUnionValue,
+ WindowedValue<KV<K, OutputT>>> rawFlinkTransform = new TwoInputTransformation<>(
+ keyedWorkItemStream.getTransformation(),
+ transformSideInputs.f1.broadcast().getTransformation(),
+ transform.getName(),
+ (TwoInputStreamOperator) doFnOperator,
+ outputTypeInfo,
+ keyedWorkItemStream.getParallelism());
+
+ rawFlinkTransform.setStateKeyType(keyedWorkItemStream.getKeyType());
+ rawFlinkTransform.setStateKeySelectors(keyedWorkItemStream.getKeySelector(), null);
+
+ @SuppressWarnings({ "unchecked", "rawtypes" })
+ SingleOutputStreamOperator<WindowedValue<KV<K, OutputT>>> outDataStream =
+ new SingleOutputStreamOperator(
+ keyedWorkItemStream.getExecutionEnvironment(),
+ rawFlinkTransform) {}; // we have to cheat around the ctor being protected
+
+ keyedWorkItemStream.getExecutionEnvironment().addOperator(rawFlinkTransform);
+
+ context.setOutputDataStream(context.getOutput(transform), outDataStream);
+ }
+ }
+ }
+
+ private static class GBKIntoKeyedWorkItemsTranslator<K, InputT>
+ extends FlinkStreamingPipelineTranslator.StreamTransformTranslator<
+ SplittableParDo.GBKIntoKeyedWorkItems<K, InputT>> {
+
+ @Override
+ boolean canTranslate(
+ SplittableParDo.GBKIntoKeyedWorkItems<K, InputT> transform,
+ FlinkStreamingTranslationContext context) {
+ return true;
+ }
+
+ @Override
+ public void translateNode(
+ SplittableParDo.GBKIntoKeyedWorkItems<K, InputT> transform,
+ FlinkStreamingTranslationContext context) {
+
+ PCollection<KV<K, InputT>> input = context.getInput(transform);
+
+ KvCoder<K, InputT> inputKvCoder = (KvCoder<K, InputT>) input.getCoder();
+
+ SingletonKeyedWorkItemCoder<K, InputT> workItemCoder = SingletonKeyedWorkItemCoder.of(
+ inputKvCoder.getKeyCoder(),
+ inputKvCoder.getValueCoder(),
+ input.getWindowingStrategy().getWindowFn().windowCoder());
+
+
+ WindowedValue.
+ FullWindowedValueCoder<SingletonKeyedWorkItem<K, InputT>> windowedWorkItemCoder =
+ WindowedValue.getFullCoder(
+ workItemCoder,
+ input.getWindowingStrategy().getWindowFn().windowCoder());
+
+ CoderTypeInformation<WindowedValue<SingletonKeyedWorkItem<K, InputT>>> workItemTypeInfo =
+ new CoderTypeInformation<>(windowedWorkItemCoder);
+
+ DataStream<WindowedValue<KV<K, InputT>>> inputDataStream = context.getInputDataStream(input);
+
+ DataStream<WindowedValue<SingletonKeyedWorkItem<K, InputT>>> workItemStream =
+ inputDataStream
+ .flatMap(new ToKeyedWorkItem<K, InputT>())
+ .returns(workItemTypeInfo).name("ToKeyedWorkItem");
+
+ KeyedStream<
+ WindowedValue<
+ SingletonKeyedWorkItem<K, InputT>>, ByteBuffer> keyedWorkItemStream = workItemStream
+ .keyBy(new WorkItemKeySelector<K, InputT>(inputKvCoder.getKeyCoder()));
+
+ context.setOutputDataStream(context.getOutput(transform), keyedWorkItemStream);
+ }
+ }
+
+ private static class FlattenPCollectionTranslator<T>
+ extends FlinkStreamingPipelineTranslator.StreamTransformTranslator<
+ Flatten.PCollections<T>> {
+
+ @Override
+ public void translateNode(
+ Flatten.PCollections<T> transform,
+ FlinkStreamingTranslationContext context) {
+ Map<TupleTag<?>, PValue> allInputs = context.getInputs(transform);
+
+ if (allInputs.isEmpty()) {
+
+ // create an empty dummy source to satisfy downstream operations
+ // we cannot create an empty source in Flink, therefore we have to
+ // add the flatMap that simply never forwards the single element
+ DataStreamSource<String> dummySource =
+ context.getExecutionEnvironment().fromElements("dummy");
+
+ DataStream<WindowedValue<T>> result = dummySource.flatMap(
+ new FlatMapFunction<String, WindowedValue<T>>() {
+ @Override
+ public void flatMap(
+ String s,
+ Collector<WindowedValue<T>> collector) throws Exception {
+ // never return anything
+ }
+ }).returns(
+ new CoderTypeInformation<>(
+ WindowedValue.getFullCoder(
+ (Coder<T>) VoidCoder.of(),
+ GlobalWindow.Coder.INSTANCE)));
+ context.setOutputDataStream(context.getOutput(transform), result);
+
+ } else {
+ DataStream<T> result = null;
+ for (PValue input : allInputs.values()) {
+ DataStream<T> current = context.getInputDataStream(input);
+ result = (result == null) ? current : result.union(current);
+ }
+ context.setOutputDataStream(context.getOutput(transform), result);
+ }
+ }
+ }
+
+ private static class ToKeyedWorkItem<K, InputT>
+ extends RichFlatMapFunction<
+ WindowedValue<KV<K, InputT>>,
+ WindowedValue<SingletonKeyedWorkItem<K, InputT>>> {
+
+ @Override
+ public void flatMap(
+ WindowedValue<KV<K, InputT>> inWithMultipleWindows,
+ Collector<WindowedValue<SingletonKeyedWorkItem<K, InputT>>> out) throws Exception {
+
+ // we need to wrap each one work item per window for now
+ // since otherwise the PushbackSideInputRunner will not correctly
+ // determine whether side inputs are ready
+ //
+ // this is tracked as https://issues.apache.org/jira/browse/BEAM-1850
+ for (WindowedValue<KV<K, InputT>> in : inWithMultipleWindows.explodeWindows()) {
+ SingletonKeyedWorkItem<K, InputT> workItem =
+ new SingletonKeyedWorkItem<>(
+ in.getValue().getKey(),
+ in.withValue(in.getValue().getValue()));
+
+ out.collect(in.withValue(workItem));
+ }
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkStreamingTranslationContext.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkStreamingTranslationContext.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkStreamingTranslationContext.java
new file mode 100644
index 0000000..1a943a3
--- /dev/null
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkStreamingTranslationContext.java
@@ -0,0 +1,130 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink;
+
+import static com.google.common.base.Preconditions.checkNotNull;
+
+import com.google.common.collect.Iterables;
+import java.util.HashMap;
+import java.util.Map;
+import org.apache.beam.runners.flink.translation.types.CoderTypeInformation;
+import org.apache.beam.sdk.coders.Coder;
+import org.apache.beam.sdk.options.PipelineOptions;
+import org.apache.beam.sdk.transforms.AppliedPTransform;
+import org.apache.beam.sdk.transforms.PTransform;
+import org.apache.beam.sdk.util.WindowedValue;
+import org.apache.beam.sdk.values.PCollection;
+import org.apache.beam.sdk.values.PInput;
+import org.apache.beam.sdk.values.POutput;
+import org.apache.beam.sdk.values.PValue;
+import org.apache.beam.sdk.values.TupleTag;
+import org.apache.flink.api.common.typeinfo.TypeInformation;
+import org.apache.flink.streaming.api.datastream.DataStream;
+import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
+
+/**
+ * Helper for keeping track of which {@link DataStream DataStreams} map
+ * to which {@link PTransform PTransforms}.
+ */
+class FlinkStreamingTranslationContext {
+
+ private final StreamExecutionEnvironment env;
+ private final PipelineOptions options;
+
+ /**
+ * Keeps a mapping between the output value of the PTransform (in Dataflow) and the
+ * Flink Operator that produced it, after the translation of the correspondinf PTransform
+ * to its Flink equivalent.
+ * */
+ private final Map<PValue, DataStream<?>> dataStreams;
+
+ private AppliedPTransform<?, ?, ?> currentTransform;
+
+ public FlinkStreamingTranslationContext(StreamExecutionEnvironment env, PipelineOptions options) {
+ this.env = checkNotNull(env);
+ this.options = checkNotNull(options);
+ this.dataStreams = new HashMap<>();
+ }
+
+ public StreamExecutionEnvironment getExecutionEnvironment() {
+ return env;
+ }
+
+ public PipelineOptions getPipelineOptions() {
+ return options;
+ }
+
+ @SuppressWarnings("unchecked")
+ public <T> DataStream<T> getInputDataStream(PValue value) {
+ return (DataStream<T>) dataStreams.get(value);
+ }
+
+ public void setOutputDataStream(PValue value, DataStream<?> set) {
+ if (!dataStreams.containsKey(value)) {
+ dataStreams.put(value, set);
+ }
+ }
+
+ /**
+ * Sets the AppliedPTransform which carries input/output.
+ * @param currentTransform
+ */
+ public void setCurrentTransform(AppliedPTransform<?, ?, ?> currentTransform) {
+ this.currentTransform = currentTransform;
+ }
+
+ public <T> Coder<WindowedValue<T>> getCoder(PCollection<T> collection) {
+ Coder<T> valueCoder = collection.getCoder();
+
+ return WindowedValue.getFullCoder(
+ valueCoder,
+ collection.getWindowingStrategy().getWindowFn().windowCoder());
+ }
+
+ @SuppressWarnings("unchecked")
+ public <T> TypeInformation<WindowedValue<T>> getTypeInfo(PCollection<T> collection) {
+ Coder<T> valueCoder = collection.getCoder();
+ WindowedValue.FullWindowedValueCoder<T> windowedValueCoder =
+ WindowedValue.getFullCoder(
+ valueCoder,
+ collection.getWindowingStrategy().getWindowFn().windowCoder());
+
+ return new CoderTypeInformation<>(windowedValueCoder);
+ }
+
+
+ @SuppressWarnings("unchecked")
+ public <T extends PValue> T getInput(PTransform<T, ?> transform) {
+ return (T) Iterables.getOnlyElement(currentTransform.getInputs().values());
+ }
+
+ public <T extends PInput> Map<TupleTag<?>, PValue> getInputs(PTransform<T, ?> transform) {
+ return currentTransform.getInputs();
+ }
+
+ @SuppressWarnings("unchecked")
+ public <T extends PValue> T getOutput(PTransform<?, T> transform) {
+ return (T) Iterables.getOnlyElement(currentTransform.getOutputs().values());
+ }
+
+ public <OutputT extends POutput> Map<TupleTag<?>, PValue> getOutputs(
+ PTransform<?, OutputT> transform) {
+ return currentTransform.getOutputs();
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkStreamingViewOverrides.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkStreamingViewOverrides.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkStreamingViewOverrides.java
new file mode 100644
index 0000000..f955f2a
--- /dev/null
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkStreamingViewOverrides.java
@@ -0,0 +1,372 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import org.apache.beam.sdk.coders.Coder;
+import org.apache.beam.sdk.coders.CoderRegistry;
+import org.apache.beam.sdk.coders.KvCoder;
+import org.apache.beam.sdk.coders.ListCoder;
+import org.apache.beam.sdk.transforms.Combine;
+import org.apache.beam.sdk.transforms.DoFn;
+import org.apache.beam.sdk.transforms.PTransform;
+import org.apache.beam.sdk.transforms.ParDo;
+import org.apache.beam.sdk.transforms.View;
+import org.apache.beam.sdk.util.PCollectionViews;
+import org.apache.beam.sdk.values.KV;
+import org.apache.beam.sdk.values.PCollection;
+import org.apache.beam.sdk.values.PCollectionView;
+
+/**
+ * Flink streaming overrides for various view (side input) transforms.
+ */
+class FlinkStreamingViewOverrides {
+
+ /**
+ * Specialized implementation for
+ * {@link org.apache.beam.sdk.transforms.View.AsMap View.AsMap}
+ * for the Flink runner in streaming mode.
+ */
+ static class StreamingViewAsMap<K, V>
+ extends PTransform<PCollection<KV<K, V>>, PCollectionView<Map<K, V>>> {
+
+ private final transient FlinkRunner runner;
+
+ @SuppressWarnings("unused") // used via reflection in FlinkRunner#apply()
+ public StreamingViewAsMap(FlinkRunner runner, View.AsMap<K, V> transform) {
+ this.runner = runner;
+ }
+
+ @Override
+ public PCollectionView<Map<K, V>> expand(PCollection<KV<K, V>> input) {
+ PCollectionView<Map<K, V>> view =
+ PCollectionViews.mapView(
+ input,
+ input.getWindowingStrategy(),
+ input.getCoder());
+
+ @SuppressWarnings({"rawtypes", "unchecked"})
+ KvCoder<K, V> inputCoder = (KvCoder) input.getCoder();
+ try {
+ inputCoder.getKeyCoder().verifyDeterministic();
+ } catch (Coder.NonDeterministicException e) {
+ runner.recordViewUsesNonDeterministicKeyCoder(this);
+ }
+
+ return input
+ .apply(Combine.globally(new Concatenate<KV<K, V>>()).withoutDefaults())
+ .apply(CreateFlinkPCollectionView.<KV<K, V>, Map<K, V>>of(view));
+ }
+
+ @Override
+ protected String getKindString() {
+ return "StreamingViewAsMap";
+ }
+ }
+
+ /**
+ * Specialized expansion for {@link
+ * View.AsMultimap View.AsMultimap} for the
+ * Flink runner in streaming mode.
+ */
+ static class StreamingViewAsMultimap<K, V>
+ extends PTransform<PCollection<KV<K, V>>, PCollectionView<Map<K, Iterable<V>>>> {
+
+ private final transient FlinkRunner runner;
+
+ /**
+ * Builds an instance of this class from the overridden transform.
+ */
+ @SuppressWarnings("unused") // used via reflection in FlinkRunner#apply()
+ public StreamingViewAsMultimap(FlinkRunner runner, View.AsMultimap<K, V> transform) {
+ this.runner = runner;
+ }
+
+ @Override
+ public PCollectionView<Map<K, Iterable<V>>> expand(PCollection<KV<K, V>> input) {
+ PCollectionView<Map<K, Iterable<V>>> view =
+ PCollectionViews.multimapView(
+ input,
+ input.getWindowingStrategy(),
+ input.getCoder());
+
+ @SuppressWarnings({"rawtypes", "unchecked"})
+ KvCoder<K, V> inputCoder = (KvCoder) input.getCoder();
+ try {
+ inputCoder.getKeyCoder().verifyDeterministic();
+ } catch (Coder.NonDeterministicException e) {
+ runner.recordViewUsesNonDeterministicKeyCoder(this);
+ }
+
+ return input
+ .apply(Combine.globally(new Concatenate<KV<K, V>>()).withoutDefaults())
+ .apply(CreateFlinkPCollectionView.<KV<K, V>, Map<K, Iterable<V>>>of(view));
+ }
+
+ @Override
+ protected String getKindString() {
+ return "StreamingViewAsMultimap";
+ }
+ }
+
+ /**
+ * Specialized implementation for
+ * {@link View.AsList View.AsList} for the
+ * Flink runner in streaming mode.
+ */
+ static class StreamingViewAsList<T>
+ extends PTransform<PCollection<T>, PCollectionView<List<T>>> {
+ /**
+ * Builds an instance of this class from the overridden transform.
+ */
+ @SuppressWarnings("unused") // used via reflection in FlinkRunner#apply()
+ public StreamingViewAsList(FlinkRunner runner, View.AsList<T> transform) {}
+
+ @Override
+ public PCollectionView<List<T>> expand(PCollection<T> input) {
+ PCollectionView<List<T>> view =
+ PCollectionViews.listView(
+ input,
+ input.getWindowingStrategy(),
+ input.getCoder());
+
+ return input.apply(Combine.globally(new Concatenate<T>()).withoutDefaults())
+ .apply(CreateFlinkPCollectionView.<T, List<T>>of(view));
+ }
+
+ @Override
+ protected String getKindString() {
+ return "StreamingViewAsList";
+ }
+ }
+
+ /**
+ * Specialized implementation for
+ * {@link View.AsIterable View.AsIterable} for the
+ * Flink runner in streaming mode.
+ */
+ static class StreamingViewAsIterable<T>
+ extends PTransform<PCollection<T>, PCollectionView<Iterable<T>>> {
+ /**
+ * Builds an instance of this class from the overridden transform.
+ */
+ @SuppressWarnings("unused") // used via reflection in FlinkRunner#apply()
+ public StreamingViewAsIterable(FlinkRunner runner, View.AsIterable<T> transform) { }
+
+ @Override
+ public PCollectionView<Iterable<T>> expand(PCollection<T> input) {
+ PCollectionView<Iterable<T>> view =
+ PCollectionViews.iterableView(
+ input,
+ input.getWindowingStrategy(),
+ input.getCoder());
+
+ return input.apply(Combine.globally(new Concatenate<T>()).withoutDefaults())
+ .apply(CreateFlinkPCollectionView.<T, Iterable<T>>of(view));
+ }
+
+ @Override
+ protected String getKindString() {
+ return "StreamingViewAsIterable";
+ }
+ }
+
+ /**
+ * Specialized expansion for
+ * {@link View.AsSingleton View.AsSingleton} for the
+ * Flink runner in streaming mode.
+ */
+ static class StreamingViewAsSingleton<T>
+ extends PTransform<PCollection<T>, PCollectionView<T>> {
+ private View.AsSingleton<T> transform;
+
+ /**
+ * Builds an instance of this class from the overridden transform.
+ */
+ @SuppressWarnings("unused") // used via reflection in FlinkRunner#apply()
+ public StreamingViewAsSingleton(FlinkRunner runner, View.AsSingleton<T> transform) {
+ this.transform = transform;
+ }
+
+ @Override
+ public PCollectionView<T> expand(PCollection<T> input) {
+ Combine.Globally<T, T> combine = Combine.globally(
+ new SingletonCombine<>(transform.hasDefaultValue(), transform.defaultValue()));
+ if (!transform.hasDefaultValue()) {
+ combine = combine.withoutDefaults();
+ }
+ return input.apply(combine.asSingletonView());
+ }
+
+ @Override
+ protected String getKindString() {
+ return "StreamingViewAsSingleton";
+ }
+
+ private static class SingletonCombine<T> extends Combine.BinaryCombineFn<T> {
+ private boolean hasDefaultValue;
+ private T defaultValue;
+
+ SingletonCombine(boolean hasDefaultValue, T defaultValue) {
+ this.hasDefaultValue = hasDefaultValue;
+ this.defaultValue = defaultValue;
+ }
+
+ @Override
+ public T apply(T left, T right) {
+ throw new IllegalArgumentException("PCollection with more than one element "
+ + "accessed as a singleton view. Consider using Combine.globally().asSingleton() to "
+ + "combine the PCollection into a single value");
+ }
+
+ @Override
+ public T identity() {
+ if (hasDefaultValue) {
+ return defaultValue;
+ } else {
+ throw new IllegalArgumentException(
+ "Empty PCollection accessed as a singleton view. "
+ + "Consider setting withDefault to provide a default value");
+ }
+ }
+ }
+ }
+
+ static class StreamingCombineGloballyAsSingletonView<InputT, OutputT>
+ extends PTransform<PCollection<InputT>, PCollectionView<OutputT>> {
+ Combine.GloballyAsSingletonView<InputT, OutputT> transform;
+
+ /**
+ * Builds an instance of this class from the overridden transform.
+ */
+ @SuppressWarnings("unused") // used via reflection in FlinkRunner#apply()
+ public StreamingCombineGloballyAsSingletonView(
+ FlinkRunner runner,
+ Combine.GloballyAsSingletonView<InputT, OutputT> transform) {
+ this.transform = transform;
+ }
+
+ @Override
+ public PCollectionView<OutputT> expand(PCollection<InputT> input) {
+ PCollection<OutputT> combined =
+ input.apply(Combine.globally(transform.getCombineFn())
+ .withoutDefaults()
+ .withFanout(transform.getFanout()));
+
+ PCollectionView<OutputT> view = PCollectionViews.singletonView(
+ combined,
+ combined.getWindowingStrategy(),
+ transform.getInsertDefault(),
+ transform.getInsertDefault()
+ ? transform.getCombineFn().defaultValue() : null,
+ combined.getCoder());
+ return combined
+ .apply(ParDo.of(new WrapAsList<OutputT>()))
+ .apply(CreateFlinkPCollectionView.<OutputT, OutputT>of(view));
+ }
+
+ @Override
+ protected String getKindString() {
+ return "StreamingCombineGloballyAsSingletonView";
+ }
+ }
+
+ private static class WrapAsList<T> extends DoFn<T, List<T>> {
+ @ProcessElement
+ public void processElement(ProcessContext c) {
+ c.output(Collections.singletonList(c.element()));
+ }
+ }
+
+ /**
+ * Combiner that combines {@code T}s into a single {@code List<T>} containing all inputs.
+ *
+ * <p>For internal use by {@link StreamingViewAsMap}, {@link StreamingViewAsMultimap},
+ * {@link StreamingViewAsList}, {@link StreamingViewAsIterable}.
+ * They require the input {@link PCollection} fits in memory.
+ * For a large {@link PCollection} this is expected to crash!
+ *
+ * @param <T> the type of elements to concatenate.
+ */
+ private static class Concatenate<T> extends Combine.CombineFn<T, List<T>, List<T>> {
+ @Override
+ public List<T> createAccumulator() {
+ return new ArrayList<T>();
+ }
+
+ @Override
+ public List<T> addInput(List<T> accumulator, T input) {
+ accumulator.add(input);
+ return accumulator;
+ }
+
+ @Override
+ public List<T> mergeAccumulators(Iterable<List<T>> accumulators) {
+ List<T> result = createAccumulator();
+ for (List<T> accumulator : accumulators) {
+ result.addAll(accumulator);
+ }
+ return result;
+ }
+
+ @Override
+ public List<T> extractOutput(List<T> accumulator) {
+ return accumulator;
+ }
+
+ @Override
+ public Coder<List<T>> getAccumulatorCoder(CoderRegistry registry, Coder<T> inputCoder) {
+ return ListCoder.of(inputCoder);
+ }
+
+ @Override
+ public Coder<List<T>> getDefaultOutputCoder(CoderRegistry registry, Coder<T> inputCoder) {
+ return ListCoder.of(inputCoder);
+ }
+ }
+
+ /**
+ * Creates a primitive {@link PCollectionView}.
+ *
+ * <p>For internal use only by runner implementors.
+ *
+ * @param <ElemT> The type of the elements of the input PCollection
+ * @param <ViewT> The type associated with the {@link PCollectionView} used as a side input
+ */
+ public static class CreateFlinkPCollectionView<ElemT, ViewT>
+ extends PTransform<PCollection<List<ElemT>>, PCollectionView<ViewT>> {
+ private PCollectionView<ViewT> view;
+
+ private CreateFlinkPCollectionView(PCollectionView<ViewT> view) {
+ this.view = view;
+ }
+
+ public static <ElemT, ViewT> CreateFlinkPCollectionView<ElemT, ViewT> of(
+ PCollectionView<ViewT> view) {
+ return new CreateFlinkPCollectionView<>(view);
+ }
+
+ @Override
+ public PCollectionView<ViewT> expand(PCollection<List<ElemT>> input) {
+ return view;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/main/java/org/apache/beam/runners/flink/PipelineTranslationOptimizer.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/PipelineTranslationOptimizer.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/PipelineTranslationOptimizer.java
new file mode 100644
index 0000000..3acc3ea
--- /dev/null
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/PipelineTranslationOptimizer.java
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink;
+
+import org.apache.beam.sdk.io.Read;
+import org.apache.beam.sdk.runners.TransformHierarchy;
+import org.apache.beam.sdk.transforms.PTransform;
+import org.apache.beam.sdk.values.PValue;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Traverses the Pipeline to determine the {@link TranslationMode} for this pipeline.
+ */
+class PipelineTranslationOptimizer extends FlinkPipelineTranslator {
+
+ private static final Logger LOG = LoggerFactory.getLogger(PipelineTranslationOptimizer.class);
+
+ private TranslationMode translationMode;
+
+ private final FlinkPipelineOptions options;
+
+ public PipelineTranslationOptimizer(TranslationMode defaultMode, FlinkPipelineOptions options) {
+ this.translationMode = defaultMode;
+ this.options = options;
+ }
+
+ public TranslationMode getTranslationMode() {
+
+ // override user-specified translation mode
+ if (options.isStreaming()) {
+ return TranslationMode.STREAMING;
+ }
+
+ return translationMode;
+ }
+
+ @Override
+ public CompositeBehavior enterCompositeTransform(TransformHierarchy.Node node) {
+ return CompositeBehavior.ENTER_TRANSFORM;
+ }
+
+ @Override
+ public void leaveCompositeTransform(TransformHierarchy.Node node) {}
+
+ @Override
+ public void visitPrimitiveTransform(TransformHierarchy.Node node) {
+ Class<? extends PTransform> transformClass = node.getTransform().getClass();
+ if (transformClass == Read.Unbounded.class) {
+ LOG.info("Found {}. Switching to streaming execution.", transformClass);
+ translationMode = TranslationMode.STREAMING;
+ }
+ }
+
+ @Override
+ public void visitValue(PValue value, TransformHierarchy.Node producer) {}
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/main/java/org/apache/beam/runners/flink/TestFlinkRunner.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/TestFlinkRunner.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/TestFlinkRunner.java
new file mode 100644
index 0000000..8f50105
--- /dev/null
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/TestFlinkRunner.java
@@ -0,0 +1,84 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink;
+
+import org.apache.beam.sdk.Pipeline;
+import org.apache.beam.sdk.Pipeline.PipelineExecutionException;
+import org.apache.beam.sdk.PipelineResult;
+import org.apache.beam.sdk.options.PipelineOptions;
+import org.apache.beam.sdk.options.PipelineOptionsFactory;
+import org.apache.beam.sdk.options.PipelineOptionsValidator;
+import org.apache.beam.sdk.runners.PipelineRunner;
+import org.apache.beam.sdk.util.UserCodeException;
+
+/**
+ * Test Flink runner.
+ */
+public class TestFlinkRunner extends PipelineRunner<PipelineResult> {
+
+ private FlinkRunner delegate;
+
+ private TestFlinkRunner(FlinkPipelineOptions options) {
+ // We use [auto] for testing since this will make it pick up the Testing ExecutionEnvironment
+ options.setFlinkMaster("[auto]");
+ this.delegate = FlinkRunner.fromOptions(options);
+ }
+
+ public static TestFlinkRunner fromOptions(PipelineOptions options) {
+ FlinkPipelineOptions flinkOptions =
+ PipelineOptionsValidator.validate(FlinkPipelineOptions.class, options);
+ return new TestFlinkRunner(flinkOptions);
+ }
+
+ public static TestFlinkRunner create(boolean streaming) {
+ FlinkPipelineOptions flinkOptions = PipelineOptionsFactory.as(FlinkPipelineOptions.class);
+ flinkOptions.setRunner(TestFlinkRunner.class);
+ flinkOptions.setStreaming(streaming);
+ return TestFlinkRunner.fromOptions(flinkOptions);
+ }
+
+ @Override
+ public PipelineResult run(Pipeline pipeline) {
+ try {
+ return delegate.run(pipeline);
+ } catch (Throwable t) {
+ // Special case hack to pull out assertion errors from PAssert; instead there should
+ // probably be a better story along the lines of UserCodeException.
+ UserCodeException innermostUserCodeException = null;
+ Throwable current = t;
+ for (; current.getCause() != null; current = current.getCause()) {
+ if (current instanceof UserCodeException) {
+ innermostUserCodeException = ((UserCodeException) current);
+ }
+ }
+ if (innermostUserCodeException != null) {
+ current = innermostUserCodeException.getCause();
+ }
+ if (current instanceof AssertionError) {
+ throw (AssertionError) current;
+ }
+ throw new PipelineExecutionException(current);
+ }
+ }
+
+ public PipelineOptions getPipelineOptions() {
+ return delegate.getPipelineOptions();
+ }
+}
+
+
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/main/java/org/apache/beam/runners/flink/TranslationMode.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/TranslationMode.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/TranslationMode.java
new file mode 100644
index 0000000..ad54750
--- /dev/null
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/TranslationMode.java
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink;
+
+/**
+ * The translation mode of the Beam Pipeline.
+ */
+enum TranslationMode {
+
+ /** Uses the batch mode of Flink. */
+ BATCH,
+
+ /** Uses the streaming mode of Flink. */
+ STREAMING
+
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/main/java/org/apache/beam/runners/flink/package-info.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/package-info.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/package-info.java
new file mode 100644
index 0000000..57f1e59
--- /dev/null
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/package-info.java
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Internal implementation of the Beam runner for Apache Flink.
+ */
+package org.apache.beam.runners.flink;
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkAggregatorFactory.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkAggregatorFactory.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkAggregatorFactory.java
new file mode 100644
index 0000000..fb2493b
--- /dev/null
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkAggregatorFactory.java
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink.translation.functions;
+
+import org.apache.beam.runners.core.AggregatorFactory;
+import org.apache.beam.runners.core.ExecutionContext;
+import org.apache.beam.runners.flink.translation.wrappers.SerializableFnAggregatorWrapper;
+import org.apache.beam.sdk.transforms.Aggregator;
+import org.apache.beam.sdk.transforms.Combine;
+import org.apache.flink.api.common.functions.RuntimeContext;
+
+/**
+ * A {@link AggregatorFactory} for the Flink Batch Runner.
+ */
+public class FlinkAggregatorFactory implements AggregatorFactory{
+
+ private final RuntimeContext runtimeContext;
+
+ public FlinkAggregatorFactory(RuntimeContext runtimeContext) {
+ this.runtimeContext = runtimeContext;
+ }
+
+ @Override
+ public <InputT, AccumT, OutputT> Aggregator<InputT, OutputT> createAggregatorForDoFn(
+ Class<?> fnClass, ExecutionContext.StepContext stepContext, String aggregatorName,
+ Combine.CombineFn<InputT, AccumT, OutputT> combine) {
+ @SuppressWarnings("unchecked")
+ SerializableFnAggregatorWrapper<InputT, OutputT> result =
+ (SerializableFnAggregatorWrapper<InputT, OutputT>)
+ runtimeContext.getAccumulator(aggregatorName);
+
+ if (result == null) {
+ result = new SerializableFnAggregatorWrapper<>(combine);
+ runtimeContext.addAccumulator(aggregatorName, result);
+ }
+ return result;
+ }
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkAssignContext.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkAssignContext.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkAssignContext.java
new file mode 100644
index 0000000..447b1e5
--- /dev/null
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkAssignContext.java
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink.translation.functions;
+
+import static com.google.common.base.Preconditions.checkArgument;
+
+import com.google.common.collect.Iterables;
+import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
+import org.apache.beam.sdk.transforms.windowing.WindowFn;
+import org.apache.beam.sdk.util.WindowedValue;
+import org.joda.time.Instant;
+
+/**
+ * {@link org.apache.beam.sdk.transforms.windowing.WindowFn.AssignContext} for
+ * Flink functions.
+ */
+class FlinkAssignContext<InputT, W extends BoundedWindow>
+ extends WindowFn<InputT, W>.AssignContext {
+ private final WindowedValue<InputT> value;
+
+ FlinkAssignContext(WindowFn<InputT, W> fn, WindowedValue<InputT> value) {
+ fn.super();
+ checkArgument(
+ Iterables.size(value.getWindows()) == 1,
+ String.format(
+ "%s passed to window assignment must be in a single window, but it was in %s: %s",
+ WindowedValue.class.getSimpleName(),
+ Iterables.size(value.getWindows()),
+ value.getWindows()));
+ this.value = value;
+ }
+
+ @Override
+ public InputT element() {
+ return value.getValue();
+ }
+
+ @Override
+ public Instant timestamp() {
+ return value.getTimestamp();
+ }
+
+ @Override
+ public BoundedWindow window() {
+ return Iterables.getOnlyElement(value.getWindows());
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkAssignWindows.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkAssignWindows.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkAssignWindows.java
new file mode 100644
index 0000000..c3a5095
--- /dev/null
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkAssignWindows.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink.translation.functions;
+
+import java.util.Collection;
+import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
+import org.apache.beam.sdk.transforms.windowing.WindowFn;
+import org.apache.beam.sdk.util.WindowedValue;
+import org.apache.flink.api.common.functions.FlatMapFunction;
+import org.apache.flink.util.Collector;
+
+/**
+ * Flink {@link FlatMapFunction} for implementing
+ * {@link org.apache.beam.sdk.transforms.windowing.Window.Assign}.
+ */
+public class FlinkAssignWindows<T, W extends BoundedWindow>
+ implements FlatMapFunction<WindowedValue<T>, WindowedValue<T>> {
+
+ private final WindowFn<T, W> windowFn;
+
+ public FlinkAssignWindows(WindowFn<T, W> windowFn) {
+ this.windowFn = windowFn;
+ }
+
+ @Override
+ public void flatMap(
+ WindowedValue<T> input, Collector<WindowedValue<T>> collector) throws Exception {
+ Collection<W> windows = windowFn.assignWindows(new FlinkAssignContext<>(windowFn, input));
+ for (W window: windows) {
+ collector.collect(
+ WindowedValue.of(input.getValue(), input.getTimestamp(), window, input.getPane()));
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkDoFnFunction.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkDoFnFunction.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkDoFnFunction.java
new file mode 100644
index 0000000..51582af
--- /dev/null
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/functions/FlinkDoFnFunction.java
@@ -0,0 +1,161 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink.translation.functions;
+
+import java.util.Collections;
+import java.util.Map;
+import org.apache.beam.runners.core.DoFnRunner;
+import org.apache.beam.runners.core.DoFnRunners;
+import org.apache.beam.runners.flink.translation.utils.SerializedPipelineOptions;
+import org.apache.beam.sdk.options.PipelineOptions;
+import org.apache.beam.sdk.transforms.DoFn;
+import org.apache.beam.sdk.transforms.join.RawUnionValue;
+import org.apache.beam.sdk.transforms.reflect.DoFnInvoker;
+import org.apache.beam.sdk.transforms.reflect.DoFnInvokers;
+import org.apache.beam.sdk.util.WindowedValue;
+import org.apache.beam.sdk.util.WindowingStrategy;
+import org.apache.beam.sdk.values.PCollectionView;
+import org.apache.beam.sdk.values.TupleTag;
+import org.apache.flink.api.common.functions.RichMapPartitionFunction;
+import org.apache.flink.api.common.functions.RuntimeContext;
+import org.apache.flink.configuration.Configuration;
+import org.apache.flink.util.Collector;
+
+/**
+ * Encapsulates a {@link DoFn}
+ * inside a Flink {@link org.apache.flink.api.common.functions.RichMapPartitionFunction}.
+ *
+ * <p>We get a mapping from {@link org.apache.beam.sdk.values.TupleTag} to output index
+ * and must tag all outputs with the output number. Afterwards a filter will filter out
+ * those elements that are not to be in a specific output.
+ */
+public class FlinkDoFnFunction<InputT, OutputT>
+ extends RichMapPartitionFunction<WindowedValue<InputT>, WindowedValue<OutputT>> {
+
+ private final SerializedPipelineOptions serializedOptions;
+
+ private final DoFn<InputT, OutputT> doFn;
+ private final Map<PCollectionView<?>, WindowingStrategy<?, ?>> sideInputs;
+
+ private final WindowingStrategy<?, ?> windowingStrategy;
+
+ private final Map<TupleTag<?>, Integer> outputMap;
+ private final TupleTag<OutputT> mainOutputTag;
+
+ private transient DoFnInvoker<InputT, OutputT> doFnInvoker;
+
+ public FlinkDoFnFunction(
+ DoFn<InputT, OutputT> doFn,
+ WindowingStrategy<?, ?> windowingStrategy,
+ Map<PCollectionView<?>, WindowingStrategy<?, ?>> sideInputs,
+ PipelineOptions options,
+ Map<TupleTag<?>, Integer> outputMap,
+ TupleTag<OutputT> mainOutputTag) {
+
+ this.doFn = doFn;
+ this.sideInputs = sideInputs;
+ this.serializedOptions = new SerializedPipelineOptions(options);
+ this.windowingStrategy = windowingStrategy;
+ this.outputMap = outputMap;
+ this.mainOutputTag = mainOutputTag;
+
+ }
+
+ @Override
+ public void mapPartition(
+ Iterable<WindowedValue<InputT>> values,
+ Collector<WindowedValue<OutputT>> out) throws Exception {
+
+ RuntimeContext runtimeContext = getRuntimeContext();
+
+ DoFnRunners.OutputManager outputManager;
+ if (outputMap == null) {
+ outputManager = new FlinkDoFnFunction.DoFnOutputManager(out);
+ } else {
+ // it has some additional outputs
+ outputManager =
+ new FlinkDoFnFunction.MultiDoFnOutputManager((Collector) out, outputMap);
+ }
+
+ DoFnRunner<InputT, OutputT> doFnRunner = DoFnRunners.simpleRunner(
+ serializedOptions.getPipelineOptions(), doFn,
+ new FlinkSideInputReader(sideInputs, runtimeContext),
+ outputManager,
+ mainOutputTag,
+ // see SimpleDoFnRunner, just use it to limit number of additional outputs
+ Collections.<TupleTag<?>>emptyList(),
+ new FlinkNoOpStepContext(),
+ new FlinkAggregatorFactory(runtimeContext),
+ windowingStrategy);
+
+ doFnRunner.startBundle();
+
+ for (WindowedValue<InputT> value : values) {
+ doFnRunner.processElement(value);
+ }
+
+ doFnRunner.finishBundle();
+ }
+
+ @Override
+ public void open(Configuration parameters) throws Exception {
+ doFnInvoker = DoFnInvokers.invokerFor(doFn);
+ doFnInvoker.invokeSetup();
+ }
+
+ @Override
+ public void close() throws Exception {
+ doFnInvoker.invokeTeardown();
+ }
+
+ static class DoFnOutputManager
+ implements DoFnRunners.OutputManager {
+
+ private Collector collector;
+
+ DoFnOutputManager(Collector collector) {
+ this.collector = collector;
+ }
+
+ @Override
+ @SuppressWarnings("unchecked")
+ public <T> void output(TupleTag<T> tag, WindowedValue<T> output) {
+ collector.collect(output);
+ }
+ }
+
+ static class MultiDoFnOutputManager
+ implements DoFnRunners.OutputManager {
+
+ private Collector<WindowedValue<RawUnionValue>> collector;
+ private Map<TupleTag<?>, Integer> outputMap;
+
+ MultiDoFnOutputManager(Collector<WindowedValue<RawUnionValue>> collector,
+ Map<TupleTag<?>, Integer> outputMap) {
+ this.collector = collector;
+ this.outputMap = outputMap;
+ }
+
+ @Override
+ public <T> void output(TupleTag<T> tag, WindowedValue<T> output) {
+ collector.collect(WindowedValue.of(new RawUnionValue(outputMap.get(tag), output.getValue()),
+ output.getTimestamp(), output.getWindows(), output.getPane()));
+ }
+ }
+
+}
[44/50] [abbrv] beam git commit: Ensure all Read outputs are consumed
in Dataflow
Posted by dh...@apache.org.
Ensure all Read outputs are consumed in Dataflow
Apply a no-op ParDo to any PTransform that is not consumed.
Project: http://git-wip-us.apache.org/repos/asf/beam/repo
Commit: http://git-wip-us.apache.org/repos/asf/beam/commit/418c304d
Tree: http://git-wip-us.apache.org/repos/asf/beam/tree/418c304d
Diff: http://git-wip-us.apache.org/repos/asf/beam/diff/418c304d
Branch: refs/heads/DSL_SQL
Commit: 418c304dbff1ce8c176d08c890780ec97245aaae
Parents: 714fdd2
Author: Thomas Groh <tg...@google.com>
Authored: Tue Apr 18 17:25:59 2017 -0700
Committer: Thomas Groh <tg...@google.com>
Committed: Wed Apr 19 10:53:30 2017 -0700
----------------------------------------------------------------------
.../core/construction/UnconsumedReads.java | 72 +++++++++++++
.../core/construction/UnconsumedReadsTest.java | 105 +++++++++++++++++++
.../beam/runners/dataflow/DataflowRunner.java | 4 +
.../runners/dataflow/DataflowRunnerTest.java | 24 +++++
4 files changed, 205 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/beam/blob/418c304d/runners/core-construction-java/src/main/java/org/apache/beam/runners/core/construction/UnconsumedReads.java
----------------------------------------------------------------------
diff --git a/runners/core-construction-java/src/main/java/org/apache/beam/runners/core/construction/UnconsumedReads.java b/runners/core-construction-java/src/main/java/org/apache/beam/runners/core/construction/UnconsumedReads.java
new file mode 100644
index 0000000..c191eeb
--- /dev/null
+++ b/runners/core-construction-java/src/main/java/org/apache/beam/runners/core/construction/UnconsumedReads.java
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.beam.runners.core.construction;
+
+import java.util.HashSet;
+import java.util.Set;
+import org.apache.beam.sdk.Pipeline;
+import org.apache.beam.sdk.Pipeline.PipelineVisitor;
+import org.apache.beam.sdk.io.Read;
+import org.apache.beam.sdk.runners.TransformHierarchy.Node;
+import org.apache.beam.sdk.transforms.DoFn;
+import org.apache.beam.sdk.transforms.PTransform;
+import org.apache.beam.sdk.transforms.ParDo;
+import org.apache.beam.sdk.values.PCollection;
+import org.apache.beam.sdk.values.PValue;
+
+/**
+ * Utilities for ensuring that all {@link Read} {@link PTransform PTransforms} are consumed by some
+ * {@link PTransform}.
+ */
+public class UnconsumedReads {
+ public static void ensureAllReadsConsumed(Pipeline pipeline) {
+ final Set<PCollection<?>> unconsumed = new HashSet<>();
+ pipeline.traverseTopologically(
+ new PipelineVisitor.Defaults() {
+ @Override
+ public void visitPrimitiveTransform(Node node) {
+ unconsumed.removeAll(node.getInputs().values());
+ }
+
+ @Override
+ public void visitValue(PValue value, Node producer) {
+ if (producer.getTransform() instanceof Read.Bounded
+ || producer.getTransform() instanceof Read.Unbounded) {
+ unconsumed.add((PCollection<?>) value);
+ }
+ }
+ });
+ int i = 0;
+ for (PCollection<?> unconsumedPCollection : unconsumed) {
+ consume(unconsumedPCollection, i);
+ i++;
+ }
+ }
+
+ private static <T> void consume(PCollection<T> unconsumedPCollection, int uniq) {
+ // Multiple applications should never break due to stable unique names.
+ String uniqueName = "DropInputs" + (uniq == 0 ? "" : uniq);
+ unconsumedPCollection.apply(uniqueName, ParDo.of(new NoOpDoFn<T>()));
+ }
+
+ private static class NoOpDoFn<T> extends DoFn<T, T> {
+ @ProcessElement
+ public void doNothing(ProcessContext context) {}
+ }
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/418c304d/runners/core-construction-java/src/test/java/org/apache/beam/runners/core/construction/UnconsumedReadsTest.java
----------------------------------------------------------------------
diff --git a/runners/core-construction-java/src/test/java/org/apache/beam/runners/core/construction/UnconsumedReadsTest.java b/runners/core-construction-java/src/test/java/org/apache/beam/runners/core/construction/UnconsumedReadsTest.java
new file mode 100644
index 0000000..1966a93
--- /dev/null
+++ b/runners/core-construction-java/src/test/java/org/apache/beam/runners/core/construction/UnconsumedReadsTest.java
@@ -0,0 +1,105 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.beam.runners.core.construction;
+
+import static org.junit.Assert.assertThat;
+
+import java.util.HashSet;
+import java.util.Set;
+import org.apache.beam.sdk.Pipeline.PipelineVisitor;
+import org.apache.beam.sdk.io.CountingSource;
+import org.apache.beam.sdk.io.Read;
+import org.apache.beam.sdk.io.Read.Bounded;
+import org.apache.beam.sdk.io.Read.Unbounded;
+import org.apache.beam.sdk.runners.TransformHierarchy.Node;
+import org.apache.beam.sdk.testing.TestPipeline;
+import org.apache.beam.sdk.transforms.Flatten;
+import org.apache.beam.sdk.transforms.PTransform;
+import org.apache.beam.sdk.values.PCollection;
+import org.apache.beam.sdk.values.PCollectionList;
+import org.apache.beam.sdk.values.PValue;
+import org.hamcrest.Matchers;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+/**
+ * Tests for {@link UnconsumedReads}.
+ */
+@RunWith(JUnit4.class)
+public class UnconsumedReadsTest {
+ @Rule public TestPipeline pipeline = TestPipeline.create().enableAbandonedNodeEnforcement(false);
+
+ @Test
+ public void matcherProducesUnconsumedValueBoundedRead() {
+ Bounded<Long> transform = Read.from(CountingSource.upTo(20L));
+ PCollection<Long> output = pipeline.apply(transform);
+ UnconsumedReads.ensureAllReadsConsumed(pipeline);
+ validateConsumed();
+ }
+
+ @Test
+ public void matcherProducesUnconsumedValueUnboundedRead() {
+ Unbounded<Long> transform = Read.from(CountingSource.unbounded());
+ PCollection<Long> output = pipeline.apply(transform);
+ UnconsumedReads.ensureAllReadsConsumed(pipeline);
+ validateConsumed();
+ }
+
+ @Test
+ public void doesNotConsumeAlreadyConsumedRead() {
+ Unbounded<Long> transform = Read.from(CountingSource.unbounded());
+ final PCollection<Long> output = pipeline.apply(transform);
+ final Flatten.PCollections<Long> consumer = Flatten.<Long>pCollections();
+ PCollectionList.of(output).apply(consumer);
+ UnconsumedReads.ensureAllReadsConsumed(pipeline);
+ pipeline.traverseTopologically(
+ new PipelineVisitor.Defaults() {
+ @Override
+ public void visitPrimitiveTransform(Node node) {
+ // The output should only be consumed by a single consumer
+ if (node.getInputs().values().contains(output)) {
+ assertThat(node.getTransform(), Matchers.<PTransform<?, ?>>is(consumer));
+ }
+ }
+ });
+ }
+
+ private void validateConsumed() {
+ final Set<PValue> consumedOutputs = new HashSet<PValue>();
+ final Set<PValue> allReadOutputs = new HashSet<PValue>();
+ pipeline.traverseTopologically(
+ new PipelineVisitor.Defaults() {
+ @Override
+ public void visitPrimitiveTransform(Node node) {
+ consumedOutputs.addAll(node.getInputs().values());
+ }
+
+ @Override
+ public void visitValue(PValue value, Node producer) {
+ if (producer.getTransform() instanceof Read.Bounded
+ || producer.getTransform() instanceof Read.Unbounded) {
+ allReadOutputs.add(value);
+ }
+ }
+ });
+ assertThat(consumedOutputs, Matchers.hasItems(allReadOutputs.toArray(new PValue[0])));
+ }
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/418c304d/runners/google-cloud-dataflow-java/src/main/java/org/apache/beam/runners/dataflow/DataflowRunner.java
----------------------------------------------------------------------
diff --git a/runners/google-cloud-dataflow-java/src/main/java/org/apache/beam/runners/dataflow/DataflowRunner.java b/runners/google-cloud-dataflow-java/src/main/java/org/apache/beam/runners/dataflow/DataflowRunner.java
index 4eec6b8..2912fa7 100644
--- a/runners/google-cloud-dataflow-java/src/main/java/org/apache/beam/runners/dataflow/DataflowRunner.java
+++ b/runners/google-cloud-dataflow-java/src/main/java/org/apache/beam/runners/dataflow/DataflowRunner.java
@@ -65,6 +65,7 @@ import org.apache.beam.runners.core.construction.PTransformReplacements;
import org.apache.beam.runners.core.construction.ReplacementOutputs;
import org.apache.beam.runners.core.construction.SingleInputOutputOverrideFactory;
import org.apache.beam.runners.core.construction.UnboundedReadFromBoundedSource;
+import org.apache.beam.runners.core.construction.UnconsumedReads;
import org.apache.beam.runners.dataflow.BatchViewOverrides.BatchCombineGloballyAsSingletonViewFactory;
import org.apache.beam.runners.dataflow.DataflowPipelineTranslator.JobSpecification;
import org.apache.beam.runners.dataflow.StreamingViewOverrides.StreamingCreatePCollectionViewFactory;
@@ -690,6 +691,9 @@ public class DataflowRunner extends PipelineRunner<DataflowPipelineJob> {
@VisibleForTesting
void replaceTransforms(Pipeline pipeline) {
boolean streaming = options.isStreaming() || containsUnboundedPCollection(pipeline);
+ // Ensure all outputs of all reads are consumed before potentially replacing any
+ // Read PTransforms
+ UnconsumedReads.ensureAllReadsConsumed(pipeline);
pipeline.replaceAll(getOverrides(streaming));
}
http://git-wip-us.apache.org/repos/asf/beam/blob/418c304d/runners/google-cloud-dataflow-java/src/test/java/org/apache/beam/runners/dataflow/DataflowRunnerTest.java
----------------------------------------------------------------------
diff --git a/runners/google-cloud-dataflow-java/src/test/java/org/apache/beam/runners/dataflow/DataflowRunnerTest.java b/runners/google-cloud-dataflow-java/src/test/java/org/apache/beam/runners/dataflow/DataflowRunnerTest.java
index 79a96e7..36704bc 100644
--- a/runners/google-cloud-dataflow-java/src/test/java/org/apache/beam/runners/dataflow/DataflowRunnerTest.java
+++ b/runners/google-cloud-dataflow-java/src/test/java/org/apache/beam/runners/dataflow/DataflowRunnerTest.java
@@ -23,6 +23,7 @@ import static org.hamcrest.Matchers.containsString;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.hasItem;
import static org.hamcrest.Matchers.instanceOf;
+import static org.hamcrest.Matchers.is;
import static org.hamcrest.Matchers.startsWith;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
@@ -57,6 +58,7 @@ import java.util.Arrays;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
+import java.util.concurrent.atomic.AtomicBoolean;
import java.util.regex.Pattern;
import org.apache.beam.runners.dataflow.options.DataflowPipelineDebugOptions;
import org.apache.beam.runners.dataflow.options.DataflowPipelineOptions;
@@ -65,11 +67,13 @@ import org.apache.beam.sdk.Pipeline.PipelineVisitor;
import org.apache.beam.sdk.coders.BigEndianIntegerCoder;
import org.apache.beam.sdk.coders.Coder;
import org.apache.beam.sdk.io.TextIO;
+import org.apache.beam.sdk.io.TextIO.Read;
import org.apache.beam.sdk.options.PipelineOptions;
import org.apache.beam.sdk.options.PipelineOptions.CheckEnabled;
import org.apache.beam.sdk.options.PipelineOptionsFactory;
import org.apache.beam.sdk.options.ValueProvider;
import org.apache.beam.sdk.runners.TransformHierarchy;
+import org.apache.beam.sdk.runners.TransformHierarchy.Node;
import org.apache.beam.sdk.testing.ExpectedLogs;
import org.apache.beam.sdk.testing.TestPipeline;
import org.apache.beam.sdk.transforms.Create;
@@ -331,6 +335,26 @@ public class DataflowRunnerTest {
.apply(TextIO.Write.to(options.getOutput()).withoutValidation());
}
+ /**
+ * Tests that all reads are consumed by at least one {@link PTransform}.
+ */
+ @Test
+ public void testUnconsumedReads() throws IOException {
+ DataflowPipelineOptions dataflowOptions = buildPipelineOptions();
+ RuntimeTestOptions options = dataflowOptions.as(RuntimeTestOptions.class);
+ Pipeline p = buildDataflowPipeline(dataflowOptions);
+ PCollection<String> unconsumed = p.apply(Read.from(options.getInput()).withoutValidation());
+ DataflowRunner.fromOptions(dataflowOptions).replaceTransforms(p);
+ final AtomicBoolean unconsumedSeenAsInput = new AtomicBoolean();
+ p.traverseTopologically(new PipelineVisitor.Defaults() {
+ @Override
+ public void visitPrimitiveTransform(Node node) {
+ unconsumedSeenAsInput.set(true);
+ }
+ });
+ assertThat(unconsumedSeenAsInput.get(), is(true));
+ }
+
@Test
public void testRunReturnDifferentRequestId() throws IOException {
DataflowPipelineOptions options = buildPipelineOptions();
[49/50] [abbrv] beam git commit: This closes #2592
Posted by dh...@apache.org.
This closes #2592
Project: http://git-wip-us.apache.org/repos/asf/beam/repo
Commit: http://git-wip-us.apache.org/repos/asf/beam/commit/19ae8776
Tree: http://git-wip-us.apache.org/repos/asf/beam/tree/19ae8776
Diff: http://git-wip-us.apache.org/repos/asf/beam/diff/19ae8776
Branch: refs/heads/DSL_SQL
Commit: 19ae8776261a5a78044091d9172223244a2b8042
Parents: 391fb77 546aa61
Author: Dan Halperin <dh...@google.com>
Authored: Wed Apr 19 12:07:37 2017 -0700
Committer: Dan Halperin <dh...@google.com>
Committed: Wed Apr 19 12:07:37 2017 -0700
----------------------------------------------------------------------
runners/apex/pom.xml | 1 +
runners/direct-java/pom.xml | 1 +
runners/flink/pom.xml | 2 ++
runners/google-cloud-dataflow-java/pom.xml | 43 +++++++++++++++++++++++++
runners/pom.xml | 40 -----------------------
runners/spark/pom.xml | 1 +
6 files changed, 48 insertions(+), 40 deletions(-)
----------------------------------------------------------------------
[21/50] [abbrv] beam git commit: [BEAM-1914] This closes #2558
Posted by dh...@apache.org.
[BEAM-1914] This closes #2558
Project: http://git-wip-us.apache.org/repos/asf/beam/repo
Commit: http://git-wip-us.apache.org/repos/asf/beam/commit/470808c0
Tree: http://git-wip-us.apache.org/repos/asf/beam/tree/470808c0
Diff: http://git-wip-us.apache.org/repos/asf/beam/diff/470808c0
Branch: refs/heads/DSL_SQL
Commit: 470808c06fc10ad545712d6b1831530e3d5313ad
Parents: 57929fb d0c0a60
Author: Jean-Baptiste Onofr� <jb...@apache.org>
Authored: Wed Apr 19 10:58:42 2017 +0200
Committer: Jean-Baptiste Onofr� <jb...@apache.org>
Committed: Wed Apr 19 10:58:42 2017 +0200
----------------------------------------------------------------------
.../apache/beam/sdk/io/CompressedSource.java | 4 +-
.../main/java/org/apache/beam/sdk/io/XmlIO.java | 477 +++++++++++++++++++
.../java/org/apache/beam/sdk/io/XmlSink.java | 226 ++-------
.../java/org/apache/beam/sdk/io/XmlSource.java | 191 +-------
.../sdk/transforms/display/DisplayData.java | 6 +
.../org/apache/beam/sdk/io/XmlSinkTest.java | 89 ++--
.../org/apache/beam/sdk/io/XmlSourceTest.java | 248 ++++++----
.../sdk/transforms/display/DisplayDataTest.java | 17 +
8 files changed, 740 insertions(+), 518 deletions(-)
----------------------------------------------------------------------
[27/50] [abbrv] beam git commit: [BEAM-1994] Remove Flink examples
package
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/BoundedSourceWrapper.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/BoundedSourceWrapper.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/BoundedSourceWrapper.java
new file mode 100644
index 0000000..2ed5024
--- /dev/null
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/BoundedSourceWrapper.java
@@ -0,0 +1,218 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink.translation.wrappers.streaming.io;
+
+import com.google.common.annotations.VisibleForTesting;
+import java.util.ArrayList;
+import java.util.List;
+import org.apache.beam.runners.flink.translation.utils.SerializedPipelineOptions;
+import org.apache.beam.sdk.io.BoundedSource;
+import org.apache.beam.sdk.options.PipelineOptions;
+import org.apache.beam.sdk.transforms.windowing.GlobalWindow;
+import org.apache.beam.sdk.transforms.windowing.PaneInfo;
+import org.apache.beam.sdk.util.WindowedValue;
+import org.apache.flink.api.common.functions.StoppableFunction;
+import org.apache.flink.streaming.api.functions.source.RichParallelSourceFunction;
+import org.apache.flink.streaming.api.watermark.Watermark;
+import org.joda.time.Instant;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Wrapper for executing {@link BoundedSource BoundedSources} as a Flink Source.
+ */
+public class BoundedSourceWrapper<OutputT>
+ extends RichParallelSourceFunction<WindowedValue<OutputT>>
+ implements StoppableFunction {
+
+ private static final Logger LOG = LoggerFactory.getLogger(BoundedSourceWrapper.class);
+
+ /**
+ * Keep the options so that we can initialize the readers.
+ */
+ private final SerializedPipelineOptions serializedOptions;
+
+ /**
+ * The split sources. We split them in the constructor to ensure that all parallel
+ * sources are consistent about the split sources.
+ */
+ private List<? extends BoundedSource<OutputT>> splitSources;
+
+ /**
+ * Make it a field so that we can access it in {@link #close()}.
+ */
+ private transient List<BoundedSource.BoundedReader<OutputT>> readers;
+
+ /**
+ * Initialize here and not in run() to prevent races where we cancel a job before run() is
+ * ever called or run() is called after cancel().
+ */
+ private volatile boolean isRunning = true;
+
+ @SuppressWarnings("unchecked")
+ public BoundedSourceWrapper(
+ PipelineOptions pipelineOptions,
+ BoundedSource<OutputT> source,
+ int parallelism) throws Exception {
+ this.serializedOptions = new SerializedPipelineOptions(pipelineOptions);
+
+ long desiredBundleSize = source.getEstimatedSizeBytes(pipelineOptions) / parallelism;
+
+ // get the splits early. we assume that the generated splits are stable,
+ // this is necessary so that the mapping of state to source is correct
+ // when restoring
+ splitSources = source.split(desiredBundleSize, pipelineOptions);
+ }
+
+ @Override
+ public void run(SourceContext<WindowedValue<OutputT>> ctx) throws Exception {
+
+ // figure out which split sources we're responsible for
+ int subtaskIndex = getRuntimeContext().getIndexOfThisSubtask();
+ int numSubtasks = getRuntimeContext().getNumberOfParallelSubtasks();
+
+ List<BoundedSource<OutputT>> localSources = new ArrayList<>();
+
+ for (int i = 0; i < splitSources.size(); i++) {
+ if (i % numSubtasks == subtaskIndex) {
+ localSources.add(splitSources.get(i));
+ }
+ }
+
+ LOG.info("Bounded Flink Source {}/{} is reading from sources: {}",
+ subtaskIndex,
+ numSubtasks,
+ localSources);
+
+ readers = new ArrayList<>();
+ // initialize readers from scratch
+ for (BoundedSource<OutputT> source : localSources) {
+ readers.add(source.createReader(serializedOptions.getPipelineOptions()));
+ }
+
+ if (readers.size() == 1) {
+ // the easy case, we just read from one reader
+ BoundedSource.BoundedReader<OutputT> reader = readers.get(0);
+
+ boolean dataAvailable = reader.start();
+ if (dataAvailable) {
+ emitElement(ctx, reader);
+ }
+
+ while (isRunning) {
+ dataAvailable = reader.advance();
+
+ if (dataAvailable) {
+ emitElement(ctx, reader);
+ } else {
+ break;
+ }
+ }
+ } else {
+ // a bit more complicated, we are responsible for several readers
+ // loop through them and sleep if none of them had any data
+
+ int currentReader = 0;
+
+ // start each reader and emit data if immediately available
+ for (BoundedSource.BoundedReader<OutputT> reader : readers) {
+ boolean dataAvailable = reader.start();
+ if (dataAvailable) {
+ emitElement(ctx, reader);
+ }
+ }
+
+ // a flag telling us whether any of the readers had data
+ // if no reader had data, sleep for bit
+ boolean hadData = false;
+ while (isRunning && !readers.isEmpty()) {
+ BoundedSource.BoundedReader<OutputT> reader = readers.get(currentReader);
+ boolean dataAvailable = reader.advance();
+
+ if (dataAvailable) {
+ emitElement(ctx, reader);
+ hadData = true;
+ } else {
+ readers.remove(currentReader);
+ currentReader--;
+ if (readers.isEmpty()) {
+ break;
+ }
+ }
+
+ currentReader = (currentReader + 1) % readers.size();
+ if (currentReader == 0 && !hadData) {
+ Thread.sleep(50);
+ } else if (currentReader == 0) {
+ hadData = false;
+ }
+ }
+
+ }
+
+ // emit final Long.MAX_VALUE watermark, just to be sure
+ ctx.emitWatermark(new Watermark(Long.MAX_VALUE));
+ }
+
+ /**
+ * Emit the current element from the given Reader. The reader is guaranteed to have data.
+ */
+ private void emitElement(
+ SourceContext<WindowedValue<OutputT>> ctx,
+ BoundedSource.BoundedReader<OutputT> reader) {
+ // make sure that reader state update and element emission are atomic
+ // with respect to snapshots
+ synchronized (ctx.getCheckpointLock()) {
+
+ OutputT item = reader.getCurrent();
+ Instant timestamp = reader.getCurrentTimestamp();
+
+ WindowedValue<OutputT> windowedValue =
+ WindowedValue.of(item, timestamp, GlobalWindow.INSTANCE, PaneInfo.NO_FIRING);
+ ctx.collectWithTimestamp(windowedValue, timestamp.getMillis());
+ }
+ }
+
+ @Override
+ public void close() throws Exception {
+ super.close();
+ if (readers != null) {
+ for (BoundedSource.BoundedReader<OutputT> reader: readers) {
+ reader.close();
+ }
+ }
+ }
+
+ @Override
+ public void cancel() {
+ isRunning = false;
+ }
+
+ @Override
+ public void stop() {
+ this.isRunning = false;
+ }
+
+ /**
+ * Visible so that we can check this in tests. Must not be used for anything else.
+ */
+ @VisibleForTesting
+ public List<? extends BoundedSource<OutputT>> getSplitSources() {
+ return splitSources;
+ }
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/UnboundedSocketSource.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/UnboundedSocketSource.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/UnboundedSocketSource.java
new file mode 100644
index 0000000..910a33f
--- /dev/null
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/UnboundedSocketSource.java
@@ -0,0 +1,249 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink.translation.wrappers.streaming.io;
+
+import static com.google.common.base.Preconditions.checkArgument;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.net.InetSocketAddress;
+import java.net.Socket;
+import java.util.Collections;
+import java.util.List;
+import java.util.NoSuchElementException;
+import javax.annotation.Nullable;
+import org.apache.beam.sdk.coders.Coder;
+import org.apache.beam.sdk.coders.StringUtf8Coder;
+import org.apache.beam.sdk.io.UnboundedSource;
+import org.apache.beam.sdk.options.PipelineOptions;
+import org.joda.time.Instant;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * An example unbounded Beam source that reads input from a socket.
+ * This is used mainly for testing and debugging.
+ * */
+public class UnboundedSocketSource<CheckpointMarkT extends UnboundedSource.CheckpointMark>
+ extends UnboundedSource<String, CheckpointMarkT> {
+
+ private static final Coder<String> DEFAULT_SOCKET_CODER = StringUtf8Coder.of();
+
+ private static final long serialVersionUID = 1L;
+
+ private static final int DEFAULT_CONNECTION_RETRY_SLEEP = 500;
+
+ private static final int CONNECTION_TIMEOUT_TIME = 0;
+
+ private final String hostname;
+ private final int port;
+ private final char delimiter;
+ private final long maxNumRetries;
+ private final long delayBetweenRetries;
+
+ public UnboundedSocketSource(String hostname, int port, char delimiter, long maxNumRetries) {
+ this(hostname, port, delimiter, maxNumRetries, DEFAULT_CONNECTION_RETRY_SLEEP);
+ }
+
+ public UnboundedSocketSource(String hostname,
+ int port,
+ char delimiter,
+ long maxNumRetries,
+ long delayBetweenRetries) {
+ this.hostname = hostname;
+ this.port = port;
+ this.delimiter = delimiter;
+ this.maxNumRetries = maxNumRetries;
+ this.delayBetweenRetries = delayBetweenRetries;
+ }
+
+ public String getHostname() {
+ return this.hostname;
+ }
+
+ public int getPort() {
+ return this.port;
+ }
+
+ public char getDelimiter() {
+ return this.delimiter;
+ }
+
+ public long getMaxNumRetries() {
+ return this.maxNumRetries;
+ }
+
+ public long getDelayBetweenRetries() {
+ return this.delayBetweenRetries;
+ }
+
+ @Override
+ public List<? extends UnboundedSource<String, CheckpointMarkT>> split(
+ int desiredNumSplits,
+ PipelineOptions options) throws Exception {
+ return Collections.<UnboundedSource<String, CheckpointMarkT>>singletonList(this);
+ }
+
+ @Override
+ public UnboundedReader<String> createReader(PipelineOptions options,
+ @Nullable CheckpointMarkT checkpointMark) {
+ return new UnboundedSocketReader(this);
+ }
+
+ @Nullable
+ @Override
+ public Coder getCheckpointMarkCoder() {
+ // Flink and Dataflow have different checkpointing mechanisms.
+ // In our case we do not need a coder.
+ return null;
+ }
+
+ @Override
+ public void validate() {
+ checkArgument(port > 0 && port < 65536, "port is out of range");
+ checkArgument(maxNumRetries >= -1, "maxNumRetries must be zero or larger (num retries), "
+ + "or -1 (infinite retries)");
+ checkArgument(delayBetweenRetries >= 0, "delayBetweenRetries must be zero or positive");
+ }
+
+ @Override
+ public Coder getDefaultOutputCoder() {
+ return DEFAULT_SOCKET_CODER;
+ }
+
+ /**
+ * Unbounded socket reader.
+ */
+ public static class UnboundedSocketReader extends UnboundedSource.UnboundedReader<String> {
+
+ private static final Logger LOG = LoggerFactory.getLogger(UnboundedSocketReader.class);
+
+ private final UnboundedSocketSource source;
+
+ private Socket socket;
+ private BufferedReader reader;
+
+ private boolean isRunning;
+
+ private String currentRecord;
+
+ public UnboundedSocketReader(UnboundedSocketSource source) {
+ this.source = source;
+ }
+
+ private void openConnection() throws IOException {
+ this.socket = new Socket();
+ this.socket.connect(new InetSocketAddress(this.source.getHostname(), this.source.getPort()),
+ CONNECTION_TIMEOUT_TIME);
+ this.reader = new BufferedReader(new InputStreamReader(this.socket.getInputStream()));
+ this.isRunning = true;
+ }
+
+ @Override
+ public boolean start() throws IOException {
+ int attempt = 0;
+ while (!isRunning) {
+ try {
+ openConnection();
+ LOG.info("Connected to server socket " + this.source.getHostname() + ':'
+ + this.source.getPort());
+
+ return advance();
+ } catch (IOException e) {
+ LOG.info("Lost connection to server socket " + this.source.getHostname() + ':'
+ + this.source.getPort() + ". Retrying in "
+ + this.source.getDelayBetweenRetries() + " msecs...");
+
+ if (this.source.getMaxNumRetries() == -1 || attempt++ < this.source.getMaxNumRetries()) {
+ try {
+ Thread.sleep(this.source.getDelayBetweenRetries());
+ } catch (InterruptedException e1) {
+ e1.printStackTrace();
+ }
+ } else {
+ this.isRunning = false;
+ break;
+ }
+ }
+ }
+ LOG.error("Unable to connect to host " + this.source.getHostname()
+ + " : " + this.source.getPort());
+ return false;
+ }
+
+ @Override
+ public boolean advance() throws IOException {
+ final StringBuilder buffer = new StringBuilder();
+ int data;
+ while (isRunning && (data = reader.read()) != -1) {
+ // check if the string is complete
+ if (data != this.source.getDelimiter()) {
+ buffer.append((char) data);
+ } else {
+ if (buffer.length() > 0 && buffer.charAt(buffer.length() - 1) == '\r') {
+ buffer.setLength(buffer.length() - 1);
+ }
+ this.currentRecord = buffer.toString();
+ buffer.setLength(0);
+ return true;
+ }
+ }
+ return false;
+ }
+
+ @Override
+ public byte[] getCurrentRecordId() throws NoSuchElementException {
+ return new byte[0];
+ }
+
+ @Override
+ public String getCurrent() throws NoSuchElementException {
+ return this.currentRecord;
+ }
+
+ @Override
+ public Instant getCurrentTimestamp() throws NoSuchElementException {
+ return Instant.now();
+ }
+
+ @Override
+ public void close() throws IOException {
+ this.reader.close();
+ this.socket.close();
+ this.isRunning = false;
+ LOG.info("Closed connection to server socket at " + this.source.getHostname() + ":"
+ + this.source.getPort() + ".");
+ }
+
+ @Override
+ public Instant getWatermark() {
+ return Instant.now();
+ }
+
+ @Override
+ public CheckpointMark getCheckpointMark() {
+ return null;
+ }
+
+ @Override
+ public UnboundedSource<String, ?> getCurrentSource() {
+ return this.source;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/UnboundedSourceWrapper.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/UnboundedSourceWrapper.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/UnboundedSourceWrapper.java
new file mode 100644
index 0000000..bb9b58a
--- /dev/null
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/UnboundedSourceWrapper.java
@@ -0,0 +1,476 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink.translation.wrappers.streaming.io;
+
+import com.google.common.annotations.VisibleForTesting;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.LinkedHashMap;
+import java.util.List;
+import org.apache.beam.runners.flink.translation.types.CoderTypeInformation;
+import org.apache.beam.runners.flink.translation.utils.SerializedPipelineOptions;
+import org.apache.beam.sdk.coders.Coder;
+import org.apache.beam.sdk.coders.KvCoder;
+import org.apache.beam.sdk.coders.SerializableCoder;
+import org.apache.beam.sdk.io.UnboundedSource;
+import org.apache.beam.sdk.options.PipelineOptions;
+import org.apache.beam.sdk.transforms.windowing.GlobalWindow;
+import org.apache.beam.sdk.transforms.windowing.PaneInfo;
+import org.apache.beam.sdk.util.WindowedValue;
+import org.apache.beam.sdk.values.KV;
+import org.apache.beam.sdk.values.TypeDescriptor;
+import org.apache.flink.api.common.ExecutionConfig;
+import org.apache.flink.api.common.functions.StoppableFunction;
+import org.apache.flink.api.common.state.ListState;
+import org.apache.flink.api.common.state.ListStateDescriptor;
+import org.apache.flink.api.common.state.OperatorStateStore;
+import org.apache.flink.configuration.Configuration;
+import org.apache.flink.runtime.state.CheckpointListener;
+import org.apache.flink.runtime.state.DefaultOperatorStateBackend;
+import org.apache.flink.runtime.state.FunctionInitializationContext;
+import org.apache.flink.runtime.state.FunctionSnapshotContext;
+import org.apache.flink.streaming.api.checkpoint.CheckpointedFunction;
+import org.apache.flink.streaming.api.functions.source.RichParallelSourceFunction;
+import org.apache.flink.streaming.api.operators.StreamingRuntimeContext;
+import org.apache.flink.streaming.api.watermark.Watermark;
+import org.apache.flink.streaming.runtime.tasks.ProcessingTimeCallback;
+import org.joda.time.Instant;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Wrapper for executing {@link UnboundedSource UnboundedSources} as a Flink Source.
+ */
+public class UnboundedSourceWrapper<
+ OutputT, CheckpointMarkT extends UnboundedSource.CheckpointMark>
+ extends RichParallelSourceFunction<WindowedValue<OutputT>>
+ implements ProcessingTimeCallback, StoppableFunction,
+ CheckpointListener, CheckpointedFunction {
+
+ private static final Logger LOG = LoggerFactory.getLogger(UnboundedSourceWrapper.class);
+
+ /**
+ * Keep the options so that we can initialize the localReaders.
+ */
+ private final SerializedPipelineOptions serializedOptions;
+
+ /**
+ * For snapshot and restore.
+ */
+ private final KvCoder<
+ ? extends UnboundedSource<OutputT, CheckpointMarkT>, CheckpointMarkT> checkpointCoder;
+
+ /**
+ * The split sources. We split them in the constructor to ensure that all parallel
+ * sources are consistent about the split sources.
+ */
+ private final List<? extends UnboundedSource<OutputT, CheckpointMarkT>> splitSources;
+
+ /**
+ * The local split sources. Assigned at runtime when the wrapper is executed in parallel.
+ */
+ private transient List<UnboundedSource<OutputT, CheckpointMarkT>> localSplitSources;
+
+ /**
+ * The local split readers. Assigned at runtime when the wrapper is executed in parallel.
+ * Make it a field so that we can access it in {@link #onProcessingTime(long)} for
+ * emitting watermarks.
+ */
+ private transient List<UnboundedSource.UnboundedReader<OutputT>> localReaders;
+
+ /**
+ * Flag to indicate whether the source is running.
+ * Initialize here and not in run() to prevent races where we cancel a job before run() is
+ * ever called or run() is called after cancel().
+ */
+ private volatile boolean isRunning = true;
+
+ /**
+ * Make it a field so that we can access it in {@link #onProcessingTime(long)} for registering new
+ * triggers.
+ */
+ private transient StreamingRuntimeContext runtimeContext;
+
+ /**
+ * Make it a field so that we can access it in {@link #onProcessingTime(long)} for emitting
+ * watermarks.
+ */
+ private transient SourceContext<WindowedValue<OutputT>> context;
+
+ /**
+ * Pending checkpoints which have not been acknowledged yet.
+ */
+ private transient LinkedHashMap<Long, List<CheckpointMarkT>> pendingCheckpoints;
+ /**
+ * Keep a maximum of 32 checkpoints for {@code CheckpointMark.finalizeCheckpoint()}.
+ */
+ private static final int MAX_NUMBER_PENDING_CHECKPOINTS = 32;
+
+ private transient ListState<KV<? extends
+ UnboundedSource<OutputT, CheckpointMarkT>, CheckpointMarkT>> stateForCheckpoint;
+
+ /**
+ * false if checkpointCoder is null or no restore state by starting first.
+ */
+ private transient boolean isRestored = false;
+
+ @SuppressWarnings("unchecked")
+ public UnboundedSourceWrapper(
+ PipelineOptions pipelineOptions,
+ UnboundedSource<OutputT, CheckpointMarkT> source,
+ int parallelism) throws Exception {
+ this.serializedOptions = new SerializedPipelineOptions(pipelineOptions);
+
+ if (source.requiresDeduping()) {
+ LOG.warn("Source {} requires deduping but Flink runner doesn't support this yet.", source);
+ }
+
+ Coder<CheckpointMarkT> checkpointMarkCoder = source.getCheckpointMarkCoder();
+ if (checkpointMarkCoder == null) {
+ LOG.info("No CheckpointMarkCoder specified for this source. Won't create snapshots.");
+ checkpointCoder = null;
+ } else {
+
+ Coder<? extends UnboundedSource<OutputT, CheckpointMarkT>> sourceCoder =
+ (Coder) SerializableCoder.of(new TypeDescriptor<UnboundedSource>() {
+ });
+
+ checkpointCoder = KvCoder.of(sourceCoder, checkpointMarkCoder);
+ }
+
+ // get the splits early. we assume that the generated splits are stable,
+ // this is necessary so that the mapping of state to source is correct
+ // when restoring
+ splitSources = source.split(parallelism, pipelineOptions);
+ }
+
+
+ /**
+ * Initialize and restore state before starting execution of the source.
+ */
+ @Override
+ public void open(Configuration parameters) throws Exception {
+ runtimeContext = (StreamingRuntimeContext) getRuntimeContext();
+
+ // figure out which split sources we're responsible for
+ int subtaskIndex = runtimeContext.getIndexOfThisSubtask();
+ int numSubtasks = runtimeContext.getNumberOfParallelSubtasks();
+
+ localSplitSources = new ArrayList<>();
+ localReaders = new ArrayList<>();
+
+ pendingCheckpoints = new LinkedHashMap<>();
+
+ if (isRestored) {
+ // restore the splitSources from the checkpoint to ensure consistent ordering
+ for (KV<? extends UnboundedSource<OutputT, CheckpointMarkT>, CheckpointMarkT> restored:
+ stateForCheckpoint.get()) {
+ localSplitSources.add(restored.getKey());
+ localReaders.add(restored.getKey().createReader(
+ serializedOptions.getPipelineOptions(), restored.getValue()));
+ }
+ } else {
+ // initialize localReaders and localSources from scratch
+ for (int i = 0; i < splitSources.size(); i++) {
+ if (i % numSubtasks == subtaskIndex) {
+ UnboundedSource<OutputT, CheckpointMarkT> source =
+ splitSources.get(i);
+ UnboundedSource.UnboundedReader<OutputT> reader =
+ source.createReader(serializedOptions.getPipelineOptions(), null);
+ localSplitSources.add(source);
+ localReaders.add(reader);
+ }
+ }
+ }
+
+ LOG.info("Unbounded Flink Source {}/{} is reading from sources: {}",
+ subtaskIndex,
+ numSubtasks,
+ localSplitSources);
+ }
+
+ @Override
+ public void run(SourceContext<WindowedValue<OutputT>> ctx) throws Exception {
+
+ context = ctx;
+
+ if (localReaders.size() == 0) {
+ // do nothing, but still look busy ...
+ // also, output a Long.MAX_VALUE watermark since we know that we're not
+ // going to emit anything
+ // we can't return here since Flink requires that all operators stay up,
+ // otherwise checkpointing would not work correctly anymore
+ ctx.emitWatermark(new Watermark(Long.MAX_VALUE));
+
+ // wait until this is canceled
+ final Object waitLock = new Object();
+ while (isRunning) {
+ try {
+ // Flink will interrupt us at some point
+ //noinspection SynchronizationOnLocalVariableOrMethodParameter
+ synchronized (waitLock) {
+ // don't wait indefinitely, in case something goes horribly wrong
+ waitLock.wait(1000);
+ }
+ } catch (InterruptedException e) {
+ if (!isRunning) {
+ // restore the interrupted state, and fall through the loop
+ Thread.currentThread().interrupt();
+ }
+ }
+ }
+ } else if (localReaders.size() == 1) {
+ // the easy case, we just read from one reader
+ UnboundedSource.UnboundedReader<OutputT> reader = localReaders.get(0);
+
+ boolean dataAvailable = reader.start();
+ if (dataAvailable) {
+ emitElement(ctx, reader);
+ }
+
+ setNextWatermarkTimer(this.runtimeContext);
+
+ while (isRunning) {
+ dataAvailable = reader.advance();
+
+ if (dataAvailable) {
+ emitElement(ctx, reader);
+ } else {
+ Thread.sleep(50);
+ }
+ }
+ } else {
+ // a bit more complicated, we are responsible for several localReaders
+ // loop through them and sleep if none of them had any data
+
+ int numReaders = localReaders.size();
+ int currentReader = 0;
+
+ // start each reader and emit data if immediately available
+ for (UnboundedSource.UnboundedReader<OutputT> reader : localReaders) {
+ boolean dataAvailable = reader.start();
+ if (dataAvailable) {
+ emitElement(ctx, reader);
+ }
+ }
+
+ // a flag telling us whether any of the localReaders had data
+ // if no reader had data, sleep for bit
+ boolean hadData = false;
+ while (isRunning) {
+ UnboundedSource.UnboundedReader<OutputT> reader = localReaders.get(currentReader);
+ boolean dataAvailable = reader.advance();
+
+ if (dataAvailable) {
+ emitElement(ctx, reader);
+ hadData = true;
+ }
+
+ currentReader = (currentReader + 1) % numReaders;
+ if (currentReader == 0 && !hadData) {
+ Thread.sleep(50);
+ } else if (currentReader == 0) {
+ hadData = false;
+ }
+ }
+
+ }
+ }
+
+ /**
+ * Emit the current element from the given Reader. The reader is guaranteed to have data.
+ */
+ private void emitElement(
+ SourceContext<WindowedValue<OutputT>> ctx,
+ UnboundedSource.UnboundedReader<OutputT> reader) {
+ // make sure that reader state update and element emission are atomic
+ // with respect to snapshots
+ synchronized (ctx.getCheckpointLock()) {
+
+ OutputT item = reader.getCurrent();
+ Instant timestamp = reader.getCurrentTimestamp();
+
+ WindowedValue<OutputT> windowedValue =
+ WindowedValue.of(item, timestamp, GlobalWindow.INSTANCE, PaneInfo.NO_FIRING);
+ ctx.collectWithTimestamp(windowedValue, timestamp.getMillis());
+ }
+ }
+
+ @Override
+ public void close() throws Exception {
+ super.close();
+ if (localReaders != null) {
+ for (UnboundedSource.UnboundedReader<OutputT> reader: localReaders) {
+ reader.close();
+ }
+ }
+ }
+
+ @Override
+ public void cancel() {
+ isRunning = false;
+ }
+
+ @Override
+ public void stop() {
+ isRunning = false;
+ }
+
+ // ------------------------------------------------------------------------
+ // Checkpoint and restore
+ // ------------------------------------------------------------------------
+
+ @Override
+ public void snapshotState(FunctionSnapshotContext functionSnapshotContext) throws Exception {
+ if (!isRunning) {
+ LOG.debug("snapshotState() called on closed source");
+ } else {
+
+ if (checkpointCoder == null) {
+ // no checkpoint coder available in this source
+ return;
+ }
+
+ stateForCheckpoint.clear();
+
+ long checkpointId = functionSnapshotContext.getCheckpointId();
+
+ // we checkpoint the sources along with the CheckpointMarkT to ensure
+ // than we have a correct mapping of checkpoints to sources when
+ // restoring
+ List<CheckpointMarkT> checkpointMarks = new ArrayList<>(localSplitSources.size());
+
+ for (int i = 0; i < localSplitSources.size(); i++) {
+ UnboundedSource<OutputT, CheckpointMarkT> source = localSplitSources.get(i);
+ UnboundedSource.UnboundedReader<OutputT> reader = localReaders.get(i);
+
+ @SuppressWarnings("unchecked")
+ CheckpointMarkT mark = (CheckpointMarkT) reader.getCheckpointMark();
+ checkpointMarks.add(mark);
+ KV<UnboundedSource<OutputT, CheckpointMarkT>, CheckpointMarkT> kv =
+ KV.of(source, mark);
+ stateForCheckpoint.add(kv);
+ }
+
+ // cleanup old pending checkpoints and add new checkpoint
+ int diff = pendingCheckpoints.size() - MAX_NUMBER_PENDING_CHECKPOINTS;
+ if (diff >= 0) {
+ for (Iterator<Long> iterator = pendingCheckpoints.keySet().iterator();
+ diff >= 0;
+ diff--) {
+ iterator.next();
+ iterator.remove();
+ }
+ }
+ pendingCheckpoints.put(checkpointId, checkpointMarks);
+
+ }
+ }
+
+ @Override
+ public void initializeState(FunctionInitializationContext context) throws Exception {
+ if (checkpointCoder == null) {
+ // no checkpoint coder available in this source
+ return;
+ }
+
+ OperatorStateStore stateStore = context.getOperatorStateStore();
+ CoderTypeInformation<
+ KV<? extends UnboundedSource<OutputT, CheckpointMarkT>, CheckpointMarkT>>
+ typeInformation = (CoderTypeInformation) new CoderTypeInformation<>(checkpointCoder);
+ stateForCheckpoint = stateStore.getOperatorState(
+ new ListStateDescriptor<>(DefaultOperatorStateBackend.DEFAULT_OPERATOR_STATE_NAME,
+ typeInformation.createSerializer(new ExecutionConfig())));
+
+ if (context.isRestored()) {
+ isRestored = true;
+ LOG.info("Having restore state in the UnbounedSourceWrapper.");
+ } else {
+ LOG.info("No restore state for UnbounedSourceWrapper.");
+ }
+ }
+
+ @Override
+ public void onProcessingTime(long timestamp) throws Exception {
+ if (this.isRunning) {
+ synchronized (context.getCheckpointLock()) {
+ // find minimum watermark over all localReaders
+ long watermarkMillis = Long.MAX_VALUE;
+ for (UnboundedSource.UnboundedReader<OutputT> reader: localReaders) {
+ Instant watermark = reader.getWatermark();
+ if (watermark != null) {
+ watermarkMillis = Math.min(watermark.getMillis(), watermarkMillis);
+ }
+ }
+ context.emitWatermark(new Watermark(watermarkMillis));
+ }
+ setNextWatermarkTimer(this.runtimeContext);
+ }
+ }
+
+ private void setNextWatermarkTimer(StreamingRuntimeContext runtime) {
+ if (this.isRunning) {
+ long watermarkInterval = runtime.getExecutionConfig().getAutoWatermarkInterval();
+ long timeToNextWatermark = getTimeToNextWatermark(watermarkInterval);
+ runtime.getProcessingTimeService().registerTimer(timeToNextWatermark, this);
+ }
+ }
+
+ private long getTimeToNextWatermark(long watermarkInterval) {
+ return System.currentTimeMillis() + watermarkInterval;
+ }
+
+ /**
+ * Visible so that we can check this in tests. Must not be used for anything else.
+ */
+ @VisibleForTesting
+ public List<? extends UnboundedSource<OutputT, CheckpointMarkT>> getSplitSources() {
+ return splitSources;
+ }
+
+ /**
+ * Visible so that we can check this in tests. Must not be used for anything else.
+ */
+ @VisibleForTesting
+ public List<? extends UnboundedSource<OutputT, CheckpointMarkT>> getLocalSplitSources() {
+ return localSplitSources;
+ }
+
+ @Override
+ public void notifyCheckpointComplete(long checkpointId) throws Exception {
+
+ List<CheckpointMarkT> checkpointMarks = pendingCheckpoints.get(checkpointId);
+
+ if (checkpointMarks != null) {
+
+ // remove old checkpoints including the current one
+ Iterator<Long> iterator = pendingCheckpoints.keySet().iterator();
+ long currentId;
+ do {
+ currentId = iterator.next();
+ iterator.remove();
+ } while (currentId != checkpointId);
+
+ // confirm all marks
+ for (CheckpointMarkT mark : checkpointMarks) {
+ mark.finalizeCheckpoint();
+ }
+
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/package-info.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/package-info.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/package-info.java
new file mode 100644
index 0000000..b431ce7
--- /dev/null
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/package-info.java
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Internal implementation of the Beam runner for Apache Flink.
+ */
+package org.apache.beam.runners.flink.translation.wrappers.streaming.io;
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/package-info.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/package-info.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/package-info.java
new file mode 100644
index 0000000..0674871
--- /dev/null
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/package-info.java
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Internal implementation of the Beam runner for Apache Flink.
+ */
+package org.apache.beam.runners.flink.translation.wrappers.streaming;
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/state/FlinkBroadcastStateInternals.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/state/FlinkBroadcastStateInternals.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/state/FlinkBroadcastStateInternals.java
new file mode 100644
index 0000000..3203446
--- /dev/null
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/state/FlinkBroadcastStateInternals.java
@@ -0,0 +1,865 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink.translation.wrappers.streaming.state;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import org.apache.beam.runners.core.StateInternals;
+import org.apache.beam.runners.core.StateNamespace;
+import org.apache.beam.runners.core.StateTag;
+import org.apache.beam.runners.flink.translation.types.CoderTypeInformation;
+import org.apache.beam.sdk.coders.Coder;
+import org.apache.beam.sdk.coders.ListCoder;
+import org.apache.beam.sdk.coders.MapCoder;
+import org.apache.beam.sdk.coders.StringUtf8Coder;
+import org.apache.beam.sdk.transforms.Combine;
+import org.apache.beam.sdk.transforms.CombineWithContext;
+import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
+import org.apache.beam.sdk.transforms.windowing.OutputTimeFn;
+import org.apache.beam.sdk.util.CombineContextFactory;
+import org.apache.beam.sdk.util.state.BagState;
+import org.apache.beam.sdk.util.state.CombiningState;
+import org.apache.beam.sdk.util.state.MapState;
+import org.apache.beam.sdk.util.state.ReadableState;
+import org.apache.beam.sdk.util.state.SetState;
+import org.apache.beam.sdk.util.state.State;
+import org.apache.beam.sdk.util.state.StateContext;
+import org.apache.beam.sdk.util.state.StateContexts;
+import org.apache.beam.sdk.util.state.ValueState;
+import org.apache.beam.sdk.util.state.WatermarkHoldState;
+import org.apache.flink.api.common.ExecutionConfig;
+import org.apache.flink.api.common.state.ListState;
+import org.apache.flink.api.common.state.ListStateDescriptor;
+import org.apache.flink.runtime.state.DefaultOperatorStateBackend;
+import org.apache.flink.runtime.state.OperatorStateBackend;
+
+/**
+ * {@link StateInternals} that uses a Flink {@link DefaultOperatorStateBackend}
+ * to manage the broadcast state.
+ * The state is the same on all parallel instances of the operator.
+ * So we just need store state of operator-0 in OperatorStateBackend.
+ *
+ * <p>Note: Ignore index of key.
+ * Mainly for SideInputs.
+ */
+public class FlinkBroadcastStateInternals<K> implements StateInternals<K> {
+
+ private int indexInSubtaskGroup;
+ private final DefaultOperatorStateBackend stateBackend;
+ // stateName -> <namespace, state>
+ private Map<String, Map<String, ?>> stateForNonZeroOperator;
+
+ public FlinkBroadcastStateInternals(int indexInSubtaskGroup, OperatorStateBackend stateBackend) {
+ //TODO flink do not yet expose through public API
+ this.stateBackend = (DefaultOperatorStateBackend) stateBackend;
+ this.indexInSubtaskGroup = indexInSubtaskGroup;
+ if (indexInSubtaskGroup != 0) {
+ stateForNonZeroOperator = new HashMap<>();
+ }
+ }
+
+ @Override
+ public K getKey() {
+ return null;
+ }
+
+ @Override
+ public <T extends State> T state(
+ final StateNamespace namespace,
+ StateTag<? super K, T> address) {
+
+ return state(namespace, address, StateContexts.nullContext());
+ }
+
+ @Override
+ public <T extends State> T state(
+ final StateNamespace namespace,
+ StateTag<? super K, T> address,
+ final StateContext<?> context) {
+
+ return address.bind(new StateTag.StateBinder<K>() {
+
+ @Override
+ public <T> ValueState<T> bindValue(
+ StateTag<? super K, ValueState<T>> address,
+ Coder<T> coder) {
+
+ return new FlinkBroadcastValueState<>(stateBackend, address, namespace, coder);
+ }
+
+ @Override
+ public <T> BagState<T> bindBag(
+ StateTag<? super K, BagState<T>> address,
+ Coder<T> elemCoder) {
+
+ return new FlinkBroadcastBagState<>(stateBackend, address, namespace, elemCoder);
+ }
+
+ @Override
+ public <T> SetState<T> bindSet(
+ StateTag<? super K, SetState<T>> address,
+ Coder<T> elemCoder) {
+ throw new UnsupportedOperationException(
+ String.format("%s is not supported", SetState.class.getSimpleName()));
+ }
+
+ @Override
+ public <KeyT, ValueT> MapState<KeyT, ValueT> bindMap(
+ StateTag<? super K, MapState<KeyT, ValueT>> spec,
+ Coder<KeyT> mapKeyCoder, Coder<ValueT> mapValueCoder) {
+ throw new UnsupportedOperationException(
+ String.format("%s is not supported", MapState.class.getSimpleName()));
+ }
+
+ @Override
+ public <InputT, AccumT, OutputT>
+ CombiningState<InputT, AccumT, OutputT>
+ bindCombiningValue(
+ StateTag<? super K, CombiningState<InputT, AccumT, OutputT>> address,
+ Coder<AccumT> accumCoder,
+ Combine.CombineFn<InputT, AccumT, OutputT> combineFn) {
+
+ return new FlinkCombiningState<>(
+ stateBackend, address, combineFn, namespace, accumCoder);
+ }
+
+ @Override
+ public <InputT, AccumT, OutputT>
+ CombiningState<InputT, AccumT, OutputT> bindKeyedCombiningValue(
+ StateTag<? super K, CombiningState<InputT, AccumT, OutputT>> address,
+ Coder<AccumT> accumCoder,
+ final Combine.KeyedCombineFn<? super K, InputT, AccumT, OutputT> combineFn) {
+ return new FlinkKeyedCombiningState<>(
+ stateBackend,
+ address,
+ combineFn,
+ namespace,
+ accumCoder,
+ FlinkBroadcastStateInternals.this);
+ }
+
+ @Override
+ public <InputT, AccumT, OutputT>
+ CombiningState<InputT, AccumT, OutputT> bindKeyedCombiningValueWithContext(
+ StateTag<? super K, CombiningState<InputT, AccumT, OutputT>> address,
+ Coder<AccumT> accumCoder,
+ CombineWithContext.KeyedCombineFnWithContext<
+ ? super K, InputT, AccumT, OutputT> combineFn) {
+ return new FlinkCombiningStateWithContext<>(
+ stateBackend,
+ address,
+ combineFn,
+ namespace,
+ accumCoder,
+ FlinkBroadcastStateInternals.this,
+ CombineContextFactory.createFromStateContext(context));
+ }
+
+ @Override
+ public <W extends BoundedWindow> WatermarkHoldState<W> bindWatermark(
+ StateTag<? super K, WatermarkHoldState<W>> address,
+ OutputTimeFn<? super W> outputTimeFn) {
+ throw new UnsupportedOperationException(
+ String.format("%s is not supported", WatermarkHoldState.class.getSimpleName()));
+ }
+ });
+ }
+
+ /**
+ * 1. The way we would use it is to only checkpoint anything from the operator
+ * with subtask index 0 because we assume that the state is the same on all
+ * parallel instances of the operator.
+ *
+ * <p>2. Use map to support namespace.
+ */
+ private abstract class AbstractBroadcastState<T> {
+
+ private String name;
+ private final StateNamespace namespace;
+ private final ListStateDescriptor<Map<String, T>> flinkStateDescriptor;
+ private final DefaultOperatorStateBackend flinkStateBackend;
+
+ AbstractBroadcastState(
+ DefaultOperatorStateBackend flinkStateBackend,
+ String name,
+ StateNamespace namespace,
+ Coder<T> coder) {
+ this.name = name;
+
+ this.namespace = namespace;
+ this.flinkStateBackend = flinkStateBackend;
+
+ CoderTypeInformation<Map<String, T>> typeInfo =
+ new CoderTypeInformation<>(MapCoder.of(StringUtf8Coder.of(), coder));
+
+ flinkStateDescriptor = new ListStateDescriptor<>(name,
+ typeInfo.createSerializer(new ExecutionConfig()));
+ }
+
+ /**
+ * Get map(namespce->T) from index 0.
+ */
+ Map<String, T> getMap() throws Exception {
+ if (indexInSubtaskGroup == 0) {
+ return getMapFromBroadcastState();
+ } else {
+ Map<String, T> result = (Map<String, T>) stateForNonZeroOperator.get(name);
+ // maybe restore from BroadcastState of Operator-0
+ if (result == null) {
+ result = getMapFromBroadcastState();
+ if (result != null) {
+ stateForNonZeroOperator.put(name, result);
+ // we don't need it anymore, must clear it.
+ flinkStateBackend.getBroadcastOperatorState(
+ flinkStateDescriptor).clear();
+ }
+ }
+ return result;
+ }
+ }
+
+ Map<String, T> getMapFromBroadcastState() throws Exception {
+ ListState<Map<String, T>> state = flinkStateBackend.getBroadcastOperatorState(
+ flinkStateDescriptor);
+ Iterable<Map<String, T>> iterable = state.get();
+ Map<String, T> ret = null;
+ if (iterable != null) {
+ // just use index 0
+ Iterator<Map<String, T>> iterator = iterable.iterator();
+ if (iterator.hasNext()) {
+ ret = iterator.next();
+ }
+ }
+ return ret;
+ }
+
+ /**
+ * Update map(namespce->T) from index 0.
+ */
+ void updateMap(Map<String, T> map) throws Exception {
+ if (indexInSubtaskGroup == 0) {
+ ListState<Map<String, T>> state = flinkStateBackend.getBroadcastOperatorState(
+ flinkStateDescriptor);
+ state.clear();
+ if (map.size() > 0) {
+ state.add(map);
+ }
+ } else {
+ if (map.size() == 0) {
+ stateForNonZeroOperator.remove(name);
+ // updateMap is always behind getMap,
+ // getMap will clear map in BroadcastOperatorState,
+ // we don't need clear here.
+ } else {
+ stateForNonZeroOperator.put(name, map);
+ }
+ }
+ }
+
+ void writeInternal(T input) {
+ try {
+ Map<String, T> map = getMap();
+ if (map == null) {
+ map = new HashMap<>();
+ }
+ map.put(namespace.stringKey(), input);
+ updateMap(map);
+ } catch (Exception e) {
+ throw new RuntimeException("Error updating state.", e);
+ }
+ }
+
+ T readInternal() {
+ try {
+ Map<String, T> map = getMap();
+ if (map == null) {
+ return null;
+ } else {
+ return map.get(namespace.stringKey());
+ }
+ } catch (Exception e) {
+ throw new RuntimeException("Error reading state.", e);
+ }
+ }
+
+ void clearInternal() {
+ try {
+ Map<String, T> map = getMap();
+ if (map != null) {
+ map.remove(namespace.stringKey());
+ updateMap(map);
+ }
+ } catch (Exception e) {
+ throw new RuntimeException("Error clearing state.", e);
+ }
+ }
+
+ }
+
+ private class FlinkBroadcastValueState<K, T>
+ extends AbstractBroadcastState<T> implements ValueState<T> {
+
+ private final StateNamespace namespace;
+ private final StateTag<? super K, ValueState<T>> address;
+
+ FlinkBroadcastValueState(
+ DefaultOperatorStateBackend flinkStateBackend,
+ StateTag<? super K, ValueState<T>> address,
+ StateNamespace namespace,
+ Coder<T> coder) {
+ super(flinkStateBackend, address.getId(), namespace, coder);
+
+ this.namespace = namespace;
+ this.address = address;
+
+ }
+
+ @Override
+ public void write(T input) {
+ writeInternal(input);
+ }
+
+ @Override
+ public ValueState<T> readLater() {
+ return this;
+ }
+
+ @Override
+ public T read() {
+ return readInternal();
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) {
+ return true;
+ }
+ if (o == null || getClass() != o.getClass()) {
+ return false;
+ }
+
+ FlinkBroadcastValueState<?, ?> that = (FlinkBroadcastValueState<?, ?>) o;
+
+ return namespace.equals(that.namespace) && address.equals(that.address);
+
+ }
+
+ @Override
+ public int hashCode() {
+ int result = namespace.hashCode();
+ result = 31 * result + address.hashCode();
+ return result;
+ }
+
+ @Override
+ public void clear() {
+ clearInternal();
+ }
+ }
+
+ private class FlinkBroadcastBagState<K, T> extends AbstractBroadcastState<List<T>>
+ implements BagState<T> {
+
+ private final StateNamespace namespace;
+ private final StateTag<? super K, BagState<T>> address;
+
+ FlinkBroadcastBagState(
+ DefaultOperatorStateBackend flinkStateBackend,
+ StateTag<? super K, BagState<T>> address,
+ StateNamespace namespace,
+ Coder<T> coder) {
+ super(flinkStateBackend, address.getId(), namespace, ListCoder.of(coder));
+
+ this.namespace = namespace;
+ this.address = address;
+ }
+
+ @Override
+ public void add(T input) {
+ List<T> list = readInternal();
+ if (list == null) {
+ list = new ArrayList<>();
+ }
+ list.add(input);
+ writeInternal(list);
+ }
+
+ @Override
+ public BagState<T> readLater() {
+ return this;
+ }
+
+ @Override
+ public Iterable<T> read() {
+ List<T> result = readInternal();
+ return result != null ? result : Collections.<T>emptyList();
+ }
+
+ @Override
+ public ReadableState<Boolean> isEmpty() {
+ return new ReadableState<Boolean>() {
+ @Override
+ public Boolean read() {
+ try {
+ List<T> result = readInternal();
+ return result == null;
+ } catch (Exception e) {
+ throw new RuntimeException("Error reading state.", e);
+ }
+
+ }
+
+ @Override
+ public ReadableState<Boolean> readLater() {
+ return this;
+ }
+ };
+ }
+
+ @Override
+ public void clear() {
+ clearInternal();
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) {
+ return true;
+ }
+ if (o == null || getClass() != o.getClass()) {
+ return false;
+ }
+
+ FlinkBroadcastBagState<?, ?> that = (FlinkBroadcastBagState<?, ?>) o;
+
+ return namespace.equals(that.namespace) && address.equals(that.address);
+
+ }
+
+ @Override
+ public int hashCode() {
+ int result = namespace.hashCode();
+ result = 31 * result + address.hashCode();
+ return result;
+ }
+ }
+
+ private class FlinkCombiningState<K, InputT, AccumT, OutputT>
+ extends AbstractBroadcastState<AccumT>
+ implements CombiningState<InputT, AccumT, OutputT> {
+
+ private final StateNamespace namespace;
+ private final StateTag<? super K, CombiningState<InputT, AccumT, OutputT>> address;
+ private final Combine.CombineFn<InputT, AccumT, OutputT> combineFn;
+
+ FlinkCombiningState(
+ DefaultOperatorStateBackend flinkStateBackend,
+ StateTag<? super K, CombiningState<InputT, AccumT, OutputT>> address,
+ Combine.CombineFn<InputT, AccumT, OutputT> combineFn,
+ StateNamespace namespace,
+ Coder<AccumT> accumCoder) {
+ super(flinkStateBackend, address.getId(), namespace, accumCoder);
+
+ this.namespace = namespace;
+ this.address = address;
+ this.combineFn = combineFn;
+ }
+
+ @Override
+ public CombiningState<InputT, AccumT, OutputT> readLater() {
+ return this;
+ }
+
+ @Override
+ public void add(InputT value) {
+ AccumT current = readInternal();
+ if (current == null) {
+ current = combineFn.createAccumulator();
+ }
+ current = combineFn.addInput(current, value);
+ writeInternal(current);
+ }
+
+ @Override
+ public void addAccum(AccumT accum) {
+ AccumT current = readInternal();
+
+ if (current == null) {
+ writeInternal(accum);
+ } else {
+ current = combineFn.mergeAccumulators(Arrays.asList(current, accum));
+ writeInternal(current);
+ }
+ }
+
+ @Override
+ public AccumT getAccum() {
+ return readInternal();
+ }
+
+ @Override
+ public AccumT mergeAccumulators(Iterable<AccumT> accumulators) {
+ return combineFn.mergeAccumulators(accumulators);
+ }
+
+ @Override
+ public OutputT read() {
+ AccumT accum = readInternal();
+ if (accum != null) {
+ return combineFn.extractOutput(accum);
+ } else {
+ return combineFn.extractOutput(combineFn.createAccumulator());
+ }
+ }
+
+ @Override
+ public ReadableState<Boolean> isEmpty() {
+ return new ReadableState<Boolean>() {
+ @Override
+ public Boolean read() {
+ try {
+ return readInternal() == null;
+ } catch (Exception e) {
+ throw new RuntimeException("Error reading state.", e);
+ }
+
+ }
+
+ @Override
+ public ReadableState<Boolean> readLater() {
+ return this;
+ }
+ };
+ }
+
+ @Override
+ public void clear() {
+ clearInternal();
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) {
+ return true;
+ }
+ if (o == null || getClass() != o.getClass()) {
+ return false;
+ }
+
+ FlinkCombiningState<?, ?, ?, ?> that =
+ (FlinkCombiningState<?, ?, ?, ?>) o;
+
+ return namespace.equals(that.namespace) && address.equals(that.address);
+
+ }
+
+ @Override
+ public int hashCode() {
+ int result = namespace.hashCode();
+ result = 31 * result + address.hashCode();
+ return result;
+ }
+ }
+
+ private class FlinkKeyedCombiningState<K, InputT, AccumT, OutputT>
+ extends AbstractBroadcastState<AccumT>
+ implements CombiningState<InputT, AccumT, OutputT> {
+
+ private final StateNamespace namespace;
+ private final StateTag<? super K, CombiningState<InputT, AccumT, OutputT>> address;
+ private final Combine.KeyedCombineFn<? super K, InputT, AccumT, OutputT> combineFn;
+ private final FlinkBroadcastStateInternals<K> flinkStateInternals;
+
+ FlinkKeyedCombiningState(
+ DefaultOperatorStateBackend flinkStateBackend,
+ StateTag<? super K, CombiningState<InputT, AccumT, OutputT>> address,
+ Combine.KeyedCombineFn<? super K, InputT, AccumT, OutputT> combineFn,
+ StateNamespace namespace,
+ Coder<AccumT> accumCoder,
+ FlinkBroadcastStateInternals<K> flinkStateInternals) {
+ super(flinkStateBackend, address.getId(), namespace, accumCoder);
+
+ this.namespace = namespace;
+ this.address = address;
+ this.combineFn = combineFn;
+ this.flinkStateInternals = flinkStateInternals;
+
+ }
+
+ @Override
+ public CombiningState<InputT, AccumT, OutputT> readLater() {
+ return this;
+ }
+
+ @Override
+ public void add(InputT value) {
+ try {
+ AccumT current = readInternal();
+ if (current == null) {
+ current = combineFn.createAccumulator(flinkStateInternals.getKey());
+ }
+ current = combineFn.addInput(flinkStateInternals.getKey(), current, value);
+ writeInternal(current);
+ } catch (Exception e) {
+ throw new RuntimeException("Error adding to state." , e);
+ }
+ }
+
+ @Override
+ public void addAccum(AccumT accum) {
+ try {
+ AccumT current = readInternal();
+ if (current == null) {
+ writeInternal(accum);
+ } else {
+ current = combineFn.mergeAccumulators(
+ flinkStateInternals.getKey(),
+ Arrays.asList(current, accum));
+ writeInternal(current);
+ }
+ } catch (Exception e) {
+ throw new RuntimeException("Error adding to state.", e);
+ }
+ }
+
+ @Override
+ public AccumT getAccum() {
+ try {
+ return readInternal();
+ } catch (Exception e) {
+ throw new RuntimeException("Error reading state.", e);
+ }
+ }
+
+ @Override
+ public AccumT mergeAccumulators(Iterable<AccumT> accumulators) {
+ return combineFn.mergeAccumulators(flinkStateInternals.getKey(), accumulators);
+ }
+
+ @Override
+ public OutputT read() {
+ try {
+ AccumT accum = readInternal();
+ if (accum != null) {
+ return combineFn.extractOutput(flinkStateInternals.getKey(), accum);
+ } else {
+ return combineFn.extractOutput(
+ flinkStateInternals.getKey(),
+ combineFn.createAccumulator(flinkStateInternals.getKey()));
+ }
+ } catch (Exception e) {
+ throw new RuntimeException("Error reading state.", e);
+ }
+ }
+
+ @Override
+ public ReadableState<Boolean> isEmpty() {
+ return new ReadableState<Boolean>() {
+ @Override
+ public Boolean read() {
+ try {
+ return readInternal() == null;
+ } catch (Exception e) {
+ throw new RuntimeException("Error reading state.", e);
+ }
+
+ }
+
+ @Override
+ public ReadableState<Boolean> readLater() {
+ return this;
+ }
+ };
+ }
+
+ @Override
+ public void clear() {
+ clearInternal();
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) {
+ return true;
+ }
+ if (o == null || getClass() != o.getClass()) {
+ return false;
+ }
+
+ FlinkKeyedCombiningState<?, ?, ?, ?> that =
+ (FlinkKeyedCombiningState<?, ?, ?, ?>) o;
+
+ return namespace.equals(that.namespace) && address.equals(that.address);
+
+ }
+
+ @Override
+ public int hashCode() {
+ int result = namespace.hashCode();
+ result = 31 * result + address.hashCode();
+ return result;
+ }
+ }
+
+ private class FlinkCombiningStateWithContext<K, InputT, AccumT, OutputT>
+ extends AbstractBroadcastState<AccumT>
+ implements CombiningState<InputT, AccumT, OutputT> {
+
+ private final StateNamespace namespace;
+ private final StateTag<? super K, CombiningState<InputT, AccumT, OutputT>> address;
+ private final CombineWithContext.KeyedCombineFnWithContext<
+ ? super K, InputT, AccumT, OutputT> combineFn;
+ private final FlinkBroadcastStateInternals<K> flinkStateInternals;
+ private final CombineWithContext.Context context;
+
+ FlinkCombiningStateWithContext(
+ DefaultOperatorStateBackend flinkStateBackend,
+ StateTag<? super K, CombiningState<InputT, AccumT, OutputT>> address,
+ CombineWithContext.KeyedCombineFnWithContext<
+ ? super K, InputT, AccumT, OutputT> combineFn,
+ StateNamespace namespace,
+ Coder<AccumT> accumCoder,
+ FlinkBroadcastStateInternals<K> flinkStateInternals,
+ CombineWithContext.Context context) {
+ super(flinkStateBackend, address.getId(), namespace, accumCoder);
+
+ this.namespace = namespace;
+ this.address = address;
+ this.combineFn = combineFn;
+ this.flinkStateInternals = flinkStateInternals;
+ this.context = context;
+
+ }
+
+ @Override
+ public CombiningState<InputT, AccumT, OutputT> readLater() {
+ return this;
+ }
+
+ @Override
+ public void add(InputT value) {
+ try {
+ AccumT current = readInternal();
+ if (current == null) {
+ current = combineFn.createAccumulator(flinkStateInternals.getKey(), context);
+ }
+ current = combineFn.addInput(flinkStateInternals.getKey(), current, value, context);
+ writeInternal(current);
+ } catch (Exception e) {
+ throw new RuntimeException("Error adding to state." , e);
+ }
+ }
+
+ @Override
+ public void addAccum(AccumT accum) {
+ try {
+
+ AccumT current = readInternal();
+ if (current == null) {
+ writeInternal(accum);
+ } else {
+ current = combineFn.mergeAccumulators(
+ flinkStateInternals.getKey(),
+ Arrays.asList(current, accum),
+ context);
+ writeInternal(current);
+ }
+ } catch (Exception e) {
+ throw new RuntimeException("Error adding to state.", e);
+ }
+ }
+
+ @Override
+ public AccumT getAccum() {
+ try {
+ return readInternal();
+ } catch (Exception e) {
+ throw new RuntimeException("Error reading state.", e);
+ }
+ }
+
+ @Override
+ public AccumT mergeAccumulators(Iterable<AccumT> accumulators) {
+ return combineFn.mergeAccumulators(flinkStateInternals.getKey(), accumulators, context);
+ }
+
+ @Override
+ public OutputT read() {
+ try {
+ AccumT accum = readInternal();
+ return combineFn.extractOutput(flinkStateInternals.getKey(), accum, context);
+ } catch (Exception e) {
+ throw new RuntimeException("Error reading state.", e);
+ }
+ }
+
+ @Override
+ public ReadableState<Boolean> isEmpty() {
+ return new ReadableState<Boolean>() {
+ @Override
+ public Boolean read() {
+ try {
+ return readInternal() == null;
+ } catch (Exception e) {
+ throw new RuntimeException("Error reading state.", e);
+ }
+
+ }
+
+ @Override
+ public ReadableState<Boolean> readLater() {
+ return this;
+ }
+ };
+ }
+
+ @Override
+ public void clear() {
+ clearInternal();
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) {
+ return true;
+ }
+ if (o == null || getClass() != o.getClass()) {
+ return false;
+ }
+
+ FlinkCombiningStateWithContext<?, ?, ?, ?> that =
+ (FlinkCombiningStateWithContext<?, ?, ?, ?>) o;
+
+ return namespace.equals(that.namespace) && address.equals(that.address);
+
+ }
+
+ @Override
+ public int hashCode() {
+ int result = namespace.hashCode();
+ result = 31 * result + address.hashCode();
+ return result;
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/state/FlinkKeyGroupStateInternals.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/state/FlinkKeyGroupStateInternals.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/state/FlinkKeyGroupStateInternals.java
new file mode 100644
index 0000000..24b340e
--- /dev/null
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/state/FlinkKeyGroupStateInternals.java
@@ -0,0 +1,487 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink.translation.wrappers.streaming.state;
+
+import static org.apache.flink.util.Preconditions.checkArgument;
+
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import org.apache.beam.runners.core.StateInternals;
+import org.apache.beam.runners.core.StateNamespace;
+import org.apache.beam.runners.core.StateTag;
+import org.apache.beam.sdk.coders.Coder;
+import org.apache.beam.sdk.coders.Coder.Context;
+import org.apache.beam.sdk.coders.CoderException;
+import org.apache.beam.sdk.coders.ListCoder;
+import org.apache.beam.sdk.coders.StringUtf8Coder;
+import org.apache.beam.sdk.transforms.Combine;
+import org.apache.beam.sdk.transforms.CombineWithContext;
+import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
+import org.apache.beam.sdk.transforms.windowing.OutputTimeFn;
+import org.apache.beam.sdk.util.CoderUtils;
+import org.apache.beam.sdk.util.state.BagState;
+import org.apache.beam.sdk.util.state.CombiningState;
+import org.apache.beam.sdk.util.state.MapState;
+import org.apache.beam.sdk.util.state.ReadableState;
+import org.apache.beam.sdk.util.state.SetState;
+import org.apache.beam.sdk.util.state.State;
+import org.apache.beam.sdk.util.state.StateContext;
+import org.apache.beam.sdk.util.state.StateContexts;
+import org.apache.beam.sdk.util.state.ValueState;
+import org.apache.beam.sdk.util.state.WatermarkHoldState;
+import org.apache.flink.api.java.tuple.Tuple2;
+import org.apache.flink.runtime.state.KeyGroupsList;
+import org.apache.flink.runtime.state.KeyedStateBackend;
+import org.apache.flink.streaming.api.operators.HeapInternalTimerService;
+import org.apache.flink.util.InstantiationUtil;
+import org.apache.flink.util.Preconditions;
+
+/**
+ * {@link StateInternals} that uses {@link KeyGroupCheckpointedOperator}
+ * to checkpoint state.
+ *
+ * <p>Note:
+ * Ignore index of key.
+ * Just implement BagState.
+ *
+ * <p>Reference from {@link HeapInternalTimerService} to the local key-group range.
+ */
+public class FlinkKeyGroupStateInternals<K> implements StateInternals<K> {
+
+ private final Coder<K> keyCoder;
+ private final KeyGroupsList localKeyGroupRange;
+ private KeyedStateBackend keyedStateBackend;
+ private final int localKeyGroupRangeStartIdx;
+
+ // stateName -> namespace -> (valueCoder, value)
+ private final Map<String, Tuple2<Coder<?>, Map<String, ?>>>[] stateTables;
+
+ public FlinkKeyGroupStateInternals(
+ Coder<K> keyCoder,
+ KeyedStateBackend keyedStateBackend) {
+ this.keyCoder = keyCoder;
+ this.keyedStateBackend = keyedStateBackend;
+ this.localKeyGroupRange = keyedStateBackend.getKeyGroupRange();
+ // find the starting index of the local key-group range
+ int startIdx = Integer.MAX_VALUE;
+ for (Integer keyGroupIdx : localKeyGroupRange) {
+ startIdx = Math.min(keyGroupIdx, startIdx);
+ }
+ this.localKeyGroupRangeStartIdx = startIdx;
+ stateTables = (Map<String, Tuple2<Coder<?>, Map<String, ?>>>[])
+ new Map[localKeyGroupRange.getNumberOfKeyGroups()];
+ for (int i = 0; i < stateTables.length; i++) {
+ stateTables[i] = new HashMap<>();
+ }
+ }
+
+ @Override
+ public K getKey() {
+ ByteBuffer keyBytes = (ByteBuffer) keyedStateBackend.getCurrentKey();
+ try {
+ return CoderUtils.decodeFromByteArray(keyCoder, keyBytes.array());
+ } catch (CoderException e) {
+ throw new RuntimeException("Error decoding key.", e);
+ }
+ }
+
+ @Override
+ public <T extends State> T state(
+ final StateNamespace namespace,
+ StateTag<? super K, T> address) {
+
+ return state(namespace, address, StateContexts.nullContext());
+ }
+
+ @Override
+ public <T extends State> T state(
+ final StateNamespace namespace,
+ StateTag<? super K, T> address,
+ final StateContext<?> context) {
+
+ return address.bind(new StateTag.StateBinder<K>() {
+
+ @Override
+ public <T> ValueState<T> bindValue(
+ StateTag<? super K, ValueState<T>> address,
+ Coder<T> coder) {
+ throw new UnsupportedOperationException(
+ String.format("%s is not supported", ValueState.class.getSimpleName()));
+ }
+
+ @Override
+ public <T> BagState<T> bindBag(
+ StateTag<? super K, BagState<T>> address,
+ Coder<T> elemCoder) {
+
+ return new FlinkKeyGroupBagState<>(address, namespace, elemCoder);
+ }
+
+ @Override
+ public <T> SetState<T> bindSet(
+ StateTag<? super K, SetState<T>> address,
+ Coder<T> elemCoder) {
+ throw new UnsupportedOperationException(
+ String.format("%s is not supported", SetState.class.getSimpleName()));
+ }
+
+ @Override
+ public <KeyT, ValueT> MapState<KeyT, ValueT> bindMap(
+ StateTag<? super K, MapState<KeyT, ValueT>> spec,
+ Coder<KeyT> mapKeyCoder, Coder<ValueT> mapValueCoder) {
+ throw new UnsupportedOperationException(
+ String.format("%s is not supported", MapState.class.getSimpleName()));
+ }
+
+ @Override
+ public <InputT, AccumT, OutputT>
+ CombiningState<InputT, AccumT, OutputT>
+ bindCombiningValue(
+ StateTag<? super K, CombiningState<InputT, AccumT, OutputT>> address,
+ Coder<AccumT> accumCoder,
+ Combine.CombineFn<InputT, AccumT, OutputT> combineFn) {
+ throw new UnsupportedOperationException("bindCombiningValue is not supported.");
+ }
+
+ @Override
+ public <InputT, AccumT, OutputT>
+ CombiningState<InputT, AccumT, OutputT> bindKeyedCombiningValue(
+ StateTag<? super K, CombiningState<InputT, AccumT, OutputT>> address,
+ Coder<AccumT> accumCoder,
+ final Combine.KeyedCombineFn<? super K, InputT, AccumT, OutputT> combineFn) {
+ throw new UnsupportedOperationException("bindKeyedCombiningValue is not supported.");
+
+ }
+
+ @Override
+ public <InputT, AccumT, OutputT>
+ CombiningState<InputT, AccumT, OutputT> bindKeyedCombiningValueWithContext(
+ StateTag<? super K, CombiningState<InputT, AccumT, OutputT>> address,
+ Coder<AccumT> accumCoder,
+ CombineWithContext.KeyedCombineFnWithContext<
+ ? super K, InputT, AccumT, OutputT> combineFn) {
+ throw new UnsupportedOperationException(
+ "bindKeyedCombiningValueWithContext is not supported.");
+ }
+
+ @Override
+ public <W extends BoundedWindow> WatermarkHoldState<W> bindWatermark(
+ StateTag<? super K, WatermarkHoldState<W>> address,
+ OutputTimeFn<? super W> outputTimeFn) {
+ throw new UnsupportedOperationException(
+ String.format("%s is not supported", CombiningState.class.getSimpleName()));
+ }
+ });
+ }
+
+ /**
+ * Reference from {@link Combine.CombineFn}.
+ *
+ * <p>Accumulators are stored in each KeyGroup, call addInput() when a element comes,
+ * call extractOutput() to produce the desired value when need to read data.
+ */
+ interface KeyGroupCombiner<InputT, AccumT, OutputT> {
+
+ /**
+ * Returns a new, mutable accumulator value, representing the accumulation
+ * of zero input values.
+ */
+ AccumT createAccumulator();
+
+ /**
+ * Adds the given input value to the given accumulator, returning the
+ * new accumulator value.
+ */
+ AccumT addInput(AccumT accumulator, InputT input);
+
+ /**
+ * Returns the output value that is the result of all accumulators from KeyGroups
+ * that are assigned to this operator.
+ */
+ OutputT extractOutput(Iterable<AccumT> accumulators);
+ }
+
+ private abstract class AbstractKeyGroupState<InputT, AccumT, OutputT> {
+
+ private String stateName;
+ private String namespace;
+ private Coder<AccumT> coder;
+ private KeyGroupCombiner<InputT, AccumT, OutputT> keyGroupCombiner;
+
+ AbstractKeyGroupState(
+ String stateName,
+ String namespace,
+ Coder<AccumT> coder,
+ KeyGroupCombiner<InputT, AccumT, OutputT> keyGroupCombiner) {
+ this.stateName = stateName;
+ this.namespace = namespace;
+ this.coder = coder;
+ this.keyGroupCombiner = keyGroupCombiner;
+ }
+
+ /**
+ * Choose keyGroup of input and addInput to accumulator.
+ */
+ void addInput(InputT input) {
+ int keyGroupIdx = keyedStateBackend.getCurrentKeyGroupIndex();
+ int localIdx = getIndexForKeyGroup(keyGroupIdx);
+ Map<String, Tuple2<Coder<?>, Map<String, ?>>> stateTable = stateTables[localIdx];
+ Tuple2<Coder<?>, Map<String, ?>> tuple2 = stateTable.get(stateName);
+ if (tuple2 == null) {
+ tuple2 = new Tuple2<>();
+ tuple2.f0 = coder;
+ tuple2.f1 = new HashMap<>();
+ stateTable.put(stateName, tuple2);
+ }
+ Map<String, AccumT> map = (Map<String, AccumT>) tuple2.f1;
+ AccumT accumulator = map.get(namespace);
+ if (accumulator == null) {
+ accumulator = keyGroupCombiner.createAccumulator();
+ }
+ accumulator = keyGroupCombiner.addInput(accumulator, input);
+ map.put(namespace, accumulator);
+ }
+
+ /**
+ * Get all accumulators and invoke extractOutput().
+ */
+ OutputT extractOutput() {
+ List<AccumT> accumulators = new ArrayList<>(stateTables.length);
+ for (Map<String, Tuple2<Coder<?>, Map<String, ?>>> stateTable : stateTables) {
+ Tuple2<Coder<?>, Map<String, ?>> tuple2 = stateTable.get(stateName);
+ if (tuple2 != null) {
+ AccumT accumulator = (AccumT) tuple2.f1.get(namespace);
+ if (accumulator != null) {
+ accumulators.add(accumulator);
+ }
+ }
+ }
+ return keyGroupCombiner.extractOutput(accumulators);
+ }
+
+ /**
+ * Find the first accumulator and return immediately.
+ */
+ boolean isEmptyInternal() {
+ for (Map<String, Tuple2<Coder<?>, Map<String, ?>>> stateTable : stateTables) {
+ Tuple2<Coder<?>, Map<String, ?>> tuple2 = stateTable.get(stateName);
+ if (tuple2 != null) {
+ AccumT accumulator = (AccumT) tuple2.f1.get(namespace);
+ if (accumulator != null) {
+ return false;
+ }
+ }
+ }
+ return true;
+ }
+
+ /**
+ * Clear accumulators and clean empty map.
+ */
+ void clearInternal() {
+ for (Map<String, Tuple2<Coder<?>, Map<String, ?>>> stateTable : stateTables) {
+ Tuple2<Coder<?>, Map<String, ?>> tuple2 = stateTable.get(stateName);
+ if (tuple2 != null) {
+ tuple2.f1.remove(namespace);
+ if (tuple2.f1.size() == 0) {
+ stateTable.remove(stateName);
+ }
+ }
+ }
+ }
+
+ }
+
+ private int getIndexForKeyGroup(int keyGroupIdx) {
+ checkArgument(localKeyGroupRange.contains(keyGroupIdx),
+ "Key Group " + keyGroupIdx + " does not belong to the local range.");
+ return keyGroupIdx - this.localKeyGroupRangeStartIdx;
+ }
+
+ private class KeyGroupBagCombiner<T> implements KeyGroupCombiner<T, List<T>, Iterable<T>> {
+
+ @Override
+ public List<T> createAccumulator() {
+ return new ArrayList<>();
+ }
+
+ @Override
+ public List<T> addInput(List<T> accumulator, T input) {
+ accumulator.add(input);
+ return accumulator;
+ }
+
+ @Override
+ public Iterable<T> extractOutput(Iterable<List<T>> accumulators) {
+ List<T> result = new ArrayList<>();
+ // maybe can return an unmodifiable view.
+ for (List<T> list : accumulators) {
+ result.addAll(list);
+ }
+ return result;
+ }
+ }
+
+ private class FlinkKeyGroupBagState<T> extends AbstractKeyGroupState<T, List<T>, Iterable<T>>
+ implements BagState<T> {
+
+ private final StateNamespace namespace;
+ private final StateTag<? super K, BagState<T>> address;
+
+ FlinkKeyGroupBagState(
+ StateTag<? super K, BagState<T>> address,
+ StateNamespace namespace,
+ Coder<T> coder) {
+ super(address.getId(), namespace.stringKey(), ListCoder.of(coder),
+ new KeyGroupBagCombiner<T>());
+ this.namespace = namespace;
+ this.address = address;
+ }
+
+ @Override
+ public void add(T input) {
+ addInput(input);
+ }
+
+ @Override
+ public BagState<T> readLater() {
+ return this;
+ }
+
+ @Override
+ public Iterable<T> read() {
+ Iterable<T> result = extractOutput();
+ return result != null ? result : Collections.<T>emptyList();
+ }
+
+ @Override
+ public ReadableState<Boolean> isEmpty() {
+ return new ReadableState<Boolean>() {
+ @Override
+ public Boolean read() {
+ try {
+ return isEmptyInternal();
+ } catch (Exception e) {
+ throw new RuntimeException("Error reading state.", e);
+ }
+
+ }
+
+ @Override
+ public ReadableState<Boolean> readLater() {
+ return this;
+ }
+ };
+ }
+
+ @Override
+ public void clear() {
+ clearInternal();
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) {
+ return true;
+ }
+ if (o == null || getClass() != o.getClass()) {
+ return false;
+ }
+
+ FlinkKeyGroupBagState<?> that = (FlinkKeyGroupBagState<?>) o;
+
+ return namespace.equals(that.namespace) && address.equals(that.address);
+
+ }
+
+ @Override
+ public int hashCode() {
+ int result = namespace.hashCode();
+ result = 31 * result + address.hashCode();
+ return result;
+ }
+ }
+
+ /**
+ * Snapshots the state {@code (stateName -> (valueCoder && (namespace -> value)))} for a given
+ * {@code keyGroupIdx}.
+ *
+ * @param keyGroupIdx the id of the key-group to be put in the snapshot.
+ * @param out the stream to write to.
+ */
+ public void snapshotKeyGroupState(int keyGroupIdx, DataOutputStream out) throws Exception {
+ int localIdx = getIndexForKeyGroup(keyGroupIdx);
+ Map<String, Tuple2<Coder<?>, Map<String, ?>>> stateTable = stateTables[localIdx];
+ Preconditions.checkState(stateTable.size() <= Short.MAX_VALUE,
+ "Too many States: " + stateTable.size() + ". Currently at most "
+ + Short.MAX_VALUE + " states are supported");
+ out.writeShort(stateTable.size());
+ for (Map.Entry<String, Tuple2<Coder<?>, Map<String, ?>>> entry : stateTable.entrySet()) {
+ out.writeUTF(entry.getKey());
+ Coder coder = entry.getValue().f0;
+ InstantiationUtil.serializeObject(out, coder);
+ Map<String, ?> map = entry.getValue().f1;
+ out.writeInt(map.size());
+ for (Map.Entry<String, ?> entry1 : map.entrySet()) {
+ StringUtf8Coder.of().encode(entry1.getKey(), out, Context.NESTED);
+ coder.encode(entry1.getValue(), out, Context.NESTED);
+ }
+ }
+ }
+
+ /**
+ * Restore the state {@code (stateName -> (valueCoder && (namespace -> value)))}
+ * for a given {@code keyGroupIdx}.
+ *
+ * @param keyGroupIdx the id of the key-group to be put in the snapshot.
+ * @param in the stream to read from.
+ * @param userCodeClassLoader the class loader that will be used to deserialize
+ * the valueCoder.
+ */
+ public void restoreKeyGroupState(int keyGroupIdx, DataInputStream in,
+ ClassLoader userCodeClassLoader) throws Exception {
+ int localIdx = getIndexForKeyGroup(keyGroupIdx);
+ Map<String, Tuple2<Coder<?>, Map<String, ?>>> stateTable = stateTables[localIdx];
+ int numStates = in.readShort();
+ for (int i = 0; i < numStates; ++i) {
+ String stateName = in.readUTF();
+ Coder coder = InstantiationUtil.deserializeObject(in, userCodeClassLoader);
+ Tuple2<Coder<?>, Map<String, ?>> tuple2 = stateTable.get(stateName);
+ if (tuple2 == null) {
+ tuple2 = new Tuple2<>();
+ tuple2.f0 = coder;
+ tuple2.f1 = new HashMap<>();
+ stateTable.put(stateName, tuple2);
+ }
+ Map<String, Object> map = (Map<String, Object>) tuple2.f1;
+ int mapSize = in.readInt();
+ for (int j = 0; j < mapSize; j++) {
+ String namespace = StringUtf8Coder.of().decode(in, Context.NESTED);
+ Object value = coder.decode(in, Context.NESTED);
+ map.put(namespace, value);
+ }
+ }
+ }
+
+}
[23/50] [abbrv] beam git commit: This closes #2588
Posted by dh...@apache.org.
This closes #2588
Project: http://git-wip-us.apache.org/repos/asf/beam/repo
Commit: http://git-wip-us.apache.org/repos/asf/beam/commit/8a00f225
Tree: http://git-wip-us.apache.org/repos/asf/beam/tree/8a00f225
Diff: http://git-wip-us.apache.org/repos/asf/beam/diff/8a00f225
Branch: refs/heads/DSL_SQL
Commit: 8a00f225454bcc742e5b38a499237bd5de455fb7
Parents: 470808c 28b692d
Author: Jean-Baptiste Onofr� <jb...@apache.org>
Authored: Wed Apr 19 13:27:51 2017 +0200
Committer: Jean-Baptiste Onofr� <jb...@apache.org>
Committed: Wed Apr 19 13:27:51 2017 +0200
----------------------------------------------------------------------
pom.xml | 7 +++++++
runners/flink/runner/pom.xml | 1 -
runners/spark/pom.xml | 1 -
sdks/java/core/pom.xml | 1 -
4 files changed, 7 insertions(+), 3 deletions(-)
----------------------------------------------------------------------
[13/50] [abbrv] beam git commit: Fix tests to properly fake out
BigQueryService, and add tests for dynamic-table functionality.
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/beam/blob/b486137d/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/FakeJobService.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/FakeJobService.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/FakeJobService.java
index 3c67c3d..a2454fb 100644
--- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/FakeJobService.java
+++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/FakeJobService.java
@@ -1,12 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
package org.apache.beam.sdk.io.gcp.bigquery;
import static com.google.common.base.Preconditions.checkArgument;
-import static com.google.common.base.Preconditions.checkState;
import com.google.api.client.json.JsonFactory;
import com.google.api.client.util.BackOff;
import com.google.api.client.util.BackOffUtils;
import com.google.api.client.util.Sleeper;
+import com.google.api.services.bigquery.model.ErrorProto;
import com.google.api.services.bigquery.model.Job;
import com.google.api.services.bigquery.model.JobConfiguration;
import com.google.api.services.bigquery.model.JobConfigurationExtract;
@@ -29,9 +47,18 @@ import java.io.ByteArrayInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.Serializable;
+import java.nio.channels.Channels;
+import java.nio.channels.WritableByteChannel;
import java.nio.charset.StandardCharsets;
import java.util.List;
+import java.util.Map;
+import java.util.concurrent.ThreadLocalRandom;
+import org.apache.avro.Schema;
+import org.apache.avro.file.DataFileWriter;
+import org.apache.avro.generic.GenericDatumWriter;
+import org.apache.avro.generic.GenericRecord;
+import org.apache.avro.generic.GenericRecordBuilder;
import org.apache.beam.sdk.coders.Coder;
import org.apache.beam.sdk.coders.Coder.Context;
import org.apache.beam.sdk.coders.TableRowJsonCoder;
@@ -40,10 +67,13 @@ import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryServices.JobService;
import org.apache.beam.sdk.util.FluentBackoff;
+import org.apache.beam.sdk.util.IOChannelUtils;
+import org.apache.beam.sdk.util.MimeTypes;
import org.apache.beam.sdk.util.Transport;
import org.joda.time.Duration;
/**
+ * A fake implementation of BigQuery's job service.
*/
class FakeJobService implements JobService, Serializable {
static final JsonFactory JSON_FACTORY = Transport.getJsonFactory();
@@ -66,6 +96,8 @@ class FakeJobService implements JobService, Serializable {
private static final com.google.common.collect.Table<String, String, JobInfo> allJobs =
HashBasedTable.create();
+ private static final com.google.common.collect.Table<String, String, List<String>>
+ filesForLoadJobs = HashBasedTable.create();
private static final com.google.common.collect.Table<String, String, JobStatistics>
dryRunQueryResults = HashBasedTable.create();
@@ -82,6 +114,18 @@ class FakeJobService implements JobService, Serializable {
job.setConfiguration(new JobConfiguration().setLoad(loadConfig));
job.setKind(" bigquery#job");
job.setStatus(new JobStatus().setState("PENDING"));
+
+ // Copy the files to a new location for import, as the temporary files will be deleted by
+ // the caller.
+ if (loadConfig.getSourceUris().size() > 0) {
+ List<String> loadFiles = Lists.newArrayList();
+ for (String filename : loadConfig.getSourceUris()) {
+ loadFiles.add(filename + ThreadLocalRandom.current().nextInt());
+ }
+ IOChannelUtils.getFactory(loadFiles.get(0)).copy(loadConfig.getSourceUris(), loadFiles);
+ filesForLoadJobs.put(jobRef.getProjectId(), jobRef.getJobId(), loadFiles);
+ }
+
allJobs.put(jobRef.getProjectId(), jobRef.getJobId(), new JobInfo(job));
}
}
@@ -91,8 +135,6 @@ class FakeJobService implements JobService, Serializable {
throws InterruptedException, IOException {
checkArgument(extractConfig.getDestinationFormat().equals("AVRO"),
"Only extract to AVRO is supported");
- checkArgument(extractConfig.getDestinationUris().size() == 1,
- "Must specify exactly one destination URI.");
synchronized (allJobs) {
Job job = new Job();
job.setJobReference(jobRef);
@@ -106,6 +148,14 @@ class FakeJobService implements JobService, Serializable {
@Override
public void startQueryJob(JobReference jobRef, JobConfigurationQuery query)
throws IOException, InterruptedException {
+ synchronized (allJobs) {
+ Job job = new Job();
+ job.setJobReference(jobRef);
+ job.setConfiguration(new JobConfiguration().setQuery(query));
+ job.setKind(" bigquery#job");
+ job.setStatus(new JobStatus().setState("PENDING"));
+ allJobs.put(jobRef.getProjectId(), jobRef.getJobId(), new JobInfo(job));
+ }
}
@Override
@@ -127,8 +177,8 @@ class FakeJobService implements JobService, Serializable {
BackOff backoff =
FluentBackoff.DEFAULT
.withMaxRetries(maxAttempts)
- .withInitialBackoff(Duration.millis(50))
- .withMaxBackoff(Duration.standardMinutes(1))
+ .withInitialBackoff(Duration.millis(10))
+ .withMaxBackoff(Duration.standardSeconds(1))
.backoff();
Sleeper sleeper = Sleeper.DEFAULT;
try {
@@ -136,7 +186,8 @@ class FakeJobService implements JobService, Serializable {
Job job = getJob(jobRef);
if (job != null) {
JobStatus status = job.getStatus();
- if (status != null && status.getState() != null && status.getState().equals("DONE")) {
+ if (status != null && status.getState() != null
+ && (status.getState().equals("DONE") || status.getState().equals("FAILED"))) {
return job;
}
}
@@ -173,12 +224,15 @@ class FakeJobService implements JobService, Serializable {
if (job == null) {
return null;
}
- ++job.getJobCount;
- if (job.getJobCount == GET_JOBS_TRANSITION_INTERVAL + 1) {
- job.job.getStatus().setState("RUNNING");
- } else if (job.getJobCount == 2 * GET_JOBS_TRANSITION_INTERVAL + 1) {
- runJob(job.job);
- job.job.getStatus().setState("DONE");
+ try {
+ ++job.getJobCount;
+ if (job.getJobCount == GET_JOBS_TRANSITION_INTERVAL + 1) {
+ job.job.getStatus().setState("RUNNING");
+ } else if (job.getJobCount == 2 * GET_JOBS_TRANSITION_INTERVAL + 1) {
+ job.job.setStatus(runJob(job.job));
+ }
+ } catch (Exception e) {
+ job.job.getStatus().setState("FAILED").setErrorResult(new ErrorProto());
}
return JSON_FACTORY.fromString(JSON_FACTORY.toString(job.job), Job.class);
}
@@ -187,41 +241,50 @@ class FakeJobService implements JobService, Serializable {
}
}
- private void runJob(Job job) throws InterruptedException, IOException {
+ private JobStatus runJob(Job job) throws InterruptedException, IOException {
if (job.getConfiguration().getLoad() != null) {
- runLoadJob(job.getConfiguration().getLoad());
+ return runLoadJob(job.getJobReference(), job.getConfiguration().getLoad());
} else if (job.getConfiguration().getCopy() != null) {
- runCopyJob(job.getConfiguration().getCopy());
+ return runCopyJob(job.getConfiguration().getCopy());
} else if (job.getConfiguration().getExtract() != null) {
- runExtractJob(job, job.getConfiguration().getExtract());
+ return runExtractJob(job, job.getConfiguration().getExtract());
+ } else if (job.getConfiguration().getQuery() != null) {
+ return runQueryJob(job.getConfiguration().getQuery());
}
+ return new JobStatus().setState("DONE");
}
- private void validateDispositions(Table table, CreateDisposition createDisposition,
- WriteDisposition writeDisposition)
+ private boolean validateDispositions(Table table, CreateDisposition createDisposition,
+ WriteDisposition writeDisposition)
throws InterruptedException, IOException {
if (table == null) {
- checkState(createDisposition != CreateDisposition.CREATE_NEVER,
- "CreateDisposition == CREATE_NEVER but the table doesn't exist.");
+ if (createDisposition == CreateDisposition.CREATE_NEVER) {
+ return false;
+ }
} else if (writeDisposition == WriteDisposition.WRITE_TRUNCATE) {
datasetService.deleteTable(table.getTableReference());
} else if (writeDisposition == WriteDisposition.WRITE_EMPTY) {
List<TableRow> allRows = datasetService.getAllRows(table.getTableReference().getProjectId(),
table.getTableReference().getDatasetId(), table.getTableReference().getTableId());
- checkState(allRows.isEmpty(), "Write disposition was set to WRITE_EMPTY,"
- + " but the table was not empty.");
+ if (!allRows.isEmpty()) {
+ return false;
+ }
}
+ return true;
}
- private void runLoadJob(JobConfigurationLoad load)
+
+ private JobStatus runLoadJob(JobReference jobRef, JobConfigurationLoad load)
throws InterruptedException, IOException {
TableReference destination = load.getDestinationTable();
TableSchema schema = load.getSchema();
- List<String> sourceFiles = load.getSourceUris();
+ List<String> sourceFiles = filesForLoadJobs.get(jobRef.getProjectId(), jobRef.getJobId());
WriteDisposition writeDisposition = WriteDisposition.valueOf(load.getWriteDisposition());
CreateDisposition createDisposition = CreateDisposition.valueOf(load.getCreateDisposition());
checkArgument(load.getSourceFormat().equals("NEWLINE_DELIMITED_JSON"));
Table existingTable = datasetService.getTable(destination);
- validateDispositions(existingTable, createDisposition, writeDisposition);
+ if (!validateDispositions(existingTable, createDisposition, writeDisposition)) {
+ return new JobStatus().setState("FAILED").setErrorResult(new ErrorProto());
+ }
datasetService.createTable(new Table().setTableReference(destination).setSchema(schema));
@@ -230,31 +293,52 @@ class FakeJobService implements JobService, Serializable {
rows.addAll(readRows(filename));
}
datasetService.insertAll(destination, rows, null);
+ return new JobStatus().setState("DONE");
}
- private void runCopyJob(JobConfigurationTableCopy copy)
+ private JobStatus runCopyJob(JobConfigurationTableCopy copy)
throws InterruptedException, IOException {
List<TableReference> sources = copy.getSourceTables();
TableReference destination = copy.getDestinationTable();
WriteDisposition writeDisposition = WriteDisposition.valueOf(copy.getWriteDisposition());
CreateDisposition createDisposition = CreateDisposition.valueOf(copy.getCreateDisposition());
Table existingTable = datasetService.getTable(destination);
- validateDispositions(existingTable, createDisposition, writeDisposition);
+ if (!validateDispositions(existingTable, createDisposition, writeDisposition)) {
+ return new JobStatus().setState("FAILED").setErrorResult(new ErrorProto());
+ }
List<TableRow> allRows = Lists.newArrayList();
for (TableReference source : sources) {
allRows.addAll(datasetService.getAllRows(
source.getProjectId(), source.getDatasetId(), source.getTableId()));
}
+ datasetService.createTable(new Table().setTableReference(destination));
datasetService.insertAll(destination, allRows, null);
+ return new JobStatus().setState("DONE");
}
- private void runExtractJob(Job job, JobConfigurationExtract extract) {
+ private JobStatus runExtractJob(Job job, JobConfigurationExtract extract)
+ throws InterruptedException, IOException {
TableReference sourceTable = extract.getSourceTable();
- extract.getDestinationUris().get(0);
- List<Long> destinationFileCounts = Lists.newArrayList(0L);
+
+ List<TableRow> rows = datasetService.getAllRows(
+ sourceTable.getProjectId(), sourceTable.getDatasetId(), sourceTable.getTableId());
+ TableSchema schema = datasetService.getTable(sourceTable).getSchema();
+ List<Long> destinationFileCounts = Lists.newArrayList();
+ for (String destination : extract.getDestinationUris()) {
+ destinationFileCounts.add(writeRows(sourceTable.getTableId(), rows, schema, destination));
+ }
job.setStatistics(new JobStatistics().setExtract(
new JobStatistics4().setDestinationUriFileCounts(destinationFileCounts)));
+ return new JobStatus().setState("DONE");
+ }
+
+ private JobStatus runQueryJob(JobConfigurationQuery query)
+ throws IOException, InterruptedException {
+ List<TableRow> rows = FakeBigQueryServices.rowsFromEncodedQuery(query.getQuery());
+ datasetService.createTable(new Table().setTableReference(query.getDestinationTable()));
+ datasetService.insertAll(query.getDestinationTable(), rows, null);
+ return new JobStatus().setState("DONE");
}
private List<TableRow> readRows(String filename) throws IOException {
@@ -270,4 +354,42 @@ class FakeJobService implements JobService, Serializable {
}
return tableRows;
}
+
+ private long writeRows(String tableId, List<TableRow> rows, TableSchema schema,
+ String destinationPattern) throws IOException {
+ Schema avroSchema = BigQueryAvroUtils.toGenericAvroSchema(tableId, schema.getFields());
+ List<TableRow> rowsToWrite = Lists.newArrayList();
+ int shard = 0;
+ for (int i = 0; i < rows.size(); ++i) {
+ rowsToWrite.add(rows.get(i));
+ if (rowsToWrite.size() == 5) {
+ writeRowsHelper(rowsToWrite, avroSchema, destinationPattern, shard++);
+ rowsToWrite.clear();
+ }
+ }
+ if (!rowsToWrite.isEmpty()) {
+ writeRowsHelper(rowsToWrite, avroSchema, destinationPattern, shard++);
+ }
+ return shard;
+ }
+
+ private void writeRowsHelper(List<TableRow> rows, Schema avroSchema,
+ String destinationPattern, int shard) throws IOException {
+ String filename = destinationPattern.replace("*", String.format("%012d", shard));
+ try (WritableByteChannel channel = IOChannelUtils.create(filename, MimeTypes.BINARY);
+ DataFileWriter<GenericRecord> tableRowWriter =
+ new DataFileWriter<>(new GenericDatumWriter<GenericRecord>(avroSchema))
+ .create(avroSchema, Channels.newOutputStream(channel))) {
+ for (Map<String, Object> record : rows) {
+ GenericRecordBuilder genericRecordBuilder = new GenericRecordBuilder(avroSchema);
+ for (Map.Entry<String, Object> field : record.entrySet()) {
+ genericRecordBuilder.set(field.getKey(), field.getValue());
+ }
+ tableRowWriter.append(genericRecordBuilder.build());
+ }
+ } catch (IOException e) {
+ throw new IllegalStateException(
+ String.format("Could not create destination for extract job %s", filename), e);
+ }
+ }
}
http://git-wip-us.apache.org/repos/asf/beam/blob/b486137d/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/TableContainer.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/TableContainer.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/TableContainer.java
index b2fc170..d52723b 100644
--- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/TableContainer.java
+++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/TableContainer.java
@@ -1,3 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
package org.apache.beam.sdk.io.gcp.bigquery;
import com.google.api.services.bigquery.model.Table;
@@ -7,23 +24,31 @@ import java.util.ArrayList;
import java.util.List;
/**
- * Created by relax on 3/30/17.
+ * Encapsulates a BigQuery Table, and it's contents.
*/
class TableContainer {
Table table;
List<TableRow> rows;
List<String> ids;
-
+ Long sizeBytes;
TableContainer(Table table) {
this.table = table;
this.rows = new ArrayList<>();
this.ids = new ArrayList<>();
+ this.sizeBytes = 0L;
}
- TableContainer addRow(TableRow row, String id) {
+ long addRow(TableRow row, String id) {
rows.add(row);
ids.add(id);
- return this;
+ long rowSize = row.toString().length();
+ Long tableSize = table.getNumBytes();
+ if (tableSize == null) {
+ table.setNumBytes(rowSize);
+ } else {
+ table.setNumBytes(tableSize + rowSize);
+ }
+ return rowSize;
}
Table getTable() {
[16/50] [abbrv] beam git commit: Refactor streaming write branch into
separate reusable components.
Posted by dh...@apache.org.
Refactor streaming write branch into separate reusable components.
Project: http://git-wip-us.apache.org/repos/asf/beam/repo
Commit: http://git-wip-us.apache.org/repos/asf/beam/commit/58ed5c7e
Tree: http://git-wip-us.apache.org/repos/asf/beam/tree/58ed5c7e
Diff: http://git-wip-us.apache.org/repos/asf/beam/diff/58ed5c7e
Branch: refs/heads/DSL_SQL
Commit: 58ed5c7ecd247f9c5e5a15deff40ffa8c800af25
Parents: 67a5f82
Author: Reuven Lax <re...@google.com>
Authored: Tue Mar 28 19:34:56 2017 -0700
Committer: Eugene Kirpichov <ki...@google.com>
Committed: Tue Apr 18 21:12:50 2017 -0700
----------------------------------------------------------------------
.../beam/sdk/io/gcp/bigquery/BigQueryIO.java | 69 ++++++------
.../beam/sdk/io/gcp/bigquery/CreateTables.java | 100 +++++++++++++++++
.../io/gcp/bigquery/GenerateShardedTable.java | 48 ++++++++
.../beam/sdk/io/gcp/bigquery/PrepareWrite.java | 65 ++++++-----
.../sdk/io/gcp/bigquery/StreamWithDeDup.java | 90 ---------------
.../sdk/io/gcp/bigquery/StreamingInserts.java | 110 +++++++++++++++++++
.../sdk/io/gcp/bigquery/StreamingWriteFn.java | 82 +-------------
.../sdk/io/gcp/bigquery/TableDestination.java | 48 +++++++-
.../io/gcp/bigquery/TableDestinationCoder.java | 64 +++++++++++
.../sdk/io/gcp/bigquery/TagWithUniqueIds.java | 71 ++++++++++++
.../gcp/bigquery/TagWithUniqueIdsAndTable.java | 101 -----------------
.../sdk/io/gcp/bigquery/BigQueryIOTest.java | 18 +--
12 files changed, 521 insertions(+), 345 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/beam/blob/58ed5c7e/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIO.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIO.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIO.java
index af0d561..af19b83 100644
--- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIO.java
+++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIO.java
@@ -40,6 +40,7 @@ import java.util.regex.Pattern;
import javax.annotation.Nullable;
import org.apache.beam.sdk.coders.Coder;
+import org.apache.beam.sdk.coders.KvCoder;
import org.apache.beam.sdk.coders.TableRowJsonCoder;
import org.apache.beam.sdk.coders.VoidCoder;
import org.apache.beam.sdk.io.BoundedSource;
@@ -60,6 +61,7 @@ import org.apache.beam.sdk.options.ValueProvider.NestedValueProvider;
import org.apache.beam.sdk.options.ValueProvider.StaticValueProvider;
import org.apache.beam.sdk.runners.PipelineRunner;
import org.apache.beam.sdk.transforms.PTransform;
+import org.apache.beam.sdk.transforms.ParDo;
import org.apache.beam.sdk.transforms.SerializableFunction;
import org.apache.beam.sdk.transforms.display.DisplayData;
import org.apache.beam.sdk.util.GcsUtil.GcsUtilFactory;
@@ -67,6 +69,7 @@ import org.apache.beam.sdk.util.IOChannelFactory;
import org.apache.beam.sdk.util.IOChannelUtils;
import org.apache.beam.sdk.util.Transport;
import org.apache.beam.sdk.util.gcsfs.GcsPath;
+import org.apache.beam.sdk.values.KV;
import org.apache.beam.sdk.values.PBegin;
import org.apache.beam.sdk.values.PCollection;
import org.apache.beam.sdk.values.PCollection.IsBounded;
@@ -681,8 +684,8 @@ public class BigQueryIO {
static final int LOAD_JOB_POLL_MAX_RETRIES = Integer.MAX_VALUE;
@Nullable abstract ValueProvider<String> getJsonTableRef();
- @Nullable abstract SerializableFunction<ValueInSingleWindow<T>, TableReference>
- getTableRefFunction();
+ @Nullable abstract SerializableFunction<ValueInSingleWindow<T>, TableDestination>
+ getTableFunction();
@Nullable abstract SerializableFunction<T, TableRow> getFormatFunction();
/** Table schema. The schema is required only if the table does not exist. */
@Nullable abstract ValueProvider<String> getJsonSchema();
@@ -783,7 +786,7 @@ public class BigQueryIO {
private void ensureToNotCalledYet() {
checkState(
getJsonTableRef() == null && getTable() == null
- && getTableRefFunction() == null, "to() already called");
+ && getTableFunction() == null, "to() already called");
}
/**
@@ -802,13 +805,16 @@ public class BigQueryIO {
/** Same as {@link #to(String)}, but with a {@link ValueProvider}. */
public Write<T> to(ValueProvider<String> tableSpec) {
ensureToNotCalledYet();
+ String tableDescription = getTableDescription();
+ if (tableDescription == null) {
+ tableDescription = "";
+ }
return toBuilder()
.setJsonTableRef(
NestedValueProvider.of(
NestedValueProvider.of(tableSpec, new TableSpecToTableRef()),
new TableRefToJson()))
- .setTableRefFunction(new TranslateTableSpecFunction<T>(
- new ConstantTableSpecFunction<T>(tableSpec)))
+ .setTableFunction(new ConstantTableFunction<T>(tableSpec, tableDescription))
.build();
}
@@ -819,6 +825,8 @@ public class BigQueryIO {
public Write<T> to(
SerializableFunction<ValueInSingleWindow<T>, String> tableSpecFunction) {
return toTableReference(new TranslateTableSpecFunction<T>(tableSpecFunction));
+ ensureToNotCalledYet();
+ return toBuilder().setTableFunction(tableFunction).build();
}
/**
@@ -828,7 +836,7 @@ public class BigQueryIO {
private Write<T> toTableReference(
SerializableFunction<ValueInSingleWindow<T>, TableReference> tableRefFunction) {
ensureToNotCalledYet();
- return toBuilder().setTableRefFunction(tableRefFunction).build();
+ return toBuilder().setTableFunction(tableFunction).build();
}
/**
@@ -838,32 +846,19 @@ public class BigQueryIO {
return toBuilder().setFormatFunction(formatFunction).build();
}
- private static class TranslateTableSpecFunction<T> implements
- SerializableFunction<ValueInSingleWindow<T>, TableReference> {
- private SerializableFunction<ValueInSingleWindow<T>, String> tableSpecFunction;
-
- TranslateTableSpecFunction(
- SerializableFunction<ValueInSingleWindow<T>, String> tableSpecFunction) {
- this.tableSpecFunction = tableSpecFunction;
- }
+ static class ConstantTableFunction<T> implements
+ SerializableFunction<ValueInSingleWindow<T>, TableDestination> {
+ private final ValueProvider<String> tableSpec;
+ private final String tableDescription;
- @Override
- public TableReference apply(ValueInSingleWindow<T> value) {
- return BigQueryHelpers.parseTableSpec(tableSpecFunction.apply(value));
- }
- }
-
- static class ConstantTableSpecFunction<T> implements
- SerializableFunction<ValueInSingleWindow<T>, String> {
- private ValueProvider<String> tableSpec;
-
- ConstantTableSpecFunction(ValueProvider<String> tableSpec) {
+ ConstantTableFunction(ValueProvider<String> tableSpec, String tableDescription) {
this.tableSpec = tableSpec;
+ this.tableDescription = tableDescription;
}
@Override
- public String apply(ValueInSingleWindow<T> value) {
- return tableSpec.get();
+ public TableDestination apply(ValueInSingleWindow<T> value) {
+ return new TableDestination(tableSpec.get(), tableDescription);
}
}
@@ -919,7 +914,7 @@ public class BigQueryIO {
BigQueryOptions options = input.getPipeline().getOptions().as(BigQueryOptions.class);
// Exactly one of the table and table reference can be configured.
- checkState(getTableRefFunction() != null,
+ checkState(getTableFunction() != null,
"must set the table reference of a BigQueryIO.Write transform");
checkArgument(getFormatFunction() != null,
@@ -978,10 +973,16 @@ public class BigQueryIO {
@Override
public WriteResult expand(PCollection<T> input) {
+ PCollection<KV<TableDestination, TableRow>> rowsWithDestination =
+ input.apply("PrepareWrite", ParDo.of(
+ new PrepareWrite<T>(getTableFunction(), getFormatFunction())))
+ .setCoder(KvCoder.of(TableDestinationCoder.of(), TableRowJsonCoder.of()));
+
+
// When writing an Unbounded PCollection, or when a tablespec function is defined, we use
- // StreamWithDeDup and BigQuery's streaming import API.
+ // StreamingInserts and BigQuery's streaming import API.
if (input.isBounded() == IsBounded.UNBOUNDED) {
- return input.apply(new StreamWithDeDup<T>(this));
+ return rowsWithDestination.apply(new StreamingInserts(this));
} else {
return input.apply(new BatchLoadBigQuery<T>(this));
}
@@ -1002,8 +1003,8 @@ public class BigQueryIO {
.addIfNotNull(DisplayData.item("schema", getJsonSchema())
.withLabel("Table Schema"));
- if (getTableRefFunction() != null) {
- builder.add(DisplayData.item("tableFn", getTableRefFunction().getClass())
+ if (getTableFunction() != null) {
+ builder.add(DisplayData.item("tableFn", getTableFunction().getClass())
.withLabel("Table Reference Function"));
}
@@ -1025,7 +1026,7 @@ public class BigQueryIO {
}
/**
- * Returns the table to write, or {@code null} if writing with {@code tableRefFunction}.
+ * Returns the table to write, or {@code null} if writing with {@code tableFunction}.
*
* <p>If the table's project is not specified, use the executing project.
*/
@@ -1066,7 +1067,7 @@ public class BigQueryIO {
*/
@VisibleForTesting
static void clearCreatedTables() {
- StreamingWriteFn.clearCreatedTables();
+ CreateTables.clearCreatedTables();
}
/////////////////////////////////////////////////////////////////////////////
http://git-wip-us.apache.org/repos/asf/beam/blob/58ed5c7e/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/CreateTables.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/CreateTables.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/CreateTables.java
new file mode 100644
index 0000000..e216553
--- /dev/null
+++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/CreateTables.java
@@ -0,0 +1,100 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements. See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership. The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.beam.sdk.io.gcp.bigquery;
+
+import com.google.api.services.bigquery.model.Table;
+import com.google.api.services.bigquery.model.TableReference;
+import com.google.api.services.bigquery.model.TableRow;
+import com.google.api.services.bigquery.model.TableSchema;
+import java.io.IOException;
+import java.util.Collections;
+import java.util.Set;
+import java.util.concurrent.ConcurrentHashMap;
+import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition;
+import org.apache.beam.sdk.io.gcp.bigquery.BigQueryServices.DatasetService;
+import org.apache.beam.sdk.options.BigQueryOptions;
+import org.apache.beam.sdk.transforms.DoFn;
+import org.apache.beam.sdk.transforms.SerializableFunction;
+import org.apache.beam.sdk.values.KV;
+
+
+/**
+ * Creates any tables needed before performing streaming writes to the tables. This is a
+ * side-effect {l@ink DoFn}, and returns the original collection unchanged.
+ */
+public class CreateTables extends DoFn<KV<TableDestination, TableRow>,
+ KV<TableDestination, TableRow>> {
+ private final CreateDisposition createDisposition;
+ private final BigQueryServices bqServices;
+ private final SerializableFunction<TableDestination, TableSchema> schemaFunction;
+
+
+ /** The list of tables created so far, so we don't try the creation
+ each time.
+ * TODO: We should put a bound on memory usage of this. Use guava cache instead.
+ */
+ private static Set<String> createdTables =
+ Collections.newSetFromMap(new ConcurrentHashMap<String, Boolean>());
+
+ public CreateTables(CreateDisposition createDisposition, BigQueryServices bqServices,
+ SerializableFunction<TableDestination, TableSchema> schemaFunction) {
+ this.createDisposition = createDisposition;
+ this.bqServices = bqServices;
+ this.schemaFunction = schemaFunction;
+ }
+
+ @ProcessElement
+ public void processElement(ProcessContext context) throws InterruptedException, IOException {
+ BigQueryOptions options = context.getPipelineOptions().as(BigQueryOptions.class);
+ possibleCreateTable(options, context.element().getKey());
+ context.output(context.element());
+ }
+
+ private void possibleCreateTable(BigQueryOptions options, TableDestination tableDestination)
+ throws InterruptedException, IOException {
+ String tableSpec = tableDestination.getTableSpec();
+ TableReference tableReference = tableDestination.getTableReference();
+ String tableDescription = tableDestination.getTableDescription();
+ if (createDisposition != createDisposition.CREATE_NEVER
+ && !createdTables.contains(tableSpec)) {
+ synchronized (createdTables) {
+ // Another thread may have succeeded in creating the table in the meanwhile, so
+ // check again. This check isn't needed for correctness, but we add it to prevent
+ // every thread from attempting a create and overwhelming our BigQuery quota.
+ DatasetService datasetService = bqServices.getDatasetService(options);
+ if (!createdTables.contains(tableSpec)) {
+ TableSchema tableSchema = schemaFunction.apply(tableDestination);
+ if (datasetService.getTable(tableReference) == null) {
+ datasetService.createTable(
+ new Table()
+ .setTableReference(tableReference)
+ .setSchema(tableSchema)
+ .setDescription(tableDescription));
+ }
+ createdTables.add(tableSpec);
+ }
+ }
+ }
+ }
+
+ static void clearCreatedTables() {
+ synchronized (createdTables) {
+ createdTables.clear();
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/58ed5c7e/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/GenerateShardedTable.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/GenerateShardedTable.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/GenerateShardedTable.java
new file mode 100644
index 0000000..da3a70a
--- /dev/null
+++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/GenerateShardedTable.java
@@ -0,0 +1,48 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements. See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership. The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.beam.sdk.io.gcp.bigquery;
+
+import com.google.api.services.bigquery.model.TableRow;
+import java.io.IOException;
+import java.util.concurrent.ThreadLocalRandom;
+import org.apache.beam.sdk.transforms.DoFn;
+import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
+import org.apache.beam.sdk.values.KV;
+
+/**
+ * Given a write to a specific table, assign that to one of the
+ * {@link GenerateShardedTable#numShards} keys assigned to that table.
+ */
+class GenerateShardedTable extends DoFn<KV<TableDestination, TableRow>,
+ KV<ShardedKey<String>, TableRow>> {
+ private final int numShards;
+
+ GenerateShardedTable(int numShards) {
+ this.numShards = numShards;
+ }
+
+ @ProcessElement
+ public void processElement(ProcessContext context, BoundedWindow window) throws IOException {
+ ThreadLocalRandom randomGenerator = ThreadLocalRandom.current();
+ // We output on keys 0-50 to ensure that there's enough batching for
+ // BigQuery.
+ String tableSpec = context.element().getKey().getTableSpec();
+ context.output(KV.of(ShardedKey.of(tableSpec, randomGenerator.nextInt(0, numShards)),
+ context.element().getValue()));
+ }
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/58ed5c7e/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/PrepareWrite.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/PrepareWrite.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/PrepareWrite.java
index 0c08e18..7712417 100644
--- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/PrepareWrite.java
+++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/PrepareWrite.java
@@ -1,3 +1,20 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements. See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership. The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
package org.apache.beam.sdk.io.gcp.bigquery;
import com.google.api.services.bigquery.model.TableReference;
@@ -6,8 +23,6 @@ import com.google.common.base.Strings;
import java.io.IOException;
import org.apache.beam.sdk.options.BigQueryOptions;
import org.apache.beam.sdk.transforms.DoFn;
-import org.apache.beam.sdk.transforms.PTransform;
-import org.apache.beam.sdk.transforms.ParDo;
import org.apache.beam.sdk.transforms.SerializableFunction;
import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
import org.apache.beam.sdk.values.KV;
@@ -15,44 +30,38 @@ import org.apache.beam.sdk.values.PCollection;
import org.apache.beam.sdk.values.ValueInSingleWindow;
/**
- * Prepare an input {@link PCollection<T>} for writing to BigQuery. Use the table-reference
+ * Prepare an input {@link PCollection} for writing to BigQuery. Use the table-reference
* function to determine which tables each element is written to, and format the element into a
* {@link TableRow} using the user-supplied format function.
*/
-public class PrepareWrite<T> extends PTransform<PCollection<T>, PCollection<KV<String, TableRow>>> {
- private static final String NAME = "PrepareWrite";
- private SerializableFunction<ValueInSingleWindow<T>, TableReference> tableRefFunction;
+public class PrepareWrite<T> extends DoFn<T, KV<TableDestination, TableRow>> {
+ private SerializableFunction<ValueInSingleWindow<T>, TableDestination> tableFunction;
private SerializableFunction<T, TableRow> formatFunction;
- public PrepareWrite(SerializableFunction<ValueInSingleWindow<T>, TableReference> tableRefFunction,
+ public PrepareWrite(SerializableFunction<ValueInSingleWindow<T>, TableDestination> tableFunction,
SerializableFunction<T, TableRow> formatFunction) {
- super(NAME);
- this.tableRefFunction = tableRefFunction;
+ this.tableFunction = tableFunction;
this.formatFunction = formatFunction;
}
- @Override
- public PCollection<KV<String, TableRow>> expand(PCollection<T> input) {
- PCollection<KV<String, TableRow>> elementsByTable =
- input.apply(ParDo.of(new DoFn<T, KV<String, TableRow>>() {
- @ProcessElement
- public void processElement(ProcessContext context, BoundedWindow window) throws IOException {
- String tableSpec = tableSpecFromWindowedValue(
- context.getPipelineOptions().as(BigQueryOptions.class),
- ValueInSingleWindow.of(context.element(), context.timestamp(), window, context.pane()));
- TableRow tableRow = formatFunction.apply(context.element());
- context.output(KV.of(tableSpec, tableRow));
- }
- }));
- return elementsByTable;
+ @ProcessElement
+ public void processElement(ProcessContext context, BoundedWindow window) throws IOException {
+ TableDestination tableDestination = tableSpecFromWindowedValue(
+ context.getPipelineOptions().as(BigQueryOptions.class),
+ ValueInSingleWindow.of(context.element(), context.timestamp(), window, context.pane()));
+ TableRow tableRow = formatFunction.apply(context.element());
+ context.output(KV.of(tableDestination, tableRow));
}
- private String tableSpecFromWindowedValue(BigQueryOptions options,
+ private TableDestination tableSpecFromWindowedValue(BigQueryOptions options,
ValueInSingleWindow<T> value) {
- TableReference table = tableRefFunction.apply(value);
- if (Strings.isNullOrEmpty(table.getProjectId())) {
- table.setProjectId(options.getProject());
+ TableDestination tableDestination = tableFunction.apply(value);
+ TableReference tableReference = tableDestination.getTableReference();
+ if (Strings.isNullOrEmpty(tableReference.getProjectId())) {
+ tableReference.setProjectId(options.getProject());
+ tableDestination = new TableDestination(tableReference,
+ tableDestination.getTableDescription());
}
- return BigQueryHelpers.toTableSpec(table);
+ return tableDestination;
}
}
http://git-wip-us.apache.org/repos/asf/beam/blob/58ed5c7e/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StreamWithDeDup.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StreamWithDeDup.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StreamWithDeDup.java
deleted file mode 100644
index 506a564..0000000
--- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StreamWithDeDup.java
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.beam.sdk.io.gcp.bigquery;
-
-import com.google.api.services.bigquery.model.TableSchema;
-import org.apache.beam.sdk.coders.Coder;
-import org.apache.beam.sdk.coders.KvCoder;
-import org.apache.beam.sdk.coders.StringUtf8Coder;
-import org.apache.beam.sdk.coders.VoidCoder;
-import org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.JsonSchemaToTableSchema;
-import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write;
-import org.apache.beam.sdk.options.BigQueryOptions;
-import org.apache.beam.sdk.options.ValueProvider.NestedValueProvider;
-import org.apache.beam.sdk.transforms.PTransform;
-import org.apache.beam.sdk.transforms.ParDo;
-import org.apache.beam.sdk.util.Reshuffle;
-import org.apache.beam.sdk.values.KV;
-import org.apache.beam.sdk.values.PCollection;
-
-/**
-* PTransform that performs streaming BigQuery write. To increase consistency,
-* it leverages BigQuery best effort de-dup mechanism.
- */
-class StreamWithDeDup<T> extends PTransform<PCollection<T>, WriteResult> {
- private final Write<T> write;
-
- /** Constructor. */
- StreamWithDeDup(Write<T> write) {
- this.write = write;
- }
-
- @Override
- protected Coder<Void> getDefaultOutputCoder() {
- return VoidCoder.of();
- }
-
- @Override
- public WriteResult expand(PCollection<T> input) {
- // A naive implementation would be to simply stream data directly to BigQuery.
- // However, this could occasionally lead to duplicated data, e.g., when
- // a VM that runs this code is restarted and the code is re-run.
-
- // The above risk is mitigated in this implementation by relying on
- // BigQuery built-in best effort de-dup mechanism.
-
- // To use this mechanism, each input TableRow is tagged with a generated
- // unique id, which is then passed to BigQuery and used to ignore duplicates.
-
- PCollection<KV<ShardedKey<String>, TableRowInfo>> tagged =
- input.apply(ParDo.of(new TagWithUniqueIdsAndTable<T>(
- input.getPipeline().getOptions().as(BigQueryOptions.class), write)));
-
- // To prevent having the same TableRow processed more than once with regenerated
- // different unique ids, this implementation relies on "checkpointing", which is
- // achieved as a side effect of having StreamingWriteFn immediately follow a GBK,
- // performed by Reshuffle.
- NestedValueProvider<TableSchema, String> schema =
- write.getJsonSchema() == null
- ? null
- : NestedValueProvider.of(write.getJsonSchema(), new JsonSchemaToTableSchema());
- tagged
- .setCoder(KvCoder.of(ShardedKeyCoder.of(StringUtf8Coder.of()), TableRowInfoCoder.of()))
- .apply(Reshuffle.<ShardedKey<String>, TableRowInfo>of())
- .apply(
- ParDo.of(
- new StreamingWriteFn(
- schema,
- write.getCreateDisposition(),
- write.getTableDescription(),
- write.getBigQueryServices())));
-
- return WriteResult.in(input.getPipeline());
- }
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/58ed5c7e/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StreamingInserts.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StreamingInserts.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StreamingInserts.java
new file mode 100644
index 0000000..37afbdf
--- /dev/null
+++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StreamingInserts.java
@@ -0,0 +1,110 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.beam.sdk.io.gcp.bigquery;
+
+import com.google.api.services.bigquery.model.TableRow;
+import com.google.api.services.bigquery.model.TableSchema;
+import javax.annotation.Nullable;
+import org.apache.beam.sdk.coders.Coder;
+import org.apache.beam.sdk.coders.KvCoder;
+import org.apache.beam.sdk.coders.StringUtf8Coder;
+import org.apache.beam.sdk.coders.TableRowJsonCoder;
+import org.apache.beam.sdk.coders.VoidCoder;
+import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write;
+import org.apache.beam.sdk.transforms.PTransform;
+import org.apache.beam.sdk.transforms.ParDo;
+import org.apache.beam.sdk.transforms.SerializableFunction;
+import org.apache.beam.sdk.util.Reshuffle;
+import org.apache.beam.sdk.values.KV;
+import org.apache.beam.sdk.values.PCollection;
+
+/**
+* PTransform that performs streaming BigQuery write. To increase consistency,
+* it leverages BigQuery best effort de-dup mechanism.
+ */
+
+class StreamingInserts
+ extends PTransform<PCollection<KV<TableDestination, TableRow>>, WriteResult> {
+ private final Write<?> write;
+
+ private static class ConstantSchemaFunction implements
+ SerializableFunction<TableDestination, TableSchema> {
+ private final @Nullable String jsonSchema;
+
+ ConstantSchemaFunction(TableSchema schema) {
+ this.jsonSchema = BigQueryHelpers.toJsonString(schema);
+ }
+
+ @Override
+ @Nullable
+ public TableSchema apply(TableDestination table) {
+ return BigQueryHelpers.fromJsonString(jsonSchema, TableSchema.class);
+ }
+ }
+
+ /** Constructor. */
+ StreamingInserts(Write<?> write) {
+ this.write = write;
+ }
+
+ @Override
+ protected Coder<Void> getDefaultOutputCoder() {
+ return VoidCoder.of();
+ }
+
+ @Override
+ public WriteResult expand(PCollection<KV<TableDestination, TableRow>> input) {
+ // Since BigQueryIO.java does not yet have support for per-table schemas, inject a constant
+ // schema function here. If no schema is specified, this function will return null.
+ SerializableFunction<TableDestination, TableSchema> schemaFunction =
+ new ConstantSchemaFunction(write.getSchema());
+
+ // A naive implementation would be to simply stream data directly to BigQuery.
+ // However, this could occasionally lead to duplicated data, e.g., when
+ // a VM that runs this code is restarted and the code is re-run.
+
+ // The above risk is mitigated in this implementation by relying on
+ // BigQuery built-in best effort de-dup mechanism.
+
+ // To use this mechanism, each input TableRow is tagged with a generated
+ // unique id, which is then passed to BigQuery and used to ignore duplicates.
+ PCollection<KV<ShardedKey<String>, TableRowInfo>> tagged = input
+ .apply("CreateTables", ParDo.of(new CreateTables(write.getCreateDisposition(),
+ write.getBigQueryServices(), schemaFunction)))
+ // We create 50 keys per BigQuery table to generate output on. This is few enough that we
+ // get good batching into BigQuery's insert calls, and enough that we can max out the
+ // streaming insert quota.
+ .apply("ShardTableWrites", ParDo.of(new GenerateShardedTable(50)))
+ .setCoder(KvCoder.of(ShardedKeyCoder.of(StringUtf8Coder.of()), TableRowJsonCoder.of()))
+ .apply("TagWithUniqueIds", ParDo.of(new TagWithUniqueIds()));
+
+ // To prevent having the same TableRow processed more than once with regenerated
+ // different unique ids, this implementation relies on "checkpointing", which is
+ // achieved as a side effect of having StreamingWriteFn immediately follow a GBK,
+ // performed by Reshuffle.
+ tagged
+ .setCoder(KvCoder.of(ShardedKeyCoder.of(StringUtf8Coder.of()), TableRowInfoCoder.of()))
+ .apply(Reshuffle.<ShardedKey<String>, TableRowInfo>of())
+ .apply("StreamingWrite",
+ ParDo.of(
+ new StreamingWriteFn(write.getBigQueryServices())));
+
+ return WriteResult.in(input.getPipeline());
+ }
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/58ed5c7e/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StreamingWriteFn.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StreamingWriteFn.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StreamingWriteFn.java
index 1d93fa3..83ed3d2 100644
--- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StreamingWriteFn.java
+++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StreamingWriteFn.java
@@ -18,28 +18,16 @@
package org.apache.beam.sdk.io.gcp.bigquery;
-import static com.google.common.base.Preconditions.checkNotNull;
-import com.google.api.services.bigquery.model.Table;
import com.google.api.services.bigquery.model.TableReference;
import com.google.api.services.bigquery.model.TableRow;
-import com.google.api.services.bigquery.model.TableSchema;
import com.google.common.annotations.VisibleForTesting;
import java.io.IOException;
-import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
-import java.util.Set;
-import java.util.concurrent.ConcurrentHashMap;
-import javax.annotation.Nullable;
-import org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.TableSchemaToJsonSchema;
-import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write;
-import org.apache.beam.sdk.io.gcp.bigquery.BigQueryServices.DatasetService;
import org.apache.beam.sdk.metrics.Counter;
import org.apache.beam.sdk.metrics.Metrics;
import org.apache.beam.sdk.options.BigQueryOptions;
-import org.apache.beam.sdk.options.ValueProvider;
-import org.apache.beam.sdk.options.ValueProvider.NestedValueProvider;
import org.apache.beam.sdk.transforms.DoFn;
import org.apache.beam.sdk.transforms.display.DisplayData;
import org.apache.beam.sdk.util.SystemDoFnInternal;
@@ -52,48 +40,19 @@ import org.apache.beam.sdk.values.KV;
@VisibleForTesting
class StreamingWriteFn
extends DoFn<KV<ShardedKey<String>, TableRowInfo>, Void> {
- /** TableSchema in JSON. Use String to make the class Serializable. */
- @Nullable
- private final ValueProvider<String> jsonTableSchema;
-
- @Nullable private final String tableDescription;
-
private final BigQueryServices bqServices;
/** JsonTableRows to accumulate BigQuery rows in order to batch writes. */
private transient Map<String, List<TableRow>> tableRows;
- private final Write.CreateDisposition createDisposition;
-
/** The list of unique ids for each BigQuery table row. */
private transient Map<String, List<String>> uniqueIdsForTableRows;
- /** The list of tables created so far, so we don't try the creation
- each time. */
- private static Set<String> createdTables =
- Collections.newSetFromMap(new ConcurrentHashMap<String, Boolean>());
-
/** Tracks bytes written, exposed as "ByteCount" Counter. */
private Counter byteCounter = Metrics.counter(StreamingWriteFn.class, "ByteCount");
- /** Constructor. */
- StreamingWriteFn(@Nullable ValueProvider<TableSchema> schema,
- Write.CreateDisposition createDisposition,
- @Nullable String tableDescription, BigQueryServices bqServices) {
- this.jsonTableSchema = schema == null ? null :
- NestedValueProvider.of(schema, new TableSchemaToJsonSchema());
- this.createDisposition = createDisposition;
- this.bqServices = checkNotNull(bqServices, "bqServices");
- this.tableDescription = tableDescription;
- }
-
- /**
- * Clear the cached map of created tables. Used for testing.
- */
- static void clearCreatedTables() {
- synchronized (createdTables) {
- createdTables.clear();
- }
+ StreamingWriteFn(BigQueryServices bqServices) {
+ this.bqServices = bqServices;
}
/** Prepares a target BigQuery table. */
@@ -119,9 +78,8 @@ class StreamingWriteFn
@FinishBundle
public void finishBundle(Context context) throws Exception {
BigQueryOptions options = context.getPipelineOptions().as(BigQueryOptions.class);
-
for (Map.Entry<String, List<TableRow>> entry : tableRows.entrySet()) {
- TableReference tableReference = getOrCreateTable(options, entry.getKey());
+ TableReference tableReference = BigQueryHelpers.parseTableSpec(entry.getKey());
flushRows(tableReference, entry.getValue(),
uniqueIdsForTableRows.get(entry.getKey()), options);
}
@@ -132,39 +90,6 @@ class StreamingWriteFn
@Override
public void populateDisplayData(DisplayData.Builder builder) {
super.populateDisplayData(builder);
-
- builder
- .addIfNotNull(DisplayData.item("schema", jsonTableSchema)
- .withLabel("Table Schema"))
- .addIfNotNull(DisplayData.item("tableDescription", tableDescription)
- .withLabel("Table Description"));
- }
-
- public TableReference getOrCreateTable(BigQueryOptions options, String tableSpec)
- throws InterruptedException, IOException {
- TableReference tableReference = BigQueryHelpers.parseTableSpec(tableSpec);
- if (createDisposition != createDisposition.CREATE_NEVER
- && !createdTables.contains(tableSpec)) {
- synchronized (createdTables) {
- // Another thread may have succeeded in creating the table in the meanwhile, so
- // check again. This check isn't needed for correctness, but we add it to prevent
- // every thread from attempting a create and overwhelming our BigQuery quota.
- DatasetService datasetService = bqServices.getDatasetService(options);
- if (!createdTables.contains(tableSpec)) {
- if (datasetService.getTable(tableReference) == null) {
- TableSchema tableSchema = BigQueryIO.JSON_FACTORY.fromString(
- jsonTableSchema.get(), TableSchema.class);
- datasetService.createTable(
- new Table()
- .setTableReference(tableReference)
- .setSchema(tableSchema)
- .setDescription(tableDescription));
- }
- createdTables.add(tableSpec);
- }
- }
- }
- return tableReference;
}
/**
@@ -173,6 +98,7 @@ class StreamingWriteFn
private void flushRows(TableReference tableReference,
List<TableRow> tableRows, List<String> uniqueIds, BigQueryOptions options)
throws InterruptedException {
+ System.out.println("FlUSHING ROWS " + tableRows.size());
if (!tableRows.isEmpty()) {
try {
long totalBytes = bqServices.getDatasetService(options).insertAll(
http://git-wip-us.apache.org/repos/asf/beam/blob/58ed5c7e/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TableDestination.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TableDestination.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TableDestination.java
index 3cbbf3b..631afeb 100644
--- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TableDestination.java
+++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TableDestination.java
@@ -1,7 +1,53 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements. See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership. The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
package org.apache.beam.sdk.io.gcp.bigquery;
+import com.google.api.services.bigquery.model.TableReference;
+
/**
- * Created by relax on 3/28/17.
+ * Encapsulates a BigQuery table destination.
*/
public class TableDestination {
+ private final String tableSpec;
+ private final String tableDescription;
+
+
+ public TableDestination(String tableSpec, String tableDescription) {
+ this.tableSpec = tableSpec;
+ this.tableDescription = tableDescription;
+ }
+
+ public TableDestination(TableReference tableReference, String tableDescription) {
+ this.tableSpec = BigQueryHelpers.toTableSpec(tableReference);
+ this.tableDescription = tableDescription;
+ }
+
+ public String getTableSpec() {
+ return tableSpec;
+ }
+
+
+ public TableReference getTableReference() {
+ return BigQueryHelpers.parseTableSpec(tableSpec);
+ }
+
+ public String getTableDescription() {
+ return tableDescription;
+ }
}
http://git-wip-us.apache.org/repos/asf/beam/blob/58ed5c7e/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TableDestinationCoder.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TableDestinationCoder.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TableDestinationCoder.java
new file mode 100644
index 0000000..fa24700
--- /dev/null
+++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TableDestinationCoder.java
@@ -0,0 +1,64 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements. See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership. The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.beam.sdk.io.gcp.bigquery;
+
+import com.fasterxml.jackson.annotation.JsonCreator;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import org.apache.beam.sdk.coders.AtomicCoder;
+import org.apache.beam.sdk.coders.CoderException;
+import org.apache.beam.sdk.coders.StringUtf8Coder;
+
+/**
+ * A coder for {@link TableDestination} objects.
+ */
+public class TableDestinationCoder extends AtomicCoder<TableDestination> {
+ private static final TableDestinationCoder INSTANCE = new TableDestinationCoder();
+
+
+ @JsonCreator
+ public static TableDestinationCoder of() {
+ return INSTANCE;
+ }
+
+ @Override
+ public void encode(TableDestination value, OutputStream outStream, Context context)
+ throws IOException {
+ if (value == null) {
+ throw new CoderException("cannot encode a null value");
+ }
+ stringCoder.encode(value.getTableSpec(), outStream, context.nested());
+ stringCoder.encode(value.getTableDescription(), outStream, context);
+ }
+
+ @Override
+ public TableDestination decode(InputStream inStream, Context context) throws IOException {
+ return new TableDestination(
+ stringCoder.decode(inStream, context.nested()),
+ stringCoder.decode(inStream, context.nested()));
+ }
+
+ @Override
+ public void verifyDeterministic() throws NonDeterministicException {
+ return;
+ }
+
+ StringUtf8Coder stringCoder = StringUtf8Coder.of();
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/58ed5c7e/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TagWithUniqueIds.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TagWithUniqueIds.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TagWithUniqueIds.java
new file mode 100644
index 0000000..6f0186e
--- /dev/null
+++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TagWithUniqueIds.java
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.beam.sdk.io.gcp.bigquery;
+
+import com.google.api.services.bigquery.model.TableReference;
+import com.google.api.services.bigquery.model.TableRow;
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.base.Strings;
+import java.io.IOException;
+import java.util.UUID;
+import java.util.concurrent.ThreadLocalRandom;
+import org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.TableRefToTableSpec;
+import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write;
+import org.apache.beam.sdk.options.BigQueryOptions;
+import org.apache.beam.sdk.options.ValueProvider;
+import org.apache.beam.sdk.options.ValueProvider.NestedValueProvider;
+import org.apache.beam.sdk.transforms.DoFn;
+import org.apache.beam.sdk.transforms.display.DisplayData;
+import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
+import org.apache.beam.sdk.values.KV;
+import org.apache.beam.sdk.values.ValueInSingleWindow;
+
+/**
+ * Fn that tags each table row with a unique id and destination table.
+ * To avoid calling UUID.randomUUID() for each element, which can be costly,
+ * a randomUUID is generated only once per bucket of data. The actual unique
+ * id is created by concatenating this randomUUID with a sequential number.
+ */
+@VisibleForTesting
+class TagWithUniqueIds
+ extends DoFn<KV<ShardedKey<String>, TableRow>, KV<ShardedKey<String>, TableRowInfo>> {
+
+ private transient String randomUUID;
+ private transient long sequenceNo = 0L;
+
+ @StartBundle
+ public void startBundle(Context context) {
+ randomUUID = UUID.randomUUID().toString();
+ }
+
+ /** Tag the input with a unique id. */
+ @ProcessElement
+ public void processElement(ProcessContext context, BoundedWindow window) throws IOException {
+ String uniqueId = randomUUID + sequenceNo++;
+ // We output on keys 0-50 to ensure that there's enough batching for
+ // BigQuery.
+ context.output(KV.of(context.element().getKey(),
+ new TableRowInfo(context.element().getValue(), uniqueId)));
+ }
+
+ @Override
+ public void populateDisplayData(DisplayData.Builder builder) {
+ super.populateDisplayData(builder);
+ }
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/58ed5c7e/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TagWithUniqueIdsAndTable.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TagWithUniqueIdsAndTable.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TagWithUniqueIdsAndTable.java
deleted file mode 100644
index 4e50f7c..0000000
--- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TagWithUniqueIdsAndTable.java
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.beam.sdk.io.gcp.bigquery;
-
-import com.google.api.services.bigquery.model.TableReference;
-import com.google.common.annotations.VisibleForTesting;
-import com.google.common.base.Strings;
-import java.io.IOException;
-import java.util.UUID;
-import java.util.concurrent.ThreadLocalRandom;
-import org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.TableRefToTableSpec;
-import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write;
-import org.apache.beam.sdk.options.BigQueryOptions;
-import org.apache.beam.sdk.options.ValueProvider;
-import org.apache.beam.sdk.options.ValueProvider.NestedValueProvider;
-import org.apache.beam.sdk.transforms.DoFn;
-import org.apache.beam.sdk.transforms.display.DisplayData;
-import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
-import org.apache.beam.sdk.values.KV;
-import org.apache.beam.sdk.values.ValueInSingleWindow;
-
-/**
- * Fn that tags each table row with a unique id and destination table.
- * To avoid calling UUID.randomUUID() for each element, which can be costly,
- * a randomUUID is generated only once per bucket of data. The actual unique
- * id is created by concatenating this randomUUID with a sequential number.
- */
-@VisibleForTesting
-class TagWithUniqueIdsAndTable<T>
- extends DoFn<T, KV<ShardedKey<String>, TableRowInfo>> {
- /** TableSpec to write to in the case of a single static destination. */
- private ValueProvider<String> tableSpec = null;
-
- private final Write<T, ?> write;
-
- private transient String randomUUID;
- private transient long sequenceNo = 0L;
-
- TagWithUniqueIdsAndTable(BigQueryOptions options,
- Write<T, ?> write) {
- ValueProvider<TableReference> table = write.getTableWithDefaultProject(
- options.as(BigQueryOptions.class));
- if (table != null) {
- this.tableSpec = NestedValueProvider.of(table, new TableRefToTableSpec());
- }
- this.write = write;
- }
-
-
- @StartBundle
- public void startBundle(Context context) {
- randomUUID = UUID.randomUUID().toString();
- }
-
- /** Tag the input with a unique id. */
- @ProcessElement
- public void processElement(ProcessContext context, BoundedWindow window) throws IOException {
- String uniqueId = randomUUID + sequenceNo++;
- ThreadLocalRandom randomGenerator = ThreadLocalRandom.current();
- String tableSpec = tableSpecFromWindowedValue(
- context.getPipelineOptions().as(BigQueryOptions.class),
- ValueInSingleWindow.of(context.element(), context.timestamp(), window, context.pane()));
- // We output on keys 0-50 to ensure that there's enough batching for
- // BigQuery.
- context.output(KV.of(ShardedKey.of(tableSpec, randomGenerator.nextInt(0, 50)),
- new TableRowInfo(write.getFormatFunction().apply(context.element()), uniqueId)));
- }
-
- @Override
- public void populateDisplayData(DisplayData.Builder builder) {
- super.populateDisplayData(builder);
-
- builder.addIfNotNull(DisplayData.item("table", tableSpec));
- builder.add(DisplayData.item("tableFn", write.getTableRefFunction().getClass())
- .withLabel("Table Reference Function"));
- }
-
- @VisibleForTesting
- ValueProvider<String> getTableSpec() {
- return tableSpec;
- }
-
-
-
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/58ed5c7e/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOTest.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOTest.java
index 499aa74..d953edd 100644
--- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOTest.java
+++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOTest.java
@@ -518,7 +518,6 @@ public class BigQueryIOTest implements Serializable {
/** A fake dataset service that can be serialized, for use in testReadFromTable. */
private static class FakeDatasetService implements DatasetService, Serializable {
-
@Override
public Table getTable(TableReference tableRef)
throws InterruptedException, IOException {
@@ -630,6 +629,7 @@ public class BigQueryIOTest implements Serializable {
TableContainer tableContainer = getTableContainer(
ref.getProjectId(), ref.getDatasetId(), ref.getTableId());
for (int i = 0; i < rowList.size(); ++i) {
+ System.out.println("adding row " + rowList.get(i));
tableContainer.addRow(rowList.get(i), insertIdList.get(i));
dataSize += rowList.get(i).toString().length();
}
@@ -1121,15 +1121,15 @@ public class BigQueryIOTest implements Serializable {
}
);
- SerializableFunction<ValueInSingleWindow<Integer>, String> tableFunction =
- new SerializableFunction<ValueInSingleWindow<Integer>, String>() {
+ SerializableFunction<ValueInSingleWindow<Integer>, TableDestination> tableFunction =
+ new SerializableFunction<ValueInSingleWindow<Integer>, TableDestination>() {
@Override
- public String apply(ValueInSingleWindow<Integer> input) {
+ public TableDestination apply(ValueInSingleWindow<Integer> input) {
PartitionedGlobalWindow window = (PartitionedGlobalWindow) input.getWindow();
// Check that we can access the element as well here.
checkArgument(window.value.equals(Integer.toString(input.getValue() % 5)),
"Incorrect element");
- return "project-id:dataset-id.table-id-" + window.value;
+ return new TableDestination("project-id:dataset-id.table-id-" + window.value, "");
}
};
@@ -1559,14 +1559,6 @@ public class BigQueryIOTest implements Serializable {
}
@Test
- public void testStreamingWriteFnCreateNever() throws Exception {
- StreamingWriteFn fn = new StreamingWriteFn(
- null, CreateDisposition.CREATE_NEVER, null, new FakeBigQueryServices());
- assertEquals(BigQueryHelpers.parseTableSpec("dataset.table"),
- fn.getOrCreateTable(null, "dataset.table"));
- }
-
- @Test
public void testCreateNeverWithStreaming() throws Exception {
BigQueryOptions options = TestPipeline.testingPipelineOptions().as(BigQueryOptions.class);
options.setProject("project");
[02/50] [abbrv] beam git commit: This closes #2572
Posted by dh...@apache.org.
This closes #2572
Project: http://git-wip-us.apache.org/repos/asf/beam/repo
Commit: http://git-wip-us.apache.org/repos/asf/beam/commit/a9bcc8b1
Tree: http://git-wip-us.apache.org/repos/asf/beam/tree/a9bcc8b1
Diff: http://git-wip-us.apache.org/repos/asf/beam/diff/a9bcc8b1
Branch: refs/heads/DSL_SQL
Commit: a9bcc8b154190bc41a0225838409e2b9cd2e9c93
Parents: d988150 09e0f77
Author: Dan Halperin <dh...@google.com>
Authored: Tue Apr 18 17:58:04 2017 -0700
Committer: Dan Halperin <dh...@google.com>
Committed: Tue Apr 18 17:58:04 2017 -0700
----------------------------------------------------------------------
sdks/java/core/pom.xml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
----------------------------------------------------------------------
[14/50] [abbrv] beam git commit: Fix tests to properly fake out
BigQueryService, and add tests for dynamic-table functionality.
Posted by dh...@apache.org.
Fix tests to properly fake out BigQueryService, and add tests for dynamic-table functionality.
Project: http://git-wip-us.apache.org/repos/asf/beam/repo
Commit: http://git-wip-us.apache.org/repos/asf/beam/commit/b486137d
Tree: http://git-wip-us.apache.org/repos/asf/beam/tree/b486137d
Diff: http://git-wip-us.apache.org/repos/asf/beam/diff/b486137d
Branch: refs/heads/DSL_SQL
Commit: b486137d2190db9212a92176f703e6ed7858fe59
Parents: 760a945
Author: Reuven Lax <re...@google.com>
Authored: Fri Mar 31 14:16:48 2017 -0700
Committer: Eugene Kirpichov <ki...@google.com>
Committed: Tue Apr 18 21:12:50 2017 -0700
----------------------------------------------------------------------
.../beam/sdk/io/gcp/bigquery/BatchLoads.java | 7 +-
.../beam/sdk/io/gcp/bigquery/BigQueryIO.java | 15 +-
.../beam/sdk/io/gcp/bigquery/ShardedKey.java | 2 +-
.../sdk/io/gcp/bigquery/StreamingInserts.java | 5 +-
.../sdk/io/gcp/bigquery/StreamingWriteFn.java | 1 -
.../sdk/io/gcp/bigquery/TableDestination.java | 3 +-
.../sdk/io/gcp/bigquery/TableRowWriter.java | 3 +-
.../sdk/io/gcp/bigquery/TagWithUniqueIds.java | 9 -
.../io/gcp/bigquery/WriteBundlesToFiles.java | 12 +-
.../sdk/io/gcp/bigquery/WritePartition.java | 13 +-
.../sdk/io/gcp/bigquery/BigQueryIOTest.java | 613 ++++++++++---------
.../io/gcp/bigquery/FakeBigQueryServices.java | 114 +++-
.../sdk/io/gcp/bigquery/FakeDatasetService.java | 138 +++--
.../sdk/io/gcp/bigquery/FakeJobService.java | 182 +++++-
.../sdk/io/gcp/bigquery/TableContainer.java | 33 +-
15 files changed, 703 insertions(+), 447 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/beam/blob/b486137d/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BatchLoads.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BatchLoads.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BatchLoads.java
index 5e80fae..06fdfce 100644
--- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BatchLoads.java
+++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BatchLoads.java
@@ -58,9 +58,8 @@ import org.apache.beam.sdk.values.TupleTagList;
/**
* PTransform that uses BigQuery batch-load jobs to write a PCollection to BigQuery.
*/
-class BatchLoads<T> extends
- PTransform<PCollection<KV<TableDestination, TableRow>>, WriteResult> {
- BigQueryIO.Write<T> write;
+class BatchLoads extends PTransform<PCollection<KV<TableDestination, TableRow>>, WriteResult> {
+ BigQueryIO.Write<?> write;
private static class ConstantSchemaFunction implements
SerializableFunction<TableDestination, TableSchema> {
@@ -79,7 +78,7 @@ class BatchLoads<T> extends
}
}
- BatchLoads(BigQueryIO.Write<T> write) {
+ BatchLoads(BigQueryIO.Write<?> write) {
this.write = write;
}
http://git-wip-us.apache.org/repos/asf/beam/blob/b486137d/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIO.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIO.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIO.java
index f1baaf7..54a25c7 100644
--- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIO.java
+++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIO.java
@@ -64,7 +64,6 @@ import org.apache.beam.sdk.transforms.PTransform;
import org.apache.beam.sdk.transforms.ParDo;
import org.apache.beam.sdk.transforms.SerializableFunction;
import org.apache.beam.sdk.transforms.display.DisplayData;
-import org.apache.beam.sdk.util.GcsUtil.GcsUtilFactory;
import org.apache.beam.sdk.util.IOChannelFactory;
import org.apache.beam.sdk.util.IOChannelUtils;
import org.apache.beam.sdk.util.Transport;
@@ -536,7 +535,7 @@ public class BigQueryIO {
}
}
if (extractFiles != null && !extractFiles.isEmpty()) {
- new GcsUtilFactory().create(options).remove(extractFiles);
+ IOChannelUtils.getFactory(extractFiles.iterator().next()).remove(extractFiles);
}
}
};
@@ -701,8 +700,8 @@ public class BigQueryIO {
@AutoValue.Builder
abstract static class Builder<T> {
abstract Builder<T> setJsonTableRef(ValueProvider<String> jsonTableRef);
- abstract Builder<T> setTableRefFunction(
- SerializableFunction<ValueInSingleWindow<T>, TableReference> tableRefFunction);
+ abstract Builder<T> setTableFunction(
+ SerializableFunction<ValueInSingleWindow<T>, TableDestination> tableFunction);
abstract Builder<T> setFormatFunction(
SerializableFunction<T, TableRow> formatFunction);
abstract Builder<T> setJsonSchema(ValueProvider<String> jsonSchema);
@@ -823,8 +822,7 @@ public class BigQueryIO {
* {@link ValueInSingleWindow}, so can be determined by the value or by the window.
*/
public Write<T> to(
- SerializableFunction<ValueInSingleWindow<T>, String> tableSpecFunction) {
- return toTableReference(new TranslateTableSpecFunction<T>(tableSpecFunction));
+ SerializableFunction<ValueInSingleWindow<T>, TableDestination> tableFunction) {
ensureToNotCalledYet();
return toBuilder().setTableFunction(tableFunction).build();
}
@@ -834,7 +832,7 @@ public class BigQueryIO {
* {@link TableReference} instead of a string table specification.
*/
private Write<T> toTableReference(
- SerializableFunction<ValueInSingleWindow<T>, TableReference> tableRefFunction) {
+ SerializableFunction<ValueInSingleWindow<T>, TableDestination> tableFunction) {
ensureToNotCalledYet();
return toBuilder().setTableFunction(tableFunction).build();
}
@@ -984,8 +982,7 @@ public class BigQueryIO {
if (input.isBounded() == IsBounded.UNBOUNDED) {
return rowsWithDestination.apply(new StreamingInserts(this));
} else {
-
- return rowsWithDestination.apply(new BatchLoads<T>(this));
+ return rowsWithDestination.apply(new BatchLoads(this));
}
}
http://git-wip-us.apache.org/repos/asf/beam/blob/b486137d/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/ShardedKey.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/ShardedKey.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/ShardedKey.java
index ab57446..09b4fbf 100644
--- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/ShardedKey.java
+++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/ShardedKey.java
@@ -56,7 +56,7 @@ class ShardedKey<K> implements Serializable {
return false;
}
ShardedKey<K> other = (ShardedKey<K>) o;
- return (key == other.key) && (shardNumber == other.shardNumber);
+ return Objects.equals(key, other.key) && Objects.equals(shardNumber, other.shardNumber);
}
@Override
http://git-wip-us.apache.org/repos/asf/beam/blob/b486137d/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StreamingInserts.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StreamingInserts.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StreamingInserts.java
index 37afbdf..ced1d66 100644
--- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StreamingInserts.java
+++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StreamingInserts.java
@@ -38,9 +38,8 @@ import org.apache.beam.sdk.values.PCollection;
* PTransform that performs streaming BigQuery write. To increase consistency,
* it leverages BigQuery best effort de-dup mechanism.
*/
-
-class StreamingInserts
- extends PTransform<PCollection<KV<TableDestination, TableRow>>, WriteResult> {
+class StreamingInserts extends PTransform<PCollection<KV<TableDestination, TableRow>>,
+ WriteResult> {
private final Write<?> write;
private static class ConstantSchemaFunction implements
http://git-wip-us.apache.org/repos/asf/beam/blob/b486137d/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StreamingWriteFn.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StreamingWriteFn.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StreamingWriteFn.java
index 83ed3d2..22b2078 100644
--- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StreamingWriteFn.java
+++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StreamingWriteFn.java
@@ -98,7 +98,6 @@ class StreamingWriteFn
private void flushRows(TableReference tableReference,
List<TableRow> tableRows, List<String> uniqueIds, BigQueryOptions options)
throws InterruptedException {
- System.out.println("FlUSHING ROWS " + tableRows.size());
if (!tableRows.isEmpty()) {
try {
long totalBytes = bqServices.getDatasetService(options).insertAll(
http://git-wip-us.apache.org/repos/asf/beam/blob/b486137d/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TableDestination.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TableDestination.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TableDestination.java
index e8538e0..36e1401 100644
--- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TableDestination.java
+++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TableDestination.java
@@ -64,7 +64,8 @@ public class TableDestination implements Serializable {
return false;
}
TableDestination other = (TableDestination) o;
- return (tableSpec == other.tableSpec) && (tableDescription == other.tableDescription);
+ return Objects.equals(this.tableSpec, other.tableSpec)
+ && Objects.equals(this.tableDescription, other.tableDescription);
}
@Override
http://git-wip-us.apache.org/repos/asf/beam/blob/b486137d/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TableRowWriter.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TableRowWriter.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TableRowWriter.java
index a1f6153..ee8f466 100644
--- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TableRowWriter.java
+++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TableRowWriter.java
@@ -29,7 +29,6 @@ import org.apache.beam.sdk.coders.Coder.Context;
import org.apache.beam.sdk.coders.TableRowJsonCoder;
import org.apache.beam.sdk.util.IOChannelUtils;
import org.apache.beam.sdk.util.MimeTypes;
-import org.apache.beam.sdk.values.KV;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -57,7 +56,7 @@ class TableRowWriter {
}
}
TableRowWriter(String basename) {
- this.tempFilePrefix = basename;
+ this.tempFilePrefix = basename;
}
public final void open(String uId) throws Exception {
http://git-wip-us.apache.org/repos/asf/beam/blob/b486137d/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TagWithUniqueIds.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TagWithUniqueIds.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TagWithUniqueIds.java
index 6f0186e..7379784 100644
--- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TagWithUniqueIds.java
+++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TagWithUniqueIds.java
@@ -18,23 +18,14 @@
package org.apache.beam.sdk.io.gcp.bigquery;
-import com.google.api.services.bigquery.model.TableReference;
import com.google.api.services.bigquery.model.TableRow;
import com.google.common.annotations.VisibleForTesting;
-import com.google.common.base.Strings;
import java.io.IOException;
import java.util.UUID;
-import java.util.concurrent.ThreadLocalRandom;
-import org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.TableRefToTableSpec;
-import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write;
-import org.apache.beam.sdk.options.BigQueryOptions;
-import org.apache.beam.sdk.options.ValueProvider;
-import org.apache.beam.sdk.options.ValueProvider.NestedValueProvider;
import org.apache.beam.sdk.transforms.DoFn;
import org.apache.beam.sdk.transforms.display.DisplayData;
import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
import org.apache.beam.sdk.values.KV;
-import org.apache.beam.sdk.values.ValueInSingleWindow;
/**
* Fn that tags each table row with a unique id and destination table.
http://git-wip-us.apache.org/repos/asf/beam/blob/b486137d/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/WriteBundlesToFiles.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/WriteBundlesToFiles.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/WriteBundlesToFiles.java
index b8069f6..869e68a 100644
--- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/WriteBundlesToFiles.java
+++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/WriteBundlesToFiles.java
@@ -19,19 +19,16 @@
package org.apache.beam.sdk.io.gcp.bigquery;
import com.google.api.services.bigquery.model.TableRow;
-
+import com.google.common.collect.Maps;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.Serializable;
import java.util.Map;
import java.util.UUID;
-
-import com.google.common.collect.Maps;
import org.apache.beam.sdk.coders.AtomicCoder;
import org.apache.beam.sdk.coders.CoderException;
import org.apache.beam.sdk.coders.StringUtf8Coder;
-import org.apache.beam.sdk.coders.TableRowJsonCoder;
import org.apache.beam.sdk.coders.VarLongCoder;
import org.apache.beam.sdk.transforms.DoFn;
import org.apache.beam.sdk.transforms.display.DisplayData;
@@ -50,6 +47,10 @@ class WriteBundlesToFiles extends DoFn<KV<TableDestination, TableRow>, WriteBund
private transient Map<TableDestination, TableRowWriter> writers;
private final String tempFilePrefix;
+ /**
+ * The result of the {@link WriteBundlesToFiles} transform. Corresponds to a single output file,
+ * and encapsulates the table it is destined to as well as the file byte size.
+ */
public static class Result implements Serializable {
public String filename;
public Long fileByteSize;
@@ -62,6 +63,9 @@ class WriteBundlesToFiles extends DoFn<KV<TableDestination, TableRow>, WriteBund
}
}
+ /**
+ * a coder for the {@link Result} class.
+ */
public static class ResultCoder extends AtomicCoder<Result> {
private static final ResultCoder INSTANCE = new ResultCoder();
http://git-wip-us.apache.org/repos/asf/beam/blob/b486137d/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/WritePartition.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/WritePartition.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/WritePartition.java
index c48955b..9c48b82 100644
--- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/WritePartition.java
+++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/WritePartition.java
@@ -89,8 +89,8 @@ class WritePartition extends DoFn<String, KV<ShardedKey<TableDestination>, List<
partitions.add(Lists.<String>newArrayList());
currResultsMap.put(tableDestination, partitions);
}
- int currNumFiles = currNumFilesMap.getOrDefault(tableDestination, 0);
- long currSizeBytes = currSizeBytesMap.getOrDefault(tableDestination, 0L);
+ int currNumFiles = getOrDefault(currNumFilesMap, tableDestination, 0);
+ long currSizeBytes = getOrDefault(currSizeBytesMap, tableDestination, 0L);
if (currNumFiles + 1 > Write.MAX_NUM_FILES
|| currSizeBytes + fileResult.fileByteSize > Write.MAX_SIZE_BYTES) {
// Add a new partition for this table.
@@ -117,4 +117,13 @@ class WritePartition extends DoFn<String, KV<ShardedKey<TableDestination>, List<
}
}
}
+
+ private <T> T getOrDefault(Map<TableDestination, T> map, TableDestination tableDestination,
+ T defaultValue) {
+ if (map.containsKey(tableDestination)) {
+ return map.get(tableDestination);
+ } else {
+ return defaultValue;
+ }
+ }
}
http://git-wip-us.apache.org/repos/asf/beam/blob/b486137d/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOTest.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOTest.java
index d1ef8e2..f10be13 100644
--- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOTest.java
+++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOTest.java
@@ -18,6 +18,8 @@
package org.apache.beam.sdk.io.gcp.bigquery;
import static com.google.common.base.Preconditions.checkArgument;
+import static com.google.common.base.Preconditions.checkNotNull;
+import static com.google.common.base.Preconditions.checkState;
import static org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.toJsonString;
import static org.apache.beam.sdk.transforms.display.DisplayDataMatchers.hasDisplayItem;
import static org.hamcrest.Matchers.containsInAnyOrder;
@@ -26,17 +28,9 @@ import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertThat;
import static org.junit.Assert.assertTrue;
-import static org.mockito.Matchers.any;
-import static org.mockito.Matchers.anyString;
-import static org.mockito.Matchers.eq;
-import static org.mockito.Mockito.doNothing;
-import static org.mockito.Mockito.doThrow;
-import static org.mockito.Mockito.when;
-
-import com.google.api.client.json.GenericJson;
+
import com.google.api.client.util.Data;
import com.google.api.services.bigquery.model.Job;
-import com.google.api.services.bigquery.model.JobReference;
import com.google.api.services.bigquery.model.JobStatistics;
import com.google.api.services.bigquery.model.JobStatistics2;
import com.google.api.services.bigquery.model.JobStatistics4;
@@ -48,7 +42,7 @@ import com.google.api.services.bigquery.model.TableRow;
import com.google.api.services.bigquery.model.TableSchema;
import com.google.common.collect.HashBasedTable;
import com.google.common.collect.ImmutableList;
-import com.google.common.collect.ImmutableMap;
+import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
@@ -58,9 +52,12 @@ import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.Serializable;
+import java.math.BigDecimal;
import java.nio.channels.Channels;
import java.nio.channels.WritableByteChannel;
+import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
+import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Collection;
@@ -69,14 +66,10 @@ import java.util.List;
import java.util.Map;
import java.util.Set;
-import org.apache.avro.Schema;
-import org.apache.avro.file.DataFileWriter;
-import org.apache.avro.generic.GenericDatumWriter;
-import org.apache.avro.generic.GenericRecord;
-import org.apache.avro.generic.GenericRecordBuilder;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.coders.AtomicCoder;
import org.apache.beam.sdk.coders.Coder;
+import org.apache.beam.sdk.coders.Coder.Context;
import org.apache.beam.sdk.coders.CoderException;
import org.apache.beam.sdk.coders.KvCoder;
import org.apache.beam.sdk.coders.StringUtf8Coder;
@@ -88,7 +81,6 @@ import org.apache.beam.sdk.io.CountingSource;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.JsonSchemaToTableSchema;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition;
-import org.apache.beam.sdk.io.gcp.bigquery.BigQueryServices.DatasetService;
import org.apache.beam.sdk.io.gcp.bigquery.PassThroughThenCleanup.CleanupOperation;
import org.apache.beam.sdk.io.gcp.bigquery.WriteBundlesToFiles.Result;
import org.apache.beam.sdk.options.BigQueryOptions;
@@ -122,7 +114,6 @@ import org.apache.beam.sdk.transforms.windowing.Window;
import org.apache.beam.sdk.transforms.windowing.WindowFn;
import org.apache.beam.sdk.transforms.windowing.WindowMappingFn;
import org.apache.beam.sdk.util.CoderUtils;
-import org.apache.beam.sdk.util.IOChannelFactory;
import org.apache.beam.sdk.util.IOChannelUtils;
import org.apache.beam.sdk.util.MimeTypes;
import org.apache.beam.sdk.util.PCollectionViews;
@@ -140,6 +131,7 @@ import org.hamcrest.Matchers;
import org.joda.time.Instant;
import org.junit.Assert;
import org.junit.Before;
+import org.junit.BeforeClass;
import org.junit.Ignore;
import org.junit.Rule;
import org.junit.Test;
@@ -147,10 +139,6 @@ import org.junit.rules.ExpectedException;
import org.junit.rules.TemporaryFolder;
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;
-import org.mockito.ArgumentCaptor;
-import org.mockito.Mock;
-import org.mockito.Mockito;
-import org.mockito.MockitoAnnotations;
/**
* Tests for BigQueryIO.
@@ -158,6 +146,8 @@ import org.mockito.MockitoAnnotations;
@RunWith(JUnit4.class)
public class BigQueryIOTest implements Serializable {
+ private static Path tempFolder;
+
// Table information must be static, as each ParDo will get a separate instance of
// FakeDatasetServices, and they must all modify the same storage.
static com.google.common.collect.Table<String, String, Map<String, TableContainer>>
@@ -169,8 +159,6 @@ public class BigQueryIOTest implements Serializable {
@Rule public transient ExpectedLogs loggedWriteRename = ExpectedLogs.none(WriteRename.class);
@Rule public transient ExpectedLogs loggedWriteTables = ExpectedLogs.none(WriteTables.class);
@Rule public transient TemporaryFolder testFolder = new TemporaryFolder();
- @Mock private transient IOChannelFactory mockIOChannelFactory;
- @Mock(extraInterfaces = Serializable.class) private transient DatasetService mockDatasetService;
private void checkReadTableObject(
BigQueryIO.Read read, String project, String dataset, String table) {
@@ -227,9 +215,13 @@ public class BigQueryIOTest implements Serializable {
assertEquals(validate, write.getValidate());
}
+ @BeforeClass
+ public static void setupClass() throws IOException {
+ tempFolder = Files.createTempDirectory("BigQueryIOTest");
+ }
+
@Before
public void setUp() throws IOException {
- MockitoAnnotations.initMocks(this);
tables = HashBasedTable.create();
BigQueryIO.clearCreatedTables();
}
@@ -289,29 +281,53 @@ public class BigQueryIOTest implements Serializable {
String tableId = "sometable";
BigQueryOptions bqOptions = TestPipeline.testingPipelineOptions().as(BigQueryOptions.class);
bqOptions.setProject(projectId);
- bqOptions.setTempLocation("gs://testbucket/testdir");
+
+ Path baseDir = Files.createTempDirectory(tempFolder, "testValidateReadSetsDefaultProject");
+ bqOptions.setTempLocation(baseDir.toString());
FakeDatasetService fakeDatasetService = new FakeDatasetService();
fakeDatasetService.createDataset(projectId, datasetId, "", "");
TableReference tableReference =
new TableReference().setProjectId(projectId).setDatasetId(datasetId).setTableId(tableId);
- fakeDatasetService.createTable(new Table().setTableReference(tableReference));
+ fakeDatasetService.createTable(new Table()
+ .setTableReference(tableReference)
+ .setSchema(new TableSchema()
+ .setFields(
+ ImmutableList.of(
+ new TableFieldSchema().setName("name").setType("STRING"),
+ new TableFieldSchema().setName("number").setType("INTEGER")))));
FakeBigQueryServices fakeBqServices = new FakeBigQueryServices()
.withJobService(new FakeJobService())
.withDatasetService(fakeDatasetService);
+ List<TableRow> expected = ImmutableList.of(
+ new TableRow().set("name", "a").set("number", 1L),
+ new TableRow().set("name", "b").set("number", 2L),
+ new TableRow().set("name", "c").set("number", 3L),
+ new TableRow().set("name", "d").set("number", 4L),
+ new TableRow().set("name", "e").set("number", 5L),
+ new TableRow().set("name", "f").set("number", 6L));
+ fakeDatasetService.insertAll(tableReference, expected, null);
+
Pipeline p = TestPipeline.create(bqOptions);
TableReference tableRef = new TableReference();
tableRef.setDatasetId(datasetId);
tableRef.setTableId(tableId);
- thrown.expect(RuntimeException.class);
- // Message will be one of following depending on the execution environment.
- thrown.expectMessage(Matchers.containsString("Unsupported"));
- p.apply(BigQueryIO.read().from(tableRef)
- .withTestServices(fakeBqServices));
+ PCollection<KV<String, Long>> output =
+ p.apply(BigQueryIO.read().from(tableRef).withTestServices(fakeBqServices))
+ .apply(ParDo.of(new DoFn<TableRow, KV<String, Long>>() {
+ @ProcessElement
+ public void processElement(ProcessContext c) throws Exception {
+ c.output(KV.of((String) c.element().get("name"),
+ Long.valueOf((String) c.element().get("number"))));
+ }
+ }));
+ PAssert.that(output).containsInAnyOrder(ImmutableList.of(KV.of("a", 1L), KV.of("b", 2L),
+ KV.of("c", 3L), KV.of("d", 4L), KV.of("e", 5L), KV.of("f", 6L)));
+ p.run();
}
@Test
@@ -400,54 +416,32 @@ public class BigQueryIOTest implements Serializable {
FakeDatasetService fakeDatasetService = new FakeDatasetService();
fakeDatasetService.createDataset("non-executing-project", "somedataset", "", "");
fakeDatasetService.createTable(sometable);
- SerializableFunction<Void, Schema> schemaGenerator =
- new SerializableFunction<Void, Schema>() {
- @Override
- public Schema apply(Void input) {
- return BigQueryAvroUtils.toGenericAvroSchema(
- "sometable",
- ImmutableList.of(
- new TableFieldSchema().setName("name").setType("STRING"),
- new TableFieldSchema().setName("number").setType("INTEGER")));
- }
- };
- Collection<Map<String, Object>> records =
- ImmutableList.<Map<String, Object>>builder()
- .add(ImmutableMap.<String, Object>builder().put("name", "a").put("number", 1L).build())
- .add(ImmutableMap.<String, Object>builder().put("name", "b").put("number", 2L).build())
- .add(ImmutableMap.<String, Object>builder().put("name", "c").put("number", 3L).build())
- .build();
- SerializableFunction<GenericJson, Void> onStartJob =
- new WriteExtractFiles(schemaGenerator, records);
+ List<TableRow> records = Lists.newArrayList(
+ new TableRow().set("name", "a").set("number", 1L),
+ new TableRow().set("name", "b").set("number", 2L),
+ new TableRow().set("name", "c").set("number", 3L));
+ fakeDatasetService.insertAll(sometable.getTableReference(), records, null);
FakeBigQueryServices fakeBqServices = new FakeBigQueryServices()
.withJobService(new FakeJobService())
- // .startJobReturns(onStartJob, "done")
- // .pollJobReturns(job)
- // .getJobReturns((Job) null)
- // .verifyExecutingProject(bqOptions.getProject()))
- .withDatasetService(fakeDatasetService)
- .readerReturns(
- toJsonString(new TableRow().set("name", "a").set("number", 1)),
- toJsonString(new TableRow().set("name", "b").set("number", 2)),
- toJsonString(new TableRow().set("name", "c").set("number", 3)));
+ .withDatasetService(fakeDatasetService);
Pipeline p = TestPipeline.create(bqOptions);
- PCollection<String> output = p
+ PCollection<KV<String, Long>> output = p
.apply(BigQueryIO.read().from("non-executing-project:somedataset.sometable")
.withTestServices(fakeBqServices)
.withoutValidation())
- .apply(ParDo.of(new DoFn<TableRow, String>() {
+ .apply(ParDo.of(new DoFn<TableRow, KV<String, Long>>() {
@ProcessElement
public void processElement(ProcessContext c) throws Exception {
- c.output((String) c.element().get("name"));
+ c.output(KV.of((String) c.element().get("name"),
+ Long.valueOf((String) c.element().get("number"))));
}
}));
PAssert.that(output)
- .containsInAnyOrder(ImmutableList.of("a", "b", "c"));
-
+ .containsInAnyOrder(ImmutableList.of(KV.of("a", 1L), KV.of("b", 2L), KV.of("c", 3L)));
p.run();
}
@@ -457,13 +451,12 @@ public class BigQueryIOTest implements Serializable {
bqOptions.setProject("defaultproject");
bqOptions.setTempLocation(testFolder.newFolder("BigQueryIOTest").getAbsolutePath());
+ FakeDatasetService datasetService = new FakeDatasetService();
FakeBigQueryServices fakeBqServices = new FakeBigQueryServices()
.withJobService(new FakeJobService())
- // .startJobReturns("done", "done", "done")
- // .pollJobReturns(Status.FAILED, Status.FAILED, Status.SUCCEEDED))
- .withDatasetService(mockDatasetService);
+ .withDatasetService(datasetService);
- mockDatasetService.createDataset("defaultproject", "dataset-id", "", "");
+ datasetService.createDataset("defaultproject", "dataset-id", "", "");
Pipeline p = TestPipeline.create(bqOptions);
p.apply(Create.of(
@@ -715,11 +708,11 @@ public class BigQueryIOTest implements Serializable {
bqOptions.setProject("defaultproject");
bqOptions.setTempLocation(testFolder.newFolder("BigQueryIOTest").getAbsolutePath());
+ FakeDatasetService datasetService = new FakeDatasetService();
FakeBigQueryServices fakeBqServices = new FakeBigQueryServices()
- .withJobService(new FakeJobService());
- // .startJobReturns("done", "done")
- // .pollJobReturns(Status.FAILED, Status.UNKNOWN));
-
+ .withJobService(new FakeJobService())
+ .withDatasetService(datasetService);
+ datasetService.createDataset("project-id", "dataset-id", "", "");
Pipeline p = TestPipeline.create(bqOptions);
p.apply(Create.of(
new TableRow().set("name", "a").set("number", 1),
@@ -732,7 +725,7 @@ public class BigQueryIOTest implements Serializable {
.withoutValidation());
thrown.expect(RuntimeException.class);
- thrown.expectMessage("UNKNOWN status of load job");
+ thrown.expectMessage("Failed to create load job");
try {
p.run();
} finally {
@@ -747,10 +740,10 @@ public class BigQueryIOTest implements Serializable {
bqOptions.setProject("defaultproject");
bqOptions.setTempLocation(testFolder.newFolder("BigQueryIOTest").getAbsolutePath());
+ FakeDatasetService datasetService = new FakeDatasetService();
FakeBigQueryServices fakeBqServices = new FakeBigQueryServices()
- .withJobService(new FakeJobService());
- // .startJobReturns("done", "done", "done")
- // .pollJobReturns(Status.FAILED, Status.FAILED, Status.FAILED));
+ .withJobService(new FakeJobService())
+ .withDatasetService(datasetService);
Pipeline p = TestPipeline.create(bqOptions);
p.apply(Create.of(
@@ -817,7 +810,7 @@ public class BigQueryIOTest implements Serializable {
BigQueryIO.Read read = BigQueryIO.read()
.from("project:dataset.tableId")
.withTestServices(new FakeBigQueryServices()
- .withDatasetService(mockDatasetService)
+ .withDatasetService(new FakeDatasetService())
.withJobService(new FakeJobService()))
.withoutValidation();
@@ -833,7 +826,7 @@ public class BigQueryIOTest implements Serializable {
BigQueryIO.Read read = BigQueryIO.read()
.fromQuery("foobar")
.withTestServices(new FakeBigQueryServices()
- .withDatasetService(mockDatasetService)
+ .withDatasetService(new FakeDatasetService())
.withJobService(new FakeJobService()))
.withoutValidation();
@@ -874,7 +867,7 @@ public class BigQueryIOTest implements Serializable {
.to("project:dataset.table")
.withSchema(new TableSchema().set("col1", "type1").set("col2", "type2"))
.withTestServices(new FakeBigQueryServices()
- .withDatasetService(mockDatasetService)
+ .withDatasetService(new FakeDatasetService())
.withJobService(new FakeJobService()))
.withoutValidation();
@@ -1040,9 +1033,7 @@ public class BigQueryIOTest implements Serializable {
FakeBigQueryServices fakeBqServices = new FakeBigQueryServices()
.withJobService(new FakeJobService())
- .withDatasetService(mockDatasetService);
- when(mockDatasetService.getDataset(projectId, datasetId)).thenThrow(
- new RuntimeException("Unable to confirm BigQuery dataset presence"));
+ .withDatasetService(new FakeDatasetService());
Pipeline p = TestPipeline.create(options);
@@ -1206,26 +1197,31 @@ public class BigQueryIOTest implements Serializable {
@Test
public void testBigQueryTableSourceThroughJsonAPI() throws Exception {
+ FakeDatasetService datasetService = new FakeDatasetService();
FakeBigQueryServices fakeBqServices = new FakeBigQueryServices()
.withJobService(new FakeJobService())
- .readerReturns(
- toJsonString(new TableRow().set("name", "a").set("number", "1")),
- toJsonString(new TableRow().set("name", "b").set("number", "2")),
- toJsonString(new TableRow().set("name", "c").set("number", "3")));
+ .withDatasetService(datasetService);
+ List<TableRow> expected = ImmutableList.of(
+ new TableRow().set("name", "a").set("number", "1"),
+ new TableRow().set("name", "b").set("number", "2"),
+ new TableRow().set("name", "c").set("number", "3"),
+ new TableRow().set("name", "d").set("number", "4"),
+ new TableRow().set("name", "e").set("number", "5"),
+ new TableRow().set("name", "f").set("number", "6"));
+
+ TableReference table = BigQueryHelpers.parseTableSpec("project:data_set.table_name");
+ datasetService.createDataset(table.getProjectId(), table.getDatasetId(), "", "");
+ datasetService.createTable(new Table().setTableReference(table));
+ datasetService.insertAll(table, expected, null);
+
+ Path baseDir = Files.createTempDirectory(tempFolder, "testBigQueryTableSourceThroughJsonAPI");
String jobIdToken = "testJobIdToken";
- TableReference table = BigQueryHelpers.parseTableSpec("project.data_set.table_name");
- String extractDestinationDir = "mock://tempLocation";
BoundedSource<TableRow> bqSource = BigQueryTableSource.create(
StaticValueProvider.of(jobIdToken), StaticValueProvider.of(table),
- extractDestinationDir, fakeBqServices,
+ baseDir.toString(), fakeBqServices,
StaticValueProvider.of("project"));
- List<TableRow> expected = ImmutableList.of(
- new TableRow().set("name", "a").set("number", "1"),
- new TableRow().set("name", "b").set("number", "2"),
- new TableRow().set("name", "c").set("number", "3"));
-
PipelineOptions options = PipelineOptionsFactory.create();
Assert.assertThat(
SourceTestUtils.readFromSource(bqSource, options),
@@ -1244,43 +1240,48 @@ public class BigQueryIOTest implements Serializable {
extractJob.setStatus(new JobStatus())
.setStatistics(jobStats);
+ FakeDatasetService fakeDatasetService = new FakeDatasetService();
FakeBigQueryServices fakeBqServices = new FakeBigQueryServices()
.withJobService(new FakeJobService())
- .withDatasetService(mockDatasetService)
- .readerReturns(
- toJsonString(new TableRow().set("name", "a").set("number", "1")),
- toJsonString(new TableRow().set("name", "b").set("number", "2")),
- toJsonString(new TableRow().set("name", "c").set("number", "3")));
+ .withDatasetService(fakeDatasetService);
+
+ List<TableRow> expected = ImmutableList.of(
+ new TableRow().set("name", "a").set("number", 1L),
+ new TableRow().set("name", "b").set("number", 2L),
+ new TableRow().set("name", "c").set("number", 3L),
+ new TableRow().set("name", "d").set("number", 4L),
+ new TableRow().set("name", "e").set("number", 5L),
+ new TableRow().set("name", "f").set("number", 6L));
- String jobIdToken = "testJobIdToken";
TableReference table = BigQueryHelpers.parseTableSpec("project:data_set.table_name");
- String extractDestinationDir = "mock://tempLocation";
+ fakeDatasetService.createDataset("project", "data_set", "", "");
+ fakeDatasetService.createTable(new Table().setTableReference(table)
+ .setSchema(new TableSchema()
+ .setFields(
+ ImmutableList.of(
+ new TableFieldSchema().setName("name").setType("STRING"),
+ new TableFieldSchema().setName("number").setType("INTEGER")))));
+ fakeDatasetService.insertAll(table, expected, null);
+
+ Path baseDir = Files.createTempDirectory(tempFolder, "testBigQueryTableSourceInitSplit");
+
+ String jobIdToken = "testJobIdToken";
+ String extractDestinationDir = baseDir.toString();
BoundedSource<TableRow> bqSource = BigQueryTableSource.create(
StaticValueProvider.of(jobIdToken), StaticValueProvider.of(table),
extractDestinationDir, fakeBqServices, StaticValueProvider.of("project"));
- List<TableRow> expected = ImmutableList.of(
- new TableRow().set("name", "a").set("number", "1"),
- new TableRow().set("name", "b").set("number", "2"),
- new TableRow().set("name", "c").set("number", "3"));
PipelineOptions options = PipelineOptionsFactory.create();
- options.setTempLocation("mock://tempLocation");
-
- IOChannelUtils.setIOFactoryInternal("mock", mockIOChannelFactory, true /* override */);
- when(mockIOChannelFactory.resolve(anyString(), anyString()))
- .thenReturn("mock://tempLocation/output");
- when(mockDatasetService.getTable(any(TableReference.class)))
- .thenReturn(new Table().setSchema(new TableSchema()));
+ options.setTempLocation(baseDir.toString());
- Assert.assertThat(
- SourceTestUtils.readFromSource(bqSource, options),
- CoreMatchers.is(expected));
+ List<TableRow> read = SourceTestUtils.readFromSource(bqSource, options);
+ assertThat(read, containsInAnyOrder(Iterables.toArray(expected, TableRow.class)));
SourceTestUtils.assertSplitAtFractionBehavior(
bqSource, 2, 0.3, ExpectedSplitOutcome.MUST_BE_CONSISTENT_IF_SUCCEEDS, options);
List<? extends BoundedSource<TableRow>> sources = bqSource.split(100, options);
- assertEquals(1, sources.size());
+ assertEquals(2, sources.size());
BoundedSource<TableRow> actual = sources.get(0);
assertThat(actual, CoreMatchers.instanceOf(TransformingSource.class));
}
@@ -1306,80 +1307,63 @@ public class BigQueryIOTest implements Serializable {
.setStatistics(extractJobStats);
FakeJobService fakeJobService = new FakeJobService();
+ FakeDatasetService fakeDatasetService = new FakeDatasetService();
FakeBigQueryServices fakeBqServices = new FakeBigQueryServices()
.withJobService(fakeJobService)
- .withDatasetService(mockDatasetService)
- .readerReturns(
- toJsonString(new TableRow().set("name", "a").set("number", "1")),
- toJsonString(new TableRow().set("name", "b").set("number", "2")),
- toJsonString(new TableRow().set("name", "c").set("number", "3")));
+ .withDatasetService(fakeDatasetService);
+
+ List<TableRow> expected = ImmutableList.of(
+ new TableRow().set("name", "a").set("number", 1L),
+ new TableRow().set("name", "b").set("number", 2L),
+ new TableRow().set("name", "c").set("number", 3L),
+ new TableRow().set("name", "d").set("number", 4L),
+ new TableRow().set("name", "e").set("number", 5L),
+ new TableRow().set("name", "f").set("number", 6L));
- String jobIdToken = "testJobIdToken";
- String extractDestinationDir = "mock://tempLocation";
TableReference destinationTable = BigQueryHelpers.parseTableSpec("project:data_set.table_name");
+ fakeDatasetService.createDataset("project", "data_set", "", "");
+ fakeDatasetService.createTable(new Table()
+ .setTableReference(destinationTable)
+ .setSchema(new TableSchema()
+ .setFields(
+ ImmutableList.of(
+ new TableFieldSchema().setName("name").setType("STRING"),
+ new TableFieldSchema().setName("number").setType("INTEGER")))));
+ Path baseDir = Files.createTempDirectory(tempFolder, "testBigQueryQuerySourceInitSplit");
+
+ String jobIdToken = "testJobIdToken";
+ String query = FakeBigQueryServices.encodeQuery(expected);
+ String extractDestinationDir = baseDir.toString();
BoundedSource<TableRow> bqSource = BigQueryQuerySource.create(
- StaticValueProvider.of(jobIdToken), StaticValueProvider.of("query"),
+ StaticValueProvider.of(jobIdToken), StaticValueProvider.of(query),
StaticValueProvider.of(destinationTable),
true /* flattenResults */, true /* useLegacySql */,
extractDestinationDir, fakeBqServices);
- List<TableRow> expected = ImmutableList.of(
- new TableRow().set("name", "a").set("number", "1"),
- new TableRow().set("name", "b").set("number", "2"),
- new TableRow().set("name", "c").set("number", "3"));
-
PipelineOptions options = PipelineOptionsFactory.create();
options.setTempLocation(extractDestinationDir);
TableReference queryTable = new TableReference()
- .setProjectId("testproject")
- .setDatasetId("testDataset")
- .setTableId("testTable");
- // when(mockJobService.dryRunQuery(anyString(), Mockito.<JobConfigurationQuery>any()))
- // .thenReturn(new JobStatistics().setQuery(
- // new JobStatistics2()
- // .setTotalBytesProcessed(100L)
- // .setReferencedTables(ImmutableList.of(queryTable))));
- fakeJobService.expectDryRunQuery("testproject", "query",
+ .setProjectId("project")
+ .setDatasetId("data_set")
+ .setTableId("table_name");
+
+ fakeJobService.expectDryRunQuery("project", query,
new JobStatistics().setQuery(
new JobStatistics2()
.setTotalBytesProcessed(100L)
.setReferencedTables(ImmutableList.of(queryTable))));
- // when(mockDatasetService.getTable(eq(queryTable)))
- // .thenReturn(new Table().setSchema(new TableSchema()));
- // when(mockDatasetService.getTable(eq(destinationTable)))
- // .thenReturn(new Table().setSchema(new TableSchema()));
- IOChannelUtils.setIOFactoryInternal("mock", mockIOChannelFactory, true /* override */);
- when(mockIOChannelFactory.resolve(anyString(), anyString()))
- .thenReturn("mock://tempLocation/output");
- //when(mockJobService.pollJob(Mockito.<JobReference>any(), Mockito.anyInt()))
- // .thenReturn(extractJob);
-
- Assert.assertThat(
- SourceTestUtils.readFromSource(bqSource, options),
- CoreMatchers.is(expected));
+ List<TableRow> read = SourceTestUtils.readFromSource(bqSource, options);
+ assertThat(read, containsInAnyOrder(Iterables.toArray(expected, TableRow.class)));
SourceTestUtils.assertSplitAtFractionBehavior(
bqSource, 2, 0.3, ExpectedSplitOutcome.MUST_BE_CONSISTENT_IF_SUCCEEDS, options);
+
List<? extends BoundedSource<TableRow>> sources = bqSource.split(100, options);
- assertEquals(1, sources.size());
+ assertEquals(2, sources.size());
BoundedSource<TableRow> actual = sources.get(0);
assertThat(actual, CoreMatchers.instanceOf(TransformingSource.class));
-
- /*
- Mockito.verify(mockJobService)
- .startQueryJob(
- Mockito.<JobReference>any(), Mockito.<JobConfigurationQuery>any());
- Mockito.verify(mockJobService)
- .startExtractJob(Mockito.<JobReference>any(), Mockito.<JobConfigurationExtract>any());
- Mockito.verify(mockDatasetService)
- .createDataset(anyString(), anyString(), anyString(), anyString());
- ArgumentCaptor<JobConfigurationQuery> queryConfigArg =
- ArgumentCaptor.forClass(JobConfigurationQuery.class);
- Mockito.verify(mockJobService).dryRunQuery(anyString(), queryConfigArg.capture());
- assertEquals(true, queryConfigArg.getValue().getFlattenResults());
- assertEquals(true, queryConfigArg.getValue().getUseLegacySql());*/
}
@Test
@@ -1402,68 +1386,60 @@ public class BigQueryIOTest implements Serializable {
extractJob.setStatus(new JobStatus())
.setStatistics(extractJobStats);
+ FakeDatasetService datasetService = new FakeDatasetService();
+ FakeJobService jobService = new FakeJobService();
FakeBigQueryServices fakeBqServices = new FakeBigQueryServices()
- .withJobService(new FakeJobService())
- .withDatasetService(mockDatasetService)
- .readerReturns(
- toJsonString(new TableRow().set("name", "a").set("number", "1")),
- toJsonString(new TableRow().set("name", "b").set("number", "2")),
- toJsonString(new TableRow().set("name", "c").set("number", "3")));
+ .withJobService(jobService)
+ .withDatasetService(datasetService);
- String jobIdToken = "testJobIdToken";
- String extractDestinationDir = "mock://tempLocation";
TableReference destinationTable = BigQueryHelpers.parseTableSpec("project:data_set.table_name");
+ List<TableRow> expected = ImmutableList.of(
+ new TableRow().set("name", "a").set("number", 1L),
+ new TableRow().set("name", "b").set("number", 2L),
+ new TableRow().set("name", "c").set("number", 3L),
+ new TableRow().set("name", "d").set("number", 4L),
+ new TableRow().set("name", "e").set("number", 5L),
+ new TableRow().set("name", "f").set("number", 6L));
+ datasetService.createDataset(destinationTable.getProjectId(), destinationTable.getDatasetId(),
+ "", "");
+ Table table = new Table()
+ .setTableReference(destinationTable)
+ .setSchema(new TableSchema()
+ .setFields(
+ ImmutableList.of(
+ new TableFieldSchema().setName("name").setType("STRING"),
+ new TableFieldSchema().setName("number").setType("INTEGER"))));
+ datasetService.createTable(table);
+
+ String query = FakeBigQueryServices.encodeQuery(expected);
+ jobService.expectDryRunQuery("project", query,
+ new JobStatistics().setQuery(
+ new JobStatistics2()
+ .setTotalBytesProcessed(100L)
+ .setReferencedTables(ImmutableList.of(table.getTableReference()))));
+
+ Path baseDir = Files.createTempDirectory(tempFolder, "testBigQueryNoTableQuerySourceInitSplit");
+ String jobIdToken = "testJobIdToken";
BoundedSource<TableRow> bqSource = BigQueryQuerySource.create(
- StaticValueProvider.of(jobIdToken), StaticValueProvider.of("query"),
+ StaticValueProvider.of(jobIdToken),
+ StaticValueProvider.of(query),
StaticValueProvider.of(destinationTable),
- true /* flattenResults */, true /* useLegacySql */,
- extractDestinationDir, fakeBqServices);
+ true /* flattenResults */, true /* useLegacySql */, baseDir.toString(), fakeBqServices);
- List<TableRow> expected = ImmutableList.of(
- new TableRow().set("name", "a").set("number", "1"),
- new TableRow().set("name", "b").set("number", "2"),
- new TableRow().set("name", "c").set("number", "3"));
- PipelineOptions options = PipelineOptionsFactory.create();
- options.setTempLocation(extractDestinationDir);
-
- /*
- when(mockJobService.dryRunQuery(anyString(), Mockito.<JobConfigurationQuery>any()))
- .thenReturn(new JobStatistics().setQuery(
- new JobStatistics2()
- .setTotalBytesProcessed(100L)));
- when(mockDatasetService.getTable(eq(destinationTable)))
- .thenReturn(new Table().setSchema(new TableSchema()));
- IOChannelUtils.setIOFactoryInternal("mock", mockIOChannelFactory, true);
- when(mockIOChannelFactory.resolve(anyString(), anyString()))
- .thenReturn("mock://tempLocation/output");
- when(mockJobService.pollJob(Mockito.<JobReference>any(), Mockito.anyInt()))
- .thenReturn(extractJob);*/
- Assert.assertThat(
- SourceTestUtils.readFromSource(bqSource, options),
- CoreMatchers.is(expected));
+ PipelineOptions options = PipelineOptionsFactory.create();
+ options.setTempLocation(baseDir.toString());
+ List<TableRow> read = convertBigDecimaslToLong(
+ SourceTestUtils.readFromSource(bqSource, options));
+ assertThat(read, containsInAnyOrder(Iterables.toArray(expected, TableRow.class)));
SourceTestUtils.assertSplitAtFractionBehavior(
bqSource, 2, 0.3, ExpectedSplitOutcome.MUST_BE_CONSISTENT_IF_SUCCEEDS, options);
List<? extends BoundedSource<TableRow>> sources = bqSource.split(100, options);
- assertEquals(1, sources.size());
+ assertEquals(2, sources.size());
BoundedSource<TableRow> actual = sources.get(0);
assertThat(actual, CoreMatchers.instanceOf(TransformingSource.class));
-
- /*
- Mockito.verify(Service)
- .startQueryJob(
- Mockito.<JobReference>any(), Mockito.<JobConfigurationQuery>any());
- Mockito.verify(mockJobService)
- .startExtractJob(Mockito.<JobReference>any(), Mockito.<JobConfigurationExtract>any());
- Mockito.verify(mockDatasetService)
- .createDataset(anyString(), anyString(), anyString(), anyString());
- ArgumentCaptor<JobConfigurationQuery> queryConfigArg =
- ArgumentCaptor.forClass(JobConfigurationQuery.class);
- Mockito.verify(mockJobService).dryRunQuery(anyString(), queryConfigArg.capture());
- assertEquals(true, queryConfigArg.getValue().getFlattenResults());
- assertEquals(true, queryConfigArg.getValue().getUseLegacySql());*/
}
@Test
@@ -1604,12 +1580,27 @@ public class BigQueryIOTest implements Serializable {
throws Exception {
p.enableAbandonedNodeEnforcement(false);
+ // In the case where a static destination is specified (i.e. not through a dynamic table
+ // function) and there is no input data, WritePartition will generate an empty table. This
+ // code is to test that path.
+ TableReference singletonReference = new TableReference()
+ .setProjectId("projectid")
+ .setDatasetId("dataset")
+ .setTableId("table");
+ String singletonDescription = "singleton";
+ boolean isSingleton = numTables == 1 && numFilesPerTable == 0;
+
List<ShardedKey<TableDestination>> expectedPartitions = Lists.newArrayList();
- for (int i = 0; i < numTables; ++i) {
- for (int j = 1; j <= expectedNumPartitionsPerTable; ++j) {
- String tableName = String.format("project-id:dataset-id.tables%05d", i);
- TableDestination destination = new TableDestination(tableName, tableName);
- expectedPartitions.add(ShardedKey.of(destination, j));
+ if (isSingleton) {
+ expectedPartitions.add(ShardedKey.of(
+ new TableDestination(singletonReference, singletonDescription), 1));
+ } else {
+ for (int i = 0; i < numTables; ++i) {
+ for (int j = 1; j <= expectedNumPartitionsPerTable; ++j) {
+ String tableName = String.format("project-id:dataset-id.tables%05d", i);
+ TableDestination destination = new TableDestination(tableName, tableName);
+ expectedPartitions.add(ShardedKey.of(destination, j));
+ }
}
}
@@ -1642,11 +1633,7 @@ public class BigQueryIOTest implements Serializable {
WriteBundlesToFiles.ResultCoder.of());
ValueProvider<String> singletonTable = null;
- if (numFilesPerTable == 0 && numTables == 1) {
- TableReference singletonReference = new TableReference()
- .setProjectId("projectid")
- .setDatasetId("dataset")
- .setTableId("table");
+ if (isSingleton) {
singletonTable = StaticValueProvider.of(BigQueryHelpers.toJsonString(singletonReference));
}
WritePartition writePartition =
@@ -1680,12 +1667,10 @@ public class BigQueryIOTest implements Serializable {
tableFilesResult.addAll(partition.getValue());
}
- assertEquals(expectedPartitions.size(), partitionsResult.size());
+ assertThat(partitionsResult,
+ containsInAnyOrder(Iterables.toArray(expectedPartitions, ShardedKey.class)));
- // assertThat(partitionsResult,
- // containsInAnyOrder(Iterables.toArray(expectedPartitions, ShardedKey.class)));
-
- if (numFilesPerTable == 0 && numTables == 1) {
+ if (isSingleton) {
assertEquals(1, filesPerTableResult.size());
List<String> singletonFiles = filesPerTableResult.values().iterator().next();
assertTrue(Files.exists(Paths.get(singletonFiles.get(0))));
@@ -1700,15 +1685,11 @@ public class BigQueryIOTest implements Serializable {
public void testWriteTables() throws Exception {
p.enableAbandonedNodeEnforcement(false);
+ FakeDatasetService datasetService = new FakeDatasetService();
FakeBigQueryServices fakeBqServices = new FakeBigQueryServices()
.withJobService(new FakeJobService())
- // .startJobReturns("done", "done", "done", "done", "done", "done", "done", "done",
- // "done", "done")
- // .pollJobReturns(Status.FAILED, Status.SUCCEEDED, Status.SUCCEEDED, Status.SUCCEEDED,
- // Status.SUCCEEDED, Status.SUCCEEDED, Status.SUCCEEDED, Status.SUCCEEDED,
- // Status.SUCCEEDED, Status.SUCCEEDED))
- .withDatasetService(mockDatasetService);
-
+ .withDatasetService(datasetService);
+ datasetService.createDataset("project-id", "dataset-id", "", "");
long numTables = 3;
long numPartitions = 3;
long numFilesPerPartition = 10;
@@ -1716,6 +1697,8 @@ public class BigQueryIOTest implements Serializable {
String tempFilePrefix = "tempFilePrefix";
Map<TableDestination, List<String>> expectedTempTables = Maps.newHashMap();
+ Path baseDir = Files.createTempDirectory(tempFolder, "testWriteTables");
+
List<KV<ShardedKey<TableDestination>, Iterable<List<String>>>> partitions =
Lists.newArrayList();
for (int i = 0; i < numTables; ++i) {
@@ -1726,7 +1709,16 @@ public class BigQueryIOTest implements Serializable {
jobIdToken + "_0x%08x_%05d", tableDestination.hashCode(), j);
List<String> filesPerPartition = Lists.newArrayList();
for (int k = 0; k < numFilesPerPartition; ++k) {
- filesPerPartition.add(String.format("files0x%08x_%05d", tableDestination.hashCode(), k));
+ String filename = Paths.get(baseDir.toString(),
+ String.format("files0x%08x_%05d", tempTableId.hashCode(), k)).toString();
+ try (WritableByteChannel channel = IOChannelUtils.create(filename, MimeTypes.TEXT)) {
+ try (OutputStream output = Channels.newOutputStream(channel)) {
+ TableRow tableRow = new TableRow().set("name", tableName);
+ TableRowJsonCoder.of().encode(tableRow, output, Context.OUTER);
+ output.write("\n".getBytes(StandardCharsets.UTF_8));
+ }
+ }
+ filesPerPartition.add(filename);
}
partitions.add(KV.of(ShardedKey.of(tableDestination, j),
(Iterable<List<String>>) Collections.singleton(filesPerPartition)));
@@ -1814,25 +1806,45 @@ public class BigQueryIOTest implements Serializable {
public void testWriteRename() throws Exception {
p.enableAbandonedNodeEnforcement(false);
+ FakeDatasetService datasetService = new FakeDatasetService();
FakeBigQueryServices fakeBqServices = new FakeBigQueryServices()
.withJobService(new FakeJobService())
- // .startJobReturns("done", "done")
- // .pollJobReturns(Status.FAILED, Status.SUCCEEDED))
- .withDatasetService(mockDatasetService);
+ .withDatasetService(datasetService);
+ datasetService.createDataset("project-id", "dataset-id", "", "");
- int numFinalTables = 3;
- int numTempTables = 3;
+ final int numFinalTables = 3;
+ final int numTempTablesPerFinalTable = 3;
+ final int numRecordsPerTempTable = 10;
+
+ Map<TableDestination, List<TableRow>> expectedRowsPerTable = Maps.newHashMap();
String jobIdToken = "jobIdToken";
- String jsonTable = "{}";
Map<TableDestination, Iterable<String>> tempTables = Maps.newHashMap();
for (int i = 0; i < numFinalTables; ++i) {
String tableName = "project-id:dataset-id.table_" + i;
- TableDestination tableDestination = new TableDestination(tableName, tableName);
+ TableDestination tableDestination = new TableDestination(
+ tableName, "table_" + i + "_desc");
List<String> tables = Lists.newArrayList();
tempTables.put(tableDestination, tables);
- for (int j = 0; i < numTempTables; ++i) {
- tables.add(String.format(
- "{\"project-id:dataset-id.tableId\":\"%s_%05d_%05d\"}", jobIdToken, i, j));
+
+ List<TableRow> expectedRows = expectedRowsPerTable.get(tableDestination);
+ if (expectedRows == null) {
+ expectedRows = Lists.newArrayList();
+ expectedRowsPerTable.put(tableDestination, expectedRows);
+ }
+ for (int j = 0; i < numTempTablesPerFinalTable; ++i) {
+ TableReference tempTable = new TableReference()
+ .setProjectId("project-id")
+ .setDatasetId("dataset-id")
+ .setTableId(String.format("%s_%05d_%05d", jobIdToken, i, j));
+ datasetService.createTable(new Table().setTableReference(tempTable));
+
+ List<TableRow> rows = Lists.newArrayList();
+ for (int k = 0; k < numRecordsPerTempTable; ++k) {
+ rows.add(new TableRow().set("number", j * numTempTablesPerFinalTable + k));
+ }
+ datasetService.insertAll(tempTable, rows, null);
+ expectedRows.addAll(rows);
+ tables.add(BigQueryHelpers.toJsonString(tempTable));
}
}
@@ -1857,37 +1869,52 @@ public class BigQueryIOTest implements Serializable {
tester.setSideInput(tempTablesView, GlobalWindow.INSTANCE, tempTables);
tester.setSideInput(jobIdTokenView, GlobalWindow.INSTANCE, jobIdToken);
tester.processElement(null);
+
+ for (Map.Entry<TableDestination, Iterable<String>> entry : tempTables.entrySet()) {
+ TableDestination tableDestination = entry.getKey();
+ TableReference tableReference = tableDestination.getTableReference();
+ Table table = checkNotNull(datasetService.getTable(tableReference));
+ assertEquals(tableReference.getTableId() + "_desc", tableDestination.getTableDescription());
+
+ List<TableRow> expectedRows = expectedRowsPerTable.get(tableDestination);
+ assertThat(datasetService.getAllRows(tableReference.getProjectId(),
+ tableReference.getDatasetId(), tableReference.getTableId()),
+ containsInAnyOrder(Iterables.toArray(expectedRows, TableRow.class)));
+
+ // Temp tables should be deleted.
+ for (String tempTableJson : entry.getValue()) {
+ TableReference tempTable = BigQueryHelpers.fromJsonString(
+ tempTableJson, TableReference.class);
+ assertEquals(null, datasetService.getTable(tempTable));
+ }
+ }
}
@Test
public void testRemoveTemporaryTables() throws Exception {
- String projectId = "someproject";
- String datasetId = "somedataset";
- List<String> tables = Lists.newArrayList("table1", "table2", "table3");
+ FakeDatasetService datasetService = new FakeDatasetService();
+ String projectId = "project";
+ String datasetId = "dataset";
+ datasetService.createDataset(projectId, datasetId, "", "");
List<TableReference> tableRefs = Lists.newArrayList(
- BigQueryHelpers.parseTableSpec(String.format("%s:%s.%s", projectId, datasetId,
- tables.get(0))),
- BigQueryHelpers.parseTableSpec(String.format("%s:%s.%s", projectId, datasetId,
- tables.get(1))),
- BigQueryHelpers.parseTableSpec(String.format("%s:%s.%s", projectId, datasetId,
- tables.get(2))));
+ BigQueryHelpers.parseTableSpec(String.format("%s:%s.%s", projectId, datasetId, "table1")),
+ BigQueryHelpers.parseTableSpec(String.format("%s:%s.%s", projectId, datasetId, "table2")),
+ BigQueryHelpers.parseTableSpec(String.format("%s:%s.%s", projectId, datasetId, "table3")));
+ for (TableReference tableRef : tableRefs) {
+ datasetService.createTable(new Table().setTableReference(tableRef));
+ }
- doThrow(new IOException("Unable to delete table"))
- .when(mockDatasetService).deleteTable(tableRefs.get(0));
- doNothing().when(mockDatasetService).deleteTable(tableRefs.get(1));
- doNothing().when(mockDatasetService).deleteTable(tableRefs.get(2));
+ // Add one more table to delete that does not actually exist.
+ tableRefs.add(
+ BigQueryHelpers.parseTableSpec(String.format("%s:%s.%s", projectId, datasetId, "table4")));
- WriteRename.removeTemporaryTables(mockDatasetService, tableRefs);
+ WriteRename.removeTemporaryTables(datasetService, tableRefs);
for (TableReference ref : tableRefs) {
loggedWriteRename.verifyDebug("Deleting table " + toJsonString(ref));
+ checkState(datasetService.getTable(ref) == null,
+ "Table " + ref + " was not deleted!");
}
- loggedWriteRename.verifyWarn("Failed to delete the table "
- + toJsonString(tableRefs.get(0)));
- loggedWriteRename.verifyNotLogged("Failed to delete the table "
- + toJsonString(tableRefs.get(1)));
- loggedWriteRename.verifyNotLogged("Failed to delete the table "
- + toJsonString(tableRefs.get(2)));
}
/** Test options. **/
@@ -1957,43 +1984,6 @@ public class BigQueryIOTest implements Serializable {
}}).length);
}
- private class WriteExtractFiles implements SerializableFunction<GenericJson, Void> {
- private final SerializableFunction<Void, Schema> schemaGenerator;
- private final Collection<Map<String, Object>> records;
-
- private WriteExtractFiles(
- SerializableFunction<Void, Schema> schemaGenerator,
- Collection<Map<String, Object>> records) {
- this.schemaGenerator = schemaGenerator;
- this.records = records;
- }
-
- @Override
- public Void apply(GenericJson input) {
- List<String> destinations = (List<String>) input.get("destinationUris");
- for (String destination : destinations) {
- String newDest = destination.replace("*", "000000000000");
- Schema schema = schemaGenerator.apply(null);
- try (WritableByteChannel channel = IOChannelUtils.create(newDest, MimeTypes.BINARY);
- DataFileWriter<GenericRecord> tableRowWriter =
- new DataFileWriter<>(new GenericDatumWriter<GenericRecord>(schema))
- .create(schema, Channels.newOutputStream(channel))) {
- for (Map<String, Object> record : records) {
- GenericRecordBuilder genericRecordBuilder = new GenericRecordBuilder(schema);
- for (Map.Entry<String, Object> field : record.entrySet()) {
- genericRecordBuilder.set(field.getKey(), field.getValue());
- }
- tableRowWriter.append(genericRecordBuilder.build());
- }
- } catch (IOException e) {
- throw new IllegalStateException(
- String.format("Could not create destination for extract job %s", destination), e);
- }
- }
- return null;
- }
- }
-
@Test
public void testShardedKeyCoderIsSerializableWithWellKnownCoderType() {
CoderProperties.coderSerializable(ShardedKeyCoder.of(GlobalWindow.Coder.INSTANCE));
@@ -2013,4 +2003,19 @@ public class BigQueryIOTest implements Serializable {
TableRowInfoCoder.of()),
IntervalWindow.getCoder()));
}
+
+ List<TableRow> convertBigDecimaslToLong(List<TableRow> toConvert) {
+ // The numbers come back as BigDecimal objects after JSON serialization. Change them back to
+ // longs so that we can assert the output.
+ List<TableRow> converted = Lists.newArrayList();
+ for (TableRow entry : toConvert) {
+ TableRow convertedEntry = entry.clone();
+ Object num = convertedEntry.get("number");
+ if (num instanceof BigDecimal) {
+ convertedEntry.set("number", ((BigDecimal) num).longValue());
+ }
+ converted.add(convertedEntry);
+ }
+ return converted;
+ }
}
http://git-wip-us.apache.org/repos/asf/beam/blob/b486137d/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/FakeBigQueryServices.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/FakeBigQueryServices.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/FakeBigQueryServices.java
index ed3ab37..6dfd9d7 100644
--- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/FakeBigQueryServices.java
+++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/FakeBigQueryServices.java
@@ -1,39 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
package org.apache.beam.sdk.io.gcp.bigquery;
-import static org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.fromJsonString;
import static org.junit.Assert.assertEquals;
+import com.google.api.client.util.Base64;
import com.google.api.services.bigquery.model.JobConfigurationQuery;
import com.google.api.services.bigquery.model.TableReference;
import com.google.api.services.bigquery.model.TableRow;
+import com.google.common.collect.Lists;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
import java.io.IOException;
+import java.util.List;
import java.util.NoSuchElementException;
+
+import org.apache.beam.sdk.coders.Coder.Context;
+import org.apache.beam.sdk.coders.ListCoder;
+import org.apache.beam.sdk.coders.TableRowJsonCoder;
import org.apache.beam.sdk.options.BigQueryOptions;
/**
- * Created by relax on 3/30/17.
+ * A fake implementation of BigQuery's query service..
*/
class FakeBigQueryServices implements BigQueryServices {
- private String[] jsonTableRowReturns = new String[0];
private JobService jobService;
- private DatasetService datasetService;
+ private FakeDatasetService datasetService;
- public FakeBigQueryServices withJobService(JobService jobService) {
+ FakeBigQueryServices withJobService(JobService jobService) {
this.jobService = jobService;
return this;
}
- public FakeBigQueryServices withDatasetService(DatasetService datasetService) {
+ FakeBigQueryServices withDatasetService(FakeDatasetService datasetService) {
this.datasetService = datasetService;
return this;
}
- public FakeBigQueryServices readerReturns(String... jsonTableRowReturns) {
- this.jsonTableRowReturns = jsonTableRowReturns;
- return this;
- }
-
@Override
public JobService getJobService(BigQueryOptions bqOptions) {
return jobService;
@@ -45,26 +65,58 @@ class FakeBigQueryServices implements BigQueryServices {
}
@Override
- public BigQueryJsonReader getReaderFromTable(
- BigQueryOptions bqOptions, TableReference tableRef) {
- return new FakeBigQueryReader(jsonTableRowReturns);
+ public BigQueryJsonReader getReaderFromTable(BigQueryOptions bqOptions, TableReference tableRef) {
+ try {
+ List<TableRow> rows = datasetService.getAllRows(
+ tableRef.getProjectId(), tableRef.getDatasetId(), tableRef.getTableId());
+ return new FakeBigQueryReader(rows);
+ } catch (Exception e) {
+ return null;
+ }
}
@Override
public BigQueryJsonReader getReaderFromQuery(
BigQueryOptions bqOptions, String projectId, JobConfigurationQuery queryConfig) {
- return new FakeBigQueryReader(jsonTableRowReturns);
+ try {
+ List<TableRow> rows = rowsFromEncodedQuery(queryConfig.getQuery());
+ return new FakeBigQueryReader(rows);
+ } catch (IOException e) {
+ return null;
+ }
+ }
+
+ static List<TableRow> rowsFromEncodedQuery(String query) throws IOException {
+ ListCoder<TableRow> listCoder = ListCoder.of(TableRowJsonCoder.of());
+ ByteArrayInputStream input = new ByteArrayInputStream(Base64.decodeBase64(query));
+ List<TableRow> rows = listCoder.decode(input, Context.OUTER);
+ for (TableRow row : rows) {
+ convertNumbers(row);
+ }
+ return rows;
+ }
+
+ static String encodeQuery(List<TableRow> rows) throws IOException {
+ ListCoder<TableRow> listCoder = ListCoder.of(TableRowJsonCoder.of());
+ ByteArrayOutputStream output = new ByteArrayOutputStream();
+ listCoder.encode(rows, output, Context.OUTER);
+ return Base64.encodeBase64String(output.toByteArray());
}
private static class FakeBigQueryReader implements BigQueryJsonReader {
private static final int UNSTARTED = -1;
private static final int CLOSED = Integer.MAX_VALUE;
- private String[] jsonTableRowReturns;
+ private List<byte[]> serializedTableRowReturns;
private int currIndex;
- FakeBigQueryReader(String[] jsonTableRowReturns) {
- this.jsonTableRowReturns = jsonTableRowReturns;
+ FakeBigQueryReader(List<TableRow> tableRowReturns) throws IOException {
+ this.serializedTableRowReturns = Lists.newArrayListWithExpectedSize(tableRowReturns.size());
+ for (TableRow tableRow : tableRowReturns) {
+ ByteArrayOutputStream output = new ByteArrayOutputStream();
+ TableRowJsonCoder.of().encode(tableRow, output, Context.OUTER);
+ serializedTableRowReturns.add(output.toByteArray());
+ }
this.currIndex = UNSTARTED;
}
@@ -72,20 +124,27 @@ class FakeBigQueryServices implements BigQueryServices {
public boolean start() throws IOException {
assertEquals(UNSTARTED, currIndex);
currIndex = 0;
- return currIndex < jsonTableRowReturns.length;
+ return currIndex < serializedTableRowReturns.size();
}
@Override
public boolean advance() throws IOException {
- return ++currIndex < jsonTableRowReturns.length;
+ return ++currIndex < serializedTableRowReturns.size();
}
@Override
public TableRow getCurrent() throws NoSuchElementException {
- if (currIndex >= jsonTableRowReturns.length) {
+ if (currIndex >= serializedTableRowReturns.size()) {
throw new NoSuchElementException();
}
- return fromJsonString(jsonTableRowReturns[currIndex], TableRow.class);
+
+ ByteArrayInputStream input = new ByteArrayInputStream(
+ serializedTableRowReturns.get(currIndex));
+ try {
+ return convertNumbers(TableRowJsonCoder.of().decode(input, Context.OUTER));
+ } catch (IOException e) {
+ return null;
+ }
}
@Override
@@ -93,4 +152,15 @@ class FakeBigQueryServices implements BigQueryServices {
currIndex = CLOSED;
}
}
+
+
+ // Longs tend to get converted back to Integers due to JSON serialization. Convert them back.
+ static TableRow convertNumbers(TableRow tableRow) {
+ for (TableRow.Entry entry : tableRow.entrySet()) {
+ if (entry.getValue() instanceof Integer) {
+ entry.setValue(new Long((Integer) entry.getValue()));
+ }
+ }
+ return tableRow;
+ }
}
http://git-wip-us.apache.org/repos/asf/beam/blob/b486137d/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/FakeDatasetService.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/FakeDatasetService.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/FakeDatasetService.java
index 9b2cf63..5103adb 100644
--- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/FakeDatasetService.java
+++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/FakeDatasetService.java
@@ -1,9 +1,29 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
package org.apache.beam.sdk.io.gcp.bigquery;
-import static com.google.common.base.Preconditions.checkNotNull;
import static org.junit.Assert.assertEquals;
+import com.google.api.client.googleapis.json.GoogleJsonResponseException;
+import com.google.api.client.http.HttpHeaders;
import com.google.api.services.bigquery.model.Dataset;
+import com.google.api.services.bigquery.model.DatasetReference;
import com.google.api.services.bigquery.model.Table;
import com.google.api.services.bigquery.model.TableReference;
import com.google.api.services.bigquery.model.TableRow;
@@ -24,13 +44,13 @@ class FakeDatasetService implements DatasetService, Serializable {
throws InterruptedException, IOException {
synchronized (BigQueryIOTest.tables) {
Map<String, TableContainer> dataset =
- checkNotNull(
- BigQueryIOTest.tables.get(tableRef.getProjectId(), tableRef.getDatasetId()),
- "Tried to get a dataset %s:%s from %s, but no such dataset was set",
- tableRef.getProjectId(),
- tableRef.getDatasetId(),
- tableRef.getTableId(),
- FakeDatasetService.class.getSimpleName());
+ BigQueryIOTest.tables.get(tableRef.getProjectId(), tableRef.getDatasetId());
+ if (dataset == null) {
+ throwNotFound(
+ "Tried to get a dataset %s:%s from, but no such dataset was set",
+ tableRef.getProjectId(),
+ tableRef.getDatasetId());
+ }
TableContainer tableContainer = dataset.get(tableRef.getTableId());
return tableContainer == null ? null : tableContainer.getTable();
}
@@ -44,27 +64,40 @@ class FakeDatasetService implements DatasetService, Serializable {
}
private TableContainer getTableContainer(String projectId, String datasetId, String tableId)
- throws InterruptedException, IOException {
- synchronized (BigQueryIOTest.tables) {
- Map<String, TableContainer> dataset =
- checkNotNull(
- BigQueryIOTest.tables.get(projectId, datasetId),
- "Tried to get a dataset %s:%s from %s, but no such dataset was set",
- projectId,
- datasetId,
- FakeDatasetService.class.getSimpleName());
- return checkNotNull(dataset.get(tableId),
- "Tried to get a table %s:%s.%s from %s, but no such table was set",
- projectId,
- datasetId,
- tableId,
- FakeDatasetService.class.getSimpleName());
- }
+ throws InterruptedException, IOException {
+ synchronized (BigQueryIOTest.tables) {
+ Map<String, TableContainer> dataset = BigQueryIOTest.tables.get(projectId, datasetId);
+ if (dataset == null) {
+ throwNotFound(
+ "Tried to get a dataset %s:%s, but no such dataset was set",
+ projectId,
+ datasetId);
+ }
+ TableContainer tableContainer = dataset.get(tableId);
+ if (tableContainer == null) {
+ throwNotFound(
+ "Tried to get a table %s:%s.%s, but no such table was set",
+ projectId,
+ datasetId,
+ tableId);
+ }
+ return tableContainer;
+ }
}
@Override
public void deleteTable(TableReference tableRef) throws IOException, InterruptedException {
- throw new UnsupportedOperationException("Unsupported");
+ synchronized (BigQueryIOTest.tables) {
+ Map<String, TableContainer> dataset =
+ BigQueryIOTest.tables.get(tableRef.getProjectId(), tableRef.getDatasetId());
+ if (dataset == null) {
+ throwNotFound(
+ "Tried to get a dataset %s:%s, but no such table was set",
+ tableRef.getProjectId(),
+ tableRef.getDatasetId());
+ }
+ dataset.remove(tableRef.getTableId());
+ }
}
@@ -73,13 +106,13 @@ class FakeDatasetService implements DatasetService, Serializable {
TableReference tableReference = table.getTableReference();
synchronized (BigQueryIOTest.tables) {
Map<String, TableContainer> dataset =
- checkNotNull(
- BigQueryIOTest.tables.get(tableReference.getProjectId(),
- tableReference.getDatasetId()),
- "Tried to get a dataset %s:%s from %s, but no such table was set",
- tableReference.getProjectId(),
- tableReference.getDatasetId(),
- FakeDatasetService.class.getSimpleName());
+ BigQueryIOTest.tables.get(tableReference.getProjectId(), tableReference.getDatasetId());
+ if (dataset == null) {
+ throwNotFound(
+ "Tried to get a dataset %s:%s, but no such table was set",
+ tableReference.getProjectId(),
+ tableReference.getDatasetId());
+ }
TableContainer tableContainer = dataset.get(tableReference.getTableId());
if (tableContainer == null) {
tableContainer = new TableContainer(table);
@@ -98,7 +131,16 @@ class FakeDatasetService implements DatasetService, Serializable {
@Override
public Dataset getDataset(
String projectId, String datasetId) throws IOException, InterruptedException {
- throw new UnsupportedOperationException("Unsupported");
+ synchronized (BigQueryIOTest.tables) {
+ Map<String, TableContainer> dataset = BigQueryIOTest.tables.get(projectId, datasetId);
+ if (dataset == null) {
+ throwNotFound("Tried to get a dataset %s:%s, but no such table was set",
+ projectId, datasetId);
+ }
+ return new Dataset().setDatasetReference(new DatasetReference()
+ .setDatasetId(datasetId)
+ .setProjectId(projectId));
+ }
}
@Override
@@ -117,7 +159,9 @@ class FakeDatasetService implements DatasetService, Serializable {
@Override
public void deleteDataset(String projectId, String datasetId)
throws IOException, InterruptedException {
- throw new UnsupportedOperationException("Unsupported");
+ synchronized (BigQueryIOTest.tables) {
+ BigQueryIOTest.tables.remove(projectId, datasetId);
+ }
}
@Override
@@ -138,8 +182,7 @@ class FakeDatasetService implements DatasetService, Serializable {
TableContainer tableContainer = getTableContainer(
ref.getProjectId(), ref.getDatasetId(), ref.getTableId());
for (int i = 0; i < rowList.size(); ++i) {
- tableContainer.addRow(rowList.get(i), insertIdList.get(i));
- dataSize += rowList.get(i).toString().length();
+ dataSize += tableContainer.addRow(rowList.get(i), insertIdList.get(i));
}
return dataSize;
}
@@ -150,23 +193,16 @@ class FakeDatasetService implements DatasetService, Serializable {
@Nullable String tableDescription)
throws IOException, InterruptedException {
synchronized (BigQueryIOTest.tables) {
- Map<String, TableContainer> dataset =
- checkNotNull(
- BigQueryIOTest.tables.get(tableReference.getProjectId(),
- tableReference.getDatasetId()),
- "Tried to get a dataset %s:%s from %s, but no such dataset was set",
- tableReference.getProjectId(),
- tableReference.getDatasetId(),
- tableReference.getTableId(),
- FakeDatasetService.class.getSimpleName());
- TableContainer tableContainer = checkNotNull(dataset.get(tableReference.getTableId()),
- "Tried to patch a table %s:%s.%s from %s, but no such table was set",
- tableReference.getProjectId(),
- tableReference.getDatasetId(),
- tableReference.getTableId(),
- FakeDatasetService.class.getSimpleName());
+ TableContainer tableContainer = getTableContainer(tableReference.getProjectId(),
+ tableReference.getDatasetId(), tableReference.getTableId());
tableContainer.getTable().setDescription(tableDescription);
return tableContainer.getTable();
}
}
+
+ void throwNotFound(String format, Object... args) throws IOException {
+ throw new IOException(
+ new GoogleJsonResponseException.Builder(404,
+ String.format(format, args), new HttpHeaders()).build());
+ }
}
[43/50] [abbrv] beam git commit: This closes #2544
Posted by dh...@apache.org.
This closes #2544
Project: http://git-wip-us.apache.org/repos/asf/beam/repo
Commit: http://git-wip-us.apache.org/repos/asf/beam/commit/714fdd29
Tree: http://git-wip-us.apache.org/repos/asf/beam/tree/714fdd29
Diff: http://git-wip-us.apache.org/repos/asf/beam/diff/714fdd29
Branch: refs/heads/DSL_SQL
Commit: 714fdd2923ed379eba1de9aaae5d76cb02d69b20
Parents: 8319369 97c6678
Author: chamikara@google.com <ch...@google.com>
Authored: Wed Apr 19 09:56:39 2017 -0700
Committer: chamikara@google.com <ch...@google.com>
Committed: Wed Apr 19 09:56:39 2017 -0700
----------------------------------------------------------------------
sdks/python/apache_beam/io/fileio.py | 90 -------------------------------
1 file changed, 90 deletions(-)
----------------------------------------------------------------------
[24/50] [abbrv] beam git commit: [BEAM-1994] Remove Flink examples
package
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/test/java/org/apache/beam/runners/flink/streaming/UnboundedSourceWrapperTest.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/test/java/org/apache/beam/runners/flink/streaming/UnboundedSourceWrapperTest.java b/runners/flink/src/test/java/org/apache/beam/runners/flink/streaming/UnboundedSourceWrapperTest.java
new file mode 100644
index 0000000..90f95d6
--- /dev/null
+++ b/runners/flink/src/test/java/org/apache/beam/runners/flink/streaming/UnboundedSourceWrapperTest.java
@@ -0,0 +1,464 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink.streaming;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.when;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+import org.apache.beam.runners.flink.translation.wrappers.streaming.io.UnboundedSourceWrapper;
+import org.apache.beam.sdk.coders.Coder;
+import org.apache.beam.sdk.io.UnboundedSource;
+import org.apache.beam.sdk.options.PipelineOptions;
+import org.apache.beam.sdk.options.PipelineOptionsFactory;
+import org.apache.beam.sdk.util.WindowedValue;
+import org.apache.beam.sdk.values.KV;
+import org.apache.flink.api.common.ExecutionConfig;
+import org.apache.flink.api.common.accumulators.Accumulator;
+import org.apache.flink.api.common.state.ListState;
+import org.apache.flink.api.common.state.ListStateDescriptor;
+import org.apache.flink.api.common.state.OperatorStateStore;
+import org.apache.flink.configuration.Configuration;
+import org.apache.flink.runtime.execution.Environment;
+import org.apache.flink.runtime.operators.testutils.DummyEnvironment;
+import org.apache.flink.runtime.state.StateInitializationContext;
+import org.apache.flink.runtime.state.StateSnapshotContextSynchronousImpl;
+import org.apache.flink.streaming.api.TimeCharacteristic;
+import org.apache.flink.streaming.api.graph.StreamConfig;
+import org.apache.flink.streaming.api.operators.Output;
+import org.apache.flink.streaming.api.operators.StreamSource;
+import org.apache.flink.streaming.api.watermark.Watermark;
+import org.apache.flink.streaming.runtime.streamrecord.LatencyMarker;
+import org.apache.flink.streaming.runtime.streamrecord.StreamRecord;
+import org.apache.flink.streaming.runtime.tasks.StreamTask;
+import org.apache.flink.streaming.runtime.tasks.TestProcessingTimeService;
+import org.apache.flink.util.InstantiationUtil;
+import org.junit.Test;
+import org.junit.experimental.runners.Enclosed;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.mockito.Matchers;
+
+/**
+ * Tests for {@link UnboundedSourceWrapper}.
+ */
+@RunWith(Enclosed.class)
+public class UnboundedSourceWrapperTest {
+
+ /**
+ * Parameterized tests.
+ */
+ @RunWith(Parameterized.class)
+ public static class UnboundedSourceWrapperTestWithParams {
+ private final int numTasks;
+ private final int numSplits;
+
+ public UnboundedSourceWrapperTestWithParams(int numTasks, int numSplits) {
+ this.numTasks = numTasks;
+ this.numSplits = numSplits;
+ }
+
+ @Parameterized.Parameters
+ public static Collection<Object[]> data() {
+ /*
+ * Parameters for initializing the tests:
+ * {numTasks, numSplits}
+ * The test currently assumes powers of two for some assertions.
+ */
+ return Arrays.asList(new Object[][]{
+ {1, 1}, {1, 2}, {1, 4},
+ {2, 1}, {2, 2}, {2, 4},
+ {4, 1}, {4, 2}, {4, 4}
+ });
+ }
+
+ /**
+ * Creates a {@link UnboundedSourceWrapper} that has one or multiple readers per source.
+ * If numSplits > numTasks the source has one source will manage multiple readers.
+ */
+ @Test
+ public void testReaders() throws Exception {
+ final int numElements = 20;
+ final Object checkpointLock = new Object();
+ PipelineOptions options = PipelineOptionsFactory.create();
+
+ // this source will emit exactly NUM_ELEMENTS across all parallel readers,
+ // afterwards it will stall. We check whether we also receive NUM_ELEMENTS
+ // elements later.
+ TestCountingSource source = new TestCountingSource(numElements);
+ UnboundedSourceWrapper<KV<Integer, Integer>, TestCountingSource.CounterMark> flinkWrapper =
+ new UnboundedSourceWrapper<>(options, source, numSplits);
+
+ assertEquals(numSplits, flinkWrapper.getSplitSources().size());
+
+ StreamSource<WindowedValue<
+ KV<Integer, Integer>>,
+ UnboundedSourceWrapper<
+ KV<Integer, Integer>,
+ TestCountingSource.CounterMark>> sourceOperator = new StreamSource<>(flinkWrapper);
+
+ setupSourceOperator(sourceOperator, numTasks);
+
+ try {
+ sourceOperator.open();
+ sourceOperator.run(checkpointLock,
+ new Output<StreamRecord<WindowedValue<KV<Integer, Integer>>>>() {
+ private int count = 0;
+
+ @Override
+ public void emitWatermark(Watermark watermark) {
+ }
+
+ @Override
+ public void emitLatencyMarker(LatencyMarker latencyMarker) {
+ }
+
+ @Override
+ public void collect(
+ StreamRecord<WindowedValue<KV<Integer, Integer>>> windowedValueStreamRecord) {
+
+ count++;
+ if (count >= numElements) {
+ throw new SuccessException();
+ }
+ }
+
+ @Override
+ public void close() {
+
+ }
+ });
+ } catch (SuccessException e) {
+
+ assertEquals(Math.max(1, numSplits / numTasks), flinkWrapper.getLocalSplitSources().size());
+
+ // success
+ return;
+ }
+ fail("Read terminated without producing expected number of outputs");
+ }
+
+ /**
+ * Verify that snapshot/restore work as expected. We bring up a source and cancel
+ * after seeing a certain number of elements. Then we snapshot that source,
+ * bring up a completely new source that we restore from the snapshot and verify
+ * that we see all expected elements in the end.
+ */
+ @Test
+ public void testRestore() throws Exception {
+ final int numElements = 20;
+ final Object checkpointLock = new Object();
+ PipelineOptions options = PipelineOptionsFactory.create();
+
+ // this source will emit exactly NUM_ELEMENTS across all parallel readers,
+ // afterwards it will stall. We check whether we also receive NUM_ELEMENTS
+ // elements later.
+ TestCountingSource source = new TestCountingSource(numElements);
+ UnboundedSourceWrapper<KV<Integer, Integer>, TestCountingSource.CounterMark> flinkWrapper =
+ new UnboundedSourceWrapper<>(options, source, numSplits);
+
+ assertEquals(numSplits, flinkWrapper.getSplitSources().size());
+
+ StreamSource<
+ WindowedValue<KV<Integer, Integer>>,
+ UnboundedSourceWrapper<
+ KV<Integer, Integer>,
+ TestCountingSource.CounterMark>> sourceOperator = new StreamSource<>(flinkWrapper);
+
+
+ OperatorStateStore backend = mock(OperatorStateStore.class);
+
+ TestingListState<KV<UnboundedSource, TestCountingSource.CounterMark>>
+ listState = new TestingListState<>();
+
+ when(backend.getOperatorState(Matchers.any(ListStateDescriptor.class)))
+ .thenReturn(listState);
+
+ StateInitializationContext initializationContext = mock(StateInitializationContext.class);
+
+ when(initializationContext.getOperatorStateStore()).thenReturn(backend);
+ when(initializationContext.isRestored()).thenReturn(false, true);
+
+ flinkWrapper.initializeState(initializationContext);
+
+ setupSourceOperator(sourceOperator, numTasks);
+
+ final Set<KV<Integer, Integer>> emittedElements = new HashSet<>();
+
+ boolean readFirstBatchOfElements = false;
+
+ try {
+ sourceOperator.open();
+ sourceOperator.run(checkpointLock,
+ new Output<StreamRecord<WindowedValue<KV<Integer, Integer>>>>() {
+ private int count = 0;
+
+ @Override
+ public void emitWatermark(Watermark watermark) {
+ }
+
+ @Override
+ public void emitLatencyMarker(LatencyMarker latencyMarker) {
+ }
+
+ @Override
+ public void collect(
+ StreamRecord<WindowedValue<KV<Integer, Integer>>> windowedValueStreamRecord) {
+
+ emittedElements.add(windowedValueStreamRecord.getValue().getValue());
+ count++;
+ if (count >= numElements / 2) {
+ throw new SuccessException();
+ }
+ }
+
+ @Override
+ public void close() {
+
+ }
+ });
+ } catch (SuccessException e) {
+ // success
+ readFirstBatchOfElements = true;
+ }
+
+ assertTrue("Did not successfully read first batch of elements.", readFirstBatchOfElements);
+
+ // draw a snapshot
+ flinkWrapper.snapshotState(new StateSnapshotContextSynchronousImpl(0, 0));
+
+ // test snapshot offsets
+ assertEquals(flinkWrapper.getLocalSplitSources().size(),
+ listState.getList().size());
+ int totalEmit = 0;
+ for (KV<UnboundedSource, TestCountingSource.CounterMark> kv : listState.get()) {
+ totalEmit += kv.getValue().current + 1;
+ }
+ assertEquals(numElements / 2, totalEmit);
+
+ // test that finalizeCheckpoint on CheckpointMark is called
+ final ArrayList<Integer> finalizeList = new ArrayList<>();
+ TestCountingSource.setFinalizeTracker(finalizeList);
+ flinkWrapper.notifyCheckpointComplete(0);
+ assertEquals(flinkWrapper.getLocalSplitSources().size(), finalizeList.size());
+
+ // create a completely new source but restore from the snapshot
+ TestCountingSource restoredSource = new TestCountingSource(numElements);
+ UnboundedSourceWrapper<
+ KV<Integer, Integer>, TestCountingSource.CounterMark> restoredFlinkWrapper =
+ new UnboundedSourceWrapper<>(options, restoredSource, numSplits);
+
+ assertEquals(numSplits, restoredFlinkWrapper.getSplitSources().size());
+
+ StreamSource<
+ WindowedValue<KV<Integer, Integer>>,
+ UnboundedSourceWrapper<
+ KV<Integer, Integer>,
+ TestCountingSource.CounterMark>> restoredSourceOperator =
+ new StreamSource<>(restoredFlinkWrapper);
+
+ setupSourceOperator(restoredSourceOperator, numTasks);
+
+ // restore snapshot
+ restoredFlinkWrapper.initializeState(initializationContext);
+
+ boolean readSecondBatchOfElements = false;
+
+ // run again and verify that we see the other elements
+ try {
+ restoredSourceOperator.open();
+ restoredSourceOperator.run(checkpointLock,
+ new Output<StreamRecord<WindowedValue<KV<Integer, Integer>>>>() {
+ private int count = 0;
+
+ @Override
+ public void emitWatermark(Watermark watermark) {
+ }
+
+ @Override
+ public void emitLatencyMarker(LatencyMarker latencyMarker) {
+ }
+
+ @Override
+ public void collect(
+ StreamRecord<WindowedValue<KV<Integer, Integer>>> windowedValueStreamRecord) {
+ emittedElements.add(windowedValueStreamRecord.getValue().getValue());
+ count++;
+ if (count >= numElements / 2) {
+ throw new SuccessException();
+ }
+ }
+
+ @Override
+ public void close() {
+
+ }
+ });
+ } catch (SuccessException e) {
+ // success
+ readSecondBatchOfElements = true;
+ }
+
+ assertEquals(Math.max(1, numSplits / numTasks), flinkWrapper.getLocalSplitSources().size());
+
+ assertTrue("Did not successfully read second batch of elements.", readSecondBatchOfElements);
+
+ // verify that we saw all NUM_ELEMENTS elements
+ assertTrue(emittedElements.size() == numElements);
+ }
+
+ @Test
+ public void testNullCheckpoint() throws Exception {
+ final int numElements = 20;
+ PipelineOptions options = PipelineOptionsFactory.create();
+
+ TestCountingSource source = new TestCountingSource(numElements) {
+ @Override
+ public Coder<CounterMark> getCheckpointMarkCoder() {
+ return null;
+ }
+ };
+ UnboundedSourceWrapper<KV<Integer, Integer>, TestCountingSource.CounterMark> flinkWrapper =
+ new UnboundedSourceWrapper<>(options, source, numSplits);
+
+ OperatorStateStore backend = mock(OperatorStateStore.class);
+
+ TestingListState<KV<UnboundedSource, TestCountingSource.CounterMark>>
+ listState = new TestingListState<>();
+
+ when(backend.getOperatorState(Matchers.any(ListStateDescriptor.class)))
+ .thenReturn(listState);
+
+ StateInitializationContext initializationContext = mock(StateInitializationContext.class);
+
+ when(initializationContext.getOperatorStateStore()).thenReturn(backend);
+ when(initializationContext.isRestored()).thenReturn(false, true);
+
+ flinkWrapper.initializeState(initializationContext);
+
+ StreamSource sourceOperator = new StreamSource<>(flinkWrapper);
+ setupSourceOperator(sourceOperator, numTasks);
+ sourceOperator.open();
+
+ flinkWrapper.snapshotState(new StateSnapshotContextSynchronousImpl(0, 0));
+
+ assertEquals(0, listState.getList().size());
+
+ UnboundedSourceWrapper<
+ KV<Integer, Integer>, TestCountingSource.CounterMark> restoredFlinkWrapper =
+ new UnboundedSourceWrapper<>(options, new TestCountingSource(numElements),
+ numSplits);
+
+ StreamSource restoredSourceOperator = new StreamSource<>(flinkWrapper);
+ setupSourceOperator(restoredSourceOperator, numTasks);
+ sourceOperator.open();
+
+ restoredFlinkWrapper.initializeState(initializationContext);
+
+ assertEquals(Math.max(1, numSplits / numTasks), flinkWrapper.getLocalSplitSources().size());
+
+ }
+
+ @SuppressWarnings("unchecked")
+ private static <T> void setupSourceOperator(StreamSource<T, ?> operator, int numSubTasks) {
+ ExecutionConfig executionConfig = new ExecutionConfig();
+ StreamConfig cfg = new StreamConfig(new Configuration());
+
+ cfg.setTimeCharacteristic(TimeCharacteristic.EventTime);
+
+ Environment env = new DummyEnvironment("MockTwoInputTask", numSubTasks, 0);
+
+ StreamTask<?, ?> mockTask = mock(StreamTask.class);
+ when(mockTask.getName()).thenReturn("Mock Task");
+ when(mockTask.getCheckpointLock()).thenReturn(new Object());
+ when(mockTask.getConfiguration()).thenReturn(cfg);
+ when(mockTask.getEnvironment()).thenReturn(env);
+ when(mockTask.getExecutionConfig()).thenReturn(executionConfig);
+ when(mockTask.getAccumulatorMap())
+ .thenReturn(Collections.<String, Accumulator<?, ?>>emptyMap());
+ TestProcessingTimeService testProcessingTimeService = new TestProcessingTimeService();
+ when(mockTask.getProcessingTimeService()).thenReturn(testProcessingTimeService);
+
+ operator.setup(mockTask, cfg, (Output<StreamRecord<T>>) mock(Output.class));
+ }
+
+ /**
+ * A special {@link RuntimeException} that we throw to signal that the test was successful.
+ */
+ private static class SuccessException extends RuntimeException {
+ }
+ }
+
+ /**
+ * Not parameterized tests.
+ */
+ public static class BasicTest {
+
+ /**
+ * Check serialization a {@link UnboundedSourceWrapper}.
+ */
+ @Test
+ public void testSerialization() throws Exception {
+ final int parallelism = 1;
+ final int numElements = 20;
+ PipelineOptions options = PipelineOptionsFactory.create();
+
+ TestCountingSource source = new TestCountingSource(numElements);
+ UnboundedSourceWrapper<KV<Integer, Integer>, TestCountingSource.CounterMark> flinkWrapper =
+ new UnboundedSourceWrapper<>(options, source, parallelism);
+
+ InstantiationUtil.serializeObject(flinkWrapper);
+ }
+
+ }
+
+ private static final class TestingListState<T> implements ListState<T> {
+
+ private final List<T> list = new ArrayList<>();
+
+ @Override
+ public void clear() {
+ list.clear();
+ }
+
+ @Override
+ public Iterable<T> get() throws Exception {
+ return list;
+ }
+
+ @Override
+ public void add(T value) throws Exception {
+ list.add(value);
+ }
+
+ public List<T> getList() {
+ return list;
+ }
+
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/test/java/org/apache/beam/runners/flink/streaming/package-info.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/test/java/org/apache/beam/runners/flink/streaming/package-info.java b/runners/flink/src/test/java/org/apache/beam/runners/flink/streaming/package-info.java
new file mode 100644
index 0000000..08a1e03
--- /dev/null
+++ b/runners/flink/src/test/java/org/apache/beam/runners/flink/streaming/package-info.java
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Internal implementation of the Beam runner for Apache Flink.
+ */
+package org.apache.beam.runners.flink.streaming;
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/test/resources/log4j-test.properties
----------------------------------------------------------------------
diff --git a/runners/flink/src/test/resources/log4j-test.properties b/runners/flink/src/test/resources/log4j-test.properties
new file mode 100644
index 0000000..4c74d85
--- /dev/null
+++ b/runners/flink/src/test/resources/log4j-test.properties
@@ -0,0 +1,27 @@
+################################################################################
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+# Set root logger level to OFF to not flood build logs
+# set manually to INFO for debugging purposes
+log4j.rootLogger=OFF, testlogger
+
+# A1 is set to be a ConsoleAppender.
+log4j.appender.testlogger=org.apache.log4j.ConsoleAppender
+log4j.appender.testlogger.target = System.err
+log4j.appender.testlogger.layout=org.apache.log4j.PatternLayout
+log4j.appender.testlogger.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n
[40/50] [abbrv] beam git commit: [BEAM-1994] Remove Flink examples
package
Posted by dh...@apache.org.
[BEAM-1994] Remove Flink examples package
Project: http://git-wip-us.apache.org/repos/asf/beam/repo
Commit: http://git-wip-us.apache.org/repos/asf/beam/commit/cdd2544b
Tree: http://git-wip-us.apache.org/repos/asf/beam/tree/cdd2544b
Diff: http://git-wip-us.apache.org/repos/asf/beam/diff/cdd2544b
Branch: refs/heads/DSL_SQL
Commit: cdd2544ba6dd6ac4aa80c65ecd8e01ab3cf664aa
Parents: 8a00f22
Author: Isma�l Mej�a <ie...@apache.org>
Authored: Tue Apr 18 17:31:07 2017 +0200
Committer: Isma�l Mej�a <ie...@apache.org>
Committed: Wed Apr 19 13:37:06 2017 +0200
----------------------------------------------------------------------
...PostCommit_Java_ValidatesRunner_Flink.groovy | 2 +-
runners/flink/examples/pom.xml | 130 ---
.../beam/runners/flink/examples/TFIDF.java | 455 --------
.../beam/runners/flink/examples/WordCount.java | 129 ---
.../runners/flink/examples/package-info.java | 22 -
.../flink/examples/streaming/AutoComplete.java | 400 -------
.../flink/examples/streaming/JoinExamples.java | 154 ---
.../examples/streaming/WindowedWordCount.java | 141 ---
.../flink/examples/streaming/package-info.java | 22 -
runners/flink/pom.xml | 275 ++++-
runners/flink/runner/pom.xml | 330 ------
.../flink/DefaultParallelismFactory.java | 39 -
.../flink/FlinkBatchPipelineTranslator.java | 139 ---
.../flink/FlinkBatchTransformTranslators.java | 723 ------------
.../flink/FlinkBatchTranslationContext.java | 153 ---
.../flink/FlinkDetachedRunnerResult.java | 75 --
.../FlinkPipelineExecutionEnvironment.java | 241 ----
.../runners/flink/FlinkPipelineOptions.java | 101 --
.../runners/flink/FlinkPipelineTranslator.java | 53 -
.../apache/beam/runners/flink/FlinkRunner.java | 232 ----
.../runners/flink/FlinkRunnerRegistrar.java | 62 --
.../beam/runners/flink/FlinkRunnerResult.java | 98 --
.../flink/FlinkStreamingPipelineTranslator.java | 276 -----
.../FlinkStreamingTransformTranslators.java | 1044 -----------------
.../flink/FlinkStreamingTranslationContext.java | 130 ---
.../flink/FlinkStreamingViewOverrides.java | 372 -------
.../flink/PipelineTranslationOptimizer.java | 72 --
.../beam/runners/flink/TestFlinkRunner.java | 84 --
.../beam/runners/flink/TranslationMode.java | 31 -
.../apache/beam/runners/flink/package-info.java | 22 -
.../functions/FlinkAggregatorFactory.java | 53 -
.../functions/FlinkAssignContext.java | 63 --
.../functions/FlinkAssignWindows.java | 49 -
.../functions/FlinkDoFnFunction.java | 161 ---
.../FlinkMergingNonShuffleReduceFunction.java | 228 ----
.../FlinkMergingPartialReduceFunction.java | 201 ----
.../functions/FlinkMergingReduceFunction.java | 199 ----
.../FlinkMultiOutputPruningFunction.java | 50 -
.../functions/FlinkNoOpStepContext.java | 73 --
.../functions/FlinkPartialReduceFunction.java | 172 ---
.../functions/FlinkReduceFunction.java | 173 ---
.../functions/FlinkSideInputReader.java | 80 --
.../functions/FlinkStatefulDoFnFunction.java | 198 ----
.../functions/SideInputInitializer.java | 73 --
.../translation/functions/package-info.java | 22 -
.../runners/flink/translation/package-info.java | 22 -
.../translation/types/CoderTypeInformation.java | 120 --
.../translation/types/CoderTypeSerializer.java | 132 ---
.../types/EncodedValueComparator.java | 195 ----
.../types/EncodedValueSerializer.java | 113 --
.../types/EncodedValueTypeInformation.java | 98 --
.../types/InspectableByteArrayOutputStream.java | 34 -
.../flink/translation/types/KvKeySelector.java | 50 -
.../flink/translation/types/package-info.java | 22 -
.../utils/SerializedPipelineOptions.java | 67 --
.../flink/translation/utils/package-info.java | 22 -
.../wrappers/DataInputViewWrapper.java | 58 -
.../wrappers/DataOutputViewWrapper.java | 51 -
.../SerializableFnAggregatorWrapper.java | 98 --
.../translation/wrappers/SourceInputFormat.java | 150 ---
.../translation/wrappers/SourceInputSplit.java | 52 -
.../translation/wrappers/package-info.java | 22 -
.../wrappers/streaming/DoFnOperator.java | 774 -------------
.../streaming/KvToByteBufferKeySelector.java | 56 -
.../streaming/SingletonKeyedWorkItem.java | 56 -
.../streaming/SingletonKeyedWorkItemCoder.java | 126 ---
.../streaming/SplittableDoFnOperator.java | 150 ---
.../wrappers/streaming/WindowDoFnOperator.java | 117 --
.../wrappers/streaming/WorkItemKeySelector.java | 56 -
.../streaming/io/BoundedSourceWrapper.java | 218 ----
.../streaming/io/UnboundedSocketSource.java | 249 -----
.../streaming/io/UnboundedSourceWrapper.java | 476 --------
.../wrappers/streaming/io/package-info.java | 22 -
.../wrappers/streaming/package-info.java | 22 -
.../state/FlinkBroadcastStateInternals.java | 865 --------------
.../state/FlinkKeyGroupStateInternals.java | 487 --------
.../state/FlinkSplitStateInternals.java | 260 -----
.../streaming/state/FlinkStateInternals.java | 1053 ------------------
.../state/KeyGroupCheckpointedOperator.java | 35 -
.../state/KeyGroupRestoringOperator.java | 32 -
.../wrappers/streaming/state/package-info.java | 22 -
.../runner/src/main/resources/log4j.properties | 23 -
.../flink/EncodedValueComparatorTest.java | 70 --
.../runners/flink/FlinkRunnerRegistrarTest.java | 48 -
.../beam/runners/flink/FlinkTestPipeline.java | 72 --
.../beam/runners/flink/PipelineOptionsTest.java | 184 ---
.../beam/runners/flink/ReadSourceITCase.java | 85 --
.../flink/ReadSourceStreamingITCase.java | 74 --
.../beam/runners/flink/WriteSinkITCase.java | 192 ----
.../flink/streaming/DoFnOperatorTest.java | 600 ----------
.../FlinkBroadcastStateInternalsTest.java | 245 ----
.../FlinkKeyGroupStateInternalsTest.java | 262 -----
.../streaming/FlinkSplitStateInternalsTest.java | 101 --
.../streaming/FlinkStateInternalsTest.java | 395 -------
.../flink/streaming/GroupByNullKeyTest.java | 124 ---
.../flink/streaming/TestCountingSource.java | 254 -----
.../streaming/TopWikipediaSessionsITCase.java | 133 ---
.../streaming/UnboundedSourceWrapperTest.java | 464 --------
.../runners/flink/streaming/package-info.java | 22 -
.../src/test/resources/log4j-test.properties | 27 -
.../flink/DefaultParallelismFactory.java | 39 +
.../flink/FlinkBatchPipelineTranslator.java | 139 +++
.../flink/FlinkBatchTransformTranslators.java | 723 ++++++++++++
.../flink/FlinkBatchTranslationContext.java | 153 +++
.../flink/FlinkDetachedRunnerResult.java | 75 ++
.../FlinkPipelineExecutionEnvironment.java | 241 ++++
.../runners/flink/FlinkPipelineOptions.java | 101 ++
.../runners/flink/FlinkPipelineTranslator.java | 53 +
.../apache/beam/runners/flink/FlinkRunner.java | 232 ++++
.../runners/flink/FlinkRunnerRegistrar.java | 62 ++
.../beam/runners/flink/FlinkRunnerResult.java | 98 ++
.../flink/FlinkStreamingPipelineTranslator.java | 276 +++++
.../FlinkStreamingTransformTranslators.java | 1044 +++++++++++++++++
.../flink/FlinkStreamingTranslationContext.java | 130 +++
.../flink/FlinkStreamingViewOverrides.java | 372 +++++++
.../flink/PipelineTranslationOptimizer.java | 72 ++
.../beam/runners/flink/TestFlinkRunner.java | 84 ++
.../beam/runners/flink/TranslationMode.java | 31 +
.../apache/beam/runners/flink/package-info.java | 22 +
.../functions/FlinkAggregatorFactory.java | 53 +
.../functions/FlinkAssignContext.java | 63 ++
.../functions/FlinkAssignWindows.java | 49 +
.../functions/FlinkDoFnFunction.java | 161 +++
.../FlinkMergingNonShuffleReduceFunction.java | 228 ++++
.../FlinkMergingPartialReduceFunction.java | 201 ++++
.../functions/FlinkMergingReduceFunction.java | 199 ++++
.../FlinkMultiOutputPruningFunction.java | 50 +
.../functions/FlinkNoOpStepContext.java | 73 ++
.../functions/FlinkPartialReduceFunction.java | 172 +++
.../functions/FlinkReduceFunction.java | 173 +++
.../functions/FlinkSideInputReader.java | 80 ++
.../functions/FlinkStatefulDoFnFunction.java | 198 ++++
.../functions/SideInputInitializer.java | 73 ++
.../translation/functions/package-info.java | 22 +
.../runners/flink/translation/package-info.java | 22 +
.../translation/types/CoderTypeInformation.java | 120 ++
.../translation/types/CoderTypeSerializer.java | 132 +++
.../types/EncodedValueComparator.java | 195 ++++
.../types/EncodedValueSerializer.java | 113 ++
.../types/EncodedValueTypeInformation.java | 98 ++
.../types/InspectableByteArrayOutputStream.java | 34 +
.../flink/translation/types/KvKeySelector.java | 50 +
.../flink/translation/types/package-info.java | 22 +
.../utils/SerializedPipelineOptions.java | 67 ++
.../flink/translation/utils/package-info.java | 22 +
.../wrappers/DataInputViewWrapper.java | 58 +
.../wrappers/DataOutputViewWrapper.java | 51 +
.../SerializableFnAggregatorWrapper.java | 98 ++
.../translation/wrappers/SourceInputFormat.java | 150 +++
.../translation/wrappers/SourceInputSplit.java | 52 +
.../translation/wrappers/package-info.java | 22 +
.../wrappers/streaming/DoFnOperator.java | 774 +++++++++++++
.../streaming/KvToByteBufferKeySelector.java | 56 +
.../streaming/SingletonKeyedWorkItem.java | 56 +
.../streaming/SingletonKeyedWorkItemCoder.java | 126 +++
.../streaming/SplittableDoFnOperator.java | 150 +++
.../wrappers/streaming/WindowDoFnOperator.java | 117 ++
.../wrappers/streaming/WorkItemKeySelector.java | 56 +
.../streaming/io/BoundedSourceWrapper.java | 218 ++++
.../streaming/io/UnboundedSocketSource.java | 249 +++++
.../streaming/io/UnboundedSourceWrapper.java | 476 ++++++++
.../wrappers/streaming/io/package-info.java | 22 +
.../wrappers/streaming/package-info.java | 22 +
.../state/FlinkBroadcastStateInternals.java | 865 ++++++++++++++
.../state/FlinkKeyGroupStateInternals.java | 487 ++++++++
.../state/FlinkSplitStateInternals.java | 260 +++++
.../streaming/state/FlinkStateInternals.java | 1053 ++++++++++++++++++
.../state/KeyGroupCheckpointedOperator.java | 35 +
.../state/KeyGroupRestoringOperator.java | 32 +
.../wrappers/streaming/state/package-info.java | 22 +
.../flink/src/main/resources/log4j.properties | 23 +
.../flink/EncodedValueComparatorTest.java | 70 ++
.../runners/flink/FlinkRunnerRegistrarTest.java | 48 +
.../beam/runners/flink/FlinkTestPipeline.java | 72 ++
.../beam/runners/flink/PipelineOptionsTest.java | 184 +++
.../beam/runners/flink/ReadSourceITCase.java | 85 ++
.../flink/ReadSourceStreamingITCase.java | 74 ++
.../beam/runners/flink/WriteSinkITCase.java | 192 ++++
.../flink/streaming/DoFnOperatorTest.java | 600 ++++++++++
.../FlinkBroadcastStateInternalsTest.java | 245 ++++
.../FlinkKeyGroupStateInternalsTest.java | 262 +++++
.../streaming/FlinkSplitStateInternalsTest.java | 101 ++
.../streaming/FlinkStateInternalsTest.java | 395 +++++++
.../flink/streaming/GroupByNullKeyTest.java | 124 +++
.../flink/streaming/TestCountingSource.java | 254 +++++
.../streaming/TopWikipediaSessionsITCase.java | 133 +++
.../streaming/UnboundedSourceWrapperTest.java | 464 ++++++++
.../runners/flink/streaming/package-info.java | 22 +
.../src/test/resources/log4j-test.properties | 27 +
189 files changed, 15765 insertions(+), 17293 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/.test-infra/jenkins/job_beam_PostCommit_Java_ValidatesRunner_Flink.groovy
----------------------------------------------------------------------
diff --git a/.test-infra/jenkins/job_beam_PostCommit_Java_ValidatesRunner_Flink.groovy b/.test-infra/jenkins/job_beam_PostCommit_Java_ValidatesRunner_Flink.groovy
index 411106d..5b228bc 100644
--- a/.test-infra/jenkins/job_beam_PostCommit_Java_ValidatesRunner_Flink.groovy
+++ b/.test-infra/jenkins/job_beam_PostCommit_Java_ValidatesRunner_Flink.groovy
@@ -39,5 +39,5 @@ mavenJob('beam_PostCommit_Java_ValidatesRunner_Flink') {
'Run Flink ValidatesRunner')
// Maven goals for this job.
- goals('-B -e clean verify -am -pl runners/flink/runner -Plocal-validates-runner-tests -Pvalidates-runner-tests')
+ goals('-B -e clean verify -am -pl runners/flink -Plocal-validates-runner-tests -Pvalidates-runner-tests')
}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/examples/pom.xml
----------------------------------------------------------------------
diff --git a/runners/flink/examples/pom.xml b/runners/flink/examples/pom.xml
deleted file mode 100644
index aaf76d9..0000000
--- a/runners/flink/examples/pom.xml
+++ /dev/null
@@ -1,130 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
-
- <modelVersion>4.0.0</modelVersion>
-
- <parent>
- <groupId>org.apache.beam</groupId>
- <artifactId>beam-runners-flink-parent</artifactId>
- <version>0.7.0-SNAPSHOT</version>
- <relativePath>../pom.xml</relativePath>
- </parent>
-
- <artifactId>beam-runners-flink_2.10-examples</artifactId>
-
- <name>Apache Beam :: Runners :: Flink :: Examples</name>
-
- <packaging>jar</packaging>
-
- <properties>
- <!-- Default parameters for mvn exec:java -->
- <flink.examples.input>kinglear.txt</flink.examples.input>
- <flink.examples.output>wordcounts.txt</flink.examples.output>
- <flink.examples.parallelism>-1</flink.examples.parallelism>
- </properties>
-
- <profiles>
- <profile>
- <id>disable-validates-runner-tests</id>
- <activation>
- <activeByDefault>true</activeByDefault>
- </activation>
- <build>
- <plugins>
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-surefire-plugin</artifactId>
- <executions>
- <execution>
- <id>validates-runner-tests</id>
- <configuration>
- <skip>true</skip>
- </configuration>
- </execution>
- </executions>
- </plugin>
- </plugins>
- </build>
- </profile>
- </profiles>
-
- <dependencies>
- <dependency>
- <groupId>org.apache.beam</groupId>
- <artifactId>beam-sdks-java-extensions-gcp-core</artifactId>
- </dependency>
-
- <dependency>
- <groupId>org.apache.beam</groupId>
- <artifactId>beam-runners-flink_2.10</artifactId>
- <version>${project.version}</version>
- </dependency>
-
- <dependency>
- <groupId>org.apache.flink</groupId>
- <artifactId>flink-connector-kafka-0.8_2.10</artifactId>
- <version>${flink.version}</version>
- </dependency>
-
- </dependencies>
-
- <build>
- <plugins>
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-compiler-plugin</artifactId>
- </plugin>
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-jar-plugin</artifactId>
- </plugin>
-
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-dependency-plugin</artifactId>
- <executions>
- <execution>
- <goals><goal>analyze-only</goal></goals>
- <configuration>
- <!-- disable for now until dependencies are cleaned up -->
- <failOnWarning>false</failOnWarning>
- </configuration>
- </execution>
- </executions>
- </plugin>
-
- <plugin>
- <groupId>org.codehaus.mojo</groupId>
- <artifactId>exec-maven-plugin</artifactId>
- <configuration>
- <executable>java</executable>
- <arguments>
- <argument>--runner=org.apache.beam.runners.flink.FlinkRunner</argument>
- <argument>--parallelism=${flink.examples.parallelism}</argument>
- <argument>--input=${flink.examples.input}</argument>
- <argument>--output=${flink.examples.output}</argument>
- </arguments>
- </configuration>
- </plugin>
-
- </plugins>
-
- </build>
-
-</project>
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/examples/src/main/java/org/apache/beam/runners/flink/examples/TFIDF.java
----------------------------------------------------------------------
diff --git a/runners/flink/examples/src/main/java/org/apache/beam/runners/flink/examples/TFIDF.java b/runners/flink/examples/src/main/java/org/apache/beam/runners/flink/examples/TFIDF.java
deleted file mode 100644
index 8e1df08..0000000
--- a/runners/flink/examples/src/main/java/org/apache/beam/runners/flink/examples/TFIDF.java
+++ /dev/null
@@ -1,455 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink.examples;
-
-import java.io.File;
-import java.io.IOException;
-import java.net.URI;
-import java.net.URISyntaxException;
-import java.util.HashSet;
-import java.util.Set;
-import org.apache.beam.runners.flink.FlinkPipelineOptions;
-import org.apache.beam.runners.flink.FlinkRunner;
-import org.apache.beam.sdk.Pipeline;
-import org.apache.beam.sdk.coders.Coder;
-import org.apache.beam.sdk.coders.KvCoder;
-import org.apache.beam.sdk.coders.StringDelegateCoder;
-import org.apache.beam.sdk.coders.StringUtf8Coder;
-import org.apache.beam.sdk.io.TextIO;
-import org.apache.beam.sdk.options.Default;
-import org.apache.beam.sdk.options.Description;
-import org.apache.beam.sdk.options.GcsOptions;
-import org.apache.beam.sdk.options.PipelineOptions;
-import org.apache.beam.sdk.options.PipelineOptionsFactory;
-import org.apache.beam.sdk.options.Validation;
-import org.apache.beam.sdk.transforms.Count;
-import org.apache.beam.sdk.transforms.Distinct;
-import org.apache.beam.sdk.transforms.DoFn;
-import org.apache.beam.sdk.transforms.Flatten;
-import org.apache.beam.sdk.transforms.Keys;
-import org.apache.beam.sdk.transforms.PTransform;
-import org.apache.beam.sdk.transforms.ParDo;
-import org.apache.beam.sdk.transforms.Values;
-import org.apache.beam.sdk.transforms.View;
-import org.apache.beam.sdk.transforms.WithKeys;
-import org.apache.beam.sdk.transforms.join.CoGbkResult;
-import org.apache.beam.sdk.transforms.join.CoGroupByKey;
-import org.apache.beam.sdk.transforms.join.KeyedPCollectionTuple;
-import org.apache.beam.sdk.util.GcsUtil;
-import org.apache.beam.sdk.util.gcsfs.GcsPath;
-import org.apache.beam.sdk.values.KV;
-import org.apache.beam.sdk.values.PBegin;
-import org.apache.beam.sdk.values.PCollection;
-import org.apache.beam.sdk.values.PCollectionList;
-import org.apache.beam.sdk.values.PCollectionView;
-import org.apache.beam.sdk.values.PDone;
-import org.apache.beam.sdk.values.TupleTag;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * An example that computes a basic TF-IDF search table for a directory or GCS prefix.
- *
- * <p>Concepts: joining data; side inputs; logging
- *
- * <p>To execute this pipeline locally, specify general pipeline configuration:
- * <pre>{@code
- * --project=YOUR_PROJECT_ID
- * }</pre>
- * and a local output file or output prefix on GCS:
- * <pre>{@code
- * --output=[YOUR_LOCAL_FILE | gs://YOUR_OUTPUT_PREFIX]
- * }</pre>
- *
- * <p>To execute this pipeline using the Dataflow service, specify pipeline configuration:
- * <pre>{@code
- * --project=YOUR_PROJECT_ID
- * --stagingLocation=gs://YOUR_STAGING_DIRECTORY
- * --runner=BlockingDataflowRunner
- * and an output prefix on GCS:
- * --output=gs://YOUR_OUTPUT_PREFIX
- * }</pre>
- *
- * <p>The default input is {@code gs://dataflow-samples/shakespeare/} and can be overridden with
- * {@code --input}.
- */
-public class TFIDF {
- /**
- * Options supported by {@link TFIDF}.
- *
- * <p>Inherits standard configuration options.
- */
- private interface Options extends PipelineOptions, FlinkPipelineOptions {
- @Description("Path to the directory or GCS prefix containing files to read from")
- @Default.String("gs://dataflow-samples/shakespeare/")
- String getInput();
- void setInput(String value);
-
- @Description("Prefix of output URI to write to")
- @Validation.Required
- String getOutput();
- void setOutput(String value);
- }
-
- /**
- * Lists documents contained beneath the {@code options.input} prefix/directory.
- */
- public static Set<URI> listInputDocuments(Options options)
- throws URISyntaxException, IOException {
- URI baseUri = new URI(options.getInput());
-
- // List all documents in the directory or GCS prefix.
- URI absoluteUri;
- if (baseUri.getScheme() != null) {
- absoluteUri = baseUri;
- } else {
- absoluteUri = new URI(
- "file",
- baseUri.getAuthority(),
- baseUri.getPath(),
- baseUri.getQuery(),
- baseUri.getFragment());
- }
-
- Set<URI> uris = new HashSet<>();
- if (absoluteUri.getScheme().equals("file")) {
- File directory = new File(absoluteUri);
- String[] directoryListing = directory.list();
- if (directoryListing == null) {
- throw new IOException(
- "Directory " + absoluteUri + " is not a valid path or IO Error occurred.");
- }
- for (String entry : directoryListing) {
- File path = new File(directory, entry);
- uris.add(path.toURI());
- }
- } else if (absoluteUri.getScheme().equals("gs")) {
- GcsUtil gcsUtil = options.as(GcsOptions.class).getGcsUtil();
- URI gcsUriGlob = new URI(
- absoluteUri.getScheme(),
- absoluteUri.getAuthority(),
- absoluteUri.getPath() + "*",
- absoluteUri.getQuery(),
- absoluteUri.getFragment());
- for (GcsPath entry : gcsUtil.expand(GcsPath.fromUri(gcsUriGlob))) {
- uris.add(entry.toUri());
- }
- }
-
- return uris;
- }
-
- /**
- * Reads the documents at the provided uris and returns all lines
- * from the documents tagged with which document they are from.
- */
- public static class ReadDocuments
- extends PTransform<PBegin, PCollection<KV<URI, String>>> {
- private static final long serialVersionUID = 0;
-
- // transient because PTransform is not really meant to be serialized.
- // see note on PTransform
- private final transient Iterable<URI> uris;
-
- public ReadDocuments(Iterable<URI> uris) {
- this.uris = uris;
- }
-
- @Override
- public Coder<?> getDefaultOutputCoder() {
- return KvCoder.of(StringDelegateCoder.of(URI.class), StringUtf8Coder.of());
- }
-
- @Override
- public PCollection<KV<URI, String>> expand(PBegin input) {
- Pipeline pipeline = input.getPipeline();
-
- // Create one TextIO.Read transform for each document
- // and add its output to a PCollectionList
- PCollectionList<KV<URI, String>> urisToLines =
- PCollectionList.empty(pipeline);
-
- // TextIO.Read supports:
- // - file: URIs and paths locally
- // - gs: URIs on the service
- for (final URI uri : uris) {
- String uriString;
- if (uri.getScheme().equals("file")) {
- uriString = new File(uri).getPath();
- } else {
- uriString = uri.toString();
- }
-
- PCollection<KV<URI, String>> oneUriToLines = pipeline
- .apply("TextIO.Read(" + uriString + ")", TextIO.Read.from(uriString))
- .apply("WithKeys(" + uriString + ")", WithKeys.<URI, String>of(uri));
-
- urisToLines = urisToLines.and(oneUriToLines);
- }
-
- return urisToLines.apply(Flatten.<KV<URI, String>>pCollections());
- }
- }
-
- /**
- * A transform containing a basic TF-IDF pipeline. The input consists of KV objects
- * where the key is the document's URI and the value is a piece
- * of the document's content. The output is mapping from terms to
- * scores for each document URI.
- */
- public static class ComputeTfIdf
- extends PTransform<PCollection<KV<URI, String>>, PCollection<KV<String, KV<URI, Double>>>> {
- private static final long serialVersionUID = 0;
-
- public ComputeTfIdf() { }
-
- @Override
- public PCollection<KV<String, KV<URI, Double>>> expand(
- PCollection<KV<URI, String>> uriToContent) {
-
- // Compute the total number of documents, and
- // prepare this singleton PCollectionView for
- // use as a side input.
- final PCollectionView<Long> totalDocuments =
- uriToContent
- .apply("GetURIs", Keys.<URI>create())
- .apply("DistinctDocs", Distinct.<URI>create())
- .apply(Count.<URI>globally())
- .apply(View.<Long>asSingleton());
-
- // Create a collection of pairs mapping a URI to each
- // of the words in the document associated with that that URI.
- PCollection<KV<URI, String>> uriToWords = uriToContent
- .apply("SplitWords", ParDo.of(new DoFn<KV<URI, String>, KV<URI, String>>() {
- private static final long serialVersionUID = 0;
-
- @ProcessElement
- public void processElement(ProcessContext c) {
- URI uri = c.element().getKey();
- String line = c.element().getValue();
- for (String word : line.split("\\W+")) {
- // Log INFO messages when the word \u201clove\u201d is found.
- if (word.toLowerCase().equals("love")) {
- LOG.info("Found {}", word.toLowerCase());
- }
-
- if (!word.isEmpty()) {
- c.output(KV.of(uri, word.toLowerCase()));
- }
- }
- }
- }));
-
- // Compute a mapping from each word to the total
- // number of documents in which it appears.
- PCollection<KV<String, Long>> wordToDocCount = uriToWords
- .apply("DistinctWords", Distinct.<KV<URI, String>>create())
- .apply(Values.<String>create())
- .apply("CountDocs", Count.<String>perElement());
-
- // Compute a mapping from each URI to the total
- // number of words in the document associated with that URI.
- PCollection<KV<URI, Long>> uriToWordTotal = uriToWords
- .apply("GetURIs2", Keys.<URI>create())
- .apply("CountWords", Count.<URI>perElement());
-
- // Count, for each (URI, word) pair, the number of
- // occurrences of that word in the document associated
- // with the URI.
- PCollection<KV<KV<URI, String>, Long>> uriAndWordToCount = uriToWords
- .apply("CountWordDocPairs", Count.<KV<URI, String>>perElement());
-
- // Adjust the above collection to a mapping from
- // (URI, word) pairs to counts into an isomorphic mapping
- // from URI to (word, count) pairs, to prepare for a join
- // by the URI key.
- PCollection<KV<URI, KV<String, Long>>> uriToWordAndCount = uriAndWordToCount
- .apply("ShiftKeys", ParDo.of(
- new DoFn<KV<KV<URI, String>, Long>, KV<URI, KV<String, Long>>>() {
- private static final long serialVersionUID = 0;
-
- @ProcessElement
- public void processElement(ProcessContext c) {
- URI uri = c.element().getKey().getKey();
- String word = c.element().getKey().getValue();
- Long occurrences = c.element().getValue();
- c.output(KV.of(uri, KV.of(word, occurrences)));
- }
- }));
-
- // Prepare to join the mapping of URI to (word, count) pairs with
- // the mapping of URI to total word counts, by associating
- // each of the input PCollection<KV<URI, ...>> with
- // a tuple tag. Each input must have the same key type, URI
- // in this case. The type parameter of the tuple tag matches
- // the types of the values for each collection.
- final TupleTag<Long> wordTotalsTag = new TupleTag<>();
- final TupleTag<KV<String, Long>> wordCountsTag = new TupleTag<>();
- KeyedPCollectionTuple<URI> coGbkInput = KeyedPCollectionTuple
- .of(wordTotalsTag, uriToWordTotal)
- .and(wordCountsTag, uriToWordAndCount);
-
- // Perform a CoGroupByKey (a sort of pre-join) on the prepared
- // inputs. This yields a mapping from URI to a CoGbkResult
- // (CoGroupByKey Result). The CoGbkResult is a mapping
- // from the above tuple tags to the values in each input
- // associated with a particular URI. In this case, each
- // KV<URI, CoGbkResult> group a URI with the total number of
- // words in that document as well as all the (word, count)
- // pairs for particular words.
- PCollection<KV<URI, CoGbkResult>> uriToWordAndCountAndTotal = coGbkInput
- .apply("CoGroupByUri", CoGroupByKey.<URI>create());
-
- // Compute a mapping from each word to a (URI, term frequency)
- // pair for each URI. A word's term frequency for a document
- // is simply the number of times that word occurs in the document
- // divided by the total number of words in the document.
- PCollection<KV<String, KV<URI, Double>>> wordToUriAndTf = uriToWordAndCountAndTotal
- .apply("ComputeTermFrequencies", ParDo.of(
- new DoFn<KV<URI, CoGbkResult>, KV<String, KV<URI, Double>>>() {
- private static final long serialVersionUID = 0;
-
- @ProcessElement
- public void processElement(ProcessContext c) {
- URI uri = c.element().getKey();
- Long wordTotal = c.element().getValue().getOnly(wordTotalsTag);
-
- for (KV<String, Long> wordAndCount
- : c.element().getValue().getAll(wordCountsTag)) {
- String word = wordAndCount.getKey();
- Long wordCount = wordAndCount.getValue();
- Double termFrequency = wordCount.doubleValue() / wordTotal.doubleValue();
- c.output(KV.of(word, KV.of(uri, termFrequency)));
- }
- }
- }));
-
- // Compute a mapping from each word to its document frequency.
- // A word's document frequency in a corpus is the number of
- // documents in which the word appears divided by the total
- // number of documents in the corpus. Note how the total number of
- // documents is passed as a side input; the same value is
- // presented to each invocation of the DoFn.
- PCollection<KV<String, Double>> wordToDf = wordToDocCount
- .apply("ComputeDocFrequencies", ParDo
- .of(new DoFn<KV<String, Long>, KV<String, Double>>() {
- private static final long serialVersionUID = 0;
-
- @ProcessElement
- public void processElement(ProcessContext c) {
- String word = c.element().getKey();
- Long documentCount = c.element().getValue();
- Long documentTotal = c.sideInput(totalDocuments);
- Double documentFrequency = documentCount.doubleValue()
- / documentTotal.doubleValue();
-
- c.output(KV.of(word, documentFrequency));
- }
- }).withSideInputs(totalDocuments));
-
- // Join the term frequency and document frequency
- // collections, each keyed on the word.
- final TupleTag<KV<URI, Double>> tfTag = new TupleTag<>();
- final TupleTag<Double> dfTag = new TupleTag<>();
- PCollection<KV<String, CoGbkResult>> wordToUriAndTfAndDf = KeyedPCollectionTuple
- .of(tfTag, wordToUriAndTf)
- .and(dfTag, wordToDf)
- .apply(CoGroupByKey.<String>create());
-
- // Compute a mapping from each word to a (URI, TF-IDF) score
- // for each URI. There are a variety of definitions of TF-IDF
- // ("term frequency - inverse document frequency") score;
- // here we use a basic version that is the term frequency
- // divided by the log of the document frequency.
-
- return wordToUriAndTfAndDf
- .apply("ComputeTfIdf", ParDo.of(
- new DoFn<KV<String, CoGbkResult>, KV<String, KV<URI, Double>>>() {
- private static final long serialVersionUID = 0;
-
- @ProcessElement
- public void processElement(ProcessContext c) {
- String word = c.element().getKey();
- Double df = c.element().getValue().getOnly(dfTag);
-
- for (KV<URI, Double> uriAndTf : c.element().getValue().getAll(tfTag)) {
- URI uri = uriAndTf.getKey();
- Double tf = uriAndTf.getValue();
- Double tfIdf = tf * Math.log(1 / df);
- c.output(KV.of(word, KV.of(uri, tfIdf)));
- }
- }
- }));
- }
-
- // Instantiate Logger.
- // It is suggested that the user specify the class name of the containing class
- // (in this case ComputeTfIdf).
- private static final Logger LOG = LoggerFactory.getLogger(ComputeTfIdf.class);
- }
-
- /**
- * A {@link PTransform} to write, in CSV format, a mapping from term and URI
- * to score.
- */
- public static class WriteTfIdf
- extends PTransform<PCollection<KV<String, KV<URI, Double>>>, PDone> {
- private static final long serialVersionUID = 0;
-
- private String output;
-
- public WriteTfIdf(String output) {
- this.output = output;
- }
-
- @Override
- public PDone expand(PCollection<KV<String, KV<URI, Double>>> wordToUriAndTfIdf) {
- return wordToUriAndTfIdf
- .apply("Format", ParDo.of(new DoFn<KV<String, KV<URI, Double>>, String>() {
- private static final long serialVersionUID = 0;
-
- @ProcessElement
- public void processElement(ProcessContext c) {
- c.output(String.format("%s,\t%s,\t%f",
- c.element().getKey(),
- c.element().getValue().getKey(),
- c.element().getValue().getValue()));
- }
- }))
- .apply(TextIO.Write
- .to(output)
- .withSuffix(".csv"));
- }
- }
-
- public static void main(String[] args) throws Exception {
- Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
-
- options.setRunner(FlinkRunner.class);
-
- Pipeline pipeline = Pipeline.create(options);
- pipeline.getCoderRegistry().registerCoder(URI.class, StringDelegateCoder.of(URI.class));
-
- pipeline
- .apply(new ReadDocuments(listInputDocuments(options)))
- .apply(new ComputeTfIdf())
- .apply(new WriteTfIdf(options.getOutput()));
-
- pipeline.run();
- }
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/examples/src/main/java/org/apache/beam/runners/flink/examples/WordCount.java
----------------------------------------------------------------------
diff --git a/runners/flink/examples/src/main/java/org/apache/beam/runners/flink/examples/WordCount.java b/runners/flink/examples/src/main/java/org/apache/beam/runners/flink/examples/WordCount.java
deleted file mode 100644
index 6ae4cf8..0000000
--- a/runners/flink/examples/src/main/java/org/apache/beam/runners/flink/examples/WordCount.java
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink.examples;
-
-import org.apache.beam.runners.flink.FlinkPipelineOptions;
-import org.apache.beam.runners.flink.FlinkRunner;
-import org.apache.beam.sdk.Pipeline;
-import org.apache.beam.sdk.io.TextIO;
-import org.apache.beam.sdk.options.Description;
-import org.apache.beam.sdk.options.PipelineOptions;
-import org.apache.beam.sdk.options.PipelineOptionsFactory;
-import org.apache.beam.sdk.options.Validation;
-import org.apache.beam.sdk.transforms.Aggregator;
-import org.apache.beam.sdk.transforms.Count;
-import org.apache.beam.sdk.transforms.DoFn;
-import org.apache.beam.sdk.transforms.MapElements;
-import org.apache.beam.sdk.transforms.PTransform;
-import org.apache.beam.sdk.transforms.ParDo;
-import org.apache.beam.sdk.transforms.SimpleFunction;
-import org.apache.beam.sdk.transforms.Sum;
-import org.apache.beam.sdk.values.KV;
-import org.apache.beam.sdk.values.PCollection;
-
-/**
- * Wordcount pipeline.
- */
-public class WordCount {
-
- /**
- * Function to extract words.
- */
- public static class ExtractWordsFn extends DoFn<String, String> {
- private final Aggregator<Long, Long> emptyLines =
- createAggregator("emptyLines", Sum.ofLongs());
-
- @ProcessElement
- public void processElement(ProcessContext c) {
- if (c.element().trim().isEmpty()) {
- emptyLines.addValue(1L);
- }
-
- // Split the line into words.
- String[] words = c.element().split("[^a-zA-Z']+");
-
- // Output each word encountered into the output PCollection.
- for (String word : words) {
- if (!word.isEmpty()) {
- c.output(word);
- }
- }
- }
- }
-
- /**
- * PTransform counting words.
- */
- public static class CountWords extends PTransform<PCollection<String>,
- PCollection<KV<String, Long>>> {
- @Override
- public PCollection<KV<String, Long>> expand(PCollection<String> lines) {
-
- // Convert lines of text into individual words.
- PCollection<String> words = lines.apply(
- ParDo.of(new ExtractWordsFn()));
-
- // Count the number of times each word occurs.
- PCollection<KV<String, Long>> wordCounts =
- words.apply(Count.<String>perElement());
-
- return wordCounts;
- }
- }
-
- /** A SimpleFunction that converts a Word and Count into a printable string. */
- public static class FormatAsTextFn extends SimpleFunction<KV<String, Long>, String> {
- @Override
- public String apply(KV<String, Long> input) {
- return input.getKey() + ": " + input.getValue();
- }
- }
-
- /**
- * Options supported by {@link WordCount}.
- *
- * <p>Inherits standard configuration options.
- */
- public interface Options extends PipelineOptions, FlinkPipelineOptions {
- @Description("Path of the file to read from")
- String getInput();
- void setInput(String value);
-
- @Description("Path of the file to write to")
- @Validation.Required
- String getOutput();
- void setOutput(String value);
- }
-
- public static void main(String[] args) {
-
- Options options = PipelineOptionsFactory.fromArgs(args).withValidation()
- .as(Options.class);
- options.setRunner(FlinkRunner.class);
-
- Pipeline p = Pipeline.create(options);
-
- p.apply("ReadLines", TextIO.Read.from(options.getInput()))
- .apply(new CountWords())
- .apply(MapElements.via(new FormatAsTextFn()))
- .apply("WriteCounts", TextIO.Write.to(options.getOutput()));
-
- p.run();
- }
-
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/examples/src/main/java/org/apache/beam/runners/flink/examples/package-info.java
----------------------------------------------------------------------
diff --git a/runners/flink/examples/src/main/java/org/apache/beam/runners/flink/examples/package-info.java b/runners/flink/examples/src/main/java/org/apache/beam/runners/flink/examples/package-info.java
deleted file mode 100644
index b0ecb56..0000000
--- a/runners/flink/examples/src/main/java/org/apache/beam/runners/flink/examples/package-info.java
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * Flink Beam runner exemple.
- */
-package org.apache.beam.runners.flink.examples;
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/examples/src/main/java/org/apache/beam/runners/flink/examples/streaming/AutoComplete.java
----------------------------------------------------------------------
diff --git a/runners/flink/examples/src/main/java/org/apache/beam/runners/flink/examples/streaming/AutoComplete.java b/runners/flink/examples/src/main/java/org/apache/beam/runners/flink/examples/streaming/AutoComplete.java
deleted file mode 100644
index d07df29..0000000
--- a/runners/flink/examples/src/main/java/org/apache/beam/runners/flink/examples/streaming/AutoComplete.java
+++ /dev/null
@@ -1,400 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink.examples.streaming;
-
-import java.io.IOException;
-import java.util.List;
-import org.apache.beam.runners.flink.FlinkRunner;
-import org.apache.beam.runners.flink.translation.wrappers.streaming.io.UnboundedSocketSource;
-import org.apache.beam.sdk.Pipeline;
-import org.apache.beam.sdk.coders.AvroCoder;
-import org.apache.beam.sdk.coders.DefaultCoder;
-import org.apache.beam.sdk.io.Read;
-import org.apache.beam.sdk.io.TextIO;
-import org.apache.beam.sdk.options.Default;
-import org.apache.beam.sdk.options.Description;
-import org.apache.beam.sdk.options.PipelineOptionsFactory;
-import org.apache.beam.sdk.transforms.Aggregator;
-import org.apache.beam.sdk.transforms.Count;
-import org.apache.beam.sdk.transforms.DoFn;
-import org.apache.beam.sdk.transforms.Filter;
-import org.apache.beam.sdk.transforms.Flatten;
-import org.apache.beam.sdk.transforms.PTransform;
-import org.apache.beam.sdk.transforms.ParDo;
-import org.apache.beam.sdk.transforms.Partition;
-import org.apache.beam.sdk.transforms.Partition.PartitionFn;
-import org.apache.beam.sdk.transforms.SerializableFunction;
-import org.apache.beam.sdk.transforms.Sum;
-import org.apache.beam.sdk.transforms.Top;
-import org.apache.beam.sdk.transforms.windowing.AfterWatermark;
-import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
-import org.apache.beam.sdk.transforms.windowing.FixedWindows;
-import org.apache.beam.sdk.transforms.windowing.Window;
-import org.apache.beam.sdk.transforms.windowing.WindowFn;
-import org.apache.beam.sdk.values.KV;
-import org.apache.beam.sdk.values.PCollection;
-import org.apache.beam.sdk.values.PCollectionList;
-import org.joda.time.Duration;
-
-/**
- * To run the example, first open a socket on a terminal by executing the command:
- * <ul>
- * <li><code>nc -lk 9999</code>
- * </ul>
- * and then launch the example. Now whatever you type in the terminal is going to be
- * the input to the program.
- * */
-public class AutoComplete {
-
- /**
- * A PTransform that takes as input a list of tokens and returns
- * the most common tokens per prefix.
- */
- public static class ComputeTopCompletions
- extends PTransform<PCollection<String>, PCollection<KV<String, List<CompletionCandidate>>>> {
- private static final long serialVersionUID = 0;
-
- private final int candidatesPerPrefix;
- private final boolean recursive;
-
- protected ComputeTopCompletions(int candidatesPerPrefix, boolean recursive) {
- this.candidatesPerPrefix = candidatesPerPrefix;
- this.recursive = recursive;
- }
-
- public static ComputeTopCompletions top(int candidatesPerPrefix, boolean recursive) {
- return new ComputeTopCompletions(candidatesPerPrefix, recursive);
- }
-
- @Override
- public PCollection<KV<String, List<CompletionCandidate>>> expand(PCollection<String> input) {
- PCollection<CompletionCandidate> candidates = input
- // First count how often each token appears.
- .apply(Count.<String>perElement())
-
- // Map the KV outputs of Count into our own CompletionCandiate class.
- .apply("CreateCompletionCandidates", ParDo.of(
- new DoFn<KV<String, Long>, CompletionCandidate>() {
- private static final long serialVersionUID = 0;
-
- @ProcessElement
- public void processElement(ProcessContext c) {
- CompletionCandidate cand = new CompletionCandidate(c.element().getKey(),
- c.element().getValue());
- c.output(cand);
- }
- }));
-
- // Compute the top via either a flat or recursive algorithm.
- if (recursive) {
- return candidates
- .apply(new ComputeTopRecursive(candidatesPerPrefix, 1))
- .apply(Flatten.<KV<String, List<CompletionCandidate>>>pCollections());
- } else {
- return candidates
- .apply(new ComputeTopFlat(candidatesPerPrefix, 1));
- }
- }
- }
-
- /**
- * Lower latency, but more expensive.
- */
- private static class ComputeTopFlat
- extends PTransform<PCollection<CompletionCandidate>,
- PCollection<KV<String, List<CompletionCandidate>>>> {
- private static final long serialVersionUID = 0;
-
- private final int candidatesPerPrefix;
- private final int minPrefix;
-
- public ComputeTopFlat(int candidatesPerPrefix, int minPrefix) {
- this.candidatesPerPrefix = candidatesPerPrefix;
- this.minPrefix = minPrefix;
- }
-
- @Override
- public PCollection<KV<String, List<CompletionCandidate>>> expand(
- PCollection<CompletionCandidate> input) {
- return input
- // For each completion candidate, map it to all prefixes.
- .apply(ParDo.of(new AllPrefixes(minPrefix)))
-
- // Find and return the top candiates for each prefix.
- .apply(Top.<String, CompletionCandidate>largestPerKey(candidatesPerPrefix)
- .withHotKeyFanout(new HotKeyFanout()));
- }
-
- private static class HotKeyFanout implements SerializableFunction<String, Integer> {
- private static final long serialVersionUID = 0;
-
- @Override
- public Integer apply(String input) {
- return (int) Math.pow(4, 5 - input.length());
- }
- }
- }
-
- /**
- * Cheaper but higher latency.
- *
- * <p>Returns two PCollections, the first is top prefixes of size greater
- * than minPrefix, and the second is top prefixes of size exactly
- * minPrefix.
- */
- private static class ComputeTopRecursive
- extends PTransform<PCollection<CompletionCandidate>,
- PCollectionList<KV<String, List<CompletionCandidate>>>> {
- private static final long serialVersionUID = 0;
-
- private final int candidatesPerPrefix;
- private final int minPrefix;
-
- public ComputeTopRecursive(int candidatesPerPrefix, int minPrefix) {
- this.candidatesPerPrefix = candidatesPerPrefix;
- this.minPrefix = minPrefix;
- }
-
- private class KeySizePartitionFn implements PartitionFn<KV<String, List<CompletionCandidate>>> {
- private static final long serialVersionUID = 0;
-
- @Override
- public int partitionFor(KV<String, List<CompletionCandidate>> elem, int numPartitions) {
- return elem.getKey().length() > minPrefix ? 0 : 1;
- }
- }
-
- private static class FlattenTops
- extends DoFn<KV<String, List<CompletionCandidate>>, CompletionCandidate> {
- private static final long serialVersionUID = 0;
-
- @ProcessElement
- public void processElement(ProcessContext c) {
- for (CompletionCandidate cc : c.element().getValue()) {
- c.output(cc);
- }
- }
- }
-
- @Override
- public PCollectionList<KV<String, List<CompletionCandidate>>> expand(
- PCollection<CompletionCandidate> input) {
- if (minPrefix > 10) {
- // Base case, partitioning to return the output in the expected format.
- return input
- .apply(new ComputeTopFlat(candidatesPerPrefix, minPrefix))
- .apply(Partition.of(2, new KeySizePartitionFn()));
- } else {
- // If a candidate is in the top N for prefix a...b, it must also be in the top
- // N for a...bX for every X, which is typlically a much smaller set to consider.
- // First, compute the top candidate for prefixes of size at least minPrefix + 1.
- PCollectionList<KV<String, List<CompletionCandidate>>> larger = input
- .apply(new ComputeTopRecursive(candidatesPerPrefix, minPrefix + 1));
- // Consider the top candidates for each prefix of length minPrefix + 1...
- PCollection<KV<String, List<CompletionCandidate>>> small =
- PCollectionList
- .of(larger.get(1).apply(ParDo.of(new FlattenTops())))
- // ...together with those (previously excluded) candidates of length
- // exactly minPrefix...
- .and(input.apply(Filter.by(new SerializableFunction<CompletionCandidate, Boolean>() {
- private static final long serialVersionUID = 0;
-
- @Override
- public Boolean apply(CompletionCandidate c) {
- return c.getValue().length() == minPrefix;
- }
- })))
- .apply("FlattenSmall", Flatten.<CompletionCandidate>pCollections())
- // ...set the key to be the minPrefix-length prefix...
- .apply(ParDo.of(new AllPrefixes(minPrefix, minPrefix)))
- // ...and (re)apply the Top operator to all of them together.
- .apply(Top.<String, CompletionCandidate>largestPerKey(candidatesPerPrefix));
-
- PCollection<KV<String, List<CompletionCandidate>>> flattenLarger = larger
- .apply("FlattenLarge", Flatten.<KV<String, List<CompletionCandidate>>>pCollections());
-
- return PCollectionList.of(flattenLarger).and(small);
- }
- }
- }
-
- /**
- * A DoFn that keys each candidate by all its prefixes.
- */
- private static class AllPrefixes
- extends DoFn<CompletionCandidate, KV<String, CompletionCandidate>> {
- private static final long serialVersionUID = 0;
-
- private final int minPrefix;
- private final int maxPrefix;
- public AllPrefixes(int minPrefix) {
- this(minPrefix, Integer.MAX_VALUE);
- }
- public AllPrefixes(int minPrefix, int maxPrefix) {
- this.minPrefix = minPrefix;
- this.maxPrefix = maxPrefix;
- }
- @ProcessElement
- public void processElement(ProcessContext c) {
- String word = c.element().value;
- for (int i = minPrefix; i <= Math.min(word.length(), maxPrefix); i++) {
- KV<String, CompletionCandidate> kv = KV.of(word.substring(0, i), c.element());
- c.output(kv);
- }
- }
- }
-
- /**
- * Class used to store tag-count pairs.
- */
- @DefaultCoder(AvroCoder.class)
- static class CompletionCandidate implements Comparable<CompletionCandidate> {
- private long count;
- private String value;
-
- public CompletionCandidate(String value, long count) {
- this.value = value;
- this.count = count;
- }
-
- public String getValue() {
- return value;
- }
-
- // Empty constructor required for Avro decoding.
- @SuppressWarnings("unused")
- public CompletionCandidate() {}
-
- @Override
- public int compareTo(CompletionCandidate o) {
- if (this.count < o.count) {
- return -1;
- } else if (this.count == o.count) {
- return this.value.compareTo(o.value);
- } else {
- return 1;
- }
- }
-
- @Override
- public boolean equals(Object other) {
- if (other instanceof CompletionCandidate) {
- CompletionCandidate that = (CompletionCandidate) other;
- return this.count == that.count && this.value.equals(that.value);
- } else {
- return false;
- }
- }
-
- @Override
- public int hashCode() {
- return Long.valueOf(count).hashCode() ^ value.hashCode();
- }
-
- @Override
- public String toString() {
- return "CompletionCandidate[" + value + ", " + count + "]";
- }
- }
-
- static class ExtractWordsFn extends DoFn<String, String> {
- private final Aggregator<Long, Long> emptyLines =
- createAggregator("emptyLines", Sum.ofLongs());
-
- @ProcessElement
- public void processElement(ProcessContext c) {
- if (c.element().trim().isEmpty()) {
- emptyLines.addValue(1L);
- }
-
- // Split the line into words.
- String[] words = c.element().split("[^a-zA-Z']+");
-
- // Output each word encountered into the output PCollection.
- for (String word : words) {
- if (!word.isEmpty()) {
- c.output(word);
- }
- }
- }
- }
-
- /**
- * Takes as input a the top candidates per prefix, and emits an entity suitable for writing to
- * Datastore.
- */
- static class FormatForPerTaskLocalFile
- extends DoFn<KV<String, List<CompletionCandidate>>, String> {
-
- private static final long serialVersionUID = 0;
-
- @ProcessElement
- public void processElement(ProcessContext c, BoundedWindow window) {
- StringBuilder str = new StringBuilder();
- KV<String, List<CompletionCandidate>> elem = c.element();
-
- str.append(elem.getKey() + " @ " + window + " -> ");
- for (CompletionCandidate cand: elem.getValue()) {
- str.append(cand.toString() + " ");
- }
- System.out.println(str.toString());
- c.output(str.toString());
- }
- }
-
- /**
- * Options supported by this class.
- *
- * <p>Inherits standard Dataflow configuration options.
- */
- private interface Options extends WindowedWordCount.StreamingWordCountOptions {
- @Description("Whether to use the recursive algorithm")
- @Default.Boolean(true)
- Boolean getRecursive();
- void setRecursive(Boolean value);
- }
-
- public static void main(String[] args) throws IOException {
- Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
- options.setStreaming(true);
- options.setCheckpointingInterval(1000L);
- options.setNumberOfExecutionRetries(5);
- options.setExecutionRetryDelay(3000L);
- options.setRunner(FlinkRunner.class);
-
-
- WindowFn<Object, ?> windowFn =
- FixedWindows.of(Duration.standardSeconds(options.getWindowSize()));
-
- // Create the pipeline.
- Pipeline p = Pipeline.create(options);
- PCollection<KV<String, List<CompletionCandidate>>> toWrite = p
- .apply("WordStream", Read.from(new UnboundedSocketSource<>("localhost", 9999, '\n', 3)))
- .apply(ParDo.of(new ExtractWordsFn()))
- .apply(Window.<String>into(windowFn)
- .triggering(AfterWatermark.pastEndOfWindow()).withAllowedLateness(Duration.ZERO)
- .discardingFiredPanes())
- .apply(ComputeTopCompletions.top(10, options.getRecursive()));
-
- toWrite
- .apply("FormatForPerTaskFile", ParDo.of(new FormatForPerTaskLocalFile()))
- .apply(TextIO.Write.to("./outputAutoComplete.txt"));
-
- p.run();
- }
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/examples/src/main/java/org/apache/beam/runners/flink/examples/streaming/JoinExamples.java
----------------------------------------------------------------------
diff --git a/runners/flink/examples/src/main/java/org/apache/beam/runners/flink/examples/streaming/JoinExamples.java b/runners/flink/examples/src/main/java/org/apache/beam/runners/flink/examples/streaming/JoinExamples.java
deleted file mode 100644
index 8fefc9f..0000000
--- a/runners/flink/examples/src/main/java/org/apache/beam/runners/flink/examples/streaming/JoinExamples.java
+++ /dev/null
@@ -1,154 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink.examples.streaming;
-
-import org.apache.beam.runners.flink.FlinkRunner;
-import org.apache.beam.runners.flink.translation.wrappers.streaming.io.UnboundedSocketSource;
-import org.apache.beam.sdk.Pipeline;
-import org.apache.beam.sdk.io.Read;
-import org.apache.beam.sdk.io.TextIO;
-import org.apache.beam.sdk.options.PipelineOptionsFactory;
-import org.apache.beam.sdk.transforms.DoFn;
-import org.apache.beam.sdk.transforms.ParDo;
-import org.apache.beam.sdk.transforms.join.CoGbkResult;
-import org.apache.beam.sdk.transforms.join.CoGroupByKey;
-import org.apache.beam.sdk.transforms.join.KeyedPCollectionTuple;
-import org.apache.beam.sdk.transforms.windowing.AfterWatermark;
-import org.apache.beam.sdk.transforms.windowing.FixedWindows;
-import org.apache.beam.sdk.transforms.windowing.Window;
-import org.apache.beam.sdk.transforms.windowing.WindowFn;
-import org.apache.beam.sdk.values.KV;
-import org.apache.beam.sdk.values.PCollection;
-import org.apache.beam.sdk.values.TupleTag;
-import org.joda.time.Duration;
-
-/**
- * To run the example, first open two sockets on two terminals by executing the commands:
- * <ul>
- * <li><code>nc -lk 9999</code>, and
- * <li><code>nc -lk 9998</code>
- * </ul>
- * and then launch the example. Now whatever you type in the terminal is going to be
- * the input to the program.
- * */
-public class JoinExamples {
-
- static PCollection<String> joinEvents(PCollection<String> streamA,
- PCollection<String> streamB) throws Exception {
-
- final TupleTag<String> firstInfoTag = new TupleTag<>();
- final TupleTag<String> secondInfoTag = new TupleTag<>();
-
- // transform both input collections to tuple collections, where the keys are country
- // codes in both cases.
- PCollection<KV<String, String>> firstInfo = streamA.apply(
- ParDo.of(new ExtractEventDataFn()));
- PCollection<KV<String, String>> secondInfo = streamB.apply(
- ParDo.of(new ExtractEventDataFn()));
-
- // country code 'key' -> CGBKR (<event info>, <country name>)
- PCollection<KV<String, CoGbkResult>> kvpCollection = KeyedPCollectionTuple
- .of(firstInfoTag, firstInfo)
- .and(secondInfoTag, secondInfo)
- .apply(CoGroupByKey.<String>create());
-
- // Process the CoGbkResult elements generated by the CoGroupByKey transform.
- // country code 'key' -> string of <event info>, <country name>
- PCollection<KV<String, String>> finalResultCollection =
- kvpCollection.apply("Process", ParDo.of(
- new DoFn<KV<String, CoGbkResult>, KV<String, String>>() {
- private static final long serialVersionUID = 0;
-
- @ProcessElement
- public void processElement(ProcessContext c) {
- KV<String, CoGbkResult> e = c.element();
- String key = e.getKey();
-
- String defaultA = "NO_VALUE";
-
- // the following getOnly is a bit tricky because it expects to have
- // EXACTLY ONE value in the corresponding stream and for the corresponding key.
-
- String lineA = e.getValue().getOnly(firstInfoTag, defaultA);
- for (String lineB : c.element().getValue().getAll(secondInfoTag)) {
- // Generate a string that combines information from both collection values
- c.output(KV.of(key, "Value A: " + lineA + " - Value B: " + lineB));
- }
- }
- }));
-
- return finalResultCollection
- .apply("Format", ParDo.of(new DoFn<KV<String, String>, String>() {
- private static final long serialVersionUID = 0;
-
- @ProcessElement
- public void processElement(ProcessContext c) {
- String result = c.element().getKey() + " -> " + c.element().getValue();
- System.out.println(result);
- c.output(result);
- }
- }));
- }
-
- static class ExtractEventDataFn extends DoFn<String, KV<String, String>> {
- private static final long serialVersionUID = 0;
-
- @ProcessElement
- public void processElement(ProcessContext c) {
- String line = c.element().toLowerCase();
- String key = line.split("\\s")[0];
- c.output(KV.of(key, line));
- }
- }
-
- private interface Options extends WindowedWordCount.StreamingWordCountOptions {
-
- }
-
- public static void main(String[] args) throws Exception {
- Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
- options.setStreaming(true);
- options.setCheckpointingInterval(1000L);
- options.setNumberOfExecutionRetries(5);
- options.setExecutionRetryDelay(3000L);
- options.setRunner(FlinkRunner.class);
-
- WindowFn<Object, ?> windowFn = FixedWindows.of(
- Duration.standardSeconds(options.getWindowSize()));
-
- Pipeline p = Pipeline.create(options);
-
- // the following two 'applys' create multiple inputs to our pipeline, one for each
- // of our two input sources.
- PCollection<String> streamA = p
- .apply("FirstStream", Read.from(new UnboundedSocketSource<>("localhost", 9999, '\n', 3)))
- .apply(Window.<String>into(windowFn)
- .triggering(AfterWatermark.pastEndOfWindow()).withAllowedLateness(Duration.ZERO)
- .discardingFiredPanes());
- PCollection<String> streamB = p
- .apply("SecondStream", Read.from(new UnboundedSocketSource<>("localhost", 9998, '\n', 3)))
- .apply(Window.<String>into(windowFn)
- .triggering(AfterWatermark.pastEndOfWindow()).withAllowedLateness(Duration.ZERO)
- .discardingFiredPanes());
-
- PCollection<String> formattedResults = joinEvents(streamA, streamB);
- formattedResults.apply(TextIO.Write.to("./outputJoin.txt"));
- p.run();
- }
-
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/examples/src/main/java/org/apache/beam/runners/flink/examples/streaming/WindowedWordCount.java
----------------------------------------------------------------------
diff --git a/runners/flink/examples/src/main/java/org/apache/beam/runners/flink/examples/streaming/WindowedWordCount.java b/runners/flink/examples/src/main/java/org/apache/beam/runners/flink/examples/streaming/WindowedWordCount.java
deleted file mode 100644
index 792c214..0000000
--- a/runners/flink/examples/src/main/java/org/apache/beam/runners/flink/examples/streaming/WindowedWordCount.java
+++ /dev/null
@@ -1,141 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink.examples.streaming;
-
-import java.io.IOException;
-import org.apache.beam.runners.flink.FlinkRunner;
-import org.apache.beam.runners.flink.translation.wrappers.streaming.io.UnboundedSocketSource;
-import org.apache.beam.sdk.Pipeline;
-import org.apache.beam.sdk.io.Read;
-import org.apache.beam.sdk.io.TextIO;
-import org.apache.beam.sdk.options.Default;
-import org.apache.beam.sdk.options.Description;
-import org.apache.beam.sdk.options.PipelineOptionsFactory;
-import org.apache.beam.sdk.transforms.Aggregator;
-import org.apache.beam.sdk.transforms.Count;
-import org.apache.beam.sdk.transforms.DoFn;
-import org.apache.beam.sdk.transforms.ParDo;
-import org.apache.beam.sdk.transforms.Sum;
-import org.apache.beam.sdk.transforms.windowing.AfterWatermark;
-import org.apache.beam.sdk.transforms.windowing.SlidingWindows;
-import org.apache.beam.sdk.transforms.windowing.Window;
-import org.apache.beam.sdk.values.KV;
-import org.apache.beam.sdk.values.PCollection;
-import org.joda.time.Duration;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * To run the example, first open a socket on a terminal by executing the command:
- * <ul>
- * <li><code>nc -lk 9999</code>
- * </ul>
- * and then launch the example. Now whatever you type in the terminal is going to be
- * the input to the program.
- * */
-public class WindowedWordCount {
-
- private static final Logger LOG = LoggerFactory.getLogger(WindowedWordCount.class);
-
- static final long WINDOW_SIZE = 10; // Default window duration in seconds
- static final long SLIDE_SIZE = 5; // Default window slide in seconds
-
- static class FormatAsStringFn extends DoFn<KV<String, Long>, String> {
- @ProcessElement
- public void processElement(ProcessContext c) {
- String row = c.element().getKey() + " - " + c.element().getValue() + " @ "
- + c.timestamp().toString();
- c.output(row);
- }
- }
-
- static class ExtractWordsFn extends DoFn<String, String> {
- private final Aggregator<Long, Long> emptyLines =
- createAggregator("emptyLines", Sum.ofLongs());
-
- @ProcessElement
- public void processElement(ProcessContext c) {
- if (c.element().trim().isEmpty()) {
- emptyLines.addValue(1L);
- }
-
- // Split the line into words.
- String[] words = c.element().split("[^a-zA-Z']+");
-
- // Output each word encountered into the output PCollection.
- for (String word : words) {
- if (!word.isEmpty()) {
- c.output(word);
- }
- }
- }
- }
-
- /**
- * Pipeline options.
- */
- public interface StreamingWordCountOptions
- extends org.apache.beam.runners.flink.examples.WordCount.Options {
- @Description("Sliding window duration, in seconds")
- @Default.Long(WINDOW_SIZE)
- Long getWindowSize();
-
- void setWindowSize(Long value);
-
- @Description("Window slide, in seconds")
- @Default.Long(SLIDE_SIZE)
- Long getSlide();
-
- void setSlide(Long value);
- }
-
- public static void main(String[] args) throws IOException {
- StreamingWordCountOptions options = PipelineOptionsFactory.fromArgs(args).withValidation()
- .as(StreamingWordCountOptions.class);
- options.setStreaming(true);
- options.setWindowSize(10L);
- options.setSlide(5L);
- options.setCheckpointingInterval(1000L);
- options.setNumberOfExecutionRetries(5);
- options.setExecutionRetryDelay(3000L);
- options.setRunner(FlinkRunner.class);
-
- LOG.info("Windpwed WordCount with Sliding Windows of " + options.getWindowSize()
- + " sec. and a slide of " + options.getSlide());
-
- Pipeline pipeline = Pipeline.create(options);
-
- PCollection<String> words = pipeline
- .apply("StreamingWordCount",
- Read.from(new UnboundedSocketSource<>("localhost", 9999, '\n', 3)))
- .apply(ParDo.of(new ExtractWordsFn()))
- .apply(Window.<String>into(SlidingWindows.of(
- Duration.standardSeconds(options.getWindowSize()))
- .every(Duration.standardSeconds(options.getSlide())))
- .triggering(AfterWatermark.pastEndOfWindow()).withAllowedLateness(Duration.ZERO)
- .discardingFiredPanes());
-
- PCollection<KV<String, Long>> wordCounts =
- words.apply(Count.<String>perElement());
-
- wordCounts.apply(ParDo.of(new FormatAsStringFn()))
- .apply(TextIO.Write.to("./outputWordCount.txt"));
-
- pipeline.run();
- }
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/examples/src/main/java/org/apache/beam/runners/flink/examples/streaming/package-info.java
----------------------------------------------------------------------
diff --git a/runners/flink/examples/src/main/java/org/apache/beam/runners/flink/examples/streaming/package-info.java b/runners/flink/examples/src/main/java/org/apache/beam/runners/flink/examples/streaming/package-info.java
deleted file mode 100644
index 58f41b6..0000000
--- a/runners/flink/examples/src/main/java/org/apache/beam/runners/flink/examples/streaming/package-info.java
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * Flink Beam runner exemple.
- */
-package org.apache.beam.runners.flink.examples.streaming;
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/pom.xml
----------------------------------------------------------------------
diff --git a/runners/flink/pom.xml b/runners/flink/pom.xml
index a5c5ea0..351035e 100644
--- a/runners/flink/pom.xml
+++ b/runners/flink/pom.xml
@@ -26,22 +26,97 @@
<relativePath>../pom.xml</relativePath>
</parent>
- <artifactId>beam-runners-flink-parent</artifactId>
+ <artifactId>beam-runners-flink</artifactId>
<name>Apache Beam :: Runners :: Flink</name>
-
- <packaging>pom</packaging>
-
- <modules>
- <module>runner</module>
- <module>examples</module>
- </modules>
+ <packaging>jar</packaging>
<properties>
- <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
- <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
<flink.version>1.2.0</flink.version>
</properties>
+ <profiles>
+ <profile>
+ <id>local-validates-runner-tests</id>
+ <activation><activeByDefault>false</activeByDefault></activation>
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-surefire-plugin</artifactId>
+ <executions>
+
+ <!-- This configures the inherited validates-runner-tests
+ execution to execute with a local Flink instance. -->
+ <execution>
+ <id>validates-runner-tests</id>
+ <phase>integration-test</phase>
+ <goals>
+ <goal>test</goal>
+ </goals>
+ <configuration>
+ <groups>org.apache.beam.sdk.testing.ValidatesRunner</groups>
+ <excludedGroups>
+ org.apache.beam.sdk.testing.FlattenWithHeterogeneousCoders,
+ org.apache.beam.sdk.testing.UsesSplittableParDo,
+ org.apache.beam.sdk.testing.UsesAttemptedMetrics,
+ org.apache.beam.sdk.testing.UsesCommittedMetrics,
+ org.apache.beam.sdk.testing.UsesTestStream
+ </excludedGroups>
+ <parallel>none</parallel>
+ <failIfNoTests>true</failIfNoTests>
+ <dependenciesToScan>
+ <dependency>org.apache.beam:beam-sdks-java-core</dependency>
+ </dependenciesToScan>
+ <systemPropertyVariables>
+ <beamTestPipelineOptions>
+ [
+ "--runner=TestFlinkRunner",
+ "--streaming=false"
+ ]
+ </beamTestPipelineOptions>
+ </systemPropertyVariables>
+ </configuration>
+ </execution>
+
+ <!-- This second execution runs the tests in streaming mode -->
+ <execution>
+ <id>streaming-validates-runner-tests</id>
+ <phase>integration-test</phase>
+ <goals>
+ <goal>test</goal>
+ </goals>
+ <configuration>
+ <groups>org.apache.beam.sdk.testing.ValidatesRunner</groups>
+ <excludedGroups>
+ org.apache.beam.sdk.testing.FlattenWithHeterogeneousCoders,
+ org.apache.beam.sdk.testing.UsesSetState,
+ org.apache.beam.sdk.testing.UsesMapState,
+ org.apache.beam.sdk.testing.UsesAttemptedMetrics,
+ org.apache.beam.sdk.testing.UsesCommittedMetrics,
+ org.apache.beam.sdk.testing.UsesTestStream
+ </excludedGroups>
+ <parallel>none</parallel>
+ <failIfNoTests>true</failIfNoTests>
+ <dependenciesToScan>
+ <dependency>org.apache.beam:beam-sdks-java-core</dependency>
+ </dependenciesToScan>
+ <systemPropertyVariables>
+ <beamTestPipelineOptions>
+ [
+ "--runner=TestFlinkRunner",
+ "--streaming=true"
+ ]
+ </beamTestPipelineOptions>
+ </systemPropertyVariables>
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
+ </plugins>
+ </build>
+ </profile>
+ </profiles>
+
<build>
<pluginManagement>
<plugins>
@@ -89,19 +164,103 @@
<!-- Flink dependencies -->
<dependency>
<groupId>org.apache.flink</groupId>
+ <artifactId>flink-clients_2.10</artifactId>
+ <version>${flink.version}</version>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.flink</groupId>
+ <artifactId>flink-core</artifactId>
+ <version>${flink.version}</version>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.flink</groupId>
+ <artifactId>flink-java</artifactId>
+ <version>${flink.version}</version>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.flink</groupId>
+ <artifactId>flink-runtime_2.10</artifactId>
+ <version>${flink.version}</version>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-java_2.10</artifactId>
<version>${flink.version}</version>
</dependency>
+ <!-- For testing -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-core</artifactId>
<version>${flink.version}</version>
+ <type>test-jar</type>
+ <scope>test</scope>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.flink</groupId>
+ <artifactId>flink-runtime_2.10</artifactId>
+ <version>${flink.version}</version>
+ <type>test-jar</type>
+ <scope>test</scope>
</dependency>
+ <!-- Beam -->
<dependency>
<groupId>org.apache.beam</groupId>
<artifactId>beam-sdks-java-core</artifactId>
+ <exclusions>
+ <exclusion>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-jdk14</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.beam</groupId>
+ <artifactId>beam-runners-core-java</artifactId>
+ <exclusions>
+ <exclusion>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-jdk14</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.beam</groupId>
+ <artifactId>beam-runners-core-construction-java</artifactId>
+ <exclusions>
+ <exclusion>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-jdk14</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+
+ <dependency>
+ <groupId>com.fasterxml.jackson.core</groupId>
+ <artifactId>jackson-annotations</artifactId>
+ </dependency>
+
+ <dependency>
+ <groupId>com.fasterxml.jackson.core</groupId>
+ <artifactId>jackson-databind</artifactId>
+ </dependency>
+
+ <dependency>
+ <groupId>com.google.guava</groupId>
+ <artifactId>guava</artifactId>
+ </dependency>
+
+ <dependency>
+ <groupId>com.google.code.findbugs</groupId>
+ <artifactId>jsr305</artifactId>
</dependency>
<dependency>
@@ -113,5 +272,101 @@
<groupId>joda-time</groupId>
<artifactId>joda-time</artifactId>
</dependency>
+
+ <!--
+ Force an upgrade on the version of Apache Commons from Flink to support DEFLATE compression.
+ -->
+ <dependency>
+ <groupId>org.apache.commons</groupId>
+ <artifactId>commons-compress</artifactId>
+ <scope>runtime</scope>
+ </dependency>
+
+ <!-- Test scoped -->
+ <dependency>
+ <groupId>org.apache.commons</groupId>
+ <artifactId>commons-lang3</artifactId>
+ <scope>test</scope>
+ </dependency>
+
+ <dependency>
+ <groupId>org.hamcrest</groupId>
+ <artifactId>hamcrest-all</artifactId>
+ <scope>test</scope>
+ </dependency>
+
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <scope>test</scope>
+ </dependency>
+
+ <dependency>
+ <groupId>org.mockito</groupId>
+ <artifactId>mockito-all</artifactId>
+ <scope>test</scope>
+ </dependency>
+
+ <!-- Depend on test jar to scan for ValidatesRunner tests -->
+ <dependency>
+ <groupId>org.apache.beam</groupId>
+ <artifactId>beam-sdks-java-core</artifactId>
+ <classifier>tests</classifier>
+ <scope>test</scope>
+ <exclusions>
+ <exclusion>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-jdk14</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+
+ <dependency>
+ <groupId>com.google.apis</groupId>
+ <artifactId>google-api-services-bigquery</artifactId>
+ <scope>test</scope>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.flink</groupId>
+ <artifactId>flink-streaming-java_2.10</artifactId>
+ <version>${flink.version}</version>
+ <scope>test</scope>
+ <type>test-jar</type>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.flink</groupId>
+ <artifactId>flink-test-utils_2.10</artifactId>
+ <version>${flink.version}</version>
+ <scope>test</scope>
+ <exclusions>
+ <exclusion>
+ <artifactId>apacheds-jdbm1</artifactId>
+ <groupId>org.apache.directory.jdbm</groupId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+
+ <!-- Optional Pipeline Registration -->
+ <dependency>
+ <groupId>com.google.auto.service</groupId>
+ <artifactId>auto-service</artifactId>
+ <optional>true</optional>
+ </dependency>
+
+ <!-- transitive test dependencies from beam-sdk-java-core -->
+ <dependency>
+ <groupId>com.fasterxml.jackson.dataformat</groupId>
+ <artifactId>jackson-dataformat-yaml</artifactId>
+ <scope>test</scope>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.beam</groupId>
+ <artifactId>beam-sdks-common-fn-api</artifactId>
+ <type>test-jar</type>
+ <scope>test</scope>
+ </dependency>
</dependencies>
</project>
[31/50] [abbrv] beam git commit: [BEAM-1994] Remove Flink examples
package
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkPipelineExecutionEnvironment.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkPipelineExecutionEnvironment.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkPipelineExecutionEnvironment.java
new file mode 100644
index 0000000..ba00036
--- /dev/null
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkPipelineExecutionEnvironment.java
@@ -0,0 +1,241 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink;
+
+import static com.google.common.base.Preconditions.checkNotNull;
+
+import java.util.List;
+import org.apache.beam.sdk.Pipeline;
+import org.apache.flink.api.common.JobExecutionResult;
+import org.apache.flink.api.java.CollectionEnvironment;
+import org.apache.flink.api.java.ExecutionEnvironment;
+import org.apache.flink.runtime.state.AbstractStateBackend;
+import org.apache.flink.streaming.api.TimeCharacteristic;
+import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * The class that instantiates and manages the execution of a given job.
+ * Depending on if the job is a Streaming or Batch processing one, it creates
+ * the adequate execution environment ({@link ExecutionEnvironment}
+ * or {@link StreamExecutionEnvironment}), the necessary {@link FlinkPipelineTranslator}
+ * ({@link FlinkBatchPipelineTranslator} or {@link FlinkStreamingPipelineTranslator}) to
+ * transform the Beam job into a Flink one, and executes the (translated) job.
+ */
+class FlinkPipelineExecutionEnvironment {
+
+ private static final Logger LOG =
+ LoggerFactory.getLogger(FlinkPipelineExecutionEnvironment.class);
+
+ private final FlinkPipelineOptions options;
+
+ /**
+ * The Flink Batch execution environment. This is instantiated to either a
+ * {@link org.apache.flink.api.java.CollectionEnvironment},
+ * a {@link org.apache.flink.api.java.LocalEnvironment} or
+ * a {@link org.apache.flink.api.java.RemoteEnvironment}, depending on the configuration
+ * options.
+ */
+ private ExecutionEnvironment flinkBatchEnv;
+
+ /**
+ * The Flink Streaming execution environment. This is instantiated to either a
+ * {@link org.apache.flink.streaming.api.environment.LocalStreamEnvironment} or
+ * a {@link org.apache.flink.streaming.api.environment.RemoteStreamEnvironment}, depending
+ * on the configuration options, and more specifically, the url of the master.
+ */
+ private StreamExecutionEnvironment flinkStreamEnv;
+
+ /**
+ * Creates a {@link FlinkPipelineExecutionEnvironment} with the user-specified parameters in the
+ * provided {@link FlinkPipelineOptions}.
+ *
+ * @param options the user-defined pipeline options.
+ * */
+ FlinkPipelineExecutionEnvironment(FlinkPipelineOptions options) {
+ this.options = checkNotNull(options);
+ }
+
+ /**
+ * Depending on if the job is a Streaming or a Batch one, this method creates
+ * the necessary execution environment and pipeline translator, and translates
+ * the {@link org.apache.beam.sdk.values.PCollection} program into
+ * a {@link org.apache.flink.api.java.DataSet}
+ * or {@link org.apache.flink.streaming.api.datastream.DataStream} one.
+ * */
+ public void translate(FlinkRunner flinkRunner, Pipeline pipeline) {
+ this.flinkBatchEnv = null;
+ this.flinkStreamEnv = null;
+
+ PipelineTranslationOptimizer optimizer =
+ new PipelineTranslationOptimizer(TranslationMode.BATCH, options);
+
+ optimizer.translate(pipeline);
+ TranslationMode translationMode = optimizer.getTranslationMode();
+
+ FlinkPipelineTranslator translator;
+ if (translationMode == TranslationMode.STREAMING) {
+ this.flinkStreamEnv = createStreamExecutionEnvironment();
+ translator = new FlinkStreamingPipelineTranslator(flinkRunner, flinkStreamEnv, options);
+ } else {
+ this.flinkBatchEnv = createBatchExecutionEnvironment();
+ translator = new FlinkBatchPipelineTranslator(flinkBatchEnv, options);
+ }
+
+ translator.translate(pipeline);
+ }
+
+ /**
+ * Launches the program execution.
+ * */
+ public JobExecutionResult executePipeline() throws Exception {
+ final String jobName = options.getJobName();
+
+ if (flinkBatchEnv != null) {
+ return flinkBatchEnv.execute(jobName);
+ } else if (flinkStreamEnv != null) {
+ return flinkStreamEnv.execute(jobName);
+ } else {
+ throw new IllegalStateException("The Pipeline has not yet been translated.");
+ }
+ }
+
+ /**
+ * If the submitted job is a batch processing job, this method creates the adequate
+ * Flink {@link org.apache.flink.api.java.ExecutionEnvironment} depending
+ * on the user-specified options.
+ */
+ private ExecutionEnvironment createBatchExecutionEnvironment() {
+
+ LOG.info("Creating the required Batch Execution Environment.");
+
+ String masterUrl = options.getFlinkMaster();
+ ExecutionEnvironment flinkBatchEnv;
+
+ // depending on the master, create the right environment.
+ if (masterUrl.equals("[local]")) {
+ flinkBatchEnv = ExecutionEnvironment.createLocalEnvironment();
+ } else if (masterUrl.equals("[collection]")) {
+ flinkBatchEnv = new CollectionEnvironment();
+ } else if (masterUrl.equals("[auto]")) {
+ flinkBatchEnv = ExecutionEnvironment.getExecutionEnvironment();
+ } else if (masterUrl.matches(".*:\\d*")) {
+ String[] parts = masterUrl.split(":");
+ List<String> stagingFiles = options.getFilesToStage();
+ flinkBatchEnv = ExecutionEnvironment.createRemoteEnvironment(parts[0],
+ Integer.parseInt(parts[1]),
+ stagingFiles.toArray(new String[stagingFiles.size()]));
+ } else {
+ LOG.warn("Unrecognized Flink Master URL {}. Defaulting to [auto].", masterUrl);
+ flinkBatchEnv = ExecutionEnvironment.getExecutionEnvironment();
+ }
+
+ // set the correct parallelism.
+ if (options.getParallelism() != -1 && !(flinkBatchEnv instanceof CollectionEnvironment)) {
+ flinkBatchEnv.setParallelism(options.getParallelism());
+ }
+
+ // set parallelism in the options (required by some execution code)
+ options.setParallelism(flinkBatchEnv.getParallelism());
+
+ if (options.getObjectReuse()) {
+ flinkBatchEnv.getConfig().enableObjectReuse();
+ } else {
+ flinkBatchEnv.getConfig().disableObjectReuse();
+ }
+
+ return flinkBatchEnv;
+ }
+
+ /**
+ * If the submitted job is a stream processing job, this method creates the adequate
+ * Flink {@link org.apache.flink.streaming.api.environment.StreamExecutionEnvironment} depending
+ * on the user-specified options.
+ */
+ private StreamExecutionEnvironment createStreamExecutionEnvironment() {
+
+ LOG.info("Creating the required Streaming Environment.");
+
+ String masterUrl = options.getFlinkMaster();
+ StreamExecutionEnvironment flinkStreamEnv = null;
+
+ // depending on the master, create the right environment.
+ if (masterUrl.equals("[local]")) {
+ flinkStreamEnv = StreamExecutionEnvironment.createLocalEnvironment();
+ } else if (masterUrl.equals("[auto]")) {
+ flinkStreamEnv = StreamExecutionEnvironment.getExecutionEnvironment();
+ } else if (masterUrl.matches(".*:\\d*")) {
+ String[] parts = masterUrl.split(":");
+ List<String> stagingFiles = options.getFilesToStage();
+ flinkStreamEnv = StreamExecutionEnvironment.createRemoteEnvironment(parts[0],
+ Integer.parseInt(parts[1]), stagingFiles.toArray(new String[stagingFiles.size()]));
+ } else {
+ LOG.warn("Unrecognized Flink Master URL {}. Defaulting to [auto].", masterUrl);
+ flinkStreamEnv = StreamExecutionEnvironment.getExecutionEnvironment();
+ }
+
+ // set the correct parallelism.
+ if (options.getParallelism() != -1) {
+ flinkStreamEnv.setParallelism(options.getParallelism());
+ }
+
+ // set parallelism in the options (required by some execution code)
+ options.setParallelism(flinkStreamEnv.getParallelism());
+
+ if (options.getObjectReuse()) {
+ flinkStreamEnv.getConfig().enableObjectReuse();
+ } else {
+ flinkStreamEnv.getConfig().disableObjectReuse();
+ }
+
+ // default to event time
+ flinkStreamEnv.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
+
+ // for the following 2 parameters, a value of -1 means that Flink will use
+ // the default values as specified in the configuration.
+ int numRetries = options.getNumberOfExecutionRetries();
+ if (numRetries != -1) {
+ flinkStreamEnv.setNumberOfExecutionRetries(numRetries);
+ }
+ long retryDelay = options.getExecutionRetryDelay();
+ if (retryDelay != -1) {
+ flinkStreamEnv.getConfig().setExecutionRetryDelay(retryDelay);
+ }
+
+ // A value of -1 corresponds to disabled checkpointing (see CheckpointConfig in Flink).
+ // If the value is not -1, then the validity checks are applied.
+ // By default, checkpointing is disabled.
+ long checkpointInterval = options.getCheckpointingInterval();
+ if (checkpointInterval != -1) {
+ if (checkpointInterval < 1) {
+ throw new IllegalArgumentException("The checkpoint interval must be positive");
+ }
+ flinkStreamEnv.enableCheckpointing(checkpointInterval);
+ }
+
+ // State backend
+ final AbstractStateBackend stateBackend = options.getStateBackend();
+ if (stateBackend != null) {
+ flinkStreamEnv.setStateBackend(stateBackend);
+ }
+
+ return flinkStreamEnv;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkPipelineOptions.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkPipelineOptions.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkPipelineOptions.java
new file mode 100644
index 0000000..ef9afea
--- /dev/null
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkPipelineOptions.java
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink;
+
+
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import java.util.List;
+import org.apache.beam.sdk.options.ApplicationNameOptions;
+import org.apache.beam.sdk.options.Default;
+import org.apache.beam.sdk.options.Description;
+import org.apache.beam.sdk.options.PipelineOptions;
+import org.apache.beam.sdk.options.StreamingOptions;
+import org.apache.flink.runtime.state.AbstractStateBackend;
+
+/**
+ * Options which can be used to configure a Flink PipelineRunner.
+ */
+public interface FlinkPipelineOptions
+ extends PipelineOptions, ApplicationNameOptions, StreamingOptions {
+
+ /**
+ * List of local files to make available to workers.
+ *
+ * <p>Jars are placed on the worker's classpath.
+ *
+ * <p>The default value is the list of jars from the main program's classpath.
+ */
+ @Description("Jar-Files to send to all workers and put on the classpath. "
+ + "The default value is all files from the classpath.")
+ @JsonIgnore
+ List<String> getFilesToStage();
+ void setFilesToStage(List<String> value);
+
+ /**
+ * The url of the Flink JobManager on which to execute pipelines. This can either be
+ * the the address of a cluster JobManager, in the form "host:port" or one of the special
+ * Strings "[local]", "[collection]" or "[auto]". "[local]" will start a local Flink
+ * Cluster in the JVM, "[collection]" will execute the pipeline on Java Collections while
+ * "[auto]" will let the system decide where to execute the pipeline based on the environment.
+ */
+ @Description("Address of the Flink Master where the Pipeline should be executed. Can"
+ + " either be of the form \"host:port\" or one of the special values [local], "
+ + "[collection] or [auto].")
+ String getFlinkMaster();
+ void setFlinkMaster(String value);
+
+ @Description("The degree of parallelism to be used when distributing operations onto workers.")
+ @Default.InstanceFactory(DefaultParallelismFactory.class)
+ Integer getParallelism();
+ void setParallelism(Integer value);
+
+ @Description("The interval between consecutive checkpoints (i.e. snapshots of the current"
+ + "pipeline state used for fault tolerance).")
+ @Default.Long(-1L)
+ Long getCheckpointingInterval();
+ void setCheckpointingInterval(Long interval);
+
+ @Description("Sets the number of times that failed tasks are re-executed. "
+ + "A value of zero effectively disables fault tolerance. A value of -1 indicates "
+ + "that the system default value (as defined in the configuration) should be used.")
+ @Default.Integer(-1)
+ Integer getNumberOfExecutionRetries();
+ void setNumberOfExecutionRetries(Integer retries);
+
+ @Description("Sets the delay between executions. A value of {@code -1} "
+ + "indicates that the default value should be used.")
+ @Default.Long(-1L)
+ Long getExecutionRetryDelay();
+ void setExecutionRetryDelay(Long delay);
+
+ @Description("Sets the behavior of reusing objects.")
+ @Default.Boolean(false)
+ Boolean getObjectReuse();
+ void setObjectReuse(Boolean reuse);
+
+ /**
+ * State backend to store Beam's state during computation.
+ * Note: Only applicable when executing in streaming mode.
+ */
+ @Description("Sets the state backend to use in streaming mode. "
+ + "Otherwise the default is read from the Flink config.")
+ @JsonIgnore
+ AbstractStateBackend getStateBackend();
+ void setStateBackend(AbstractStateBackend stateBackend);
+
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkPipelineTranslator.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkPipelineTranslator.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkPipelineTranslator.java
new file mode 100644
index 0000000..65f416d
--- /dev/null
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkPipelineTranslator.java
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink;
+
+import org.apache.beam.sdk.Pipeline;
+
+/**
+ * The role of this class is to translate the Beam operators to
+ * their Flink counterparts. If we have a streaming job, this is instantiated as a
+ * {@link FlinkStreamingPipelineTranslator}. In other case, i.e. for a batch job,
+ * a {@link FlinkBatchPipelineTranslator} is created. Correspondingly, the
+ * {@link org.apache.beam.sdk.values.PCollection}-based user-provided job is translated into
+ * a {@link org.apache.flink.streaming.api.datastream.DataStream} (for streaming) or a
+ * {@link org.apache.flink.api.java.DataSet} (for batch) one.
+ */
+abstract class FlinkPipelineTranslator extends Pipeline.PipelineVisitor.Defaults {
+
+ /**
+ * Translates the pipeline by passing this class as a visitor.
+ * @param pipeline The pipeline to be translated
+ */
+ public void translate(Pipeline pipeline) {
+ pipeline.traverseTopologically(this);
+ }
+
+ /**
+ * Utility formatting method.
+ * @param n number of spaces to generate
+ * @return String with "|" followed by n spaces
+ */
+ protected static String genSpaces(int n) {
+ StringBuilder builder = new StringBuilder();
+ for (int i = 0; i < n; i++) {
+ builder.append("| ");
+ }
+ return builder.toString();
+ }
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkRunner.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkRunner.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkRunner.java
new file mode 100644
index 0000000..096f030
--- /dev/null
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkRunner.java
@@ -0,0 +1,232 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink;
+
+import com.google.common.base.Joiner;
+import java.io.File;
+import java.net.URISyntaxException;
+import java.net.URL;
+import java.net.URLClassLoader;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.SortedSet;
+import java.util.TreeSet;
+import org.apache.beam.sdk.Pipeline;
+import org.apache.beam.sdk.PipelineResult;
+import org.apache.beam.sdk.options.PipelineOptions;
+import org.apache.beam.sdk.options.PipelineOptionsValidator;
+import org.apache.beam.sdk.runners.PipelineRunner;
+import org.apache.beam.sdk.runners.TransformHierarchy;
+import org.apache.beam.sdk.transforms.PTransform;
+import org.apache.beam.sdk.transforms.View;
+import org.apache.beam.sdk.values.PValue;
+import org.apache.flink.api.common.JobExecutionResult;
+import org.apache.flink.client.program.DetachedEnvironment;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * A {@link PipelineRunner} that executes the operations in the
+ * pipeline by first translating them to a Flink Plan and then executing them either locally
+ * or on a Flink cluster, depending on the configuration.
+ */
+public class FlinkRunner extends PipelineRunner<PipelineResult> {
+
+ private static final Logger LOG = LoggerFactory.getLogger(FlinkRunner.class);
+
+ /**
+ * Provided options.
+ */
+ private final FlinkPipelineOptions options;
+
+ /**
+ * Construct a runner from the provided options.
+ *
+ * @param options Properties which configure the runner.
+ * @return The newly created runner.
+ */
+ public static FlinkRunner fromOptions(PipelineOptions options) {
+ FlinkPipelineOptions flinkOptions =
+ PipelineOptionsValidator.validate(FlinkPipelineOptions.class, options);
+ ArrayList<String> missing = new ArrayList<>();
+
+ if (flinkOptions.getAppName() == null) {
+ missing.add("appName");
+ }
+ if (missing.size() > 0) {
+ throw new IllegalArgumentException(
+ "Missing required values: " + Joiner.on(',').join(missing));
+ }
+
+ if (flinkOptions.getFilesToStage() == null) {
+ flinkOptions.setFilesToStage(detectClassPathResourcesToStage(
+ FlinkRunner.class.getClassLoader()));
+ LOG.info("PipelineOptions.filesToStage was not specified. "
+ + "Defaulting to files from the classpath: will stage {} files. "
+ + "Enable logging at DEBUG level to see which files will be staged.",
+ flinkOptions.getFilesToStage().size());
+ LOG.debug("Classpath elements: {}", flinkOptions.getFilesToStage());
+ }
+
+ // Set Flink Master to [auto] if no option was specified.
+ if (flinkOptions.getFlinkMaster() == null) {
+ flinkOptions.setFlinkMaster("[auto]");
+ }
+
+ return new FlinkRunner(flinkOptions);
+ }
+
+ private FlinkRunner(FlinkPipelineOptions options) {
+ this.options = options;
+ this.ptransformViewsWithNonDeterministicKeyCoders = new HashSet<>();
+ }
+
+ @Override
+ public PipelineResult run(Pipeline pipeline) {
+ logWarningIfPCollectionViewHasNonDeterministicKeyCoder(pipeline);
+
+ LOG.info("Executing pipeline using FlinkRunner.");
+
+ FlinkPipelineExecutionEnvironment env = new FlinkPipelineExecutionEnvironment(options);
+
+ LOG.info("Translating pipeline to Flink program.");
+ env.translate(this, pipeline);
+
+ JobExecutionResult result;
+ try {
+ LOG.info("Starting execution of Flink program.");
+ result = env.executePipeline();
+ } catch (Exception e) {
+ LOG.error("Pipeline execution failed", e);
+ throw new RuntimeException("Pipeline execution failed", e);
+ }
+
+ if (result instanceof DetachedEnvironment.DetachedJobExecutionResult) {
+ LOG.info("Pipeline submitted in Detached mode");
+ return new FlinkDetachedRunnerResult();
+ } else {
+ LOG.info("Execution finished in {} msecs", result.getNetRuntime());
+ Map<String, Object> accumulators = result.getAllAccumulatorResults();
+ if (accumulators != null && !accumulators.isEmpty()) {
+ LOG.info("Final aggregator values:");
+
+ for (Map.Entry<String, Object> entry : result.getAllAccumulatorResults().entrySet()) {
+ LOG.info("{} : {}", entry.getKey(), entry.getValue());
+ }
+ }
+
+ return new FlinkRunnerResult(accumulators, result.getNetRuntime());
+ }
+ }
+
+ /**
+ * For testing.
+ */
+ public FlinkPipelineOptions getPipelineOptions() {
+ return options;
+ }
+
+ @Override
+ public String toString() {
+ return "FlinkRunner#" + hashCode();
+ }
+
+ /**
+ * Attempts to detect all the resources the class loader has access to. This does not recurse
+ * to class loader parents stopping it from pulling in resources from the system class loader.
+ *
+ * @param classLoader The URLClassLoader to use to detect resources to stage.
+ * @return A list of absolute paths to the resources the class loader uses.
+ * @throws IllegalArgumentException If either the class loader is not a URLClassLoader or one
+ * of the resources the class loader exposes is not a file resource.
+ */
+ protected static List<String> detectClassPathResourcesToStage(
+ ClassLoader classLoader) {
+ if (!(classLoader instanceof URLClassLoader)) {
+ String message = String.format("Unable to use ClassLoader to detect classpath elements. "
+ + "Current ClassLoader is %s, only URLClassLoaders are supported.", classLoader);
+ LOG.error(message);
+ throw new IllegalArgumentException(message);
+ }
+
+ List<String> files = new ArrayList<>();
+ for (URL url : ((URLClassLoader) classLoader).getURLs()) {
+ try {
+ files.add(new File(url.toURI()).getAbsolutePath());
+ } catch (IllegalArgumentException | URISyntaxException e) {
+ String message = String.format("Unable to convert url (%s) to file.", url);
+ LOG.error(message);
+ throw new IllegalArgumentException(message, e);
+ }
+ }
+ return files;
+ }
+
+ /** A set of {@link View}s with non-deterministic key coders. */
+ Set<PTransform<?, ?>> ptransformViewsWithNonDeterministicKeyCoders;
+
+ /**
+ * Records that the {@link PTransform} requires a deterministic key coder.
+ */
+ void recordViewUsesNonDeterministicKeyCoder(PTransform<?, ?> ptransform) {
+ ptransformViewsWithNonDeterministicKeyCoders.add(ptransform);
+ }
+
+ /** Outputs a warning about PCollection views without deterministic key coders. */
+ private void logWarningIfPCollectionViewHasNonDeterministicKeyCoder(Pipeline pipeline) {
+ // We need to wait till this point to determine the names of the transforms since only
+ // at this time do we know the hierarchy of the transforms otherwise we could
+ // have just recorded the full names during apply time.
+ if (!ptransformViewsWithNonDeterministicKeyCoders.isEmpty()) {
+ final SortedSet<String> ptransformViewNamesWithNonDeterministicKeyCoders = new TreeSet<>();
+ pipeline.traverseTopologically(new Pipeline.PipelineVisitor() {
+ @Override
+ public void visitValue(PValue value, TransformHierarchy.Node producer) {
+ }
+
+ @Override
+ public void visitPrimitiveTransform(TransformHierarchy.Node node) {
+ if (ptransformViewsWithNonDeterministicKeyCoders.contains(node.getTransform())) {
+ ptransformViewNamesWithNonDeterministicKeyCoders.add(node.getFullName());
+ }
+ }
+
+ @Override
+ public CompositeBehavior enterCompositeTransform(TransformHierarchy.Node node) {
+ if (ptransformViewsWithNonDeterministicKeyCoders.contains(node.getTransform())) {
+ ptransformViewNamesWithNonDeterministicKeyCoders.add(node.getFullName());
+ }
+ return CompositeBehavior.ENTER_TRANSFORM;
+ }
+
+ @Override
+ public void leaveCompositeTransform(TransformHierarchy.Node node) {
+ }
+ });
+
+ LOG.warn("Unable to use indexed implementation for View.AsMap and View.AsMultimap for {} "
+ + "because the key coder is not deterministic. Falling back to singleton implementation "
+ + "which may cause memory and/or performance problems. Future major versions of "
+ + "the Flink runner will require deterministic key coders.",
+ ptransformViewNamesWithNonDeterministicKeyCoders);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkRunnerRegistrar.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkRunnerRegistrar.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkRunnerRegistrar.java
new file mode 100644
index 0000000..681459a
--- /dev/null
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkRunnerRegistrar.java
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.beam.runners.flink;
+
+import com.google.auto.service.AutoService;
+import com.google.common.collect.ImmutableList;
+import org.apache.beam.sdk.options.PipelineOptions;
+import org.apache.beam.sdk.options.PipelineOptionsRegistrar;
+import org.apache.beam.sdk.runners.PipelineRunner;
+import org.apache.beam.sdk.runners.PipelineRunnerRegistrar;
+
+
+/**
+ * AutoService registrar - will register FlinkRunner and FlinkOptions
+ * as possible pipeline runner services.
+ *
+ * <p>It ends up in META-INF/services and gets picked up by Beam.
+ *
+ */
+public class FlinkRunnerRegistrar {
+ private FlinkRunnerRegistrar() { }
+
+ /**
+ * Pipeline runner registrar.
+ */
+ @AutoService(PipelineRunnerRegistrar.class)
+ public static class Runner implements PipelineRunnerRegistrar {
+ @Override
+ public Iterable<Class<? extends PipelineRunner<?>>> getPipelineRunners() {
+ return ImmutableList.<Class<? extends PipelineRunner<?>>>of(
+ FlinkRunner.class,
+ TestFlinkRunner.class);
+ }
+ }
+
+ /**
+ * Pipeline options registrar.
+ */
+ @AutoService(PipelineOptionsRegistrar.class)
+ public static class Options implements PipelineOptionsRegistrar {
+ @Override
+ public Iterable<Class<? extends PipelineOptions>> getPipelineOptions() {
+ return ImmutableList.<Class<? extends PipelineOptions>>of(FlinkPipelineOptions.class);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkRunnerResult.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkRunnerResult.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkRunnerResult.java
new file mode 100644
index 0000000..0682b56
--- /dev/null
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkRunnerResult.java
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink;
+
+import java.io.IOException;
+import java.util.Collections;
+import java.util.Map;
+import org.apache.beam.sdk.AggregatorRetrievalException;
+import org.apache.beam.sdk.AggregatorValues;
+import org.apache.beam.sdk.PipelineResult;
+import org.apache.beam.sdk.metrics.MetricResults;
+import org.apache.beam.sdk.transforms.Aggregator;
+import org.joda.time.Duration;
+
+/**
+ * Result of executing a {@link org.apache.beam.sdk.Pipeline} with Flink. This
+ * has methods to query to job runtime and the final values of
+ * {@link org.apache.beam.sdk.transforms.Aggregator}s.
+ */
+public class FlinkRunnerResult implements PipelineResult {
+
+ private final Map<String, Object> aggregators;
+
+ private final long runtime;
+
+ FlinkRunnerResult(Map<String, Object> aggregators, long runtime) {
+ this.aggregators = (aggregators == null || aggregators.isEmpty())
+ ? Collections.<String, Object>emptyMap()
+ : Collections.unmodifiableMap(aggregators);
+ this.runtime = runtime;
+ }
+
+ @Override
+ public State getState() {
+ return State.DONE;
+ }
+
+ @Override
+ public <T> AggregatorValues<T> getAggregatorValues(final Aggregator<?, T> aggregator)
+ throws AggregatorRetrievalException {
+ // TODO provide a list of all accumulator step values
+ Object value = aggregators.get(aggregator.getName());
+ if (value != null) {
+ return new AggregatorValues<T>() {
+ @Override
+ public Map<String, T> getValuesAtSteps() {
+ return (Map<String, T>) aggregators;
+ }
+ };
+ } else {
+ throw new AggregatorRetrievalException("Accumulator results not found.",
+ new RuntimeException("Accumulator does not exist."));
+ }
+ }
+
+ @Override
+ public String toString() {
+ return "FlinkRunnerResult{"
+ + "aggregators=" + aggregators
+ + ", runtime=" + runtime
+ + '}';
+ }
+
+ @Override
+ public State cancel() throws IOException {
+ throw new UnsupportedOperationException("FlinkRunnerResult does not support cancel.");
+ }
+
+ @Override
+ public State waitUntilFinish() {
+ return State.DONE;
+ }
+
+ @Override
+ public State waitUntilFinish(Duration duration) {
+ return State.DONE;
+ }
+
+ @Override
+ public MetricResults metrics() {
+ throw new UnsupportedOperationException("The FlinkRunner does not currently support metrics.");
+ }
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkStreamingPipelineTranslator.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkStreamingPipelineTranslator.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkStreamingPipelineTranslator.java
new file mode 100644
index 0000000..0459ef7
--- /dev/null
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkStreamingPipelineTranslator.java
@@ -0,0 +1,276 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink;
+
+import com.google.common.collect.ImmutableList;
+import java.util.List;
+import java.util.Map;
+import org.apache.beam.runners.core.SplittableParDo;
+import org.apache.beam.runners.core.construction.PTransformMatchers;
+import org.apache.beam.runners.core.construction.PTransformReplacements;
+import org.apache.beam.runners.core.construction.ReplacementOutputs;
+import org.apache.beam.runners.core.construction.SingleInputOutputOverrideFactory;
+import org.apache.beam.sdk.Pipeline;
+import org.apache.beam.sdk.options.PipelineOptions;
+import org.apache.beam.sdk.runners.PTransformOverride;
+import org.apache.beam.sdk.runners.PTransformOverrideFactory;
+import org.apache.beam.sdk.runners.TransformHierarchy;
+import org.apache.beam.sdk.transforms.AppliedPTransform;
+import org.apache.beam.sdk.transforms.Combine;
+import org.apache.beam.sdk.transforms.PTransform;
+import org.apache.beam.sdk.transforms.ParDo.MultiOutput;
+import org.apache.beam.sdk.transforms.View;
+import org.apache.beam.sdk.util.InstanceBuilder;
+import org.apache.beam.sdk.values.PCollection;
+import org.apache.beam.sdk.values.PCollectionTuple;
+import org.apache.beam.sdk.values.PValue;
+import org.apache.beam.sdk.values.TupleTag;
+import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This is a {@link FlinkPipelineTranslator} for streaming jobs. Its role is to translate
+ * the user-provided {@link org.apache.beam.sdk.values.PCollection}-based job into a
+ * {@link org.apache.flink.streaming.api.datastream.DataStream} one.
+ *
+ */
+class FlinkStreamingPipelineTranslator extends FlinkPipelineTranslator {
+
+ private static final Logger LOG = LoggerFactory.getLogger(FlinkStreamingPipelineTranslator.class);
+
+ /** The necessary context in the case of a straming job. */
+ private final FlinkStreamingTranslationContext streamingContext;
+
+ private int depth = 0;
+
+ private FlinkRunner flinkRunner;
+
+ public FlinkStreamingPipelineTranslator(
+ FlinkRunner flinkRunner,
+ StreamExecutionEnvironment env,
+ PipelineOptions options) {
+ this.streamingContext = new FlinkStreamingTranslationContext(env, options);
+ this.flinkRunner = flinkRunner;
+ }
+
+ @Override
+ public void translate(Pipeline pipeline) {
+ List<PTransformOverride> transformOverrides =
+ ImmutableList.<PTransformOverride>builder()
+ .add(
+ PTransformOverride.of(
+ PTransformMatchers.splittableParDoMulti(),
+ new SplittableParDoOverrideFactory()))
+ .add(
+ PTransformOverride.of(
+ PTransformMatchers.classEqualTo(View.AsIterable.class),
+ new ReflectiveOneToOneOverrideFactory(
+ FlinkStreamingViewOverrides.StreamingViewAsIterable.class, flinkRunner)))
+ .add(
+ PTransformOverride.of(
+ PTransformMatchers.classEqualTo(View.AsList.class),
+ new ReflectiveOneToOneOverrideFactory(
+ FlinkStreamingViewOverrides.StreamingViewAsList.class, flinkRunner)))
+ .add(
+ PTransformOverride.of(
+ PTransformMatchers.classEqualTo(View.AsMap.class),
+ new ReflectiveOneToOneOverrideFactory(
+ FlinkStreamingViewOverrides.StreamingViewAsMap.class, flinkRunner)))
+ .add(
+ PTransformOverride.of(
+ PTransformMatchers.classEqualTo(View.AsMultimap.class),
+ new ReflectiveOneToOneOverrideFactory(
+ FlinkStreamingViewOverrides.StreamingViewAsMultimap.class, flinkRunner)))
+ .add(
+ PTransformOverride.of(
+ PTransformMatchers.classEqualTo(View.AsSingleton.class),
+ new ReflectiveOneToOneOverrideFactory(
+ FlinkStreamingViewOverrides.StreamingViewAsSingleton.class, flinkRunner)))
+ // this has to be last since the ViewAsSingleton override
+ // can expand to a Combine.GloballyAsSingletonView
+ .add(
+ PTransformOverride.of(
+ PTransformMatchers.classEqualTo(Combine.GloballyAsSingletonView.class),
+ new ReflectiveOneToOneOverrideFactory(
+ FlinkStreamingViewOverrides.StreamingCombineGloballyAsSingletonView.class,
+ flinkRunner)))
+ .build();
+
+ pipeline.replaceAll(transformOverrides);
+ super.translate(pipeline);
+ }
+
+ // --------------------------------------------------------------------------------------------
+ // Pipeline Visitor Methods
+ // --------------------------------------------------------------------------------------------
+
+ @Override
+ public CompositeBehavior enterCompositeTransform(TransformHierarchy.Node node) {
+ LOG.info("{} enterCompositeTransform- {}", genSpaces(this.depth), node.getFullName());
+ this.depth++;
+
+ PTransform<?, ?> transform = node.getTransform();
+ if (transform != null) {
+ StreamTransformTranslator<?> translator =
+ FlinkStreamingTransformTranslators.getTranslator(transform);
+
+ if (translator != null && applyCanTranslate(transform, node, translator)) {
+ applyStreamingTransform(transform, node, translator);
+ LOG.info("{} translated- {}", genSpaces(this.depth), node.getFullName());
+ return CompositeBehavior.DO_NOT_ENTER_TRANSFORM;
+ }
+ }
+ return CompositeBehavior.ENTER_TRANSFORM;
+ }
+
+ @Override
+ public void leaveCompositeTransform(TransformHierarchy.Node node) {
+ this.depth--;
+ LOG.info("{} leaveCompositeTransform- {}", genSpaces(this.depth), node.getFullName());
+ }
+
+ @Override
+ public void visitPrimitiveTransform(TransformHierarchy.Node node) {
+ LOG.info("{} visitPrimitiveTransform- {}", genSpaces(this.depth), node.getFullName());
+ // get the transformation corresponding to hte node we are
+ // currently visiting and translate it into its Flink alternative.
+
+ PTransform<?, ?> transform = node.getTransform();
+ StreamTransformTranslator<?> translator =
+ FlinkStreamingTransformTranslators.getTranslator(transform);
+
+ if (translator == null || !applyCanTranslate(transform, node, translator)) {
+ LOG.info(node.getTransform().getClass().toString());
+ throw new UnsupportedOperationException(
+ "The transform " + transform + " is currently not supported.");
+ }
+ applyStreamingTransform(transform, node, translator);
+ }
+
+ @Override
+ public void visitValue(PValue value, TransformHierarchy.Node producer) {
+ // do nothing here
+ }
+
+ private <T extends PTransform<?, ?>> void applyStreamingTransform(
+ PTransform<?, ?> transform,
+ TransformHierarchy.Node node,
+ StreamTransformTranslator<?> translator) {
+
+ @SuppressWarnings("unchecked")
+ T typedTransform = (T) transform;
+
+ @SuppressWarnings("unchecked")
+ StreamTransformTranslator<T> typedTranslator = (StreamTransformTranslator<T>) translator;
+
+ // create the applied PTransform on the streamingContext
+ streamingContext.setCurrentTransform(node.toAppliedPTransform());
+ typedTranslator.translateNode(typedTransform, streamingContext);
+ }
+
+ private <T extends PTransform<?, ?>> boolean applyCanTranslate(
+ PTransform<?, ?> transform,
+ TransformHierarchy.Node node,
+ StreamTransformTranslator<?> translator) {
+
+ @SuppressWarnings("unchecked")
+ T typedTransform = (T) transform;
+
+ @SuppressWarnings("unchecked")
+ StreamTransformTranslator<T> typedTranslator = (StreamTransformTranslator<T>) translator;
+
+ streamingContext.setCurrentTransform(node.toAppliedPTransform());
+
+ return typedTranslator.canTranslate(typedTransform, streamingContext);
+ }
+
+ /**
+ * The interface that every Flink translator of a Beam operator should implement.
+ * This interface is for <b>streaming</b> jobs. For examples of such translators see
+ * {@link FlinkStreamingTransformTranslators}.
+ */
+ abstract static class StreamTransformTranslator<T extends PTransform> {
+
+ /**
+ * Translate the given transform.
+ */
+ abstract void translateNode(T transform, FlinkStreamingTranslationContext context);
+
+ /**
+ * Returns true iff this translator can translate the given transform.
+ */
+ boolean canTranslate(T transform, FlinkStreamingTranslationContext context) {
+ return true;
+ }
+ }
+
+ private static class ReflectiveOneToOneOverrideFactory<
+ InputT, OutputT, TransformT extends PTransform<PCollection<InputT>, PCollection<OutputT>>>
+ extends SingleInputOutputOverrideFactory<
+ PCollection<InputT>, PCollection<OutputT>, TransformT> {
+ private final Class<PTransform<PCollection<InputT>, PCollection<OutputT>>> replacement;
+ private final FlinkRunner runner;
+
+ private ReflectiveOneToOneOverrideFactory(
+ Class<PTransform<PCollection<InputT>, PCollection<OutputT>>> replacement,
+ FlinkRunner runner) {
+ this.replacement = replacement;
+ this.runner = runner;
+ }
+
+ @Override
+ public PTransformReplacement<PCollection<InputT>, PCollection<OutputT>> getReplacementTransform(
+ AppliedPTransform<PCollection<InputT>, PCollection<OutputT>, TransformT> transform) {
+ return PTransformReplacement.of(
+ PTransformReplacements.getSingletonMainInput(transform),
+ InstanceBuilder.ofType(replacement)
+ .withArg(FlinkRunner.class, runner)
+ .withArg(
+ (Class<PTransform<PCollection<InputT>, PCollection<OutputT>>>)
+ transform.getTransform().getClass(),
+ transform.getTransform())
+ .build());
+ }
+ }
+
+ /**
+ * A {@link PTransformOverrideFactory} that overrides a <a
+ * href="https://s.apache.org/splittable-do-fn">Splittable DoFn</a> with {@link SplittableParDo}.
+ */
+ static class SplittableParDoOverrideFactory<InputT, OutputT>
+ implements PTransformOverrideFactory<
+ PCollection<InputT>, PCollectionTuple, MultiOutput<InputT, OutputT>> {
+ @Override
+ public PTransformReplacement<PCollection<InputT>, PCollectionTuple>
+ getReplacementTransform(
+ AppliedPTransform<
+ PCollection<InputT>, PCollectionTuple, MultiOutput<InputT, OutputT>>
+ transform) {
+ return PTransformReplacement.of(
+ PTransformReplacements.getSingletonMainInput(transform),
+ new SplittableParDo<>(transform.getTransform()));
+ }
+
+ @Override
+ public Map<PValue, ReplacementOutput> mapOutputs(
+ Map<TupleTag<?>, PValue> outputs, PCollectionTuple newOutput) {
+ return ReplacementOutputs.tagged(outputs, newOutput);
+ }
+ }
+}
[18/50] [abbrv] beam git commit: Refactor batch loads,
and add support for windowed writes.
Posted by dh...@apache.org.
Refactor batch loads, and add support for windowed writes.
Project: http://git-wip-us.apache.org/repos/asf/beam/repo
Commit: http://git-wip-us.apache.org/repos/asf/beam/commit/760a9458
Tree: http://git-wip-us.apache.org/repos/asf/beam/tree/760a9458
Diff: http://git-wip-us.apache.org/repos/asf/beam/diff/760a9458
Branch: refs/heads/DSL_SQL
Commit: 760a94580d7561bb63a3eea67d8e5443c233a541
Parents: 8581caf
Author: Reuven Lax <re...@google.com>
Authored: Fri Mar 31 11:19:25 2017 -0700
Committer: Eugene Kirpichov <ki...@google.com>
Committed: Tue Apr 18 21:12:50 2017 -0700
----------------------------------------------------------------------
.../apache/beam/sdk/util/IOChannelUtils.java | 9 +
.../beam/sdk/io/gcp/bigquery/BatchLoads.java | 49 +-
.../beam/sdk/io/gcp/bigquery/ShardedKey.java | 24 +-
.../sdk/io/gcp/bigquery/TableDestination.java | 10 +-
.../io/gcp/bigquery/WriteBundlesToFiles.java | 54 +-
.../sdk/io/gcp/bigquery/WritePartition.java | 28 +-
.../beam/sdk/io/gcp/bigquery/WriteRename.java | 13 +-
.../beam/sdk/io/gcp/bigquery/WriteTables.java | 14 +-
.../sdk/io/gcp/bigquery/BigQueryIOTest.java | 838 +++++--------------
.../io/gcp/bigquery/FakeBigQueryServices.java | 96 +++
.../sdk/io/gcp/bigquery/FakeDatasetService.java | 172 ++++
.../sdk/io/gcp/bigquery/FakeJobService.java | 273 ++++++
.../sdk/io/gcp/bigquery/TableContainer.java | 36 +
13 files changed, 948 insertions(+), 668 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/beam/blob/760a9458/sdks/java/core/src/main/java/org/apache/beam/sdk/util/IOChannelUtils.java
----------------------------------------------------------------------
diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/util/IOChannelUtils.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/util/IOChannelUtils.java
index ea53527..9d3dd23 100644
--- a/sdks/java/core/src/main/java/org/apache/beam/sdk/util/IOChannelUtils.java
+++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/util/IOChannelUtils.java
@@ -28,6 +28,7 @@ import com.google.common.collect.Sets;
import com.google.common.collect.TreeMultimap;
import java.io.FileNotFoundException;
import java.io.IOException;
+import java.nio.channels.ReadableByteChannel;
import java.nio.channels.WritableByteChannel;
import java.text.DecimalFormat;
import java.util.Arrays;
@@ -181,6 +182,14 @@ public class IOChannelUtils {
}
/**
+ * Creates a read channel for the given filename.
+ */
+ public static ReadableByteChannel open(String filename)
+ throws IOException {
+ return getFactory(filename).open(filename);
+ }
+
+ /**
* Creates a write channel for the given file components.
*
* <p>If numShards is specified, then a ShardingWritableByteChannel is
http://git-wip-us.apache.org/repos/asf/beam/blob/760a9458/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BatchLoads.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BatchLoads.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BatchLoads.java
index 8594211..5e80fae 100644
--- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BatchLoads.java
+++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BatchLoads.java
@@ -26,6 +26,10 @@ import java.util.List;
import java.util.Map;
import javax.annotation.Nullable;
import org.apache.beam.sdk.Pipeline;
+import org.apache.beam.sdk.coders.Coder;
+import org.apache.beam.sdk.coders.KvCoder;
+import org.apache.beam.sdk.coders.ListCoder;
+import org.apache.beam.sdk.coders.StringUtf8Coder;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition;
import org.apache.beam.sdk.options.BigQueryOptions;
@@ -61,16 +65,17 @@ class BatchLoads<T> extends
private static class ConstantSchemaFunction implements
SerializableFunction<TableDestination, TableSchema> {
private final @Nullable
- String jsonSchema;
+ ValueProvider<String> jsonSchema;
- ConstantSchemaFunction(TableSchema schema) {
- this.jsonSchema = BigQueryHelpers.toJsonString(schema);
+ ConstantSchemaFunction(ValueProvider<String> jsonSchema) {
+ this.jsonSchema = jsonSchema;
}
@Override
@Nullable
public TableSchema apply(TableDestination table) {
- return BigQueryHelpers.fromJsonString(jsonSchema, TableSchema.class);
+ return BigQueryHelpers.fromJsonString(
+ jsonSchema == null ? null : jsonSchema.get(), TableSchema.class);
}
}
@@ -114,7 +119,7 @@ class BatchLoads<T> extends
.apply(View.<String>asSingleton());
PCollection<KV<TableDestination, TableRow>> inputInGlobalWindow =
- input.apply(
+ input.apply("rewindowIntoGlobal",
Window.<KV<TableDestination, TableRow>>into(new GlobalWindows())
.triggering(DefaultTrigger.of())
.discardingFiredPanes());
@@ -122,12 +127,13 @@ class BatchLoads<T> extends
// PCollection of filename, file byte size, and table destination.
PCollection<WriteBundlesToFiles.Result> results = inputInGlobalWindow
.apply("WriteBundlesToFiles",
- ParDo.of(new WriteBundlesToFiles(tempFilePrefix)));
+ ParDo.of(new WriteBundlesToFiles(tempFilePrefix)))
+ .setCoder(WriteBundlesToFiles.ResultCoder.of());
- TupleTag<KV<KV<TableDestination, Integer>, List<String>>> multiPartitionsTag =
- new TupleTag<KV<KV<TableDestination, Integer>, List<String>>>("multiPartitionsTag") {};
- TupleTag<KV<KV<TableDestination, Integer>, List<String>>> singlePartitionTag =
- new TupleTag<KV<KV<TableDestination, Integer>, List<String>>>("singlePartitionTag") {};
+ TupleTag<KV<ShardedKey<TableDestination>, List<String>>> multiPartitionsTag =
+ new TupleTag<KV<ShardedKey<TableDestination>, List<String>>>("multiPartitionsTag") {};
+ TupleTag<KV<ShardedKey<TableDestination>, List<String>>> singlePartitionTag =
+ new TupleTag<KV<ShardedKey<TableDestination>, List<String>>>("singlePartitionTag") {};
// Turn the list of files and record counts in a PCollectionView that can be used as a
// side input.
@@ -136,9 +142,9 @@ class BatchLoads<T> extends
// This transform will look at the set of files written for each table, and if any table has
// too many files or bytes, will partition that table's files into multiple partitions for
// loading.
- PCollectionTuple partitions = singleton.apply(ParDo
- .of(new WritePartition(
- write.getTable(),
+ PCollectionTuple partitions = singleton.apply("WritePartition",
+ ParDo.of(new WritePartition(
+ write.getJsonTableRef(),
write.getTableDescription(),
resultsView,
multiPartitionsTag,
@@ -148,17 +154,22 @@ class BatchLoads<T> extends
// Since BigQueryIO.java does not yet have support for per-table schemas, inject a constant
// schema function here. If no schema is specified, this function will return null.
+ // TODO: Turn this into a side-input instead.
SerializableFunction<TableDestination, TableSchema> schemaFunction =
- new ConstantSchemaFunction(write.getSchema());
+ new ConstantSchemaFunction(write.getJsonSchema());
+ Coder<KV<ShardedKey<TableDestination>, List<String>>> partitionsCoder =
+ KvCoder.of(ShardedKeyCoder.of(TableDestinationCoder.of()),
+ ListCoder.of(StringUtf8Coder.of()));
// If WriteBundlesToFiles produced more than MAX_NUM_FILES files or MAX_SIZE_BYTES bytes, then
// the import needs to be split into multiple partitions, and those partitions will be
// specified in multiPartitionsTag.
PCollection<KV<TableDestination, String>> tempTables = partitions.get(multiPartitionsTag)
+ .setCoder(partitionsCoder)
// What's this GroupByKey for? Is this so we have a deterministic temp tables? If so, maybe
// Reshuffle is better here.
.apply("MultiPartitionsGroupByKey",
- GroupByKey.<KV<TableDestination, Integer>, List<String>>create())
+ GroupByKey.<ShardedKey<TableDestination>, List<String>>create())
.apply("MultiPartitionsWriteTables", ParDo.of(new WriteTables(
false,
write.getBigQueryServices(),
@@ -174,20 +185,20 @@ class BatchLoads<T> extends
PCollectionView<Map<TableDestination, Iterable<String>>> tempTablesView = tempTables
.apply("TempTablesView", View.<TableDestination, String>asMultimap());
- singleton.apply(ParDo
+ singleton.apply("WriteRename", ParDo
.of(new WriteRename(
write.getBigQueryServices(),
jobIdTokenView,
write.getWriteDisposition(),
write.getCreateDisposition(),
- tempTablesView,
- write.getTableDescription()))
+ tempTablesView))
.withSideInputs(tempTablesView, jobIdTokenView));
// Write single partition to final table
partitions.get(singlePartitionTag)
+ .setCoder(partitionsCoder)
.apply("SinglePartitionGroupByKey",
- GroupByKey.<KV<TableDestination, Integer>, List<String>>create())
+ GroupByKey.<ShardedKey<TableDestination>, List<String>>create())
.apply("SinglePartitionWriteTables", ParDo.of(new WriteTables(
true,
write.getBigQueryServices(),
http://git-wip-us.apache.org/repos/asf/beam/blob/760a9458/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/ShardedKey.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/ShardedKey.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/ShardedKey.java
index 8c968df..ab57446 100644
--- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/ShardedKey.java
+++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/ShardedKey.java
@@ -18,10 +18,13 @@
package org.apache.beam.sdk.io.gcp.bigquery;
+import java.io.Serializable;
+import java.util.Objects;
+
/**
* A key and a shard number.
*/
-class ShardedKey<K> {
+class ShardedKey<K> implements Serializable {
private final K key;
private final int shardNumber;
@@ -41,4 +44,23 @@ class ShardedKey<K> {
public int getShardNumber() {
return shardNumber;
}
+
+ @Override
+ public String toString() {
+ return "key: " + key + " shard: " + shardNumber;
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (!(o instanceof ShardedKey)) {
+ return false;
+ }
+ ShardedKey<K> other = (ShardedKey<K>) o;
+ return (key == other.key) && (shardNumber == other.shardNumber);
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(key, shardNumber);
+ }
}
http://git-wip-us.apache.org/repos/asf/beam/blob/760a9458/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TableDestination.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TableDestination.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TableDestination.java
index 1c2b256..e8538e0 100644
--- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TableDestination.java
+++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TableDestination.java
@@ -20,12 +20,13 @@ package org.apache.beam.sdk.io.gcp.bigquery;
import com.google.api.services.bigquery.model.TableReference;
+import java.io.Serializable;
import java.util.Objects;
/**
* Encapsulates a BigQuery table destination.
*/
-public class TableDestination {
+public class TableDestination implements Serializable {
private final String tableSpec;
private final String tableDescription;
@@ -53,12 +54,17 @@ public class TableDestination {
}
@Override
+ public String toString() {
+ return "tableSpec: " + tableSpec + " tableDescription: " + tableDescription;
+ }
+
+ @Override
public boolean equals(Object o) {
if (!(o instanceof TableDestination)) {
return false;
}
TableDestination other = (TableDestination) o;
- return tableSpec == other.tableSpec && tableDescription == other.tableDescription;
+ return (tableSpec == other.tableSpec) && (tableDescription == other.tableDescription);
}
@Override
http://git-wip-us.apache.org/repos/asf/beam/blob/760a9458/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/WriteBundlesToFiles.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/WriteBundlesToFiles.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/WriteBundlesToFiles.java
index 4e6167b..b8069f6 100644
--- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/WriteBundlesToFiles.java
+++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/WriteBundlesToFiles.java
@@ -20,10 +20,19 @@ package org.apache.beam.sdk.io.gcp.bigquery;
import com.google.api.services.bigquery.model.TableRow;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.io.Serializable;
import java.util.Map;
import java.util.UUID;
import com.google.common.collect.Maps;
+import org.apache.beam.sdk.coders.AtomicCoder;
+import org.apache.beam.sdk.coders.CoderException;
+import org.apache.beam.sdk.coders.StringUtf8Coder;
+import org.apache.beam.sdk.coders.TableRowJsonCoder;
+import org.apache.beam.sdk.coders.VarLongCoder;
import org.apache.beam.sdk.transforms.DoFn;
import org.apache.beam.sdk.transforms.display.DisplayData;
import org.apache.beam.sdk.values.KV;
@@ -41,7 +50,7 @@ class WriteBundlesToFiles extends DoFn<KV<TableDestination, TableRow>, WriteBund
private transient Map<TableDestination, TableRowWriter> writers;
private final String tempFilePrefix;
- public static class Result {
+ public static class Result implements Serializable {
public String filename;
public Long fileByteSize;
public TableDestination tableDestination;
@@ -52,15 +61,54 @@ class WriteBundlesToFiles extends DoFn<KV<TableDestination, TableRow>, WriteBund
this.tableDestination = tableDestination;
}
}
+
+ public static class ResultCoder extends AtomicCoder<Result> {
+ private static final ResultCoder INSTANCE = new ResultCoder();
+
+ public static ResultCoder of() {
+ return INSTANCE;
+ }
+
+ @Override
+ public void encode(Result value, OutputStream outStream, Context context)
+ throws IOException {
+ if (value == null) {
+ throw new CoderException("cannot encode a null value");
+ }
+ stringCoder.encode(value.filename, outStream, context.nested());
+ longCoder.encode(value.fileByteSize, outStream, context.nested());
+ tableDestinationCoder.encode(value.tableDestination, outStream, context.nested());
+ }
+
+ @Override
+ public Result decode(InputStream inStream, Context context)
+ throws IOException {
+ return new Result(stringCoder.decode(inStream, context.nested()),
+ longCoder.decode(inStream, context.nested()),
+ tableDestinationCoder.decode(inStream, context.nested()));
+ }
+
+ @Override
+ public void verifyDeterministic() throws NonDeterministicException {
+ }
+
+ StringUtf8Coder stringCoder = StringUtf8Coder.of();
+ VarLongCoder longCoder = VarLongCoder.of();
+ TableDestinationCoder tableDestinationCoder = TableDestinationCoder.of();
+ }
+
WriteBundlesToFiles(String tempFilePrefix) {
this.tempFilePrefix = tempFilePrefix;
+ }
+
+ @StartBundle
+ public void startBundle(Context c) {
this.writers = Maps.newHashMap();
}
@ProcessElement
public void processElement(ProcessContext c) throws Exception {
- // ??? can we assume Java8?
- TableRowWriter writer = writers.getOrDefault(c.element().getKey(), null);
+ TableRowWriter writer = writers.get(c.element().getKey());
if (writer == null) {
writer = new TableRowWriter(tempFilePrefix);
writer.open(UUID.randomUUID().toString());
http://git-wip-us.apache.org/repos/asf/beam/blob/760a9458/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/WritePartition.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/WritePartition.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/WritePartition.java
index 8e1b16d..c48955b 100644
--- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/WritePartition.java
+++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/WritePartition.java
@@ -37,20 +37,20 @@ import org.apache.beam.sdk.values.TupleTag;
* Partitions temporary files based on number of files and file sizes. Output key is a pair of
* tablespec and the list of files corresponding to each partition of that table.
*/
-class WritePartition extends DoFn<String, KV<KV<TableDestination, Integer>, List<String>>> {
- private final ValueProvider<TableReference> singletonOutputTable;
+class WritePartition extends DoFn<String, KV<ShardedKey<TableDestination>, List<String>>> {
+ private final ValueProvider<String> singletonOutputJsonTableRef;
private final String singletonOutputTableDescription;
private final PCollectionView<Iterable<WriteBundlesToFiles.Result>> resultsView;
- private TupleTag<KV<KV<TableDestination, Integer>, List<String>>> multiPartitionsTag;
- private TupleTag<KV<KV<TableDestination, Integer>, List<String>>> singlePartitionTag;
+ private TupleTag<KV<ShardedKey<TableDestination>, List<String>>> multiPartitionsTag;
+ private TupleTag<KV<ShardedKey<TableDestination>, List<String>>> singlePartitionTag;
public WritePartition(
- ValueProvider<TableReference> singletonOutputTable,
+ ValueProvider<String> singletonOutputJsonTableRef,
String singletonOutputTableDescription,
PCollectionView<Iterable<WriteBundlesToFiles.Result>> resultsView,
- TupleTag<KV<KV<TableDestination, Integer>, List<String>>> multiPartitionsTag,
- TupleTag<KV<KV<TableDestination, Integer>, List<String>>> singlePartitionTag) {
- this.singletonOutputTable = singletonOutputTable;
+ TupleTag<KV<ShardedKey<TableDestination>, List<String>>> multiPartitionsTag,
+ TupleTag<KV<ShardedKey<TableDestination>, List<String>>> singlePartitionTag) {
+ this.singletonOutputJsonTableRef = singletonOutputJsonTableRef;
this.singletonOutputTableDescription = singletonOutputTableDescription;
this.resultsView = resultsView;
this.multiPartitionsTag = multiPartitionsTag;
@@ -63,8 +63,9 @@ class WritePartition extends DoFn<String, KV<KV<TableDestination, Integer>, List
// If there are no elements to write _and_ the user specified a constant output table, then
// generate an empty table of that name.
- if (results.isEmpty() && singletonOutputTable != null) {
- TableReference singletonTable = singletonOutputTable.get();
+ if (results.isEmpty() && singletonOutputJsonTableRef != null) {
+ TableReference singletonTable = BigQueryHelpers.fromJsonString(
+ singletonOutputJsonTableRef.get(), TableReference.class);
if (singletonTable != null) {
TableRowWriter writer = new TableRowWriter(c.element());
writer.open(UUID.randomUUID().toString());
@@ -82,8 +83,7 @@ class WritePartition extends DoFn<String, KV<KV<TableDestination, Integer>, List
for (int i = 0; i < results.size(); ++i) {
WriteBundlesToFiles.Result fileResult = results.get(i);
TableDestination tableDestination = fileResult.tableDestination;
- // JAVA8
- List<List<String>> partitions = currResultsMap.getOrDefault(tableDestination, null);
+ List<List<String>> partitions = currResultsMap.get(tableDestination);
if (partitions == null) {
partitions = Lists.newArrayList();
partitions.add(Lists.<String>newArrayList());
@@ -110,10 +110,10 @@ class WritePartition extends DoFn<String, KV<KV<TableDestination, Integer>, List
for (Map.Entry<TableDestination, List<List<String>>> entry : currResultsMap.entrySet()) {
TableDestination tableDestination = entry.getKey();
List<List<String>> partitions = entry.getValue();
- TupleTag<KV<KV<TableDestination, Integer>, List<String>>> outputTag =
+ TupleTag<KV<ShardedKey<TableDestination>, List<String>>> outputTag =
(partitions.size() == 1) ? singlePartitionTag : multiPartitionsTag;
for (int i = 0; i < partitions.size(); ++i) {
- c.output(outputTag, KV.of(KV.of(tableDestination, i + 1), partitions.get(i)));
+ c.output(outputTag, KV.of(ShardedKey.of(tableDestination, i + 1), partitions.get(i)));
}
}
}
http://git-wip-us.apache.org/repos/asf/beam/blob/760a9458/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/WriteRename.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/WriteRename.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/WriteRename.java
index fbfb290..752e7d3 100644
--- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/WriteRename.java
+++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/WriteRename.java
@@ -18,12 +18,12 @@
package org.apache.beam.sdk.io.gcp.bigquery;
-import avro.shaded.com.google.common.collect.Maps;
import com.google.api.services.bigquery.model.Job;
import com.google.api.services.bigquery.model.JobConfigurationTableCopy;
import com.google.api.services.bigquery.model.JobReference;
import com.google.api.services.bigquery.model.TableReference;
import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
import java.io.IOException;
import java.util.List;
import java.util.Map;
@@ -36,7 +36,6 @@ import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryServices.DatasetService;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryServices.JobService;
import org.apache.beam.sdk.options.BigQueryOptions;
-import org.apache.beam.sdk.options.ValueProvider;
import org.apache.beam.sdk.transforms.DoFn;
import org.apache.beam.sdk.transforms.display.DisplayData;
import org.apache.beam.sdk.values.PCollectionView;
@@ -53,23 +52,21 @@ class WriteRename extends DoFn<String, Void> {
private final PCollectionView<String> jobIdToken;
private final WriteDisposition writeDisposition;
private final CreateDisposition createDisposition;
+ // Map from final destination to a list of temporary tables that need to be copied into it.
private final PCollectionView<Map<TableDestination, Iterable<String>>> tempTablesView;
- @Nullable
- private final String tableDescription;
+
public WriteRename(
BigQueryServices bqServices,
PCollectionView<String> jobIdToken,
WriteDisposition writeDisposition,
CreateDisposition createDisposition,
- PCollectionView<Map<TableDestination, Iterable<String>>> tempTablesView,
- @Nullable String tableDescription) {
+ PCollectionView<Map<TableDestination, Iterable<String>>> tempTablesView) {
this.bqServices = bqServices;
this.jobIdToken = jobIdToken;
this.writeDisposition = writeDisposition;
this.createDisposition = createDisposition;
this.tempTablesView = tempTablesView;
- this.tableDescription = tableDescription;
}
@ProcessElement
@@ -102,7 +99,7 @@ class WriteRename extends DoFn<String, Void> {
tempTables,
writeDisposition,
createDisposition,
- tableDescription);
+ finalTableDestination.getTableDescription());
DatasetService tableService =
bqServices.getDatasetService(c.getPipelineOptions().as(BigQueryOptions.class));
http://git-wip-us.apache.org/repos/asf/beam/blob/760a9458/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/WriteTables.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/WriteTables.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/WriteTables.java
index 5051c95..f7fe87b 100644
--- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/WriteTables.java
+++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/WriteTables.java
@@ -39,7 +39,6 @@ import org.apache.beam.sdk.io.gcp.bigquery.BigQueryServices.DatasetService;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryServices.JobService;
import org.apache.beam.sdk.options.BigQueryOptions;
import org.apache.beam.sdk.options.PipelineOptions;
-import org.apache.beam.sdk.options.ValueProvider;
import org.apache.beam.sdk.transforms.DoFn;
import org.apache.beam.sdk.transforms.SerializableFunction;
import org.apache.beam.sdk.transforms.display.DisplayData;
@@ -57,8 +56,12 @@ import org.slf4j.LoggerFactory;
/**
* Writes partitions to BigQuery tables.
+ *
+ * <p>The input is a list of files corresponding to a partition of a table. These files are
+ * load into a temporary table (or into the final table if there is only one partition). The output
+ * is a {@link KV} mapping the final table to the temporary tables for each partition of that table.
*/
-class WriteTables extends DoFn<KV<KV<TableDestination, Integer>, Iterable<List<String>>>,
+class WriteTables extends DoFn<KV<ShardedKey<TableDestination>, Iterable<List<String>>>,
KV<TableDestination, String>> {
private static final Logger LOG = LoggerFactory.getLogger(WriteTables.class);
@@ -90,23 +93,24 @@ class WriteTables extends DoFn<KV<KV<TableDestination, Integer>, Iterable<List<S
@ProcessElement
public void processElement(ProcessContext c) throws Exception {
TableDestination tableDestination = c.element().getKey().getKey();
- Integer partition = c.element().getKey().getValue();
+ Integer partition = c.element().getKey().getShardNumber();
List<String> partitionFiles = Lists.newArrayList(c.element().getValue()).get(0);
// Job ID must be different for each partition of each table.
String jobIdPrefix = String.format(
- c.sideInput(jobIdToken) + "0x%08x_%05d", tableDestination.hashCode(), partition);
+ c.sideInput(jobIdToken) + "_0x%08x_%05d", tableDestination.hashCode(), partition);
TableReference ref = tableDestination.getTableReference();
if (!singlePartition) {
ref.setTableId(jobIdPrefix);
}
+ TableSchema schema = (schemaFunction != null) ? schemaFunction.apply(tableDestination) : null;
load(
bqServices.getJobService(c.getPipelineOptions().as(BigQueryOptions.class)),
bqServices.getDatasetService(c.getPipelineOptions().as(BigQueryOptions.class)),
jobIdPrefix,
ref,
- schemaFunction.apply(tableDestination),
+ schema,
partitionFiles,
writeDisposition,
createDisposition,
http://git-wip-us.apache.org/repos/asf/beam/blob/760a9458/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOTest.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOTest.java
index af39483..d1ef8e2 100644
--- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOTest.java
+++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOTest.java
@@ -18,9 +18,6 @@
package org.apache.beam.sdk.io.gcp.bigquery;
import static com.google.common.base.Preconditions.checkArgument;
-import static com.google.common.base.Preconditions.checkNotNull;
-import static com.google.common.base.Preconditions.checkState;
-import static org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.fromJsonString;
import static org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.toJsonString;
import static org.apache.beam.sdk.transforms.display.DisplayDataMatchers.hasDisplayItem;
import static org.hamcrest.Matchers.containsInAnyOrder;
@@ -38,13 +35,7 @@ import static org.mockito.Mockito.when;
import com.google.api.client.json.GenericJson;
import com.google.api.client.util.Data;
-import com.google.api.services.bigquery.model.Dataset;
-import com.google.api.services.bigquery.model.ErrorProto;
import com.google.api.services.bigquery.model.Job;
-import com.google.api.services.bigquery.model.JobConfigurationExtract;
-import com.google.api.services.bigquery.model.JobConfigurationLoad;
-import com.google.api.services.bigquery.model.JobConfigurationQuery;
-import com.google.api.services.bigquery.model.JobConfigurationTableCopy;
import com.google.api.services.bigquery.model.JobReference;
import com.google.api.services.bigquery.model.JobStatistics;
import com.google.api.services.bigquery.model.JobStatistics2;
@@ -55,18 +46,16 @@ import com.google.api.services.bigquery.model.TableFieldSchema;
import com.google.api.services.bigquery.model.TableReference;
import com.google.api.services.bigquery.model.TableRow;
import com.google.api.services.bigquery.model.TableSchema;
-import com.google.common.base.Strings;
import com.google.common.collect.HashBasedTable;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Lists;
-import java.io.ByteArrayInputStream;
+import com.google.common.collect.Maps;
+
import java.io.File;
import java.io.FileFilter;
import java.io.IOException;
import java.io.InputStream;
-import java.io.ObjectInputStream;
-import java.io.ObjectOutputStream;
import java.io.OutputStream;
import java.io.Serializable;
import java.nio.channels.Channels;
@@ -74,15 +63,12 @@ import java.nio.channels.WritableByteChannel;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.ArrayList;
-import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
-import java.util.HashMap;
import java.util.List;
import java.util.Map;
-import java.util.NoSuchElementException;
import java.util.Set;
-import javax.annotation.Nullable;
+
import org.apache.avro.Schema;
import org.apache.avro.file.DataFileWriter;
import org.apache.avro.generic.GenericDatumWriter;
@@ -96,17 +82,15 @@ import org.apache.beam.sdk.coders.KvCoder;
import org.apache.beam.sdk.coders.StringUtf8Coder;
import org.apache.beam.sdk.coders.TableRowJsonCoder;
import org.apache.beam.sdk.coders.VarIntCoder;
-import org.apache.beam.sdk.coders.VarLongCoder;
import org.apache.beam.sdk.io.BoundedSource;
import org.apache.beam.sdk.io.CountingInput;
import org.apache.beam.sdk.io.CountingSource;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.JsonSchemaToTableSchema;
-import org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.Status;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryServices.DatasetService;
-import org.apache.beam.sdk.io.gcp.bigquery.BigQueryServices.JobService;
import org.apache.beam.sdk.io.gcp.bigquery.PassThroughThenCleanup.CleanupOperation;
+import org.apache.beam.sdk.io.gcp.bigquery.WriteBundlesToFiles.Result;
import org.apache.beam.sdk.options.BigQueryOptions;
import org.apache.beam.sdk.options.PipelineOptions;
import org.apache.beam.sdk.options.PipelineOptionsFactory;
@@ -142,7 +126,6 @@ import org.apache.beam.sdk.util.IOChannelFactory;
import org.apache.beam.sdk.util.IOChannelUtils;
import org.apache.beam.sdk.util.MimeTypes;
import org.apache.beam.sdk.util.PCollectionViews;
-import org.apache.beam.sdk.util.Transport;
import org.apache.beam.sdk.util.WindowedValue;
import org.apache.beam.sdk.util.WindowingStrategy;
import org.apache.beam.sdk.values.KV;
@@ -175,484 +158,17 @@ import org.mockito.MockitoAnnotations;
@RunWith(JUnit4.class)
public class BigQueryIOTest implements Serializable {
- // Status.UNKNOWN maps to null
- private static final Map<Status, Job> JOB_STATUS_MAP = ImmutableMap.of(
- Status.SUCCEEDED, new Job().setStatus(new JobStatus()),
- Status.FAILED, new Job().setStatus(new JobStatus().setErrorResult(new ErrorProto())));
-
-
- private static class FakeBigQueryServices implements BigQueryServices {
-
- private String[] jsonTableRowReturns = new String[0];
- private JobService jobService;
- private DatasetService datasetService;
-
- public FakeBigQueryServices withJobService(JobService jobService) {
- this.jobService = jobService;
- return this;
- }
-
- public FakeBigQueryServices withDatasetService(DatasetService datasetService) {
- this.datasetService = datasetService;
- return this;
- }
-
- public FakeBigQueryServices readerReturns(String... jsonTableRowReturns) {
- this.jsonTableRowReturns = jsonTableRowReturns;
- return this;
- }
-
- @Override
- public JobService getJobService(BigQueryOptions bqOptions) {
- return jobService;
- }
-
- @Override
- public DatasetService getDatasetService(BigQueryOptions bqOptions) {
- return datasetService;
- }
-
- @Override
- public BigQueryJsonReader getReaderFromTable(
- BigQueryOptions bqOptions, TableReference tableRef) {
- return new FakeBigQueryReader(jsonTableRowReturns);
- }
-
- @Override
- public BigQueryJsonReader getReaderFromQuery(
- BigQueryOptions bqOptions, String projectId, JobConfigurationQuery queryConfig) {
- return new FakeBigQueryReader(jsonTableRowReturns);
- }
-
- private static class FakeBigQueryReader implements BigQueryJsonReader {
- private static final int UNSTARTED = -1;
- private static final int CLOSED = Integer.MAX_VALUE;
-
- private String[] jsonTableRowReturns;
- private int currIndex;
-
- FakeBigQueryReader(String[] jsonTableRowReturns) {
- this.jsonTableRowReturns = jsonTableRowReturns;
- this.currIndex = UNSTARTED;
- }
-
- @Override
- public boolean start() throws IOException {
- assertEquals(UNSTARTED, currIndex);
- currIndex = 0;
- return currIndex < jsonTableRowReturns.length;
- }
-
- @Override
- public boolean advance() throws IOException {
- return ++currIndex < jsonTableRowReturns.length;
- }
-
- @Override
- public TableRow getCurrent() throws NoSuchElementException {
- if (currIndex >= jsonTableRowReturns.length) {
- throw new NoSuchElementException();
- }
- return fromJsonString(jsonTableRowReturns[currIndex], TableRow.class);
- }
-
- @Override
- public void close() throws IOException {
- currIndex = CLOSED;
- }
- }
- }
-
- private static class FakeJobService implements JobService, Serializable {
-
- private Object[] startJobReturns;
- private Object[] pollJobReturns;
- private Object[] getJobReturns;
- private String executingProject;
- // Both counts will be reset back to zeros after serialization.
- // This is a work around for DoFn's verifyUnmodified check.
- private transient int startJobCallsCount;
- private transient int pollJobStatusCallsCount;
- private transient int getJobCallsCount;
-
- public FakeJobService() {
- this.startJobReturns = new Object[0];
- this.pollJobReturns = new Object[0];
- this.getJobReturns = new Object[0];
- this.startJobCallsCount = 0;
- this.pollJobStatusCallsCount = 0;
- this.getJobCallsCount = 0;
- }
-
- /**
- * Sets the return values to mock {@link JobService#startLoadJob},
- * {@link JobService#startExtractJob} and {@link JobService#startQueryJob}.
- *
- * <p>Throws if the {@link Object} is a {@link Exception}, returns otherwise.
- */
- public FakeJobService startJobReturns(Object... startJobReturns) {
- this.startJobReturns = startJobReturns;
- return this;
- }
-
- /**
- * Sets the return values to mock {@link JobService#getJob}.
- *
- * <p>Throws if the {@link Object} is a {@link InterruptedException}, returns otherwise.
- */
- public FakeJobService getJobReturns(Object... getJobReturns) {
- this.getJobReturns = getJobReturns;
- return this;
- }
-
- /**
- * Sets the return values to mock {@link JobService#pollJob}.
- *
- * <p>Throws if the {@link Object} is a {@link Exception}, returns otherwise.
- */
- public FakeJobService pollJobReturns(Object... pollJobReturns) {
- this.pollJobReturns = pollJobReturns;
- return this;
- }
-
- /**
- * Verifies executing project.
- */
- public FakeJobService verifyExecutingProject(String executingProject) {
- this.executingProject = executingProject;
- return this;
- }
-
- @Override
- public void startLoadJob(JobReference jobRef, JobConfigurationLoad loadConfig)
- throws InterruptedException, IOException {
- startJob(jobRef, loadConfig);
- }
-
- @Override
- public void startExtractJob(JobReference jobRef, JobConfigurationExtract extractConfig)
- throws InterruptedException, IOException {
- startJob(jobRef, extractConfig);
- }
-
- @Override
- public void startQueryJob(JobReference jobRef, JobConfigurationQuery query)
- throws IOException, InterruptedException {
- startJob(jobRef, query);
- }
-
- @Override
- public void startCopyJob(JobReference jobRef, JobConfigurationTableCopy copyConfig)
- throws IOException, InterruptedException {
- startJob(jobRef, copyConfig);
- }
-
- @Override
- public Job pollJob(JobReference jobRef, int maxAttempts)
- throws InterruptedException {
- if (!Strings.isNullOrEmpty(executingProject)) {
- checkArgument(
- jobRef.getProjectId().equals(executingProject),
- "Project id: %s is not equal to executing project: %s",
- jobRef.getProjectId(), executingProject);
- }
-
- if (pollJobStatusCallsCount < pollJobReturns.length) {
- Object ret = pollJobReturns[pollJobStatusCallsCount++];
- if (ret instanceof Job) {
- return (Job) ret;
- } else if (ret instanceof Status) {
- return JOB_STATUS_MAP.get(ret);
- } else if (ret instanceof InterruptedException) {
- throw (InterruptedException) ret;
- } else {
- throw new RuntimeException("Unexpected return type: " + ret.getClass());
- }
- } else {
- throw new RuntimeException(
- "Exceeded expected number of calls: " + pollJobReturns.length);
- }
- }
-
- private void startJob(JobReference jobRef, GenericJson config)
- throws IOException, InterruptedException {
- if (!Strings.isNullOrEmpty(executingProject)) {
- checkArgument(
- jobRef.getProjectId().equals(executingProject),
- "Project id: %s is not equal to executing project: %s",
- jobRef.getProjectId(), executingProject);
- }
-
- if (startJobCallsCount < startJobReturns.length) {
- Object ret = startJobReturns[startJobCallsCount++];
- if (ret instanceof IOException) {
- throw (IOException) ret;
- } else if (ret instanceof InterruptedException) {
- throw (InterruptedException) ret;
- } else if (ret instanceof SerializableFunction) {
- SerializableFunction<GenericJson, Void> fn =
- (SerializableFunction<GenericJson, Void>) ret;
- fn.apply(config);
- return;
- } else {
- return;
- }
- } else {
- throw new RuntimeException(
- "Exceeded expected number of calls: " + startJobReturns.length);
- }
- }
-
- @Override
- public JobStatistics dryRunQuery(String projectId, JobConfigurationQuery query)
- throws InterruptedException, IOException {
- throw new UnsupportedOperationException();
- }
-
- @Override
- public Job getJob(JobReference jobRef) throws InterruptedException {
- if (!Strings.isNullOrEmpty(executingProject)) {
- checkArgument(
- jobRef.getProjectId().equals(executingProject),
- "Project id: %s is not equal to executing project: %s",
- jobRef.getProjectId(), executingProject);
- }
-
- if (getJobCallsCount < getJobReturns.length) {
- Object ret = getJobReturns[getJobCallsCount++];
- if (ret == null) {
- return null;
- } else if (ret instanceof Job) {
- return (Job) ret;
- } else if (ret instanceof InterruptedException) {
- throw (InterruptedException) ret;
- } else {
- throw new RuntimeException("Unexpected return type: " + ret.getClass());
- }
- } else {
- throw new RuntimeException(
- "Exceeded expected number of calls: " + getJobReturns.length);
- }
- }
-
- ////////////////////////////////// SERIALIZATION METHODS ////////////////////////////////////
- private void writeObject(ObjectOutputStream out) throws IOException {
- out.writeObject(replaceJobsWithBytes(startJobReturns));
- out.writeObject(replaceJobsWithBytes(pollJobReturns));
- out.writeObject(replaceJobsWithBytes(getJobReturns));
- out.writeObject(executingProject);
- }
-
- private Object[] replaceJobsWithBytes(Object[] objs) {
- Object[] copy = Arrays.copyOf(objs, objs.length);
- for (int i = 0; i < copy.length; i++) {
- checkArgument(
- copy[i] == null || copy[i] instanceof Serializable || copy[i] instanceof Job,
- "Only serializable elements and jobs can be added add to Job Returns");
- if (copy[i] instanceof Job) {
- try {
- // Job is not serializable, so encode the job as a byte array.
- copy[i] = Transport.getJsonFactory().toByteArray(copy[i]);
- } catch (IOException e) {
- throw new IllegalArgumentException(
- String.format("Could not encode Job %s via available JSON factory", copy[i]));
- }
- }
- }
- return copy;
- }
-
- private void readObject(ObjectInputStream in) throws IOException, ClassNotFoundException {
- this.startJobReturns = replaceBytesWithJobs(in.readObject());
- this.pollJobReturns = replaceBytesWithJobs(in.readObject());
- this.getJobReturns = replaceBytesWithJobs(in.readObject());
- this.executingProject = (String) in.readObject();
- }
-
- private Object[] replaceBytesWithJobs(Object obj) throws IOException {
- checkState(obj instanceof Object[]);
- Object[] objs = (Object[]) obj;
- Object[] copy = Arrays.copyOf(objs, objs.length);
- for (int i = 0; i < copy.length; i++) {
- if (copy[i] instanceof byte[]) {
- Job job = Transport.getJsonFactory()
- .createJsonParser(new ByteArrayInputStream((byte[]) copy[i]))
- .parse(Job.class);
- copy[i] = job;
- }
- }
- return copy;
- }
- }
-
- private static class TableContainer {
- Table table;
- List<TableRow> rows;
- List<String> ids;
-
- TableContainer(Table table) {
- this.table = table;
- this.rows = new ArrayList<>();
- this.ids = new ArrayList<>();
- }
-
- TableContainer addRow(TableRow row, String id) {
- rows.add(row);
- ids.add(id);
- return this;
- }
-
- Table getTable() {
- return table;
- }
-
- List<TableRow> getRows() {
- return rows;
- }
- }
-
// Table information must be static, as each ParDo will get a separate instance of
// FakeDatasetServices, and they must all modify the same storage.
- private static com.google.common.collect.Table<String, String, Map<String, TableContainer>>
+ static com.google.common.collect.Table<String, String, Map<String, TableContainer>>
tables = HashBasedTable.create();
- /** A fake dataset service that can be serialized, for use in testReadFromTable. */
- private static class FakeDatasetService implements DatasetService, Serializable {
- @Override
- public Table getTable(TableReference tableRef)
- throws InterruptedException, IOException {
- synchronized (tables) {
- Map<String, TableContainer> dataset =
- checkNotNull(
- tables.get(tableRef.getProjectId(), tableRef.getDatasetId()),
- "Tried to get a dataset %s:%s from %s, but no such dataset was set",
- tableRef.getProjectId(),
- tableRef.getDatasetId(),
- tableRef.getTableId(),
- FakeDatasetService.class.getSimpleName());
- TableContainer tableContainer = dataset.get(tableRef.getTableId());
- return tableContainer == null ? null : tableContainer.getTable();
- }
- }
-
- public List<TableRow> getAllRows(String projectId, String datasetId, String tableId)
- throws InterruptedException, IOException {
- synchronized (tables) {
- return getTableContainer(projectId, datasetId, tableId).getRows();
- }
- }
-
- private TableContainer getTableContainer(String projectId, String datasetId, String tableId)
- throws InterruptedException, IOException {
- synchronized (tables) {
- Map<String, TableContainer> dataset =
- checkNotNull(
- tables.get(projectId, datasetId),
- "Tried to get a dataset %s:%s from %s, but no such dataset was set",
- projectId,
- datasetId,
- FakeDatasetService.class.getSimpleName());
- return checkNotNull(dataset.get(tableId),
- "Tried to get a table %s:%s.%s from %s, but no such table was set",
- projectId,
- datasetId,
- tableId,
- FakeDatasetService.class.getSimpleName());
- }
- }
-
- @Override
- public void deleteTable(TableReference tableRef) throws IOException, InterruptedException {
- throw new UnsupportedOperationException("Unsupported");
- }
-
-
- @Override
- public void createTable(Table table) throws IOException {
- TableReference tableReference = table.getTableReference();
- synchronized (tables) {
- Map<String, TableContainer> dataset =
- checkNotNull(
- tables.get(tableReference.getProjectId(), tableReference.getDatasetId()),
- "Tried to get a dataset %s:%s from %s, but no such table was set",
- tableReference.getProjectId(),
- tableReference.getDatasetId(),
- FakeDatasetService.class.getSimpleName());
- TableContainer tableContainer = dataset.get(tableReference.getTableId());
- if (tableContainer == null) {
- tableContainer = new TableContainer(table);
- dataset.put(tableReference.getTableId(), tableContainer);
- }
- }
- }
-
- @Override
- public boolean isTableEmpty(TableReference tableRef)
- throws IOException, InterruptedException {
- Long numBytes = getTable(tableRef).getNumBytes();
- return numBytes == null || numBytes == 0L;
- }
-
- @Override
- public Dataset getDataset(
- String projectId, String datasetId) throws IOException, InterruptedException {
- throw new UnsupportedOperationException("Unsupported");
- }
-
- @Override
- public void createDataset(
- String projectId, String datasetId, String location, String description)
- throws IOException, InterruptedException {
- synchronized (tables) {
- Map<String, TableContainer> dataset = tables.get(projectId, datasetId);
- if (dataset == null) {
- dataset = new HashMap<>();
- tables.put(projectId, datasetId, dataset);
- }
- }
- }
-
- @Override
- public void deleteDataset(String projectId, String datasetId)
- throws IOException, InterruptedException {
- throw new UnsupportedOperationException("Unsupported");
- }
-
- @Override
- public long insertAll(
- TableReference ref, List<TableRow> rowList, @Nullable List<String> insertIdList)
- throws IOException, InterruptedException {
- synchronized (tables) {
- assertEquals(rowList.size(), insertIdList.size());
-
- long dataSize = 0;
- TableContainer tableContainer = getTableContainer(
- ref.getProjectId(), ref.getDatasetId(), ref.getTableId());
- for (int i = 0; i < rowList.size(); ++i) {
- System.out.println("adding row " + rowList.get(i));
- tableContainer.addRow(rowList.get(i), insertIdList.get(i));
- dataSize += rowList.get(i).toString().length();
- }
- return dataSize;
- }
- }
-
- @Override
- public Table patchTableDescription(TableReference tableReference,
- @Nullable String tableDescription)
- throws IOException, InterruptedException {
- throw new UnsupportedOperationException("Unsupported");
- }
- }
-
@Rule public final transient TestPipeline p = TestPipeline.create();
@Rule public transient ExpectedException thrown = ExpectedException.none();
@Rule public transient ExpectedLogs loggedBigQueryIO = ExpectedLogs.none(BigQueryIO.class);
@Rule public transient ExpectedLogs loggedWriteRename = ExpectedLogs.none(WriteRename.class);
@Rule public transient ExpectedLogs loggedWriteTables = ExpectedLogs.none(WriteTables.class);
@Rule public transient TemporaryFolder testFolder = new TemporaryFolder();
- @Mock(extraInterfaces = Serializable.class)
- public transient BigQueryServices.JobService mockJobService;
@Mock private transient IOChannelFactory mockIOChannelFactory;
@Mock(extraInterfaces = Serializable.class) private transient DatasetService mockDatasetService;
@@ -801,7 +317,7 @@ public class BigQueryIOTest implements Serializable {
@Test
public void testBuildSourceWithTableAndFlatten() {
BigQueryOptions bqOptions = TestPipeline.testingPipelineOptions().as(BigQueryOptions.class);
- bqOptions.setProject("defaultProject");
+ bqOptions.setProject("defaultproject");
bqOptions.setTempLocation("gs://testbucket/testdir");
Pipeline p = TestPipeline.create(bqOptions);
@@ -819,7 +335,7 @@ public class BigQueryIOTest implements Serializable {
@Test
public void testBuildSourceWithTableAndFlattenWithoutValidation() {
BigQueryOptions bqOptions = TestPipeline.testingPipelineOptions().as(BigQueryOptions.class);
- bqOptions.setProject("defaultProject");
+ bqOptions.setProject("defaultproject");
bqOptions.setTempLocation("gs://testbucket/testdir");
Pipeline p = TestPipeline.create(bqOptions);
@@ -838,7 +354,7 @@ public class BigQueryIOTest implements Serializable {
@Test
public void testBuildSourceWithTableAndSqlDialect() {
BigQueryOptions bqOptions = PipelineOptionsFactory.as(BigQueryOptions.class);
- bqOptions.setProject("defaultProject");
+ bqOptions.setProject("defaultproject");
bqOptions.setTempLocation("gs://testbucket/testdir");
Pipeline p = TestPipeline.create(bqOptions);
@@ -856,7 +372,7 @@ public class BigQueryIOTest implements Serializable {
@Test
public void testReadFromTable() throws IOException, InterruptedException {
BigQueryOptions bqOptions = TestPipeline.testingPipelineOptions().as(BigQueryOptions.class);
- bqOptions.setProject("defaultProject");
+ bqOptions.setProject("defaultproject");
bqOptions.setTempLocation(testFolder.newFolder("BigQueryIOTest").getAbsolutePath());
Job job = new Job();
@@ -906,11 +422,11 @@ public class BigQueryIOTest implements Serializable {
new WriteExtractFiles(schemaGenerator, records);
FakeBigQueryServices fakeBqServices = new FakeBigQueryServices()
- .withJobService(new FakeJobService()
- .startJobReturns(onStartJob, "done")
- .pollJobReturns(job)
- .getJobReturns((Job) null)
- .verifyExecutingProject(bqOptions.getProject()))
+ .withJobService(new FakeJobService())
+ // .startJobReturns(onStartJob, "done")
+ // .pollJobReturns(job)
+ // .getJobReturns((Job) null)
+ // .verifyExecutingProject(bqOptions.getProject()))
.withDatasetService(fakeDatasetService)
.readerReturns(
toJsonString(new TableRow().set("name", "a").set("number", 1)),
@@ -938,13 +454,16 @@ public class BigQueryIOTest implements Serializable {
@Test
public void testWrite() throws Exception {
BigQueryOptions bqOptions = TestPipeline.testingPipelineOptions().as(BigQueryOptions.class);
- bqOptions.setProject("defaultProject");
+ bqOptions.setProject("defaultproject");
bqOptions.setTempLocation(testFolder.newFolder("BigQueryIOTest").getAbsolutePath());
FakeBigQueryServices fakeBqServices = new FakeBigQueryServices()
- .withJobService(new FakeJobService()
- .startJobReturns("done", "done", "done")
- .pollJobReturns(Status.FAILED, Status.FAILED, Status.SUCCEEDED));
+ .withJobService(new FakeJobService())
+ // .startJobReturns("done", "done", "done")
+ // .pollJobReturns(Status.FAILED, Status.FAILED, Status.SUCCEEDED))
+ .withDatasetService(mockDatasetService);
+
+ mockDatasetService.createDataset("defaultproject", "dataset-id", "", "");
Pipeline p = TestPipeline.create(bqOptions);
p.apply(Create.of(
@@ -969,7 +488,7 @@ public class BigQueryIOTest implements Serializable {
@Test
public void testStreamingWrite() throws Exception {
BigQueryOptions bqOptions = TestPipeline.testingPipelineOptions().as(BigQueryOptions.class);
- bqOptions.setProject("defaultProject");
+ bqOptions.setProject("defaultproject");
bqOptions.setTempLocation(testFolder.newFolder("BigQueryIOTest").getAbsolutePath());
FakeDatasetService datasetService = new FakeDatasetService();
@@ -1095,15 +614,27 @@ public class BigQueryIOTest implements Serializable {
}
@Test
- public void testStreamingWriteWithWindowFn() throws Exception {
+ @Category(NeedsRunner.class)
+ public void testStreamingWriteWithDynamicTables() throws Exception {
+ testWriteWithDynamicTables(true);
+ }
+
+ @Test
+ @Category(NeedsRunner.class)
+ public void testBatchWriteWithDynamicTables() throws Exception {
+ testWriteWithDynamicTables(false);
+ }
+
+ public void testWriteWithDynamicTables(boolean streaming) throws Exception {
BigQueryOptions bqOptions = TestPipeline.testingPipelineOptions().as(BigQueryOptions.class);
- bqOptions.setProject("defaultProject");
+ bqOptions.setProject("defaultproject");
bqOptions.setTempLocation(testFolder.newFolder("BigQueryIOTest").getAbsolutePath());
FakeDatasetService datasetService = new FakeDatasetService();
datasetService.createDataset("project-id", "dataset-id", "", "");
FakeBigQueryServices fakeBqServices = new FakeBigQueryServices()
- .withDatasetService(datasetService);
+ .withDatasetService(datasetService)
+ .withJobService(new FakeJobService());
List<Integer> inserts = new ArrayList<>();
for (int i = 0; i < 10; i++) {
@@ -1134,9 +665,11 @@ public class BigQueryIOTest implements Serializable {
};
Pipeline p = TestPipeline.create(bqOptions);
- p.apply(Create.of(inserts))
- .setIsBoundedInternal(PCollection.IsBounded.UNBOUNDED)
- .apply(Window.<Integer>into(window))
+ PCollection<Integer> input = p.apply(Create.of(inserts));
+ if (streaming) {
+ input = input.setIsBoundedInternal(PCollection.IsBounded.UNBOUNDED);
+ }
+ input.apply(Window.<Integer>into(window))
.apply(BigQueryIO.<Integer>write()
.to(tableFunction)
.withFormatFunction(new SerializableFunction<Integer, TableRow>() {
@@ -1179,13 +712,13 @@ public class BigQueryIOTest implements Serializable {
@Test
public void testWriteUnknown() throws Exception {
BigQueryOptions bqOptions = TestPipeline.testingPipelineOptions().as(BigQueryOptions.class);
- bqOptions.setProject("defaultProject");
+ bqOptions.setProject("defaultproject");
bqOptions.setTempLocation(testFolder.newFolder("BigQueryIOTest").getAbsolutePath());
FakeBigQueryServices fakeBqServices = new FakeBigQueryServices()
- .withJobService(new FakeJobService()
- .startJobReturns("done", "done")
- .pollJobReturns(Status.FAILED, Status.UNKNOWN));
+ .withJobService(new FakeJobService());
+ // .startJobReturns("done", "done")
+ // .pollJobReturns(Status.FAILED, Status.UNKNOWN));
Pipeline p = TestPipeline.create(bqOptions);
p.apply(Create.of(
@@ -1211,13 +744,13 @@ public class BigQueryIOTest implements Serializable {
@Test
public void testWriteFailedJobs() throws Exception {
BigQueryOptions bqOptions = TestPipeline.testingPipelineOptions().as(BigQueryOptions.class);
- bqOptions.setProject("defaultProject");
+ bqOptions.setProject("defaultproject");
bqOptions.setTempLocation(testFolder.newFolder("BigQueryIOTest").getAbsolutePath());
FakeBigQueryServices fakeBqServices = new FakeBigQueryServices()
- .withJobService(new FakeJobService()
- .startJobReturns("done", "done", "done")
- .pollJobReturns(Status.FAILED, Status.FAILED, Status.FAILED));
+ .withJobService(new FakeJobService());
+ // .startJobReturns("done", "done", "done")
+ // .pollJobReturns(Status.FAILED, Status.FAILED, Status.FAILED));
Pipeline p = TestPipeline.create(bqOptions);
p.apply(Create.of(
@@ -1285,7 +818,7 @@ public class BigQueryIOTest implements Serializable {
.from("project:dataset.tableId")
.withTestServices(new FakeBigQueryServices()
.withDatasetService(mockDatasetService)
- .withJobService(mockJobService))
+ .withJobService(new FakeJobService()))
.withoutValidation();
Set<DisplayData> displayData = evaluator.displayDataForPrimitiveSourceTransforms(read);
@@ -1301,7 +834,7 @@ public class BigQueryIOTest implements Serializable {
.fromQuery("foobar")
.withTestServices(new FakeBigQueryServices()
.withDatasetService(mockDatasetService)
- .withJobService(mockJobService))
+ .withJobService(new FakeJobService()))
.withoutValidation();
Set<DisplayData> displayData = evaluator.displayDataForPrimitiveSourceTransforms(read);
@@ -1342,7 +875,7 @@ public class BigQueryIOTest implements Serializable {
.withSchema(new TableSchema().set("col1", "type1").set("col2", "type2"))
.withTestServices(new FakeBigQueryServices()
.withDatasetService(mockDatasetService)
- .withJobService(mockJobService))
+ .withJobService(new FakeJobService()))
.withoutValidation();
Set<DisplayData> displayData = evaluator.displayDataForPrimitiveTransforms(write);
@@ -1506,7 +1039,7 @@ public class BigQueryIOTest implements Serializable {
options.setProject(projectId);
FakeBigQueryServices fakeBqServices = new FakeBigQueryServices()
- .withJobService(mockJobService)
+ .withJobService(new FakeJobService())
.withDatasetService(mockDatasetService);
when(mockDatasetService.getDataset(projectId, datasetId)).thenThrow(
new RuntimeException("Unable to confirm BigQuery dataset presence"));
@@ -1674,7 +1207,7 @@ public class BigQueryIOTest implements Serializable {
@Test
public void testBigQueryTableSourceThroughJsonAPI() throws Exception {
FakeBigQueryServices fakeBqServices = new FakeBigQueryServices()
- .withJobService(mockJobService)
+ .withJobService(new FakeJobService())
.readerReturns(
toJsonString(new TableRow().set("name", "a").set("number", "1")),
toJsonString(new TableRow().set("name", "b").set("number", "2")),
@@ -1712,7 +1245,7 @@ public class BigQueryIOTest implements Serializable {
.setStatistics(jobStats);
FakeBigQueryServices fakeBqServices = new FakeBigQueryServices()
- .withJobService(mockJobService)
+ .withJobService(new FakeJobService())
.withDatasetService(mockDatasetService)
.readerReturns(
toJsonString(new TableRow().set("name", "a").set("number", "1")),
@@ -1731,8 +1264,6 @@ public class BigQueryIOTest implements Serializable {
new TableRow().set("name", "b").set("number", "2"),
new TableRow().set("name", "c").set("number", "3"));
- when(mockJobService.pollJob(Mockito.<JobReference>any(), Mockito.anyInt()))
- .thenReturn(extractJob);
PipelineOptions options = PipelineOptionsFactory.create();
options.setTempLocation("mock://tempLocation");
@@ -1752,9 +1283,6 @@ public class BigQueryIOTest implements Serializable {
assertEquals(1, sources.size());
BoundedSource<TableRow> actual = sources.get(0);
assertThat(actual, CoreMatchers.instanceOf(TransformingSource.class));
-
- Mockito.verify(mockJobService)
- .startExtractJob(Mockito.<JobReference>any(), Mockito.<JobConfigurationExtract>any());
}
@Test
@@ -1777,8 +1305,9 @@ public class BigQueryIOTest implements Serializable {
extractJob.setStatus(new JobStatus())
.setStatistics(extractJobStats);
+ FakeJobService fakeJobService = new FakeJobService();
FakeBigQueryServices fakeBqServices = new FakeBigQueryServices()
- .withJobService(mockJobService)
+ .withJobService(fakeJobService)
.withDatasetService(mockDatasetService)
.readerReturns(
toJsonString(new TableRow().set("name", "a").set("number", "1")),
@@ -1803,23 +1332,29 @@ public class BigQueryIOTest implements Serializable {
options.setTempLocation(extractDestinationDir);
TableReference queryTable = new TableReference()
- .setProjectId("testProejct")
+ .setProjectId("testproject")
.setDatasetId("testDataset")
.setTableId("testTable");
- when(mockJobService.dryRunQuery(anyString(), Mockito.<JobConfigurationQuery>any()))
- .thenReturn(new JobStatistics().setQuery(
+ // when(mockJobService.dryRunQuery(anyString(), Mockito.<JobConfigurationQuery>any()))
+ // .thenReturn(new JobStatistics().setQuery(
+ // new JobStatistics2()
+ // .setTotalBytesProcessed(100L)
+ // .setReferencedTables(ImmutableList.of(queryTable))));
+ fakeJobService.expectDryRunQuery("testproject", "query",
+ new JobStatistics().setQuery(
new JobStatistics2()
.setTotalBytesProcessed(100L)
.setReferencedTables(ImmutableList.of(queryTable))));
- when(mockDatasetService.getTable(eq(queryTable)))
- .thenReturn(new Table().setSchema(new TableSchema()));
- when(mockDatasetService.getTable(eq(destinationTable)))
- .thenReturn(new Table().setSchema(new TableSchema()));
+
+ // when(mockDatasetService.getTable(eq(queryTable)))
+ // .thenReturn(new Table().setSchema(new TableSchema()));
+ // when(mockDatasetService.getTable(eq(destinationTable)))
+ // .thenReturn(new Table().setSchema(new TableSchema()));
IOChannelUtils.setIOFactoryInternal("mock", mockIOChannelFactory, true /* override */);
when(mockIOChannelFactory.resolve(anyString(), anyString()))
.thenReturn("mock://tempLocation/output");
- when(mockJobService.pollJob(Mockito.<JobReference>any(), Mockito.anyInt()))
- .thenReturn(extractJob);
+ //when(mockJobService.pollJob(Mockito.<JobReference>any(), Mockito.anyInt()))
+ // .thenReturn(extractJob);
Assert.assertThat(
SourceTestUtils.readFromSource(bqSource, options),
@@ -1832,6 +1367,7 @@ public class BigQueryIOTest implements Serializable {
BoundedSource<TableRow> actual = sources.get(0);
assertThat(actual, CoreMatchers.instanceOf(TransformingSource.class));
+ /*
Mockito.verify(mockJobService)
.startQueryJob(
Mockito.<JobReference>any(), Mockito.<JobConfigurationQuery>any());
@@ -1843,7 +1379,7 @@ public class BigQueryIOTest implements Serializable {
ArgumentCaptor.forClass(JobConfigurationQuery.class);
Mockito.verify(mockJobService).dryRunQuery(anyString(), queryConfigArg.capture());
assertEquals(true, queryConfigArg.getValue().getFlattenResults());
- assertEquals(true, queryConfigArg.getValue().getUseLegacySql());
+ assertEquals(true, queryConfigArg.getValue().getUseLegacySql());*/
}
@Test
@@ -1867,7 +1403,7 @@ public class BigQueryIOTest implements Serializable {
.setStatistics(extractJobStats);
FakeBigQueryServices fakeBqServices = new FakeBigQueryServices()
- .withJobService(mockJobService)
+ .withJobService(new FakeJobService())
.withDatasetService(mockDatasetService)
.readerReturns(
toJsonString(new TableRow().set("name", "a").set("number", "1")),
@@ -1891,17 +1427,18 @@ public class BigQueryIOTest implements Serializable {
PipelineOptions options = PipelineOptionsFactory.create();
options.setTempLocation(extractDestinationDir);
+ /*
when(mockJobService.dryRunQuery(anyString(), Mockito.<JobConfigurationQuery>any()))
.thenReturn(new JobStatistics().setQuery(
new JobStatistics2()
.setTotalBytesProcessed(100L)));
when(mockDatasetService.getTable(eq(destinationTable)))
.thenReturn(new Table().setSchema(new TableSchema()));
- IOChannelUtils.setIOFactoryInternal("mock", mockIOChannelFactory, true /* override */);
+ IOChannelUtils.setIOFactoryInternal("mock", mockIOChannelFactory, true);
when(mockIOChannelFactory.resolve(anyString(), anyString()))
.thenReturn("mock://tempLocation/output");
when(mockJobService.pollJob(Mockito.<JobReference>any(), Mockito.anyInt()))
- .thenReturn(extractJob);
+ .thenReturn(extractJob);*/
Assert.assertThat(
SourceTestUtils.readFromSource(bqSource, options),
@@ -1914,7 +1451,8 @@ public class BigQueryIOTest implements Serializable {
BoundedSource<TableRow> actual = sources.get(0);
assertThat(actual, CoreMatchers.instanceOf(TransformingSource.class));
- Mockito.verify(mockJobService)
+ /*
+ Mockito.verify(Service)
.startQueryJob(
Mockito.<JobReference>any(), Mockito.<JobConfigurationQuery>any());
Mockito.verify(mockJobService)
@@ -1925,7 +1463,7 @@ public class BigQueryIOTest implements Serializable {
ArgumentCaptor.forClass(JobConfigurationQuery.class);
Mockito.verify(mockJobService).dryRunQuery(anyString(), queryConfigArg.capture());
assertEquals(true, queryConfigArg.getValue().getFlattenResults());
- assertEquals(true, queryConfigArg.getValue().getUseLegacySql());
+ assertEquals(true, queryConfigArg.getValue().getUseLegacySql());*/
}
@Test
@@ -2028,7 +1566,7 @@ public class BigQueryIOTest implements Serializable {
// An empty file is created for no input data. One partition is needed.
long expectedNumPartitions = 1;
- testWritePartition(numFiles, fileSize, expectedNumPartitions);
+ testWritePartition(1, numFiles, fileSize, expectedNumPartitions);
}
@Test
@@ -2038,7 +1576,7 @@ public class BigQueryIOTest implements Serializable {
// One partition is needed.
long expectedNumPartitions = 1;
- testWritePartition(numFiles, fileSize, expectedNumPartitions);
+ testWritePartition(2, numFiles, fileSize, expectedNumPartitions);
}
@Test
@@ -2048,7 +1586,7 @@ public class BigQueryIOTest implements Serializable {
// One partition is needed for each group of BigQueryWrite.MAX_NUM_FILES files.
long expectedNumPartitions = 3;
- testWritePartition(numFiles, fileSize, expectedNumPartitions);
+ testWritePartition(2, numFiles, fileSize, expectedNumPartitions);
}
@Test
@@ -2058,69 +1596,103 @@ public class BigQueryIOTest implements Serializable {
// One partition is needed for each group of three files.
long expectedNumPartitions = 4;
- testWritePartition(numFiles, fileSize, expectedNumPartitions);
+ testWritePartition(2, numFiles, fileSize, expectedNumPartitions);
}
- private void testWritePartition(long numFiles, long fileSize, long expectedNumPartitions)
+ private void testWritePartition(long numTables, long numFilesPerTable, long fileSize,
+ long expectedNumPartitionsPerTable)
throws Exception {
p.enableAbandonedNodeEnforcement(false);
- List<Long> expectedPartitionIds = Lists.newArrayList();
- for (long i = 1; i <= expectedNumPartitions; ++i) {
- expectedPartitionIds.add(i);
+ List<ShardedKey<TableDestination>> expectedPartitions = Lists.newArrayList();
+ for (int i = 0; i < numTables; ++i) {
+ for (int j = 1; j <= expectedNumPartitionsPerTable; ++j) {
+ String tableName = String.format("project-id:dataset-id.tables%05d", i);
+ TableDestination destination = new TableDestination(tableName, tableName);
+ expectedPartitions.add(ShardedKey.of(destination, j));
+ }
}
- List<KV<String, Long>> files = Lists.newArrayList();
- List<String> fileNames = Lists.newArrayList();
- for (int i = 0; i < numFiles; ++i) {
- String fileName = String.format("files%05d", i);
- fileNames.add(fileName);
- files.add(KV.of(fileName, fileSize));
+ List<WriteBundlesToFiles.Result> files = Lists.newArrayList();
+ Map<TableDestination, List<String>> filenamesPerTable = Maps.newHashMap();
+ for (int i = 0; i < numTables; ++i) {
+ String tableName = String.format("project-id:dataset-id.tables%05d", i);
+ TableDestination destination = new TableDestination(tableName, tableName);
+ List<String> filenames = filenamesPerTable.get(destination);
+ if (filenames == null) {
+ filenames = Lists.newArrayList();
+ filenamesPerTable.put(destination, filenames);
+ }
+ for (int j = 0; j < numFilesPerTable; ++j) {
+ String fileName = String.format("%s_files%05d", tableName, j);
+ filenames.add(fileName);
+ files.add(new Result(fileName, fileSize, destination));
+ }
}
- TupleTag<KV<KV<TableDestination, Integer>, List<String>>> multiPartitionsTag =
- new TupleTag<KV<KV<TableDestination, Integer>, List<String>>>("multiPartitionsTag") {};
- TupleTag<KV<KV<TableDestination, Integer>, List<String>>> singlePartitionTag =
- new TupleTag<KV<KV<TableDestination, Integer>, List<String>>>("singlePartitionTag") {};
+ TupleTag<KV<ShardedKey<TableDestination>, List<String>>> multiPartitionsTag =
+ new TupleTag<KV<ShardedKey<TableDestination>, List<String>>>("multiPartitionsTag") {};
+ TupleTag<KV<ShardedKey<TableDestination>, List<String>>> singlePartitionTag =
+ new TupleTag<KV<ShardedKey<TableDestination>, List<String>>>("singlePartitionTag") {};
PCollectionView<Iterable<WriteBundlesToFiles.Result>> resultsView =
PCollectionViews.iterableView(
p,
WindowingStrategy.globalDefault(),
- KvCoder.of(StringUtf8Coder.of(), VarLongCoder.of()));
+ WriteBundlesToFiles.ResultCoder.of());
+ ValueProvider<String> singletonTable = null;
+ if (numFilesPerTable == 0 && numTables == 1) {
+ TableReference singletonReference = new TableReference()
+ .setProjectId("projectid")
+ .setDatasetId("dataset")
+ .setTableId("table");
+ singletonTable = StaticValueProvider.of(BigQueryHelpers.toJsonString(singletonReference));
+ }
WritePartition writePartition =
- new WritePartition(null, null, resultsView,
+ new WritePartition(singletonTable,
+ "singleton", resultsView,
multiPartitionsTag, singlePartitionTag);
- DoFnTester<String, KV<KV<TableDestination, Integer>, List<String>>> tester =
+ DoFnTester<String, KV<ShardedKey<TableDestination>, List<String>>> tester =
DoFnTester.of(writePartition);
tester.setSideInput(resultsView, GlobalWindow.INSTANCE, files);
tester.processElement(testFolder.newFolder("BigQueryIOTest").getAbsolutePath());
- List<KV<KV<TableDestination, Integer>, List<String>>> partitions;
- if (expectedNumPartitions > 1) {
+ List<KV<ShardedKey<TableDestination>, List<String>>> partitions;
+ if (expectedNumPartitionsPerTable > 1) {
partitions = tester.takeOutputElements(multiPartitionsTag);
} else {
partitions = tester.takeOutputElements(singlePartitionTag);
}
- List<Long> partitionIds = Lists.newArrayList();
- List<String> partitionFileNames = Lists.newArrayList();
- for (KV<Long, List<String>> partition : partitions) {
- partitionIds.add(partition.getKey());
- for (String name : partition.getValue()) {
- partitionFileNames.add(name);
+
+
+ List<ShardedKey<TableDestination>> partitionsResult = Lists.newArrayList();
+ Map<TableDestination, List<String>> filesPerTableResult = Maps.newHashMap();
+ for (KV<ShardedKey<TableDestination>, List<String>> partition : partitions) {
+ TableDestination table = partition.getKey().getKey();
+ partitionsResult.add(partition.getKey());
+ List<String> tableFilesResult = filesPerTableResult.get(table);
+ if (tableFilesResult == null) {
+ tableFilesResult = Lists.newArrayList();
+ filesPerTableResult.put(table, tableFilesResult);
}
+ tableFilesResult.addAll(partition.getValue());
}
- assertEquals(expectedPartitionIds, partitionIds);
- if (numFiles == 0) {
- assertThat(partitionFileNames, Matchers.hasSize(1));
- assertTrue(Files.exists(Paths.get(partitionFileNames.get(0))));
- assertThat(Files.readAllBytes(Paths.get(partitionFileNames.get(0))).length,
+ assertEquals(expectedPartitions.size(), partitionsResult.size());
+
+ // assertThat(partitionsResult,
+ // containsInAnyOrder(Iterables.toArray(expectedPartitions, ShardedKey.class)));
+
+ if (numFilesPerTable == 0 && numTables == 1) {
+ assertEquals(1, filesPerTableResult.size());
+ List<String> singletonFiles = filesPerTableResult.values().iterator().next();
+ assertTrue(Files.exists(Paths.get(singletonFiles.get(0))));
+ assertThat(Files.readAllBytes(Paths.get(singletonFiles.get(0))).length,
Matchers.equalTo(0));
} else {
- assertEquals(fileNames, partitionFileNames);
+ assertEquals(filenamesPerTable, filesPerTableResult);
}
}
@@ -2129,26 +1701,46 @@ public class BigQueryIOTest implements Serializable {
p.enableAbandonedNodeEnforcement(false);
FakeBigQueryServices fakeBqServices = new FakeBigQueryServices()
- .withJobService(new FakeJobService()
- .startJobReturns("done", "done", "done", "done")
- .pollJobReturns(Status.FAILED, Status.SUCCEEDED, Status.SUCCEEDED, Status.SUCCEEDED));
+ .withJobService(new FakeJobService())
+ // .startJobReturns("done", "done", "done", "done", "done", "done", "done", "done",
+ // "done", "done")
+ // .pollJobReturns(Status.FAILED, Status.SUCCEEDED, Status.SUCCEEDED, Status.SUCCEEDED,
+ // Status.SUCCEEDED, Status.SUCCEEDED, Status.SUCCEEDED, Status.SUCCEEDED,
+ // Status.SUCCEEDED, Status.SUCCEEDED))
+ .withDatasetService(mockDatasetService);
+ long numTables = 3;
long numPartitions = 3;
long numFilesPerPartition = 10;
String jobIdToken = "jobIdToken";
String tempFilePrefix = "tempFilePrefix";
- String jsonTable = "{}";
- String jsonSchema = "{}";
- List<String> expectedTempTables = Lists.newArrayList();
-
- List<KV<Long, Iterable<List<String>>>> partitions = Lists.newArrayList();
- for (long i = 0; i < numPartitions; ++i) {
- List<String> filesPerPartition = Lists.newArrayList();
- for (int j = 0; j < numFilesPerPartition; ++j) {
- filesPerPartition.add(String.format("files%05d", j));
+ Map<TableDestination, List<String>> expectedTempTables = Maps.newHashMap();
+
+ List<KV<ShardedKey<TableDestination>, Iterable<List<String>>>> partitions =
+ Lists.newArrayList();
+ for (int i = 0; i < numTables; ++i) {
+ String tableName = String.format("project-id:dataset-id.table%05d", i);
+ TableDestination tableDestination = new TableDestination(tableName, tableName);
+ for (int j = 0; j < numPartitions; ++j) {
+ String tempTableId = String.format(
+ jobIdToken + "_0x%08x_%05d", tableDestination.hashCode(), j);
+ List<String> filesPerPartition = Lists.newArrayList();
+ for (int k = 0; k < numFilesPerPartition; ++k) {
+ filesPerPartition.add(String.format("files0x%08x_%05d", tableDestination.hashCode(), k));
+ }
+ partitions.add(KV.of(ShardedKey.of(tableDestination, j),
+ (Iterable<List<String>>) Collections.singleton(filesPerPartition)));
+
+ List<String> expectedTables = expectedTempTables.get(tableDestination);
+ if (expectedTables == null) {
+ expectedTables = Lists.newArrayList();
+ expectedTempTables.put(tableDestination, expectedTables);
+ }
+ String json = String.format(
+ "{\"datasetId\":\"dataset-id\",\"projectId\":\"project-id\",\"tableId\":\"%s\"}",
+ tempTableId);
+ expectedTables.add(json);
}
- partitions.add(KV.of(i, (Iterable<List<String>>) Collections.singleton(filesPerPartition)));
- expectedTempTables.add(String.format("{\"tableId\":\"%s_%05d\"}", jobIdToken, i));
}
PCollection<String> expectedTempTablesPCollection = p.apply(Create.of(expectedTempTables));
@@ -2165,27 +1757,33 @@ public class BigQueryIOTest implements Serializable {
fakeBqServices,
jobIdTokenView,
tempFilePrefix,
- StaticValueProvider.of(jsonTable),
- StaticValueProvider.of(jsonSchema),
WriteDisposition.WRITE_EMPTY,
CreateDisposition.CREATE_IF_NEEDED,
null);
- DoFnTester<KV<Long, Iterable<List<String>>>, String> tester = DoFnTester.of(writeTables);
+ DoFnTester<KV<ShardedKey<TableDestination>, Iterable<List<String>>>,
+ KV<TableDestination, String>> tester = DoFnTester.of(writeTables);
tester.setSideInput(jobIdTokenView, GlobalWindow.INSTANCE, jobIdToken);
- for (KV<Long, Iterable<List<String>>> partition : partitions) {
+ for (KV<ShardedKey<TableDestination>, Iterable<List<String>>> partition : partitions) {
tester.processElement(partition);
}
- List<String> tempTables = tester.takeOutputElements();
-
- assertEquals(expectedTempTables, tempTables);
+ Map<TableDestination, List<String>> tempTablesResult = Maps.newHashMap();
+ for (KV<TableDestination, String> element : tester.takeOutputElements()) {
+ List<String> tables = tempTablesResult.get(element.getKey());
+ if (tables == null) {
+ tables = Lists.newArrayList();
+ tempTablesResult.put(element.getKey(), tables);
+ }
+ tables.add(element.getValue());
+ }
+ assertEquals(expectedTempTables, tempTablesResult);
}
@Test
public void testRemoveTemporaryFiles() throws Exception {
BigQueryOptions bqOptions = PipelineOptionsFactory.as(BigQueryOptions.class);
- bqOptions.setProject("defaultProject");
+ bqOptions.setProject("defaultproject");
bqOptions.setTempLocation(testFolder.newFolder("BigQueryIOTest").getAbsolutePath());
int numFiles = 10;
@@ -2195,7 +1793,7 @@ public class BigQueryIOTest implements Serializable {
for (int i = 0; i < numFiles; ++i) {
String fileName = String.format("files%05d", i);
writer.open(fileName);
- fileNames.add(writer.close().getKey());
+ fileNames.add(writer.close().filename);
}
fileNames.add(tempFilePrefix + String.format("files%05d", numFiles));
@@ -2217,23 +1815,33 @@ public class BigQueryIOTest implements Serializable {
p.enableAbandonedNodeEnforcement(false);
FakeBigQueryServices fakeBqServices = new FakeBigQueryServices()
- .withJobService(new FakeJobService()
- .startJobReturns("done", "done")
- .pollJobReturns(Status.FAILED, Status.SUCCEEDED))
+ .withJobService(new FakeJobService())
+ // .startJobReturns("done", "done")
+ // .pollJobReturns(Status.FAILED, Status.SUCCEEDED))
.withDatasetService(mockDatasetService);
- long numTempTables = 3;
+ int numFinalTables = 3;
+ int numTempTables = 3;
String jobIdToken = "jobIdToken";
String jsonTable = "{}";
- List<String> tempTables = Lists.newArrayList();
- for (long i = 0; i < numTempTables; ++i) {
- tempTables.add(String.format("{\"tableId\":\"%s_%05d\"}", jobIdToken, i));
+ Map<TableDestination, Iterable<String>> tempTables = Maps.newHashMap();
+ for (int i = 0; i < numFinalTables; ++i) {
+ String tableName = "project-id:dataset-id.table_" + i;
+ TableDestination tableDestination = new TableDestination(tableName, tableName);
+ List<String> tables = Lists.newArrayList();
+ tempTables.put(tableDestination, tables);
+ for (int j = 0; i < numTempTables; ++i) {
+ tables.add(String.format(
+ "{\"project-id:dataset-id.tableId\":\"%s_%05d_%05d\"}", jobIdToken, i, j));
+ }
}
- PCollection<String> tempTablesPCollection = p.apply(Create.of(tempTables));
- PCollectionView<Iterable<String>> tempTablesView =
- PCollectionViews.iterableView(
- tempTablesPCollection, WindowingStrategy.globalDefault(), StringUtf8Coder.of());
+ PCollectionView<Map<TableDestination, Iterable<String>>> tempTablesView =
+ PCollectionViews.multimapView(
+ p,
+ WindowingStrategy.globalDefault(),
+ KvCoder.of(TableDestinationCoder.of(), StringUtf8Coder.of()));
+
PCollection<String> jobIdTokenCollection = p.apply("CreateJobId", Create.of("jobId"));
PCollectionView<String> jobIdTokenView =
jobIdTokenCollection.apply(View.<String>asSingleton());
@@ -2241,11 +1849,9 @@ public class BigQueryIOTest implements Serializable {
WriteRename writeRename = new WriteRename(
fakeBqServices,
jobIdTokenView,
- StaticValueProvider.of(jsonTable),
WriteDisposition.WRITE_EMPTY,
CreateDisposition.CREATE_IF_NEEDED,
- tempTablesView,
- null);
+ tempTablesView);
DoFnTester<String, Void> tester = DoFnTester.of(writeRename);
tester.setSideInput(tempTablesView, GlobalWindow.INSTANCE, tempTables);
http://git-wip-us.apache.org/repos/asf/beam/blob/760a9458/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/FakeBigQueryServices.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/FakeBigQueryServices.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/FakeBigQueryServices.java
new file mode 100644
index 0000000..ed3ab37
--- /dev/null
+++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/FakeBigQueryServices.java
@@ -0,0 +1,96 @@
+package org.apache.beam.sdk.io.gcp.bigquery;
+
+import static org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.fromJsonString;
+import static org.junit.Assert.assertEquals;
+
+import com.google.api.services.bigquery.model.JobConfigurationQuery;
+import com.google.api.services.bigquery.model.TableReference;
+import com.google.api.services.bigquery.model.TableRow;
+import java.io.IOException;
+import java.util.NoSuchElementException;
+import org.apache.beam.sdk.options.BigQueryOptions;
+
+
+/**
+ * Created by relax on 3/30/17.
+ */
+class FakeBigQueryServices implements BigQueryServices {
+ private String[] jsonTableRowReturns = new String[0];
+ private JobService jobService;
+ private DatasetService datasetService;
+
+ public FakeBigQueryServices withJobService(JobService jobService) {
+ this.jobService = jobService;
+ return this;
+ }
+
+ public FakeBigQueryServices withDatasetService(DatasetService datasetService) {
+ this.datasetService = datasetService;
+ return this;
+ }
+
+ public FakeBigQueryServices readerReturns(String... jsonTableRowReturns) {
+ this.jsonTableRowReturns = jsonTableRowReturns;
+ return this;
+ }
+
+ @Override
+ public JobService getJobService(BigQueryOptions bqOptions) {
+ return jobService;
+ }
+
+ @Override
+ public DatasetService getDatasetService(BigQueryOptions bqOptions) {
+ return datasetService;
+ }
+
+ @Override
+ public BigQueryJsonReader getReaderFromTable(
+ BigQueryOptions bqOptions, TableReference tableRef) {
+ return new FakeBigQueryReader(jsonTableRowReturns);
+ }
+
+ @Override
+ public BigQueryJsonReader getReaderFromQuery(
+ BigQueryOptions bqOptions, String projectId, JobConfigurationQuery queryConfig) {
+ return new FakeBigQueryReader(jsonTableRowReturns);
+ }
+
+ private static class FakeBigQueryReader implements BigQueryJsonReader {
+ private static final int UNSTARTED = -1;
+ private static final int CLOSED = Integer.MAX_VALUE;
+
+ private String[] jsonTableRowReturns;
+ private int currIndex;
+
+ FakeBigQueryReader(String[] jsonTableRowReturns) {
+ this.jsonTableRowReturns = jsonTableRowReturns;
+ this.currIndex = UNSTARTED;
+ }
+
+ @Override
+ public boolean start() throws IOException {
+ assertEquals(UNSTARTED, currIndex);
+ currIndex = 0;
+ return currIndex < jsonTableRowReturns.length;
+ }
+
+ @Override
+ public boolean advance() throws IOException {
+ return ++currIndex < jsonTableRowReturns.length;
+ }
+
+ @Override
+ public TableRow getCurrent() throws NoSuchElementException {
+ if (currIndex >= jsonTableRowReturns.length) {
+ throw new NoSuchElementException();
+ }
+ return fromJsonString(jsonTableRowReturns[currIndex], TableRow.class);
+ }
+
+ @Override
+ public void close() throws IOException {
+ currIndex = CLOSED;
+ }
+ }
+}
[17/50] [abbrv] beam git commit: Refactor batch loads,
and add support for windowed writes.
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/beam/blob/760a9458/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/FakeDatasetService.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/FakeDatasetService.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/FakeDatasetService.java
new file mode 100644
index 0000000..9b2cf63
--- /dev/null
+++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/FakeDatasetService.java
@@ -0,0 +1,172 @@
+package org.apache.beam.sdk.io.gcp.bigquery;
+
+import static com.google.common.base.Preconditions.checkNotNull;
+import static org.junit.Assert.assertEquals;
+
+import com.google.api.services.bigquery.model.Dataset;
+import com.google.api.services.bigquery.model.Table;
+import com.google.api.services.bigquery.model.TableReference;
+import com.google.api.services.bigquery.model.TableRow;
+import com.google.common.collect.Lists;
+import java.io.IOException;
+import java.io.Serializable;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.ThreadLocalRandom;
+import javax.annotation.Nullable;
+import org.apache.beam.sdk.io.gcp.bigquery.BigQueryServices.DatasetService;
+
+/** A fake dataset service that can be serialized, for use in testReadFromTable. */
+class FakeDatasetService implements DatasetService, Serializable {
+ @Override
+ public Table getTable(TableReference tableRef)
+ throws InterruptedException, IOException {
+ synchronized (BigQueryIOTest.tables) {
+ Map<String, TableContainer> dataset =
+ checkNotNull(
+ BigQueryIOTest.tables.get(tableRef.getProjectId(), tableRef.getDatasetId()),
+ "Tried to get a dataset %s:%s from %s, but no such dataset was set",
+ tableRef.getProjectId(),
+ tableRef.getDatasetId(),
+ tableRef.getTableId(),
+ FakeDatasetService.class.getSimpleName());
+ TableContainer tableContainer = dataset.get(tableRef.getTableId());
+ return tableContainer == null ? null : tableContainer.getTable();
+ }
+ }
+
+ List<TableRow> getAllRows(String projectId, String datasetId, String tableId)
+ throws InterruptedException, IOException {
+ synchronized (BigQueryIOTest.tables) {
+ return getTableContainer(projectId, datasetId, tableId).getRows();
+ }
+ }
+
+ private TableContainer getTableContainer(String projectId, String datasetId, String tableId)
+ throws InterruptedException, IOException {
+ synchronized (BigQueryIOTest.tables) {
+ Map<String, TableContainer> dataset =
+ checkNotNull(
+ BigQueryIOTest.tables.get(projectId, datasetId),
+ "Tried to get a dataset %s:%s from %s, but no such dataset was set",
+ projectId,
+ datasetId,
+ FakeDatasetService.class.getSimpleName());
+ return checkNotNull(dataset.get(tableId),
+ "Tried to get a table %s:%s.%s from %s, but no such table was set",
+ projectId,
+ datasetId,
+ tableId,
+ FakeDatasetService.class.getSimpleName());
+ }
+ }
+
+ @Override
+ public void deleteTable(TableReference tableRef) throws IOException, InterruptedException {
+ throw new UnsupportedOperationException("Unsupported");
+ }
+
+
+ @Override
+ public void createTable(Table table) throws IOException {
+ TableReference tableReference = table.getTableReference();
+ synchronized (BigQueryIOTest.tables) {
+ Map<String, TableContainer> dataset =
+ checkNotNull(
+ BigQueryIOTest.tables.get(tableReference.getProjectId(),
+ tableReference.getDatasetId()),
+ "Tried to get a dataset %s:%s from %s, but no such table was set",
+ tableReference.getProjectId(),
+ tableReference.getDatasetId(),
+ FakeDatasetService.class.getSimpleName());
+ TableContainer tableContainer = dataset.get(tableReference.getTableId());
+ if (tableContainer == null) {
+ tableContainer = new TableContainer(table);
+ dataset.put(tableReference.getTableId(), tableContainer);
+ }
+ }
+ }
+
+ @Override
+ public boolean isTableEmpty(TableReference tableRef)
+ throws IOException, InterruptedException {
+ Long numBytes = getTable(tableRef).getNumBytes();
+ return numBytes == null || numBytes == 0L;
+ }
+
+ @Override
+ public Dataset getDataset(
+ String projectId, String datasetId) throws IOException, InterruptedException {
+ throw new UnsupportedOperationException("Unsupported");
+ }
+
+ @Override
+ public void createDataset(
+ String projectId, String datasetId, String location, String description)
+ throws IOException, InterruptedException {
+ synchronized (BigQueryIOTest.tables) {
+ Map<String, TableContainer> dataset = BigQueryIOTest.tables.get(projectId, datasetId);
+ if (dataset == null) {
+ dataset = new HashMap<>();
+ BigQueryIOTest.tables.put(projectId, datasetId, dataset);
+ }
+ }
+ }
+
+ @Override
+ public void deleteDataset(String projectId, String datasetId)
+ throws IOException, InterruptedException {
+ throw new UnsupportedOperationException("Unsupported");
+ }
+
+ @Override
+ public long insertAll(
+ TableReference ref, List<TableRow> rowList, @Nullable List<String> insertIdList)
+ throws IOException, InterruptedException {
+ synchronized (BigQueryIOTest.tables) {
+ if (insertIdList != null) {
+ assertEquals(rowList.size(), insertIdList.size());
+ } else {
+ insertIdList = Lists.newArrayListWithExpectedSize(rowList.size());
+ for (int i = 0; i < rowList.size(); ++i) {
+ insertIdList.add(Integer.toString(ThreadLocalRandom.current().nextInt()));
+ }
+ }
+
+ long dataSize = 0;
+ TableContainer tableContainer = getTableContainer(
+ ref.getProjectId(), ref.getDatasetId(), ref.getTableId());
+ for (int i = 0; i < rowList.size(); ++i) {
+ tableContainer.addRow(rowList.get(i), insertIdList.get(i));
+ dataSize += rowList.get(i).toString().length();
+ }
+ return dataSize;
+ }
+ }
+
+ @Override
+ public Table patchTableDescription(TableReference tableReference,
+ @Nullable String tableDescription)
+ throws IOException, InterruptedException {
+ synchronized (BigQueryIOTest.tables) {
+ Map<String, TableContainer> dataset =
+ checkNotNull(
+ BigQueryIOTest.tables.get(tableReference.getProjectId(),
+ tableReference.getDatasetId()),
+ "Tried to get a dataset %s:%s from %s, but no such dataset was set",
+ tableReference.getProjectId(),
+ tableReference.getDatasetId(),
+ tableReference.getTableId(),
+ FakeDatasetService.class.getSimpleName());
+ TableContainer tableContainer = checkNotNull(dataset.get(tableReference.getTableId()),
+ "Tried to patch a table %s:%s.%s from %s, but no such table was set",
+ tableReference.getProjectId(),
+ tableReference.getDatasetId(),
+ tableReference.getTableId(),
+ FakeDatasetService.class.getSimpleName());
+ tableContainer.getTable().setDescription(tableDescription);
+ return tableContainer.getTable();
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/760a9458/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/FakeJobService.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/FakeJobService.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/FakeJobService.java
new file mode 100644
index 0000000..3c67c3d
--- /dev/null
+++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/FakeJobService.java
@@ -0,0 +1,273 @@
+package org.apache.beam.sdk.io.gcp.bigquery;
+
+import static com.google.common.base.Preconditions.checkArgument;
+import static com.google.common.base.Preconditions.checkState;
+
+import com.google.api.client.json.JsonFactory;
+import com.google.api.client.util.BackOff;
+import com.google.api.client.util.BackOffUtils;
+import com.google.api.client.util.Sleeper;
+import com.google.api.services.bigquery.model.Job;
+import com.google.api.services.bigquery.model.JobConfiguration;
+import com.google.api.services.bigquery.model.JobConfigurationExtract;
+import com.google.api.services.bigquery.model.JobConfigurationLoad;
+import com.google.api.services.bigquery.model.JobConfigurationQuery;
+import com.google.api.services.bigquery.model.JobConfigurationTableCopy;
+import com.google.api.services.bigquery.model.JobReference;
+import com.google.api.services.bigquery.model.JobStatistics;
+import com.google.api.services.bigquery.model.JobStatistics4;
+import com.google.api.services.bigquery.model.JobStatus;
+import com.google.api.services.bigquery.model.Table;
+import com.google.api.services.bigquery.model.TableReference;
+import com.google.api.services.bigquery.model.TableRow;
+import com.google.api.services.bigquery.model.TableSchema;
+import com.google.common.collect.HashBasedTable;
+import com.google.common.collect.Lists;
+
+import java.io.BufferedReader;
+import java.io.ByteArrayInputStream;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.Serializable;
+import java.nio.charset.StandardCharsets;
+import java.util.List;
+
+import org.apache.beam.sdk.coders.Coder;
+import org.apache.beam.sdk.coders.Coder.Context;
+import org.apache.beam.sdk.coders.TableRowJsonCoder;
+import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition;
+import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition;
+import org.apache.beam.sdk.io.gcp.bigquery.BigQueryServices.JobService;
+import org.apache.beam.sdk.util.FluentBackoff;
+
+import org.apache.beam.sdk.util.Transport;
+import org.joda.time.Duration;
+
+/**
+ */
+class FakeJobService implements JobService, Serializable {
+ static final JsonFactory JSON_FACTORY = Transport.getJsonFactory();
+
+ // Whenever a job is started, the first 5 calls to GetJob will report the job as pending,
+ // the next 5 will return the job as running, and only then will the job report as done.
+ private static final int GET_JOBS_TRANSITION_INTERVAL = 5;
+
+ private FakeDatasetService datasetService;
+
+ private static class JobInfo {
+ Job job;
+ int getJobCount = 0;
+
+ JobInfo(Job job) {
+ this.job = job;
+ }
+ }
+
+ private static final com.google.common.collect.Table<String, String, JobInfo> allJobs =
+ HashBasedTable.create();
+
+ private static final com.google.common.collect.Table<String, String, JobStatistics>
+ dryRunQueryResults = HashBasedTable.create();
+
+ FakeJobService() {
+ this.datasetService = new FakeDatasetService();
+ }
+
+ @Override
+ public void startLoadJob(JobReference jobRef, JobConfigurationLoad loadConfig)
+ throws InterruptedException, IOException {
+ synchronized (allJobs) {
+ Job job = new Job();
+ job.setJobReference(jobRef);
+ job.setConfiguration(new JobConfiguration().setLoad(loadConfig));
+ job.setKind(" bigquery#job");
+ job.setStatus(new JobStatus().setState("PENDING"));
+ allJobs.put(jobRef.getProjectId(), jobRef.getJobId(), new JobInfo(job));
+ }
+ }
+
+ @Override
+ public void startExtractJob(JobReference jobRef, JobConfigurationExtract extractConfig)
+ throws InterruptedException, IOException {
+ checkArgument(extractConfig.getDestinationFormat().equals("AVRO"),
+ "Only extract to AVRO is supported");
+ checkArgument(extractConfig.getDestinationUris().size() == 1,
+ "Must specify exactly one destination URI.");
+ synchronized (allJobs) {
+ Job job = new Job();
+ job.setJobReference(jobRef);
+ job.setConfiguration(new JobConfiguration().setExtract(extractConfig));
+ job.setKind(" bigquery#job");
+ job.setStatus(new JobStatus().setState("PENDING"));
+ allJobs.put(jobRef.getProjectId(), jobRef.getJobId(), new JobInfo(job));
+ }
+ }
+
+ @Override
+ public void startQueryJob(JobReference jobRef, JobConfigurationQuery query)
+ throws IOException, InterruptedException {
+ }
+
+ @Override
+ public void startCopyJob(JobReference jobRef, JobConfigurationTableCopy copyConfig)
+ throws IOException, InterruptedException {
+ synchronized (allJobs) {
+ Job job = new Job();
+ job.setJobReference(jobRef);
+ job.setConfiguration(new JobConfiguration().setCopy(copyConfig));
+ job.setKind(" bigquery#job");
+ job.setStatus(new JobStatus().setState("PENDING"));
+ allJobs.put(jobRef.getProjectId(), jobRef.getJobId(), new JobInfo(job));
+ }
+ }
+
+ @Override
+ public Job pollJob(JobReference jobRef, int maxAttempts)
+ throws InterruptedException {
+ BackOff backoff =
+ FluentBackoff.DEFAULT
+ .withMaxRetries(maxAttempts)
+ .withInitialBackoff(Duration.millis(50))
+ .withMaxBackoff(Duration.standardMinutes(1))
+ .backoff();
+ Sleeper sleeper = Sleeper.DEFAULT;
+ try {
+ do {
+ Job job = getJob(jobRef);
+ if (job != null) {
+ JobStatus status = job.getStatus();
+ if (status != null && status.getState() != null && status.getState().equals("DONE")) {
+ return job;
+ }
+ }
+ } while (BackOffUtils.next(sleeper, backoff));
+ } catch (IOException e) {
+ return null;
+ }
+ return null;
+ }
+
+ public void expectDryRunQuery(String projectId, String query, JobStatistics result) {
+ synchronized (dryRunQueryResults) {
+ dryRunQueryResults.put(projectId, query, result);
+ }
+ }
+
+ @Override
+ public JobStatistics dryRunQuery(String projectId, JobConfigurationQuery query)
+ throws InterruptedException, IOException {
+ synchronized (dryRunQueryResults) {
+ JobStatistics result = dryRunQueryResults.get(projectId, query.getQuery());
+ if (result != null) {
+ return result;
+ }
+ }
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public Job getJob(JobReference jobRef) throws InterruptedException {
+ try {
+ synchronized (allJobs) {
+ JobInfo job = allJobs.get(jobRef.getProjectId(), jobRef.getJobId());
+ if (job == null) {
+ return null;
+ }
+ ++job.getJobCount;
+ if (job.getJobCount == GET_JOBS_TRANSITION_INTERVAL + 1) {
+ job.job.getStatus().setState("RUNNING");
+ } else if (job.getJobCount == 2 * GET_JOBS_TRANSITION_INTERVAL + 1) {
+ runJob(job.job);
+ job.job.getStatus().setState("DONE");
+ }
+ return JSON_FACTORY.fromString(JSON_FACTORY.toString(job.job), Job.class);
+ }
+ } catch (IOException e) {
+ return null;
+ }
+ }
+
+ private void runJob(Job job) throws InterruptedException, IOException {
+ if (job.getConfiguration().getLoad() != null) {
+ runLoadJob(job.getConfiguration().getLoad());
+ } else if (job.getConfiguration().getCopy() != null) {
+ runCopyJob(job.getConfiguration().getCopy());
+ } else if (job.getConfiguration().getExtract() != null) {
+ runExtractJob(job, job.getConfiguration().getExtract());
+ }
+ }
+
+ private void validateDispositions(Table table, CreateDisposition createDisposition,
+ WriteDisposition writeDisposition)
+ throws InterruptedException, IOException {
+ if (table == null) {
+ checkState(createDisposition != CreateDisposition.CREATE_NEVER,
+ "CreateDisposition == CREATE_NEVER but the table doesn't exist.");
+ } else if (writeDisposition == WriteDisposition.WRITE_TRUNCATE) {
+ datasetService.deleteTable(table.getTableReference());
+ } else if (writeDisposition == WriteDisposition.WRITE_EMPTY) {
+ List<TableRow> allRows = datasetService.getAllRows(table.getTableReference().getProjectId(),
+ table.getTableReference().getDatasetId(), table.getTableReference().getTableId());
+ checkState(allRows.isEmpty(), "Write disposition was set to WRITE_EMPTY,"
+ + " but the table was not empty.");
+ }
+ }
+ private void runLoadJob(JobConfigurationLoad load)
+ throws InterruptedException, IOException {
+ TableReference destination = load.getDestinationTable();
+ TableSchema schema = load.getSchema();
+ List<String> sourceFiles = load.getSourceUris();
+ WriteDisposition writeDisposition = WriteDisposition.valueOf(load.getWriteDisposition());
+ CreateDisposition createDisposition = CreateDisposition.valueOf(load.getCreateDisposition());
+ checkArgument(load.getSourceFormat().equals("NEWLINE_DELIMITED_JSON"));
+ Table existingTable = datasetService.getTable(destination);
+ validateDispositions(existingTable, createDisposition, writeDisposition);
+
+ datasetService.createTable(new Table().setTableReference(destination).setSchema(schema));
+
+ List<TableRow> rows = Lists.newArrayList();
+ for (String filename : sourceFiles) {
+ rows.addAll(readRows(filename));
+ }
+ datasetService.insertAll(destination, rows, null);
+ }
+
+ private void runCopyJob(JobConfigurationTableCopy copy)
+ throws InterruptedException, IOException {
+ List<TableReference> sources = copy.getSourceTables();
+ TableReference destination = copy.getDestinationTable();
+ WriteDisposition writeDisposition = WriteDisposition.valueOf(copy.getWriteDisposition());
+ CreateDisposition createDisposition = CreateDisposition.valueOf(copy.getCreateDisposition());
+ Table existingTable = datasetService.getTable(destination);
+ validateDispositions(existingTable, createDisposition, writeDisposition);
+
+ List<TableRow> allRows = Lists.newArrayList();
+ for (TableReference source : sources) {
+ allRows.addAll(datasetService.getAllRows(
+ source.getProjectId(), source.getDatasetId(), source.getTableId()));
+ }
+ datasetService.insertAll(destination, allRows, null);
+ }
+
+ private void runExtractJob(Job job, JobConfigurationExtract extract) {
+ TableReference sourceTable = extract.getSourceTable();
+ extract.getDestinationUris().get(0);
+ List<Long> destinationFileCounts = Lists.newArrayList(0L);
+ job.setStatistics(new JobStatistics().setExtract(
+ new JobStatistics4().setDestinationUriFileCounts(destinationFileCounts)));
+ }
+
+ private List<TableRow> readRows(String filename) throws IOException {
+ Coder<TableRow> coder = TableRowJsonCoder.of();
+ List<TableRow> tableRows = Lists.newArrayList();
+ try (BufferedReader reader = new BufferedReader(new FileReader(filename))) {
+ String line;
+ while ((line = reader.readLine()) != null) {
+ TableRow tableRow = coder.decode(
+ new ByteArrayInputStream(line.getBytes(StandardCharsets.UTF_8)), Context.OUTER);
+ tableRows.add(tableRow);
+ }
+ }
+ return tableRows;
+ }
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/760a9458/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/TableContainer.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/TableContainer.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/TableContainer.java
new file mode 100644
index 0000000..b2fc170
--- /dev/null
+++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/TableContainer.java
@@ -0,0 +1,36 @@
+package org.apache.beam.sdk.io.gcp.bigquery;
+
+import com.google.api.services.bigquery.model.Table;
+import com.google.api.services.bigquery.model.TableRow;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Created by relax on 3/30/17.
+ */
+class TableContainer {
+ Table table;
+ List<TableRow> rows;
+ List<String> ids;
+
+ TableContainer(Table table) {
+ this.table = table;
+ this.rows = new ArrayList<>();
+ this.ids = new ArrayList<>();
+ }
+
+ TableContainer addRow(TableRow row, String id) {
+ rows.add(row);
+ ids.add(id);
+ return this;
+ }
+
+ Table getTable() {
+ return table;
+ }
+
+ List<TableRow> getRows() {
+ return rows;
+ }
+}
[11/50] [abbrv] beam git commit: Use tableRefFunction throughout
BigQueryIO. Constant table writes use ConstantTableSpecFunction.
Posted by dh...@apache.org.
Use tableRefFunction throughout BigQueryIO. Constant table writes use ConstantTableSpecFunction.
Project: http://git-wip-us.apache.org/repos/asf/beam/repo
Commit: http://git-wip-us.apache.org/repos/asf/beam/commit/c939a436
Tree: http://git-wip-us.apache.org/repos/asf/beam/tree/c939a436
Diff: http://git-wip-us.apache.org/repos/asf/beam/diff/c939a436
Branch: refs/heads/DSL_SQL
Commit: c939a43617cdb37228625a34b3545377b142fc8a
Parents: e0df7d8
Author: Reuven Lax <re...@google.com>
Authored: Tue Mar 28 11:21:59 2017 -0700
Committer: Eugene Kirpichov <ki...@google.com>
Committed: Tue Apr 18 21:12:49 2017 -0700
----------------------------------------------------------------------
.../beam/sdk/io/gcp/bigquery/BigQueryIO.java | 57 ++++++++++----------
.../sdk/io/gcp/bigquery/StreamWithDeDup.java | 4 +-
.../gcp/bigquery/TagWithUniqueIdsAndTable.java | 57 ++++++--------------
.../sdk/io/gcp/bigquery/BigQueryIOTest.java | 19 ++-----
4 files changed, 50 insertions(+), 87 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/beam/blob/c939a436/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIO.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIO.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIO.java
index 9753da5..af0d561 100644
--- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIO.java
+++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIO.java
@@ -700,7 +700,8 @@ public class BigQueryIO {
abstract Builder<T> setJsonTableRef(ValueProvider<String> jsonTableRef);
abstract Builder<T> setTableRefFunction(
SerializableFunction<ValueInSingleWindow<T>, TableReference> tableRefFunction);
- abstract Builder<T> setFormatFunction(SerializableFunction<T, TableRow> formatFunction);
+ abstract Builder<T> setFormatFunction(
+ SerializableFunction<T, TableRow> formatFunction);
abstract Builder<T> setJsonSchema(ValueProvider<String> jsonSchema);
abstract Builder<T> setCreateDisposition(CreateDisposition createDisposition);
abstract Builder<T> setWriteDisposition(WriteDisposition writeDisposition);
@@ -781,7 +782,8 @@ public class BigQueryIO {
/** Ensures that methods of the to() family are called at most once. */
private void ensureToNotCalledYet() {
checkState(
- getJsonTableRef() == null && getTable() == null, "to() already called");
+ getJsonTableRef() == null && getTable() == null
+ && getTableRefFunction() == null, "to() already called");
}
/**
@@ -805,6 +807,8 @@ public class BigQueryIO {
NestedValueProvider.of(
NestedValueProvider.of(tableSpec, new TableSpecToTableRef()),
new TableRefToJson()))
+ .setTableRefFunction(new TranslateTableSpecFunction<T>(
+ new ConstantTableSpecFunction<T>(tableSpec)))
.build();
}
@@ -812,7 +816,8 @@ public class BigQueryIO {
* Writes to table specified by the specified table function. The table is a function of
* {@link ValueInSingleWindow}, so can be determined by the value or by the window.
*/
- public Write<T> to(SerializableFunction<ValueInSingleWindow<T>, String> tableSpecFunction) {
+ public Write<T> to(
+ SerializableFunction<ValueInSingleWindow<T>, String> tableSpecFunction) {
return toTableReference(new TranslateTableSpecFunction<T>(tableSpecFunction));
}
@@ -848,6 +853,20 @@ public class BigQueryIO {
}
}
+ static class ConstantTableSpecFunction<T> implements
+ SerializableFunction<ValueInSingleWindow<T>, String> {
+ private ValueProvider<String> tableSpec;
+
+ ConstantTableSpecFunction(ValueProvider<String> tableSpec) {
+ this.tableSpec = tableSpec;
+ }
+
+ @Override
+ public String apply(ValueInSingleWindow<T> value) {
+ return tableSpec.get();
+ }
+ }
+
/**
* Uses the specified schema for rows to be written.
*
@@ -900,13 +919,8 @@ public class BigQueryIO {
BigQueryOptions options = input.getPipeline().getOptions().as(BigQueryOptions.class);
// Exactly one of the table and table reference can be configured.
- checkState(
- getJsonTableRef() != null || getTableRefFunction() != null,
+ checkState(getTableRefFunction() != null,
"must set the table reference of a BigQueryIO.Write transform");
- checkState(
- getJsonTableRef() == null || getTableRefFunction() == null,
- "Cannot set both a table reference and a table function for a BigQueryIO.Write"
- + " transform");
checkArgument(getFormatFunction() != null,
"A function must be provided to convert type into a TableRow. "
@@ -920,6 +934,7 @@ public class BigQueryIO {
// The user specified a table.
if (getJsonTableRef() != null && getValidate()) {
TableReference table = getTableWithDefaultProject(options).get();
+ // TODO: This seems wrong - what if the ValueProvider is not accessible?
DatasetService datasetService = getBigQueryServices().getDatasetService(options);
// Check for destination table presence and emptiness for early failure notification.
@@ -935,24 +950,12 @@ public class BigQueryIO {
}
}
- if (input.isBounded() == PCollection.IsBounded.UNBOUNDED || getTableRefFunction() != null) {
+ if (input.isBounded() == PCollection.IsBounded.UNBOUNDED) {
// We will use BigQuery's streaming write API -- validate supported dispositions.
- if (getTableRefFunction() != null) {
- checkArgument(
- getCreateDisposition() != CreateDisposition.CREATE_NEVER,
- "CreateDisposition.CREATE_NEVER is not supported when using a tablespec"
- + " function.");
- }
- if (getJsonSchema() == null) {
- checkArgument(
- getCreateDisposition() == CreateDisposition.CREATE_NEVER,
- "CreateDisposition.CREATE_NEVER must be used if jsonSchema is null.");
- }
-
checkArgument(
getWriteDisposition() != WriteDisposition.WRITE_TRUNCATE,
- "WriteDisposition.WRITE_TRUNCATE is not supported for an unbounded PCollection or"
- + " when using a tablespec function.");
+ "WriteDisposition.WRITE_TRUNCATE is not supported for an unbounded"
+ + " PCollection.");
} else {
// We will use a BigQuery load job -- validate the temp location.
String tempLocation = options.getTempLocation();
@@ -977,7 +980,7 @@ public class BigQueryIO {
public WriteResult expand(PCollection<T> input) {
// When writing an Unbounded PCollection, or when a tablespec function is defined, we use
// StreamWithDeDup and BigQuery's streaming import API.
- if (input.isBounded() == IsBounded.UNBOUNDED || getTableRefFunction() != null) {
+ if (input.isBounded() == IsBounded.UNBOUNDED) {
return input.apply(new StreamWithDeDup<T>(this));
} else {
return input.apply(new BatchLoadBigQuery<T>(this));
@@ -1026,12 +1029,12 @@ public class BigQueryIO {
*
* <p>If the table's project is not specified, use the executing project.
*/
- @Nullable ValueProvider<TableReference> getTableWithDefaultProject(
- BigQueryOptions bqOptions) {
+ @Nullable ValueProvider<TableReference> getTableWithDefaultProject(BigQueryOptions bqOptions) {
ValueProvider<TableReference> table = getTable();
if (table == null) {
return table;
}
+
if (!table.isAccessible()) {
LOG.info("Using a dynamic value for table input. This must contain a project"
+ " in the table reference: {}", table);
http://git-wip-us.apache.org/repos/asf/beam/blob/c939a436/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StreamWithDeDup.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StreamWithDeDup.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StreamWithDeDup.java
index 1fa26d1..506a564 100644
--- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StreamWithDeDup.java
+++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StreamWithDeDup.java
@@ -64,8 +64,7 @@ class StreamWithDeDup<T> extends PTransform<PCollection<T>, WriteResult> {
PCollection<KV<ShardedKey<String>, TableRowInfo>> tagged =
input.apply(ParDo.of(new TagWithUniqueIdsAndTable<T>(
- input.getPipeline().getOptions().as(BigQueryOptions.class), write.getTable(),
- write.getTableRefFunction(), write.getFormatFunction())));
+ input.getPipeline().getOptions().as(BigQueryOptions.class), write)));
// To prevent having the same TableRow processed more than once with regenerated
// different unique ids, this implementation relies on "checkpointing", which is
@@ -85,6 +84,7 @@ class StreamWithDeDup<T> extends PTransform<PCollection<T>, WriteResult> {
write.getCreateDisposition(),
write.getTableDescription(),
write.getBigQueryServices())));
+
return WriteResult.in(input.getPipeline());
}
}
http://git-wip-us.apache.org/repos/asf/beam/blob/c939a436/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TagWithUniqueIdsAndTable.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TagWithUniqueIdsAndTable.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TagWithUniqueIdsAndTable.java
index a6608e4..8d7d1e6 100644
--- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TagWithUniqueIdsAndTable.java
+++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TagWithUniqueIdsAndTable.java
@@ -18,23 +18,18 @@
package org.apache.beam.sdk.io.gcp.bigquery;
-import static com.google.common.base.Preconditions.checkArgument;
-
import com.google.api.services.bigquery.model.TableReference;
-import com.google.api.services.bigquery.model.TableRow;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Strings;
import java.io.IOException;
import java.util.UUID;
import java.util.concurrent.ThreadLocalRandom;
-import org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.JsonTableRefToTableRef;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.TableRefToTableSpec;
+import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write;
import org.apache.beam.sdk.options.BigQueryOptions;
import org.apache.beam.sdk.options.ValueProvider;
import org.apache.beam.sdk.options.ValueProvider.NestedValueProvider;
-import org.apache.beam.sdk.options.ValueProvider.StaticValueProvider;
import org.apache.beam.sdk.transforms.DoFn;
-import org.apache.beam.sdk.transforms.SerializableFunction;
import org.apache.beam.sdk.transforms.display.DisplayData;
import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
import org.apache.beam.sdk.values.KV;
@@ -49,39 +44,22 @@ import org.apache.beam.sdk.values.ValueInSingleWindow;
@VisibleForTesting
class TagWithUniqueIdsAndTable<T>
extends DoFn<T, KV<ShardedKey<String>, TableRowInfo>> {
- /** TableSpec to write to. */
- private final ValueProvider<String> tableSpec;
-
- /** User function mapping windowed values to {@link TableReference} in JSON. */
- private final SerializableFunction<ValueInSingleWindow<T>, TableReference> tableRefFunction;
+ /** TableSpec to write to in the case of a single static destination. */
+ private ValueProvider<String> tableSpec = null;
- /** User function mapping user type to a TableRow. */
- private final SerializableFunction<T, TableRow> formatFunction;
+ private final Write<T, ?> write;
private transient String randomUUID;
private transient long sequenceNo = 0L;
TagWithUniqueIdsAndTable(BigQueryOptions options,
- ValueProvider<TableReference> table,
- SerializableFunction<ValueInSingleWindow<T>, TableReference>
- tableRefFunction,
- SerializableFunction<T, TableRow> formatFunction) {
- checkArgument(table == null ^ tableRefFunction == null,
- "Exactly one of table or tableRefFunction should be set");
+ Write<T, ?> write) {
+ ValueProvider<TableReference> table = write.getTableWithDefaultProject(
+ options.as(BigQueryOptions.class));
if (table != null) {
- if (table.isAccessible() && Strings.isNullOrEmpty(table.get().getProjectId())) {
- TableReference tableRef = table.get()
- .setProjectId(options.as(BigQueryOptions.class).getProject());
- table = NestedValueProvider.of(
- StaticValueProvider.of(BigQueryHelpers.toJsonString(tableRef)),
- new JsonTableRefToTableRef());
- }
this.tableSpec = NestedValueProvider.of(table, new TableRefToTableSpec());
- } else {
- tableSpec = null;
}
- this.tableRefFunction = tableRefFunction;
- this.formatFunction = formatFunction;
+ this.write = write;
}
@@ -101,7 +79,7 @@ class TagWithUniqueIdsAndTable<T>
// We output on keys 0-50 to ensure that there's enough batching for
// BigQuery.
context.output(KV.of(ShardedKey.of(tableSpec, randomGenerator.nextInt(0, 50)),
- new TableRowInfo(formatFunction.apply(context.element()), uniqueId)));
+ new TableRowInfo(write.getFormatFunction().apply(context.element()), uniqueId)));
}
@Override
@@ -109,10 +87,8 @@ class TagWithUniqueIdsAndTable<T>
super.populateDisplayData(builder);
builder.addIfNotNull(DisplayData.item("table", tableSpec));
- if (tableRefFunction != null) {
- builder.add(DisplayData.item("tableFn", tableRefFunction.getClass())
+ builder.add(DisplayData.item("tableFn", write.getTableRefFunction().getClass())
.withLabel("Table Reference Function"));
- }
}
@VisibleForTesting
@@ -120,16 +96,13 @@ class TagWithUniqueIdsAndTable<T>
return tableSpec;
}
+
private String tableSpecFromWindowedValue(BigQueryOptions options,
ValueInSingleWindow<T> value) {
- if (tableSpec != null) {
- return tableSpec.get();
- } else {
- TableReference table = tableRefFunction.apply(value);
- if (table.getProjectId() == null) {
- table.setProjectId(options.getProject());
- }
- return BigQueryHelpers.toTableSpec(table);
+ TableReference table = write.getTableRefFunction().apply(value);
+ if (Strings.isNullOrEmpty(table.getProjectId())) {
+ table.setProjectId(options.getProject());
}
+ return BigQueryHelpers.toTableSpec(table);
}
}
http://git-wip-us.apache.org/repos/asf/beam/blob/c939a436/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOTest.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOTest.java
index 83fd8d9..499aa74 100644
--- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOTest.java
+++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOTest.java
@@ -26,7 +26,6 @@ import static org.apache.beam.sdk.transforms.display.DisplayDataMatchers.hasDisp
import static org.hamcrest.Matchers.containsInAnyOrder;
import static org.hamcrest.Matchers.hasItem;
import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertThat;
import static org.junit.Assert.assertTrue;
@@ -103,7 +102,6 @@ import org.apache.beam.sdk.io.CountingInput;
import org.apache.beam.sdk.io.CountingSource;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.JsonSchemaToTableSchema;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.Status;
-import org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.TableSpecToTableRef;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryServices.DatasetService;
@@ -150,6 +148,7 @@ import org.apache.beam.sdk.util.WindowingStrategy;
import org.apache.beam.sdk.values.KV;
import org.apache.beam.sdk.values.PCollection;
import org.apache.beam.sdk.values.PCollectionView;
+import org.apache.beam.sdk.values.PDone;
import org.apache.beam.sdk.values.TupleTag;
import org.apache.beam.sdk.values.TypeDescriptor;
import org.apache.beam.sdk.values.ValueInSingleWindow;
@@ -1375,7 +1374,8 @@ public class BigQueryIOTest implements Serializable {
@Test
public void testBuildWriteDefaultProject() {
- BigQueryIO.Write<TableRow> write = BigQueryIO.writeTableRows().to("somedataset.sometable");
+ BigQueryIO.Write<TableRow> write = BigQueryIO.writeTableRows()
+ .to("somedataset" + ".sometable");
checkWriteObject(
write, null, "somedataset", "sometable",
null, CreateDisposition.CREATE_IF_NEEDED, WriteDisposition.WRITE_EMPTY,
@@ -2350,19 +2350,6 @@ public class BigQueryIOTest implements Serializable {
DisplayData.from(write);
}
- @Test
- public void testTagWithUniqueIdsAndTableProjectNotNullWithNvp() {
- BigQueryOptions bqOptions = PipelineOptionsFactory.as(BigQueryOptions.class);
- bqOptions.setProject("project");
- TagWithUniqueIdsAndTable<TableRow> tag =
- new TagWithUniqueIdsAndTable<TableRow>(
- bqOptions, NestedValueProvider.of(
- StaticValueProvider.of("data_set.table_name"),
- new TableSpecToTableRef()), null, null);
- TableReference table = BigQueryHelpers.parseTableSpec(tag.getTableSpec().get());
- assertNotNull(table.getProjectId());
- }
-
private static void testNumFiles(File tempDir, int expectedNumFiles) {
assertEquals(expectedNumFiles, tempDir.listFiles(new FileFilter() {
@Override
[38/50] [abbrv] beam git commit: [BEAM-1994] Remove Flink examples
package
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/FlinkRunnerRegistrar.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/FlinkRunnerRegistrar.java b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/FlinkRunnerRegistrar.java
deleted file mode 100644
index 681459a..0000000
--- a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/FlinkRunnerRegistrar.java
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.beam.runners.flink;
-
-import com.google.auto.service.AutoService;
-import com.google.common.collect.ImmutableList;
-import org.apache.beam.sdk.options.PipelineOptions;
-import org.apache.beam.sdk.options.PipelineOptionsRegistrar;
-import org.apache.beam.sdk.runners.PipelineRunner;
-import org.apache.beam.sdk.runners.PipelineRunnerRegistrar;
-
-
-/**
- * AutoService registrar - will register FlinkRunner and FlinkOptions
- * as possible pipeline runner services.
- *
- * <p>It ends up in META-INF/services and gets picked up by Beam.
- *
- */
-public class FlinkRunnerRegistrar {
- private FlinkRunnerRegistrar() { }
-
- /**
- * Pipeline runner registrar.
- */
- @AutoService(PipelineRunnerRegistrar.class)
- public static class Runner implements PipelineRunnerRegistrar {
- @Override
- public Iterable<Class<? extends PipelineRunner<?>>> getPipelineRunners() {
- return ImmutableList.<Class<? extends PipelineRunner<?>>>of(
- FlinkRunner.class,
- TestFlinkRunner.class);
- }
- }
-
- /**
- * Pipeline options registrar.
- */
- @AutoService(PipelineOptionsRegistrar.class)
- public static class Options implements PipelineOptionsRegistrar {
- @Override
- public Iterable<Class<? extends PipelineOptions>> getPipelineOptions() {
- return ImmutableList.<Class<? extends PipelineOptions>>of(FlinkPipelineOptions.class);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/FlinkRunnerResult.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/FlinkRunnerResult.java b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/FlinkRunnerResult.java
deleted file mode 100644
index 0682b56..0000000
--- a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/FlinkRunnerResult.java
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink;
-
-import java.io.IOException;
-import java.util.Collections;
-import java.util.Map;
-import org.apache.beam.sdk.AggregatorRetrievalException;
-import org.apache.beam.sdk.AggregatorValues;
-import org.apache.beam.sdk.PipelineResult;
-import org.apache.beam.sdk.metrics.MetricResults;
-import org.apache.beam.sdk.transforms.Aggregator;
-import org.joda.time.Duration;
-
-/**
- * Result of executing a {@link org.apache.beam.sdk.Pipeline} with Flink. This
- * has methods to query to job runtime and the final values of
- * {@link org.apache.beam.sdk.transforms.Aggregator}s.
- */
-public class FlinkRunnerResult implements PipelineResult {
-
- private final Map<String, Object> aggregators;
-
- private final long runtime;
-
- FlinkRunnerResult(Map<String, Object> aggregators, long runtime) {
- this.aggregators = (aggregators == null || aggregators.isEmpty())
- ? Collections.<String, Object>emptyMap()
- : Collections.unmodifiableMap(aggregators);
- this.runtime = runtime;
- }
-
- @Override
- public State getState() {
- return State.DONE;
- }
-
- @Override
- public <T> AggregatorValues<T> getAggregatorValues(final Aggregator<?, T> aggregator)
- throws AggregatorRetrievalException {
- // TODO provide a list of all accumulator step values
- Object value = aggregators.get(aggregator.getName());
- if (value != null) {
- return new AggregatorValues<T>() {
- @Override
- public Map<String, T> getValuesAtSteps() {
- return (Map<String, T>) aggregators;
- }
- };
- } else {
- throw new AggregatorRetrievalException("Accumulator results not found.",
- new RuntimeException("Accumulator does not exist."));
- }
- }
-
- @Override
- public String toString() {
- return "FlinkRunnerResult{"
- + "aggregators=" + aggregators
- + ", runtime=" + runtime
- + '}';
- }
-
- @Override
- public State cancel() throws IOException {
- throw new UnsupportedOperationException("FlinkRunnerResult does not support cancel.");
- }
-
- @Override
- public State waitUntilFinish() {
- return State.DONE;
- }
-
- @Override
- public State waitUntilFinish(Duration duration) {
- return State.DONE;
- }
-
- @Override
- public MetricResults metrics() {
- throw new UnsupportedOperationException("The FlinkRunner does not currently support metrics.");
- }
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/FlinkStreamingPipelineTranslator.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/FlinkStreamingPipelineTranslator.java b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/FlinkStreamingPipelineTranslator.java
deleted file mode 100644
index 0459ef7..0000000
--- a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/FlinkStreamingPipelineTranslator.java
+++ /dev/null
@@ -1,276 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink;
-
-import com.google.common.collect.ImmutableList;
-import java.util.List;
-import java.util.Map;
-import org.apache.beam.runners.core.SplittableParDo;
-import org.apache.beam.runners.core.construction.PTransformMatchers;
-import org.apache.beam.runners.core.construction.PTransformReplacements;
-import org.apache.beam.runners.core.construction.ReplacementOutputs;
-import org.apache.beam.runners.core.construction.SingleInputOutputOverrideFactory;
-import org.apache.beam.sdk.Pipeline;
-import org.apache.beam.sdk.options.PipelineOptions;
-import org.apache.beam.sdk.runners.PTransformOverride;
-import org.apache.beam.sdk.runners.PTransformOverrideFactory;
-import org.apache.beam.sdk.runners.TransformHierarchy;
-import org.apache.beam.sdk.transforms.AppliedPTransform;
-import org.apache.beam.sdk.transforms.Combine;
-import org.apache.beam.sdk.transforms.PTransform;
-import org.apache.beam.sdk.transforms.ParDo.MultiOutput;
-import org.apache.beam.sdk.transforms.View;
-import org.apache.beam.sdk.util.InstanceBuilder;
-import org.apache.beam.sdk.values.PCollection;
-import org.apache.beam.sdk.values.PCollectionTuple;
-import org.apache.beam.sdk.values.PValue;
-import org.apache.beam.sdk.values.TupleTag;
-import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * This is a {@link FlinkPipelineTranslator} for streaming jobs. Its role is to translate
- * the user-provided {@link org.apache.beam.sdk.values.PCollection}-based job into a
- * {@link org.apache.flink.streaming.api.datastream.DataStream} one.
- *
- */
-class FlinkStreamingPipelineTranslator extends FlinkPipelineTranslator {
-
- private static final Logger LOG = LoggerFactory.getLogger(FlinkStreamingPipelineTranslator.class);
-
- /** The necessary context in the case of a straming job. */
- private final FlinkStreamingTranslationContext streamingContext;
-
- private int depth = 0;
-
- private FlinkRunner flinkRunner;
-
- public FlinkStreamingPipelineTranslator(
- FlinkRunner flinkRunner,
- StreamExecutionEnvironment env,
- PipelineOptions options) {
- this.streamingContext = new FlinkStreamingTranslationContext(env, options);
- this.flinkRunner = flinkRunner;
- }
-
- @Override
- public void translate(Pipeline pipeline) {
- List<PTransformOverride> transformOverrides =
- ImmutableList.<PTransformOverride>builder()
- .add(
- PTransformOverride.of(
- PTransformMatchers.splittableParDoMulti(),
- new SplittableParDoOverrideFactory()))
- .add(
- PTransformOverride.of(
- PTransformMatchers.classEqualTo(View.AsIterable.class),
- new ReflectiveOneToOneOverrideFactory(
- FlinkStreamingViewOverrides.StreamingViewAsIterable.class, flinkRunner)))
- .add(
- PTransformOverride.of(
- PTransformMatchers.classEqualTo(View.AsList.class),
- new ReflectiveOneToOneOverrideFactory(
- FlinkStreamingViewOverrides.StreamingViewAsList.class, flinkRunner)))
- .add(
- PTransformOverride.of(
- PTransformMatchers.classEqualTo(View.AsMap.class),
- new ReflectiveOneToOneOverrideFactory(
- FlinkStreamingViewOverrides.StreamingViewAsMap.class, flinkRunner)))
- .add(
- PTransformOverride.of(
- PTransformMatchers.classEqualTo(View.AsMultimap.class),
- new ReflectiveOneToOneOverrideFactory(
- FlinkStreamingViewOverrides.StreamingViewAsMultimap.class, flinkRunner)))
- .add(
- PTransformOverride.of(
- PTransformMatchers.classEqualTo(View.AsSingleton.class),
- new ReflectiveOneToOneOverrideFactory(
- FlinkStreamingViewOverrides.StreamingViewAsSingleton.class, flinkRunner)))
- // this has to be last since the ViewAsSingleton override
- // can expand to a Combine.GloballyAsSingletonView
- .add(
- PTransformOverride.of(
- PTransformMatchers.classEqualTo(Combine.GloballyAsSingletonView.class),
- new ReflectiveOneToOneOverrideFactory(
- FlinkStreamingViewOverrides.StreamingCombineGloballyAsSingletonView.class,
- flinkRunner)))
- .build();
-
- pipeline.replaceAll(transformOverrides);
- super.translate(pipeline);
- }
-
- // --------------------------------------------------------------------------------------------
- // Pipeline Visitor Methods
- // --------------------------------------------------------------------------------------------
-
- @Override
- public CompositeBehavior enterCompositeTransform(TransformHierarchy.Node node) {
- LOG.info("{} enterCompositeTransform- {}", genSpaces(this.depth), node.getFullName());
- this.depth++;
-
- PTransform<?, ?> transform = node.getTransform();
- if (transform != null) {
- StreamTransformTranslator<?> translator =
- FlinkStreamingTransformTranslators.getTranslator(transform);
-
- if (translator != null && applyCanTranslate(transform, node, translator)) {
- applyStreamingTransform(transform, node, translator);
- LOG.info("{} translated- {}", genSpaces(this.depth), node.getFullName());
- return CompositeBehavior.DO_NOT_ENTER_TRANSFORM;
- }
- }
- return CompositeBehavior.ENTER_TRANSFORM;
- }
-
- @Override
- public void leaveCompositeTransform(TransformHierarchy.Node node) {
- this.depth--;
- LOG.info("{} leaveCompositeTransform- {}", genSpaces(this.depth), node.getFullName());
- }
-
- @Override
- public void visitPrimitiveTransform(TransformHierarchy.Node node) {
- LOG.info("{} visitPrimitiveTransform- {}", genSpaces(this.depth), node.getFullName());
- // get the transformation corresponding to hte node we are
- // currently visiting and translate it into its Flink alternative.
-
- PTransform<?, ?> transform = node.getTransform();
- StreamTransformTranslator<?> translator =
- FlinkStreamingTransformTranslators.getTranslator(transform);
-
- if (translator == null || !applyCanTranslate(transform, node, translator)) {
- LOG.info(node.getTransform().getClass().toString());
- throw new UnsupportedOperationException(
- "The transform " + transform + " is currently not supported.");
- }
- applyStreamingTransform(transform, node, translator);
- }
-
- @Override
- public void visitValue(PValue value, TransformHierarchy.Node producer) {
- // do nothing here
- }
-
- private <T extends PTransform<?, ?>> void applyStreamingTransform(
- PTransform<?, ?> transform,
- TransformHierarchy.Node node,
- StreamTransformTranslator<?> translator) {
-
- @SuppressWarnings("unchecked")
- T typedTransform = (T) transform;
-
- @SuppressWarnings("unchecked")
- StreamTransformTranslator<T> typedTranslator = (StreamTransformTranslator<T>) translator;
-
- // create the applied PTransform on the streamingContext
- streamingContext.setCurrentTransform(node.toAppliedPTransform());
- typedTranslator.translateNode(typedTransform, streamingContext);
- }
-
- private <T extends PTransform<?, ?>> boolean applyCanTranslate(
- PTransform<?, ?> transform,
- TransformHierarchy.Node node,
- StreamTransformTranslator<?> translator) {
-
- @SuppressWarnings("unchecked")
- T typedTransform = (T) transform;
-
- @SuppressWarnings("unchecked")
- StreamTransformTranslator<T> typedTranslator = (StreamTransformTranslator<T>) translator;
-
- streamingContext.setCurrentTransform(node.toAppliedPTransform());
-
- return typedTranslator.canTranslate(typedTransform, streamingContext);
- }
-
- /**
- * The interface that every Flink translator of a Beam operator should implement.
- * This interface is for <b>streaming</b> jobs. For examples of such translators see
- * {@link FlinkStreamingTransformTranslators}.
- */
- abstract static class StreamTransformTranslator<T extends PTransform> {
-
- /**
- * Translate the given transform.
- */
- abstract void translateNode(T transform, FlinkStreamingTranslationContext context);
-
- /**
- * Returns true iff this translator can translate the given transform.
- */
- boolean canTranslate(T transform, FlinkStreamingTranslationContext context) {
- return true;
- }
- }
-
- private static class ReflectiveOneToOneOverrideFactory<
- InputT, OutputT, TransformT extends PTransform<PCollection<InputT>, PCollection<OutputT>>>
- extends SingleInputOutputOverrideFactory<
- PCollection<InputT>, PCollection<OutputT>, TransformT> {
- private final Class<PTransform<PCollection<InputT>, PCollection<OutputT>>> replacement;
- private final FlinkRunner runner;
-
- private ReflectiveOneToOneOverrideFactory(
- Class<PTransform<PCollection<InputT>, PCollection<OutputT>>> replacement,
- FlinkRunner runner) {
- this.replacement = replacement;
- this.runner = runner;
- }
-
- @Override
- public PTransformReplacement<PCollection<InputT>, PCollection<OutputT>> getReplacementTransform(
- AppliedPTransform<PCollection<InputT>, PCollection<OutputT>, TransformT> transform) {
- return PTransformReplacement.of(
- PTransformReplacements.getSingletonMainInput(transform),
- InstanceBuilder.ofType(replacement)
- .withArg(FlinkRunner.class, runner)
- .withArg(
- (Class<PTransform<PCollection<InputT>, PCollection<OutputT>>>)
- transform.getTransform().getClass(),
- transform.getTransform())
- .build());
- }
- }
-
- /**
- * A {@link PTransformOverrideFactory} that overrides a <a
- * href="https://s.apache.org/splittable-do-fn">Splittable DoFn</a> with {@link SplittableParDo}.
- */
- static class SplittableParDoOverrideFactory<InputT, OutputT>
- implements PTransformOverrideFactory<
- PCollection<InputT>, PCollectionTuple, MultiOutput<InputT, OutputT>> {
- @Override
- public PTransformReplacement<PCollection<InputT>, PCollectionTuple>
- getReplacementTransform(
- AppliedPTransform<
- PCollection<InputT>, PCollectionTuple, MultiOutput<InputT, OutputT>>
- transform) {
- return PTransformReplacement.of(
- PTransformReplacements.getSingletonMainInput(transform),
- new SplittableParDo<>(transform.getTransform()));
- }
-
- @Override
- public Map<PValue, ReplacementOutput> mapOutputs(
- Map<TupleTag<?>, PValue> outputs, PCollectionTuple newOutput) {
- return ReplacementOutputs.tagged(outputs, newOutput);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/FlinkStreamingTransformTranslators.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/FlinkStreamingTransformTranslators.java b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/FlinkStreamingTransformTranslators.java
deleted file mode 100644
index 123d5e7..0000000
--- a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/FlinkStreamingTransformTranslators.java
+++ /dev/null
@@ -1,1044 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.beam.runners.flink;
-
-import static com.google.common.base.Preconditions.checkArgument;
-
-import com.google.common.collect.Lists;
-import com.google.common.collect.Maps;
-import java.nio.ByteBuffer;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.Map.Entry;
-import org.apache.beam.runners.core.ElementAndRestriction;
-import org.apache.beam.runners.core.KeyedWorkItem;
-import org.apache.beam.runners.core.SplittableParDo;
-import org.apache.beam.runners.core.SystemReduceFn;
-import org.apache.beam.runners.flink.translation.functions.FlinkAssignWindows;
-import org.apache.beam.runners.flink.translation.types.CoderTypeInformation;
-import org.apache.beam.runners.flink.translation.wrappers.streaming.DoFnOperator;
-import org.apache.beam.runners.flink.translation.wrappers.streaming.KvToByteBufferKeySelector;
-import org.apache.beam.runners.flink.translation.wrappers.streaming.SingletonKeyedWorkItem;
-import org.apache.beam.runners.flink.translation.wrappers.streaming.SingletonKeyedWorkItemCoder;
-import org.apache.beam.runners.flink.translation.wrappers.streaming.SplittableDoFnOperator;
-import org.apache.beam.runners.flink.translation.wrappers.streaming.WindowDoFnOperator;
-import org.apache.beam.runners.flink.translation.wrappers.streaming.WorkItemKeySelector;
-import org.apache.beam.runners.flink.translation.wrappers.streaming.io.BoundedSourceWrapper;
-import org.apache.beam.runners.flink.translation.wrappers.streaming.io.UnboundedSourceWrapper;
-import org.apache.beam.sdk.coders.Coder;
-import org.apache.beam.sdk.coders.KvCoder;
-import org.apache.beam.sdk.coders.StringUtf8Coder;
-import org.apache.beam.sdk.coders.VoidCoder;
-import org.apache.beam.sdk.io.Read;
-import org.apache.beam.sdk.io.TextIO;
-import org.apache.beam.sdk.transforms.Combine;
-import org.apache.beam.sdk.transforms.DoFn;
-import org.apache.beam.sdk.transforms.Flatten;
-import org.apache.beam.sdk.transforms.GroupByKey;
-import org.apache.beam.sdk.transforms.PTransform;
-import org.apache.beam.sdk.transforms.ParDo;
-import org.apache.beam.sdk.transforms.join.RawUnionValue;
-import org.apache.beam.sdk.transforms.join.UnionCoder;
-import org.apache.beam.sdk.transforms.reflect.DoFnSignature;
-import org.apache.beam.sdk.transforms.reflect.DoFnSignatures;
-import org.apache.beam.sdk.transforms.splittabledofn.RestrictionTracker;
-import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
-import org.apache.beam.sdk.transforms.windowing.GlobalWindow;
-import org.apache.beam.sdk.transforms.windowing.Window;
-import org.apache.beam.sdk.transforms.windowing.WindowFn;
-import org.apache.beam.sdk.util.AppliedCombineFn;
-import org.apache.beam.sdk.util.Reshuffle;
-import org.apache.beam.sdk.util.WindowedValue;
-import org.apache.beam.sdk.util.WindowingStrategy;
-import org.apache.beam.sdk.values.KV;
-import org.apache.beam.sdk.values.PCollection;
-import org.apache.beam.sdk.values.PCollectionView;
-import org.apache.beam.sdk.values.PValue;
-import org.apache.beam.sdk.values.TupleTag;
-import org.apache.flink.api.common.functions.FlatMapFunction;
-import org.apache.flink.api.common.functions.MapFunction;
-import org.apache.flink.api.common.functions.RichFlatMapFunction;
-import org.apache.flink.api.common.typeinfo.TypeInformation;
-import org.apache.flink.api.java.tuple.Tuple2;
-import org.apache.flink.core.fs.FileSystem;
-import org.apache.flink.streaming.api.collector.selector.OutputSelector;
-import org.apache.flink.streaming.api.datastream.DataStream;
-import org.apache.flink.streaming.api.datastream.DataStreamSink;
-import org.apache.flink.streaming.api.datastream.DataStreamSource;
-import org.apache.flink.streaming.api.datastream.KeyedStream;
-import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
-import org.apache.flink.streaming.api.datastream.SplitStream;
-import org.apache.flink.streaming.api.operators.OneInputStreamOperator;
-import org.apache.flink.streaming.api.operators.TwoInputStreamOperator;
-import org.apache.flink.streaming.api.transformations.TwoInputTransformation;
-import org.apache.flink.util.Collector;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * This class contains all the mappings between Beam and Flink
- * <b>streaming</b> transformations. The {@link FlinkStreamingPipelineTranslator}
- * traverses the Beam job and comes here to translate the encountered Beam transformations
- * into Flink one, based on the mapping available in this class.
- */
-class FlinkStreamingTransformTranslators {
-
- // --------------------------------------------------------------------------------------------
- // Transform Translator Registry
- // --------------------------------------------------------------------------------------------
-
- @SuppressWarnings("rawtypes")
- private static final Map<
- Class<? extends PTransform>,
- FlinkStreamingPipelineTranslator.StreamTransformTranslator> TRANSLATORS = new HashMap<>();
-
- // here you can find all the available translators.
- static {
- TRANSLATORS.put(Read.Bounded.class, new BoundedReadSourceTranslator());
- TRANSLATORS.put(Read.Unbounded.class, new UnboundedReadSourceTranslator());
- TRANSLATORS.put(TextIO.Write.Bound.class, new TextIOWriteBoundStreamingTranslator());
-
- TRANSLATORS.put(ParDo.MultiOutput.class, new ParDoStreamingTranslator());
- TRANSLATORS.put(
- SplittableParDo.ProcessElements.class, new SplittableProcessElementsStreamingTranslator());
- TRANSLATORS.put(
- SplittableParDo.GBKIntoKeyedWorkItems.class, new GBKIntoKeyedWorkItemsTranslator());
-
-
- TRANSLATORS.put(Window.Assign.class, new WindowAssignTranslator());
- TRANSLATORS.put(Flatten.PCollections.class, new FlattenPCollectionTranslator());
- TRANSLATORS.put(
- FlinkStreamingViewOverrides.CreateFlinkPCollectionView.class,
- new CreateViewStreamingTranslator());
-
- TRANSLATORS.put(Reshuffle.class, new ReshuffleTranslatorStreaming());
- TRANSLATORS.put(GroupByKey.class, new GroupByKeyTranslator());
- TRANSLATORS.put(Combine.PerKey.class, new CombinePerKeyTranslator());
- }
-
- public static FlinkStreamingPipelineTranslator.StreamTransformTranslator<?> getTranslator(
- PTransform<?, ?> transform) {
- return TRANSLATORS.get(transform.getClass());
- }
-
- // --------------------------------------------------------------------------------------------
- // Transformation Implementations
- // --------------------------------------------------------------------------------------------
-
- private static class TextIOWriteBoundStreamingTranslator
- extends FlinkStreamingPipelineTranslator.StreamTransformTranslator<TextIO.Write.Bound> {
-
- private static final Logger LOG =
- LoggerFactory.getLogger(TextIOWriteBoundStreamingTranslator.class);
-
- @Override
- public void translateNode(
- TextIO.Write.Bound transform,
- FlinkStreamingTranslationContext context) {
- PValue input = context.getInput(transform);
- DataStream<WindowedValue<String>> inputDataStream = context.getInputDataStream(input);
-
- String filenamePrefix = transform.getFilenamePrefix();
- String filenameSuffix = transform.getFilenameSuffix();
- boolean needsValidation = transform.needsValidation();
- int numShards = transform.getNumShards();
- String shardNameTemplate = transform.getShardNameTemplate();
-
- // TODO: Implement these. We need Flink support for this.
- LOG.warn(
- "Translation of TextIO.Write.needsValidation not yet supported. Is: {}.",
- needsValidation);
- LOG.warn(
- "Translation of TextIO.Write.filenameSuffix not yet supported. Is: {}.",
- filenameSuffix);
- LOG.warn(
- "Translation of TextIO.Write.shardNameTemplate not yet supported. Is: {}.",
- shardNameTemplate);
-
- DataStream<String> dataSink = inputDataStream
- .flatMap(new FlatMapFunction<WindowedValue<String>, String>() {
- @Override
- public void flatMap(
- WindowedValue<String> value,
- Collector<String> out)
- throws Exception {
- out.collect(value.getValue());
- }
- });
- DataStreamSink<String> output =
- dataSink.writeAsText(filenamePrefix, FileSystem.WriteMode.OVERWRITE);
-
- if (numShards > 0) {
- output.setParallelism(numShards);
- }
- }
- }
-
- private static class UnboundedReadSourceTranslator<T>
- extends FlinkStreamingPipelineTranslator.StreamTransformTranslator<Read.Unbounded<T>> {
-
- @Override
- public void translateNode(
- Read.Unbounded<T> transform,
- FlinkStreamingTranslationContext context) {
- PCollection<T> output = context.getOutput(transform);
-
- TypeInformation<WindowedValue<T>> outputTypeInfo =
- context.getTypeInfo(context.getOutput(transform));
-
- DataStream<WindowedValue<T>> source;
- try {
- UnboundedSourceWrapper<T, ?> sourceWrapper =
- new UnboundedSourceWrapper<>(
- context.getPipelineOptions(),
- transform.getSource(),
- context.getExecutionEnvironment().getParallelism());
- source = context
- .getExecutionEnvironment()
- .addSource(sourceWrapper).name(transform.getName()).returns(outputTypeInfo);
- } catch (Exception e) {
- throw new RuntimeException(
- "Error while translating UnboundedSource: " + transform.getSource(), e);
- }
-
- context.setOutputDataStream(output, source);
- }
- }
-
- private static class BoundedReadSourceTranslator<T>
- extends FlinkStreamingPipelineTranslator.StreamTransformTranslator<Read.Bounded<T>> {
-
- @Override
- public void translateNode(
- Read.Bounded<T> transform,
- FlinkStreamingTranslationContext context) {
- PCollection<T> output = context.getOutput(transform);
-
- TypeInformation<WindowedValue<T>> outputTypeInfo =
- context.getTypeInfo(context.getOutput(transform));
-
-
- DataStream<WindowedValue<T>> source;
- try {
- BoundedSourceWrapper<T> sourceWrapper =
- new BoundedSourceWrapper<>(
- context.getPipelineOptions(),
- transform.getSource(),
- context.getExecutionEnvironment().getParallelism());
- source = context
- .getExecutionEnvironment()
- .addSource(sourceWrapper).name(transform.getName()).returns(outputTypeInfo);
- } catch (Exception e) {
- throw new RuntimeException(
- "Error while translating BoundedSource: " + transform.getSource(), e);
- }
-
- context.setOutputDataStream(output, source);
- }
- }
-
- /**
- * Wraps each element in a {@link RawUnionValue} with the given tag id.
- */
- private static class ToRawUnion<T> implements MapFunction<T, RawUnionValue> {
- private final int intTag;
-
- public ToRawUnion(int intTag) {
- this.intTag = intTag;
- }
-
- @Override
- public RawUnionValue map(T o) throws Exception {
- return new RawUnionValue(intTag, o);
- }
- }
-
- private static Tuple2<Map<Integer, PCollectionView<?>>, DataStream<RawUnionValue>>
- transformSideInputs(
- Collection<PCollectionView<?>> sideInputs,
- FlinkStreamingTranslationContext context) {
-
- // collect all side inputs
- Map<TupleTag<?>, Integer> tagToIntMapping = new HashMap<>();
- Map<Integer, PCollectionView<?>> intToViewMapping = new HashMap<>();
- int count = 0;
- for (PCollectionView<?> sideInput: sideInputs) {
- TupleTag<?> tag = sideInput.getTagInternal();
- intToViewMapping.put(count, sideInput);
- tagToIntMapping.put(tag, count);
- count++;
- Coder<Iterable<WindowedValue<?>>> coder = sideInput.getCoderInternal();
- }
-
-
- List<Coder<?>> inputCoders = new ArrayList<>();
- for (PCollectionView<?> sideInput: sideInputs) {
- DataStream<Object> sideInputStream = context.getInputDataStream(sideInput);
- TypeInformation<Object> tpe = sideInputStream.getType();
- if (!(tpe instanceof CoderTypeInformation)) {
- throw new IllegalStateException(
- "Input Stream TypeInformation is no CoderTypeInformation.");
- }
-
- Coder<?> coder = ((CoderTypeInformation) tpe).getCoder();
- inputCoders.add(coder);
- }
-
- UnionCoder unionCoder = UnionCoder.of(inputCoders);
-
- CoderTypeInformation<RawUnionValue> unionTypeInformation =
- new CoderTypeInformation<>(unionCoder);
-
- // transform each side input to RawUnionValue and union them
- DataStream<RawUnionValue> sideInputUnion = null;
-
- for (PCollectionView<?> sideInput: sideInputs) {
- TupleTag<?> tag = sideInput.getTagInternal();
- final int intTag = tagToIntMapping.get(tag);
- DataStream<Object> sideInputStream = context.getInputDataStream(sideInput);
- DataStream<RawUnionValue> unionValueStream =
- sideInputStream.map(new ToRawUnion<>(intTag)).returns(unionTypeInformation);
-
- if (sideInputUnion == null) {
- sideInputUnion = unionValueStream;
- } else {
- sideInputUnion = sideInputUnion.union(unionValueStream);
- }
- }
-
- if (sideInputUnion == null) {
- throw new IllegalStateException("No unioned side inputs, this indicates a bug.");
- }
-
- return new Tuple2<>(intToViewMapping, sideInputUnion);
- }
-
- /**
- * Helper for translating {@link ParDo.MultiOutput} and {@link SplittableParDo.ProcessElements}.
- */
- static class ParDoTranslationHelper {
-
- interface DoFnOperatorFactory<InputT, OutputT> {
- DoFnOperator<InputT, OutputT, RawUnionValue> createDoFnOperator(
- DoFn<InputT, OutputT> doFn,
- List<PCollectionView<?>> sideInputs,
- TupleTag<OutputT> mainOutputTag,
- List<TupleTag<?>> additionalOutputTags,
- FlinkStreamingTranslationContext context,
- WindowingStrategy<?, ?> windowingStrategy,
- Map<TupleTag<?>, Integer> tagsToLabels,
- Coder<WindowedValue<InputT>> inputCoder,
- Coder keyCoder,
- Map<Integer, PCollectionView<?>> transformedSideInputs);
- }
-
- static <InputT, OutputT> void translateParDo(
- String transformName,
- DoFn<InputT, OutputT> doFn,
- PCollection<InputT> input,
- List<PCollectionView<?>> sideInputs,
- Map<TupleTag<?>, PValue> outputs,
- TupleTag<OutputT> mainOutputTag,
- List<TupleTag<?>> additionalOutputTags,
- FlinkStreamingTranslationContext context,
- DoFnOperatorFactory<InputT, OutputT> doFnOperatorFactory) {
-
- // we assume that the transformation does not change the windowing strategy.
- WindowingStrategy<?, ?> windowingStrategy = input.getWindowingStrategy();
-
- Map<TupleTag<?>, Integer> tagsToLabels =
- transformTupleTagsToLabels(mainOutputTag, outputs);
-
- SingleOutputStreamOperator<RawUnionValue> unionOutputStream;
-
- Coder<WindowedValue<InputT>> inputCoder = context.getCoder(input);
-
- DataStream<WindowedValue<InputT>> inputDataStream = context.getInputDataStream(input);
-
- Coder keyCoder = null;
- boolean stateful = false;
- DoFnSignature signature = DoFnSignatures.getSignature(doFn.getClass());
- if (signature.stateDeclarations().size() > 0
- || signature.timerDeclarations().size() > 0) {
- // Based on the fact that the signature is stateful, DoFnSignatures ensures
- // that it is also keyed
- keyCoder = ((KvCoder) input.getCoder()).getKeyCoder();
- inputDataStream = inputDataStream.keyBy(new KvToByteBufferKeySelector(keyCoder));
- stateful = true;
- } else if (doFn instanceof SplittableParDo.ProcessFn) {
- // we know that it is keyed on String
- keyCoder = StringUtf8Coder.of();
- stateful = true;
- }
-
- if (sideInputs.isEmpty()) {
- DoFnOperator<InputT, OutputT, RawUnionValue> doFnOperator =
- doFnOperatorFactory.createDoFnOperator(
- doFn,
- sideInputs,
- mainOutputTag,
- additionalOutputTags,
- context,
- windowingStrategy,
- tagsToLabels,
- inputCoder,
- keyCoder,
- new HashMap<Integer, PCollectionView<?>>() /* side-input mapping */);
-
- UnionCoder outputUnionCoder = createUnionCoder(outputs);
-
- CoderTypeInformation<RawUnionValue> outputUnionTypeInformation =
- new CoderTypeInformation<>(outputUnionCoder);
-
- unionOutputStream = inputDataStream
- .transform(transformName, outputUnionTypeInformation, doFnOperator);
-
- } else {
- Tuple2<Map<Integer, PCollectionView<?>>, DataStream<RawUnionValue>> transformedSideInputs =
- transformSideInputs(sideInputs, context);
-
- DoFnOperator<InputT, OutputT, RawUnionValue> doFnOperator =
- doFnOperatorFactory.createDoFnOperator(
- doFn,
- sideInputs,
- mainOutputTag,
- additionalOutputTags,
- context,
- windowingStrategy,
- tagsToLabels,
- inputCoder,
- keyCoder,
- transformedSideInputs.f0);
-
- UnionCoder outputUnionCoder = createUnionCoder(outputs);
-
- CoderTypeInformation<RawUnionValue> outputUnionTypeInformation =
- new CoderTypeInformation<>(outputUnionCoder);
-
- if (stateful) {
- // we have to manually contruct the two-input transform because we're not
- // allowed to have only one input keyed, normally.
- KeyedStream keyedStream = (KeyedStream<?, InputT>) inputDataStream;
- TwoInputTransformation<
- WindowedValue<KV<?, InputT>>,
- RawUnionValue,
- WindowedValue<OutputT>> rawFlinkTransform = new TwoInputTransformation(
- keyedStream.getTransformation(),
- transformedSideInputs.f1.broadcast().getTransformation(),
- transformName,
- (TwoInputStreamOperator) doFnOperator,
- outputUnionTypeInformation,
- keyedStream.getParallelism());
-
- rawFlinkTransform.setStateKeyType(keyedStream.getKeyType());
- rawFlinkTransform.setStateKeySelectors(keyedStream.getKeySelector(), null);
-
- unionOutputStream = new SingleOutputStreamOperator(
- keyedStream.getExecutionEnvironment(),
- rawFlinkTransform) {}; // we have to cheat around the ctor being protected
-
- keyedStream.getExecutionEnvironment().addOperator(rawFlinkTransform);
-
- } else {
- unionOutputStream = inputDataStream
- .connect(transformedSideInputs.f1.broadcast())
- .transform(transformName, outputUnionTypeInformation, doFnOperator);
- }
- }
-
- SplitStream<RawUnionValue> splitStream = unionOutputStream
- .split(new OutputSelector<RawUnionValue>() {
- @Override
- public Iterable<String> select(RawUnionValue value) {
- return Collections.singletonList(Integer.toString(value.getUnionTag()));
- }
- });
-
- for (Entry<TupleTag<?>, PValue> output : outputs.entrySet()) {
- final int outputTag = tagsToLabels.get(output.getKey());
-
- TypeInformation outputTypeInfo = context.getTypeInfo((PCollection<?>) output.getValue());
-
- @SuppressWarnings("unchecked")
- DataStream unwrapped = splitStream.select(String.valueOf(outputTag))
- .flatMap(new FlatMapFunction<RawUnionValue, Object>() {
- @Override
- public void flatMap(RawUnionValue value, Collector<Object> out) throws Exception {
- out.collect(value.getValue());
- }
- }).returns(outputTypeInfo);
-
- context.setOutputDataStream(output.getValue(), unwrapped);
- }
- }
-
- private static Map<TupleTag<?>, Integer> transformTupleTagsToLabels(
- TupleTag<?> mainTag,
- Map<TupleTag<?>, PValue> allTaggedValues) {
-
- Map<TupleTag<?>, Integer> tagToLabelMap = Maps.newHashMap();
- int count = 0;
- tagToLabelMap.put(mainTag, count++);
- for (TupleTag<?> key : allTaggedValues.keySet()) {
- if (!tagToLabelMap.containsKey(key)) {
- tagToLabelMap.put(key, count++);
- }
- }
- return tagToLabelMap;
- }
-
- private static UnionCoder createUnionCoder(Map<TupleTag<?>, PValue> taggedCollections) {
- List<Coder<?>> outputCoders = Lists.newArrayList();
- for (PValue taggedColl : taggedCollections.values()) {
- checkArgument(
- taggedColl instanceof PCollection,
- "A Union Coder can only be created for a Collection of Tagged %s. Got %s",
- PCollection.class.getSimpleName(),
- taggedColl.getClass().getSimpleName());
- PCollection<?> coll = (PCollection<?>) taggedColl;
- WindowedValue.FullWindowedValueCoder<?> windowedValueCoder =
- WindowedValue.getFullCoder(
- coll.getCoder(),
- coll.getWindowingStrategy().getWindowFn().windowCoder());
- outputCoders.add(windowedValueCoder);
- }
- return UnionCoder.of(outputCoders);
- }
- }
-
- private static class ParDoStreamingTranslator<InputT, OutputT>
- extends FlinkStreamingPipelineTranslator.StreamTransformTranslator<
- ParDo.MultiOutput<InputT, OutputT>> {
-
- @Override
- public void translateNode(
- ParDo.MultiOutput<InputT, OutputT> transform,
- FlinkStreamingTranslationContext context) {
-
- ParDoTranslationHelper.translateParDo(
- transform.getName(),
- transform.getFn(),
- (PCollection<InputT>) context.getInput(transform),
- transform.getSideInputs(),
- context.getOutputs(transform),
- transform.getMainOutputTag(),
- transform.getAdditionalOutputTags().getAll(),
- context,
- new ParDoTranslationHelper.DoFnOperatorFactory<InputT, OutputT>() {
- @Override
- public DoFnOperator<InputT, OutputT, RawUnionValue> createDoFnOperator(
- DoFn<InputT, OutputT> doFn,
- List<PCollectionView<?>> sideInputs,
- TupleTag<OutputT> mainOutputTag,
- List<TupleTag<?>> additionalOutputTags,
- FlinkStreamingTranslationContext context,
- WindowingStrategy<?, ?> windowingStrategy,
- Map<TupleTag<?>, Integer> tagsToLabels,
- Coder<WindowedValue<InputT>> inputCoder,
- Coder keyCoder,
- Map<Integer, PCollectionView<?>> transformedSideInputs) {
- return new DoFnOperator<>(
- doFn,
- inputCoder,
- mainOutputTag,
- additionalOutputTags,
- new DoFnOperator.MultiOutputOutputManagerFactory(tagsToLabels),
- windowingStrategy,
- transformedSideInputs,
- sideInputs,
- context.getPipelineOptions(),
- keyCoder);
- }
- });
- }
- }
-
- private static class SplittableProcessElementsStreamingTranslator<
- InputT, OutputT, RestrictionT, TrackerT extends RestrictionTracker<RestrictionT>>
- extends FlinkStreamingPipelineTranslator.StreamTransformTranslator<
- SplittableParDo.ProcessElements<InputT, OutputT, RestrictionT, TrackerT>> {
-
- @Override
- public void translateNode(
- SplittableParDo.ProcessElements<InputT, OutputT, RestrictionT, TrackerT> transform,
- FlinkStreamingTranslationContext context) {
-
- ParDoTranslationHelper.translateParDo(
- transform.getName(),
- transform.newProcessFn(transform.getFn()),
- (PCollection<KeyedWorkItem<String, ElementAndRestriction<InputT, RestrictionT>>>)
- context.getInput(transform),
- transform.getSideInputs(),
- context.getOutputs(transform),
- transform.getMainOutputTag(),
- transform.getAdditionalOutputTags().getAll(),
- context,
- new ParDoTranslationHelper.DoFnOperatorFactory<
- KeyedWorkItem<String, ElementAndRestriction<InputT, RestrictionT>>, OutputT>() {
- @Override
- public DoFnOperator<
- KeyedWorkItem<String, ElementAndRestriction<InputT, RestrictionT>>,
- OutputT,
- RawUnionValue> createDoFnOperator(
- DoFn<
- KeyedWorkItem<String, ElementAndRestriction<InputT, RestrictionT>>,
- OutputT> doFn,
- List<PCollectionView<?>> sideInputs,
- TupleTag<OutputT> mainOutputTag,
- List<TupleTag<?>> additionalOutputTags,
- FlinkStreamingTranslationContext context,
- WindowingStrategy<?, ?> windowingStrategy,
- Map<TupleTag<?>, Integer> tagsToLabels,
- Coder<
- WindowedValue<
- KeyedWorkItem<
- String,
- ElementAndRestriction<InputT, RestrictionT>>>> inputCoder,
- Coder keyCoder,
- Map<Integer, PCollectionView<?>> transformedSideInputs) {
- return new SplittableDoFnOperator<>(
- doFn,
- inputCoder,
- mainOutputTag,
- additionalOutputTags,
- new DoFnOperator.MultiOutputOutputManagerFactory(tagsToLabels),
- windowingStrategy,
- transformedSideInputs,
- sideInputs,
- context.getPipelineOptions(),
- keyCoder);
- }
- });
- }
- }
-
- private static class CreateViewStreamingTranslator<ElemT, ViewT>
- extends FlinkStreamingPipelineTranslator.StreamTransformTranslator<
- FlinkStreamingViewOverrides.CreateFlinkPCollectionView<ElemT, ViewT>> {
-
- @Override
- public void translateNode(
- FlinkStreamingViewOverrides.CreateFlinkPCollectionView<ElemT, ViewT> transform,
- FlinkStreamingTranslationContext context) {
- // just forward
- DataStream<WindowedValue<List<ElemT>>> inputDataSet =
- context.getInputDataStream(context.getInput(transform));
-
- PCollectionView<ViewT> view = context.getOutput(transform);
-
- context.setOutputDataStream(view, inputDataSet);
- }
- }
-
- private static class WindowAssignTranslator<T>
- extends FlinkStreamingPipelineTranslator.StreamTransformTranslator<Window.Assign<T>> {
-
- @Override
- public void translateNode(
- Window.Assign<T> transform,
- FlinkStreamingTranslationContext context) {
-
- @SuppressWarnings("unchecked")
- WindowingStrategy<T, BoundedWindow> windowingStrategy =
- (WindowingStrategy<T, BoundedWindow>)
- context.getOutput(transform).getWindowingStrategy();
-
- TypeInformation<WindowedValue<T>> typeInfo =
- context.getTypeInfo(context.getOutput(transform));
-
- DataStream<WindowedValue<T>> inputDataStream =
- context.getInputDataStream(context.getInput(transform));
-
- WindowFn<T, ? extends BoundedWindow> windowFn = windowingStrategy.getWindowFn();
-
- FlinkAssignWindows<T, ? extends BoundedWindow> assignWindowsFunction =
- new FlinkAssignWindows<>(windowFn);
-
- SingleOutputStreamOperator<WindowedValue<T>> outputDataStream = inputDataStream
- .flatMap(assignWindowsFunction)
- .name(context.getOutput(transform).getName())
- .returns(typeInfo);
-
- context.setOutputDataStream(context.getOutput(transform), outputDataStream);
- }
- }
-
- private static class ReshuffleTranslatorStreaming<K, InputT>
- extends FlinkStreamingPipelineTranslator.StreamTransformTranslator<Reshuffle<K, InputT>> {
-
- @Override
- public void translateNode(
- Reshuffle<K, InputT> transform,
- FlinkStreamingTranslationContext context) {
-
- DataStream<WindowedValue<KV<K, InputT>>> inputDataSet =
- context.getInputDataStream(context.getInput(transform));
-
- context.setOutputDataStream(context.getOutput(transform), inputDataSet.rebalance());
-
- }
- }
-
-
- private static class GroupByKeyTranslator<K, InputT>
- extends FlinkStreamingPipelineTranslator.StreamTransformTranslator<GroupByKey<K, InputT>> {
-
- @Override
- public void translateNode(
- GroupByKey<K, InputT> transform,
- FlinkStreamingTranslationContext context) {
-
- PCollection<KV<K, InputT>> input = context.getInput(transform);
-
- @SuppressWarnings("unchecked")
- WindowingStrategy<?, BoundedWindow> windowingStrategy =
- (WindowingStrategy<?, BoundedWindow>) input.getWindowingStrategy();
-
- KvCoder<K, InputT> inputKvCoder = (KvCoder<K, InputT>) input.getCoder();
-
- SingletonKeyedWorkItemCoder<K, InputT> workItemCoder = SingletonKeyedWorkItemCoder.of(
- inputKvCoder.getKeyCoder(),
- inputKvCoder.getValueCoder(),
- input.getWindowingStrategy().getWindowFn().windowCoder());
-
- DataStream<WindowedValue<KV<K, InputT>>> inputDataStream = context.getInputDataStream(input);
-
- WindowedValue.
- FullWindowedValueCoder<SingletonKeyedWorkItem<K, InputT>> windowedWorkItemCoder =
- WindowedValue.getFullCoder(
- workItemCoder,
- input.getWindowingStrategy().getWindowFn().windowCoder());
-
- CoderTypeInformation<WindowedValue<SingletonKeyedWorkItem<K, InputT>>> workItemTypeInfo =
- new CoderTypeInformation<>(windowedWorkItemCoder);
-
- DataStream<WindowedValue<SingletonKeyedWorkItem<K, InputT>>> workItemStream =
- inputDataStream
- .flatMap(new ToKeyedWorkItem<K, InputT>())
- .returns(workItemTypeInfo).name("ToKeyedWorkItem");
-
- KeyedStream<
- WindowedValue<
- SingletonKeyedWorkItem<K, InputT>>, ByteBuffer> keyedWorkItemStream = workItemStream
- .keyBy(new WorkItemKeySelector<K, InputT>(inputKvCoder.getKeyCoder()));
-
- SystemReduceFn<K, InputT, Iterable<InputT>, Iterable<InputT>, BoundedWindow> reduceFn =
- SystemReduceFn.buffering(inputKvCoder.getValueCoder());
-
- TypeInformation<WindowedValue<KV<K, Iterable<InputT>>>> outputTypeInfo =
- context.getTypeInfo(context.getOutput(transform));
-
- DoFnOperator.DefaultOutputManagerFactory<
- WindowedValue<KV<K, Iterable<InputT>>>> outputManagerFactory =
- new DoFnOperator.DefaultOutputManagerFactory<>();
-
- WindowDoFnOperator<K, InputT, Iterable<InputT>> doFnOperator =
- new WindowDoFnOperator<>(
- reduceFn,
- (Coder) windowedWorkItemCoder,
- new TupleTag<KV<K, Iterable<InputT>>>("main output"),
- Collections.<TupleTag<?>>emptyList(),
- outputManagerFactory,
- windowingStrategy,
- new HashMap<Integer, PCollectionView<?>>(), /* side-input mapping */
- Collections.<PCollectionView<?>>emptyList(), /* side inputs */
- context.getPipelineOptions(),
- inputKvCoder.getKeyCoder());
-
- // our operator excepts WindowedValue<KeyedWorkItem> while our input stream
- // is WindowedValue<SingletonKeyedWorkItem>, which is fine but Java doesn't like it ...
- @SuppressWarnings("unchecked")
- SingleOutputStreamOperator<WindowedValue<KV<K, Iterable<InputT>>>> outDataStream =
- keyedWorkItemStream
- .transform(
- transform.getName(),
- outputTypeInfo,
- (OneInputStreamOperator) doFnOperator);
-
- context.setOutputDataStream(context.getOutput(transform), outDataStream);
-
- }
- }
-
- private static class CombinePerKeyTranslator<K, InputT, OutputT>
- extends FlinkStreamingPipelineTranslator.StreamTransformTranslator<
- Combine.PerKey<K, InputT, OutputT>> {
-
- @Override
- boolean canTranslate(
- Combine.PerKey<K, InputT, OutputT> transform,
- FlinkStreamingTranslationContext context) {
-
- // if we have a merging window strategy and side inputs we cannot
- // translate as a proper combine. We have to group and then run the combine
- // over the final grouped values.
- PCollection<KV<K, InputT>> input = context.getInput(transform);
-
- @SuppressWarnings("unchecked")
- WindowingStrategy<?, BoundedWindow> windowingStrategy =
- (WindowingStrategy<?, BoundedWindow>) input.getWindowingStrategy();
-
- return windowingStrategy.getWindowFn().isNonMerging() || transform.getSideInputs().isEmpty();
- }
-
- @Override
- public void translateNode(
- Combine.PerKey<K, InputT, OutputT> transform,
- FlinkStreamingTranslationContext context) {
-
- PCollection<KV<K, InputT>> input = context.getInput(transform);
-
- @SuppressWarnings("unchecked")
- WindowingStrategy<?, BoundedWindow> windowingStrategy =
- (WindowingStrategy<?, BoundedWindow>) input.getWindowingStrategy();
-
- KvCoder<K, InputT> inputKvCoder = (KvCoder<K, InputT>) input.getCoder();
-
- SingletonKeyedWorkItemCoder<K, InputT> workItemCoder = SingletonKeyedWorkItemCoder.of(
- inputKvCoder.getKeyCoder(),
- inputKvCoder.getValueCoder(),
- input.getWindowingStrategy().getWindowFn().windowCoder());
-
- DataStream<WindowedValue<KV<K, InputT>>> inputDataStream = context.getInputDataStream(input);
-
- WindowedValue.
- FullWindowedValueCoder<SingletonKeyedWorkItem<K, InputT>> windowedWorkItemCoder =
- WindowedValue.getFullCoder(
- workItemCoder,
- input.getWindowingStrategy().getWindowFn().windowCoder());
-
- CoderTypeInformation<WindowedValue<SingletonKeyedWorkItem<K, InputT>>> workItemTypeInfo =
- new CoderTypeInformation<>(windowedWorkItemCoder);
-
- DataStream<WindowedValue<SingletonKeyedWorkItem<K, InputT>>> workItemStream =
- inputDataStream
- .flatMap(new ToKeyedWorkItem<K, InputT>())
- .returns(workItemTypeInfo).name("ToKeyedWorkItem");
-
- KeyedStream<
- WindowedValue<
- SingletonKeyedWorkItem<K, InputT>>, ByteBuffer> keyedWorkItemStream = workItemStream
- .keyBy(new WorkItemKeySelector<K, InputT>(inputKvCoder.getKeyCoder()));
-
- SystemReduceFn<K, InputT, ?, OutputT, BoundedWindow> reduceFn = SystemReduceFn.combining(
- inputKvCoder.getKeyCoder(),
- AppliedCombineFn.withInputCoder(
- transform.getFn(), input.getPipeline().getCoderRegistry(), inputKvCoder));
-
- TypeInformation<WindowedValue<KV<K, OutputT>>> outputTypeInfo =
- context.getTypeInfo(context.getOutput(transform));
-
- List<PCollectionView<?>> sideInputs = transform.getSideInputs();
-
- if (sideInputs.isEmpty()) {
-
- WindowDoFnOperator<K, InputT, OutputT> doFnOperator =
- new WindowDoFnOperator<>(
- reduceFn,
- (Coder) windowedWorkItemCoder,
- new TupleTag<KV<K, OutputT>>("main output"),
- Collections.<TupleTag<?>>emptyList(),
- new DoFnOperator.DefaultOutputManagerFactory<WindowedValue<KV<K, OutputT>>>(),
- windowingStrategy,
- new HashMap<Integer, PCollectionView<?>>(), /* side-input mapping */
- Collections.<PCollectionView<?>>emptyList(), /* side inputs */
- context.getPipelineOptions(),
- inputKvCoder.getKeyCoder());
-
- // our operator excepts WindowedValue<KeyedWorkItem> while our input stream
- // is WindowedValue<SingletonKeyedWorkItem>, which is fine but Java doesn't like it ...
- @SuppressWarnings("unchecked")
- SingleOutputStreamOperator<WindowedValue<KV<K, OutputT>>> outDataStream =
- keyedWorkItemStream.transform(
- transform.getName(), outputTypeInfo, (OneInputStreamOperator) doFnOperator);
-
- context.setOutputDataStream(context.getOutput(transform), outDataStream);
- } else {
- Tuple2<Map<Integer, PCollectionView<?>>, DataStream<RawUnionValue>> transformSideInputs =
- transformSideInputs(sideInputs, context);
-
- WindowDoFnOperator<K, InputT, OutputT> doFnOperator =
- new WindowDoFnOperator<>(
- reduceFn,
- (Coder) windowedWorkItemCoder,
- new TupleTag<KV<K, OutputT>>("main output"),
- Collections.<TupleTag<?>>emptyList(),
- new DoFnOperator.DefaultOutputManagerFactory<WindowedValue<KV<K, OutputT>>>(),
- windowingStrategy,
- transformSideInputs.f0,
- sideInputs,
- context.getPipelineOptions(),
- inputKvCoder.getKeyCoder());
-
- // we have to manually contruct the two-input transform because we're not
- // allowed to have only one input keyed, normally.
-
- TwoInputTransformation<
- WindowedValue<SingletonKeyedWorkItem<K, InputT>>,
- RawUnionValue,
- WindowedValue<KV<K, OutputT>>> rawFlinkTransform = new TwoInputTransformation<>(
- keyedWorkItemStream.getTransformation(),
- transformSideInputs.f1.broadcast().getTransformation(),
- transform.getName(),
- (TwoInputStreamOperator) doFnOperator,
- outputTypeInfo,
- keyedWorkItemStream.getParallelism());
-
- rawFlinkTransform.setStateKeyType(keyedWorkItemStream.getKeyType());
- rawFlinkTransform.setStateKeySelectors(keyedWorkItemStream.getKeySelector(), null);
-
- @SuppressWarnings({ "unchecked", "rawtypes" })
- SingleOutputStreamOperator<WindowedValue<KV<K, OutputT>>> outDataStream =
- new SingleOutputStreamOperator(
- keyedWorkItemStream.getExecutionEnvironment(),
- rawFlinkTransform) {}; // we have to cheat around the ctor being protected
-
- keyedWorkItemStream.getExecutionEnvironment().addOperator(rawFlinkTransform);
-
- context.setOutputDataStream(context.getOutput(transform), outDataStream);
- }
- }
- }
-
- private static class GBKIntoKeyedWorkItemsTranslator<K, InputT>
- extends FlinkStreamingPipelineTranslator.StreamTransformTranslator<
- SplittableParDo.GBKIntoKeyedWorkItems<K, InputT>> {
-
- @Override
- boolean canTranslate(
- SplittableParDo.GBKIntoKeyedWorkItems<K, InputT> transform,
- FlinkStreamingTranslationContext context) {
- return true;
- }
-
- @Override
- public void translateNode(
- SplittableParDo.GBKIntoKeyedWorkItems<K, InputT> transform,
- FlinkStreamingTranslationContext context) {
-
- PCollection<KV<K, InputT>> input = context.getInput(transform);
-
- KvCoder<K, InputT> inputKvCoder = (KvCoder<K, InputT>) input.getCoder();
-
- SingletonKeyedWorkItemCoder<K, InputT> workItemCoder = SingletonKeyedWorkItemCoder.of(
- inputKvCoder.getKeyCoder(),
- inputKvCoder.getValueCoder(),
- input.getWindowingStrategy().getWindowFn().windowCoder());
-
-
- WindowedValue.
- FullWindowedValueCoder<SingletonKeyedWorkItem<K, InputT>> windowedWorkItemCoder =
- WindowedValue.getFullCoder(
- workItemCoder,
- input.getWindowingStrategy().getWindowFn().windowCoder());
-
- CoderTypeInformation<WindowedValue<SingletonKeyedWorkItem<K, InputT>>> workItemTypeInfo =
- new CoderTypeInformation<>(windowedWorkItemCoder);
-
- DataStream<WindowedValue<KV<K, InputT>>> inputDataStream = context.getInputDataStream(input);
-
- DataStream<WindowedValue<SingletonKeyedWorkItem<K, InputT>>> workItemStream =
- inputDataStream
- .flatMap(new ToKeyedWorkItem<K, InputT>())
- .returns(workItemTypeInfo).name("ToKeyedWorkItem");
-
- KeyedStream<
- WindowedValue<
- SingletonKeyedWorkItem<K, InputT>>, ByteBuffer> keyedWorkItemStream = workItemStream
- .keyBy(new WorkItemKeySelector<K, InputT>(inputKvCoder.getKeyCoder()));
-
- context.setOutputDataStream(context.getOutput(transform), keyedWorkItemStream);
- }
- }
-
- private static class FlattenPCollectionTranslator<T>
- extends FlinkStreamingPipelineTranslator.StreamTransformTranslator<
- Flatten.PCollections<T>> {
-
- @Override
- public void translateNode(
- Flatten.PCollections<T> transform,
- FlinkStreamingTranslationContext context) {
- Map<TupleTag<?>, PValue> allInputs = context.getInputs(transform);
-
- if (allInputs.isEmpty()) {
-
- // create an empty dummy source to satisfy downstream operations
- // we cannot create an empty source in Flink, therefore we have to
- // add the flatMap that simply never forwards the single element
- DataStreamSource<String> dummySource =
- context.getExecutionEnvironment().fromElements("dummy");
-
- DataStream<WindowedValue<T>> result = dummySource.flatMap(
- new FlatMapFunction<String, WindowedValue<T>>() {
- @Override
- public void flatMap(
- String s,
- Collector<WindowedValue<T>> collector) throws Exception {
- // never return anything
- }
- }).returns(
- new CoderTypeInformation<>(
- WindowedValue.getFullCoder(
- (Coder<T>) VoidCoder.of(),
- GlobalWindow.Coder.INSTANCE)));
- context.setOutputDataStream(context.getOutput(transform), result);
-
- } else {
- DataStream<T> result = null;
- for (PValue input : allInputs.values()) {
- DataStream<T> current = context.getInputDataStream(input);
- result = (result == null) ? current : result.union(current);
- }
- context.setOutputDataStream(context.getOutput(transform), result);
- }
- }
- }
-
- private static class ToKeyedWorkItem<K, InputT>
- extends RichFlatMapFunction<
- WindowedValue<KV<K, InputT>>,
- WindowedValue<SingletonKeyedWorkItem<K, InputT>>> {
-
- @Override
- public void flatMap(
- WindowedValue<KV<K, InputT>> inWithMultipleWindows,
- Collector<WindowedValue<SingletonKeyedWorkItem<K, InputT>>> out) throws Exception {
-
- // we need to wrap each one work item per window for now
- // since otherwise the PushbackSideInputRunner will not correctly
- // determine whether side inputs are ready
- //
- // this is tracked as https://issues.apache.org/jira/browse/BEAM-1850
- for (WindowedValue<KV<K, InputT>> in : inWithMultipleWindows.explodeWindows()) {
- SingletonKeyedWorkItem<K, InputT> workItem =
- new SingletonKeyedWorkItem<>(
- in.getValue().getKey(),
- in.withValue(in.getValue().getValue()));
-
- out.collect(in.withValue(workItem));
- }
- }
- }
-
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/FlinkStreamingTranslationContext.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/FlinkStreamingTranslationContext.java b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/FlinkStreamingTranslationContext.java
deleted file mode 100644
index 1a943a3..0000000
--- a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/FlinkStreamingTranslationContext.java
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink;
-
-import static com.google.common.base.Preconditions.checkNotNull;
-
-import com.google.common.collect.Iterables;
-import java.util.HashMap;
-import java.util.Map;
-import org.apache.beam.runners.flink.translation.types.CoderTypeInformation;
-import org.apache.beam.sdk.coders.Coder;
-import org.apache.beam.sdk.options.PipelineOptions;
-import org.apache.beam.sdk.transforms.AppliedPTransform;
-import org.apache.beam.sdk.transforms.PTransform;
-import org.apache.beam.sdk.util.WindowedValue;
-import org.apache.beam.sdk.values.PCollection;
-import org.apache.beam.sdk.values.PInput;
-import org.apache.beam.sdk.values.POutput;
-import org.apache.beam.sdk.values.PValue;
-import org.apache.beam.sdk.values.TupleTag;
-import org.apache.flink.api.common.typeinfo.TypeInformation;
-import org.apache.flink.streaming.api.datastream.DataStream;
-import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
-
-/**
- * Helper for keeping track of which {@link DataStream DataStreams} map
- * to which {@link PTransform PTransforms}.
- */
-class FlinkStreamingTranslationContext {
-
- private final StreamExecutionEnvironment env;
- private final PipelineOptions options;
-
- /**
- * Keeps a mapping between the output value of the PTransform (in Dataflow) and the
- * Flink Operator that produced it, after the translation of the correspondinf PTransform
- * to its Flink equivalent.
- * */
- private final Map<PValue, DataStream<?>> dataStreams;
-
- private AppliedPTransform<?, ?, ?> currentTransform;
-
- public FlinkStreamingTranslationContext(StreamExecutionEnvironment env, PipelineOptions options) {
- this.env = checkNotNull(env);
- this.options = checkNotNull(options);
- this.dataStreams = new HashMap<>();
- }
-
- public StreamExecutionEnvironment getExecutionEnvironment() {
- return env;
- }
-
- public PipelineOptions getPipelineOptions() {
- return options;
- }
-
- @SuppressWarnings("unchecked")
- public <T> DataStream<T> getInputDataStream(PValue value) {
- return (DataStream<T>) dataStreams.get(value);
- }
-
- public void setOutputDataStream(PValue value, DataStream<?> set) {
- if (!dataStreams.containsKey(value)) {
- dataStreams.put(value, set);
- }
- }
-
- /**
- * Sets the AppliedPTransform which carries input/output.
- * @param currentTransform
- */
- public void setCurrentTransform(AppliedPTransform<?, ?, ?> currentTransform) {
- this.currentTransform = currentTransform;
- }
-
- public <T> Coder<WindowedValue<T>> getCoder(PCollection<T> collection) {
- Coder<T> valueCoder = collection.getCoder();
-
- return WindowedValue.getFullCoder(
- valueCoder,
- collection.getWindowingStrategy().getWindowFn().windowCoder());
- }
-
- @SuppressWarnings("unchecked")
- public <T> TypeInformation<WindowedValue<T>> getTypeInfo(PCollection<T> collection) {
- Coder<T> valueCoder = collection.getCoder();
- WindowedValue.FullWindowedValueCoder<T> windowedValueCoder =
- WindowedValue.getFullCoder(
- valueCoder,
- collection.getWindowingStrategy().getWindowFn().windowCoder());
-
- return new CoderTypeInformation<>(windowedValueCoder);
- }
-
-
- @SuppressWarnings("unchecked")
- public <T extends PValue> T getInput(PTransform<T, ?> transform) {
- return (T) Iterables.getOnlyElement(currentTransform.getInputs().values());
- }
-
- public <T extends PInput> Map<TupleTag<?>, PValue> getInputs(PTransform<T, ?> transform) {
- return currentTransform.getInputs();
- }
-
- @SuppressWarnings("unchecked")
- public <T extends PValue> T getOutput(PTransform<?, T> transform) {
- return (T) Iterables.getOnlyElement(currentTransform.getOutputs().values());
- }
-
- public <OutputT extends POutput> Map<TupleTag<?>, PValue> getOutputs(
- PTransform<?, OutputT> transform) {
- return currentTransform.getOutputs();
- }
-
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/FlinkStreamingViewOverrides.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/FlinkStreamingViewOverrides.java b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/FlinkStreamingViewOverrides.java
deleted file mode 100644
index f955f2a..0000000
--- a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/FlinkStreamingViewOverrides.java
+++ /dev/null
@@ -1,372 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink;
-
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
-import java.util.Map;
-import org.apache.beam.sdk.coders.Coder;
-import org.apache.beam.sdk.coders.CoderRegistry;
-import org.apache.beam.sdk.coders.KvCoder;
-import org.apache.beam.sdk.coders.ListCoder;
-import org.apache.beam.sdk.transforms.Combine;
-import org.apache.beam.sdk.transforms.DoFn;
-import org.apache.beam.sdk.transforms.PTransform;
-import org.apache.beam.sdk.transforms.ParDo;
-import org.apache.beam.sdk.transforms.View;
-import org.apache.beam.sdk.util.PCollectionViews;
-import org.apache.beam.sdk.values.KV;
-import org.apache.beam.sdk.values.PCollection;
-import org.apache.beam.sdk.values.PCollectionView;
-
-/**
- * Flink streaming overrides for various view (side input) transforms.
- */
-class FlinkStreamingViewOverrides {
-
- /**
- * Specialized implementation for
- * {@link org.apache.beam.sdk.transforms.View.AsMap View.AsMap}
- * for the Flink runner in streaming mode.
- */
- static class StreamingViewAsMap<K, V>
- extends PTransform<PCollection<KV<K, V>>, PCollectionView<Map<K, V>>> {
-
- private final transient FlinkRunner runner;
-
- @SuppressWarnings("unused") // used via reflection in FlinkRunner#apply()
- public StreamingViewAsMap(FlinkRunner runner, View.AsMap<K, V> transform) {
- this.runner = runner;
- }
-
- @Override
- public PCollectionView<Map<K, V>> expand(PCollection<KV<K, V>> input) {
- PCollectionView<Map<K, V>> view =
- PCollectionViews.mapView(
- input,
- input.getWindowingStrategy(),
- input.getCoder());
-
- @SuppressWarnings({"rawtypes", "unchecked"})
- KvCoder<K, V> inputCoder = (KvCoder) input.getCoder();
- try {
- inputCoder.getKeyCoder().verifyDeterministic();
- } catch (Coder.NonDeterministicException e) {
- runner.recordViewUsesNonDeterministicKeyCoder(this);
- }
-
- return input
- .apply(Combine.globally(new Concatenate<KV<K, V>>()).withoutDefaults())
- .apply(CreateFlinkPCollectionView.<KV<K, V>, Map<K, V>>of(view));
- }
-
- @Override
- protected String getKindString() {
- return "StreamingViewAsMap";
- }
- }
-
- /**
- * Specialized expansion for {@link
- * View.AsMultimap View.AsMultimap} for the
- * Flink runner in streaming mode.
- */
- static class StreamingViewAsMultimap<K, V>
- extends PTransform<PCollection<KV<K, V>>, PCollectionView<Map<K, Iterable<V>>>> {
-
- private final transient FlinkRunner runner;
-
- /**
- * Builds an instance of this class from the overridden transform.
- */
- @SuppressWarnings("unused") // used via reflection in FlinkRunner#apply()
- public StreamingViewAsMultimap(FlinkRunner runner, View.AsMultimap<K, V> transform) {
- this.runner = runner;
- }
-
- @Override
- public PCollectionView<Map<K, Iterable<V>>> expand(PCollection<KV<K, V>> input) {
- PCollectionView<Map<K, Iterable<V>>> view =
- PCollectionViews.multimapView(
- input,
- input.getWindowingStrategy(),
- input.getCoder());
-
- @SuppressWarnings({"rawtypes", "unchecked"})
- KvCoder<K, V> inputCoder = (KvCoder) input.getCoder();
- try {
- inputCoder.getKeyCoder().verifyDeterministic();
- } catch (Coder.NonDeterministicException e) {
- runner.recordViewUsesNonDeterministicKeyCoder(this);
- }
-
- return input
- .apply(Combine.globally(new Concatenate<KV<K, V>>()).withoutDefaults())
- .apply(CreateFlinkPCollectionView.<KV<K, V>, Map<K, Iterable<V>>>of(view));
- }
-
- @Override
- protected String getKindString() {
- return "StreamingViewAsMultimap";
- }
- }
-
- /**
- * Specialized implementation for
- * {@link View.AsList View.AsList} for the
- * Flink runner in streaming mode.
- */
- static class StreamingViewAsList<T>
- extends PTransform<PCollection<T>, PCollectionView<List<T>>> {
- /**
- * Builds an instance of this class from the overridden transform.
- */
- @SuppressWarnings("unused") // used via reflection in FlinkRunner#apply()
- public StreamingViewAsList(FlinkRunner runner, View.AsList<T> transform) {}
-
- @Override
- public PCollectionView<List<T>> expand(PCollection<T> input) {
- PCollectionView<List<T>> view =
- PCollectionViews.listView(
- input,
- input.getWindowingStrategy(),
- input.getCoder());
-
- return input.apply(Combine.globally(new Concatenate<T>()).withoutDefaults())
- .apply(CreateFlinkPCollectionView.<T, List<T>>of(view));
- }
-
- @Override
- protected String getKindString() {
- return "StreamingViewAsList";
- }
- }
-
- /**
- * Specialized implementation for
- * {@link View.AsIterable View.AsIterable} for the
- * Flink runner in streaming mode.
- */
- static class StreamingViewAsIterable<T>
- extends PTransform<PCollection<T>, PCollectionView<Iterable<T>>> {
- /**
- * Builds an instance of this class from the overridden transform.
- */
- @SuppressWarnings("unused") // used via reflection in FlinkRunner#apply()
- public StreamingViewAsIterable(FlinkRunner runner, View.AsIterable<T> transform) { }
-
- @Override
- public PCollectionView<Iterable<T>> expand(PCollection<T> input) {
- PCollectionView<Iterable<T>> view =
- PCollectionViews.iterableView(
- input,
- input.getWindowingStrategy(),
- input.getCoder());
-
- return input.apply(Combine.globally(new Concatenate<T>()).withoutDefaults())
- .apply(CreateFlinkPCollectionView.<T, Iterable<T>>of(view));
- }
-
- @Override
- protected String getKindString() {
- return "StreamingViewAsIterable";
- }
- }
-
- /**
- * Specialized expansion for
- * {@link View.AsSingleton View.AsSingleton} for the
- * Flink runner in streaming mode.
- */
- static class StreamingViewAsSingleton<T>
- extends PTransform<PCollection<T>, PCollectionView<T>> {
- private View.AsSingleton<T> transform;
-
- /**
- * Builds an instance of this class from the overridden transform.
- */
- @SuppressWarnings("unused") // used via reflection in FlinkRunner#apply()
- public StreamingViewAsSingleton(FlinkRunner runner, View.AsSingleton<T> transform) {
- this.transform = transform;
- }
-
- @Override
- public PCollectionView<T> expand(PCollection<T> input) {
- Combine.Globally<T, T> combine = Combine.globally(
- new SingletonCombine<>(transform.hasDefaultValue(), transform.defaultValue()));
- if (!transform.hasDefaultValue()) {
- combine = combine.withoutDefaults();
- }
- return input.apply(combine.asSingletonView());
- }
-
- @Override
- protected String getKindString() {
- return "StreamingViewAsSingleton";
- }
-
- private static class SingletonCombine<T> extends Combine.BinaryCombineFn<T> {
- private boolean hasDefaultValue;
- private T defaultValue;
-
- SingletonCombine(boolean hasDefaultValue, T defaultValue) {
- this.hasDefaultValue = hasDefaultValue;
- this.defaultValue = defaultValue;
- }
-
- @Override
- public T apply(T left, T right) {
- throw new IllegalArgumentException("PCollection with more than one element "
- + "accessed as a singleton view. Consider using Combine.globally().asSingleton() to "
- + "combine the PCollection into a single value");
- }
-
- @Override
- public T identity() {
- if (hasDefaultValue) {
- return defaultValue;
- } else {
- throw new IllegalArgumentException(
- "Empty PCollection accessed as a singleton view. "
- + "Consider setting withDefault to provide a default value");
- }
- }
- }
- }
-
- static class StreamingCombineGloballyAsSingletonView<InputT, OutputT>
- extends PTransform<PCollection<InputT>, PCollectionView<OutputT>> {
- Combine.GloballyAsSingletonView<InputT, OutputT> transform;
-
- /**
- * Builds an instance of this class from the overridden transform.
- */
- @SuppressWarnings("unused") // used via reflection in FlinkRunner#apply()
- public StreamingCombineGloballyAsSingletonView(
- FlinkRunner runner,
- Combine.GloballyAsSingletonView<InputT, OutputT> transform) {
- this.transform = transform;
- }
-
- @Override
- public PCollectionView<OutputT> expand(PCollection<InputT> input) {
- PCollection<OutputT> combined =
- input.apply(Combine.globally(transform.getCombineFn())
- .withoutDefaults()
- .withFanout(transform.getFanout()));
-
- PCollectionView<OutputT> view = PCollectionViews.singletonView(
- combined,
- combined.getWindowingStrategy(),
- transform.getInsertDefault(),
- transform.getInsertDefault()
- ? transform.getCombineFn().defaultValue() : null,
- combined.getCoder());
- return combined
- .apply(ParDo.of(new WrapAsList<OutputT>()))
- .apply(CreateFlinkPCollectionView.<OutputT, OutputT>of(view));
- }
-
- @Override
- protected String getKindString() {
- return "StreamingCombineGloballyAsSingletonView";
- }
- }
-
- private static class WrapAsList<T> extends DoFn<T, List<T>> {
- @ProcessElement
- public void processElement(ProcessContext c) {
- c.output(Collections.singletonList(c.element()));
- }
- }
-
- /**
- * Combiner that combines {@code T}s into a single {@code List<T>} containing all inputs.
- *
- * <p>For internal use by {@link StreamingViewAsMap}, {@link StreamingViewAsMultimap},
- * {@link StreamingViewAsList}, {@link StreamingViewAsIterable}.
- * They require the input {@link PCollection} fits in memory.
- * For a large {@link PCollection} this is expected to crash!
- *
- * @param <T> the type of elements to concatenate.
- */
- private static class Concatenate<T> extends Combine.CombineFn<T, List<T>, List<T>> {
- @Override
- public List<T> createAccumulator() {
- return new ArrayList<T>();
- }
-
- @Override
- public List<T> addInput(List<T> accumulator, T input) {
- accumulator.add(input);
- return accumulator;
- }
-
- @Override
- public List<T> mergeAccumulators(Iterable<List<T>> accumulators) {
- List<T> result = createAccumulator();
- for (List<T> accumulator : accumulators) {
- result.addAll(accumulator);
- }
- return result;
- }
-
- @Override
- public List<T> extractOutput(List<T> accumulator) {
- return accumulator;
- }
-
- @Override
- public Coder<List<T>> getAccumulatorCoder(CoderRegistry registry, Coder<T> inputCoder) {
- return ListCoder.of(inputCoder);
- }
-
- @Override
- public Coder<List<T>> getDefaultOutputCoder(CoderRegistry registry, Coder<T> inputCoder) {
- return ListCoder.of(inputCoder);
- }
- }
-
- /**
- * Creates a primitive {@link PCollectionView}.
- *
- * <p>For internal use only by runner implementors.
- *
- * @param <ElemT> The type of the elements of the input PCollection
- * @param <ViewT> The type associated with the {@link PCollectionView} used as a side input
- */
- public static class CreateFlinkPCollectionView<ElemT, ViewT>
- extends PTransform<PCollection<List<ElemT>>, PCollectionView<ViewT>> {
- private PCollectionView<ViewT> view;
-
- private CreateFlinkPCollectionView(PCollectionView<ViewT> view) {
- this.view = view;
- }
-
- public static <ElemT, ViewT> CreateFlinkPCollectionView<ElemT, ViewT> of(
- PCollectionView<ViewT> view) {
- return new CreateFlinkPCollectionView<>(view);
- }
-
- @Override
- public PCollectionView<ViewT> expand(PCollection<List<ElemT>> input) {
- return view;
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/PipelineTranslationOptimizer.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/PipelineTranslationOptimizer.java b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/PipelineTranslationOptimizer.java
deleted file mode 100644
index 3acc3ea..0000000
--- a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/PipelineTranslationOptimizer.java
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink;
-
-import org.apache.beam.sdk.io.Read;
-import org.apache.beam.sdk.runners.TransformHierarchy;
-import org.apache.beam.sdk.transforms.PTransform;
-import org.apache.beam.sdk.values.PValue;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * Traverses the Pipeline to determine the {@link TranslationMode} for this pipeline.
- */
-class PipelineTranslationOptimizer extends FlinkPipelineTranslator {
-
- private static final Logger LOG = LoggerFactory.getLogger(PipelineTranslationOptimizer.class);
-
- private TranslationMode translationMode;
-
- private final FlinkPipelineOptions options;
-
- public PipelineTranslationOptimizer(TranslationMode defaultMode, FlinkPipelineOptions options) {
- this.translationMode = defaultMode;
- this.options = options;
- }
-
- public TranslationMode getTranslationMode() {
-
- // override user-specified translation mode
- if (options.isStreaming()) {
- return TranslationMode.STREAMING;
- }
-
- return translationMode;
- }
-
- @Override
- public CompositeBehavior enterCompositeTransform(TransformHierarchy.Node node) {
- return CompositeBehavior.ENTER_TRANSFORM;
- }
-
- @Override
- public void leaveCompositeTransform(TransformHierarchy.Node node) {}
-
- @Override
- public void visitPrimitiveTransform(TransformHierarchy.Node node) {
- Class<? extends PTransform> transformClass = node.getTransform().getClass();
- if (transformClass == Read.Unbounded.class) {
- LOG.info("Found {}. Switching to streaming execution.", transformClass);
- translationMode = TranslationMode.STREAMING;
- }
- }
-
- @Override
- public void visitValue(PValue value, TransformHierarchy.Node producer) {}
-}
[33/50] [abbrv] beam git commit: [BEAM-1994] Remove Flink examples
package
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/test/java/org/apache/beam/runners/flink/ReadSourceITCase.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/test/java/org/apache/beam/runners/flink/ReadSourceITCase.java b/runners/flink/runner/src/test/java/org/apache/beam/runners/flink/ReadSourceITCase.java
deleted file mode 100644
index 44c9017..0000000
--- a/runners/flink/runner/src/test/java/org/apache/beam/runners/flink/ReadSourceITCase.java
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink;
-
-import com.google.common.base.Joiner;
-import java.io.File;
-import java.net.URI;
-import org.apache.beam.sdk.Pipeline;
-import org.apache.beam.sdk.io.CountingInput;
-import org.apache.beam.sdk.io.TextIO;
-import org.apache.beam.sdk.transforms.DoFn;
-import org.apache.beam.sdk.transforms.ParDo;
-import org.apache.beam.sdk.values.PCollection;
-import org.apache.flink.test.util.JavaProgramTestBase;
-
-/**
- * Reads from a bounded source in batch execution.
- */
-public class ReadSourceITCase extends JavaProgramTestBase {
-
- protected String resultPath;
-
- public ReadSourceITCase(){
- }
-
- private static final String[] EXPECTED_RESULT = new String[] {
- "0", "1", "2", "3", "4", "5", "6", "7", "8", "9"};
-
- @Override
- protected void preSubmit() throws Exception {
- resultPath = getTempDirPath("result");
-
- // need to create the dir, otherwise Beam sinks don't
- // work for these tests
-
- if (!new File(new URI(resultPath)).mkdirs()) {
- throw new RuntimeException("Could not create output dir.");
- }
- }
-
- @Override
- protected void postSubmit() throws Exception {
- compareResultsByLinesInMemory(Joiner.on('\n').join(EXPECTED_RESULT), resultPath);
- }
-
- @Override
- protected void testProgram() throws Exception {
- runProgram(resultPath);
- }
-
- private static void runProgram(String resultPath) throws Exception {
-
- Pipeline p = FlinkTestPipeline.createForBatch();
-
- PCollection<String> result = p
- .apply(CountingInput.upTo(10))
- .apply(ParDo.of(new DoFn<Long, String>() {
- @ProcessElement
- public void processElement(ProcessContext c) throws Exception {
- c.output(c.element().toString());
- }
- }));
-
- result.apply(TextIO.Write.to(new URI(resultPath).getPath() + "/part"));
-
- p.run();
- }
-}
-
-
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/test/java/org/apache/beam/runners/flink/ReadSourceStreamingITCase.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/test/java/org/apache/beam/runners/flink/ReadSourceStreamingITCase.java b/runners/flink/runner/src/test/java/org/apache/beam/runners/flink/ReadSourceStreamingITCase.java
deleted file mode 100644
index 79b7882..0000000
--- a/runners/flink/runner/src/test/java/org/apache/beam/runners/flink/ReadSourceStreamingITCase.java
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink;
-
-import com.google.common.base.Joiner;
-import org.apache.beam.sdk.Pipeline;
-import org.apache.beam.sdk.io.CountingInput;
-import org.apache.beam.sdk.io.TextIO;
-import org.apache.beam.sdk.transforms.DoFn;
-import org.apache.beam.sdk.transforms.ParDo;
-import org.apache.flink.streaming.util.StreamingProgramTestBase;
-
-/**
- * Reads from a bounded source in streaming.
- */
-public class ReadSourceStreamingITCase extends StreamingProgramTestBase {
-
- protected String resultPath;
-
- public ReadSourceStreamingITCase(){
- }
-
- private static final String[] EXPECTED_RESULT = new String[] {
- "0", "1", "2", "3", "4", "5", "6", "7", "8", "9"};
-
- @Override
- protected void preSubmit() throws Exception {
- resultPath = getTempDirPath("result");
- }
-
- @Override
- protected void postSubmit() throws Exception {
- compareResultsByLinesInMemory(Joiner.on('\n').join(EXPECTED_RESULT), resultPath);
- }
-
- @Override
- protected void testProgram() throws Exception {
- runProgram(resultPath);
- }
-
- private static void runProgram(String resultPath) {
-
- Pipeline p = FlinkTestPipeline.createForStreaming();
-
- p
- .apply(CountingInput.upTo(10))
- .apply(ParDo.of(new DoFn<Long, String>() {
- @ProcessElement
- public void processElement(ProcessContext c) throws Exception {
- c.output(c.element().toString());
- }
- }))
- .apply(TextIO.Write.to(resultPath));
-
- p.run();
- }
-}
-
-
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/test/java/org/apache/beam/runners/flink/WriteSinkITCase.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/test/java/org/apache/beam/runners/flink/WriteSinkITCase.java b/runners/flink/runner/src/test/java/org/apache/beam/runners/flink/WriteSinkITCase.java
deleted file mode 100644
index 38b790e..0000000
--- a/runners/flink/runner/src/test/java/org/apache/beam/runners/flink/WriteSinkITCase.java
+++ /dev/null
@@ -1,192 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.beam.runners.flink;
-
-import static org.junit.Assert.assertNotNull;
-
-import com.google.common.base.Joiner;
-import com.google.common.collect.ImmutableList;
-import java.io.File;
-import java.io.IOException;
-import java.io.PrintWriter;
-import java.net.URI;
-import org.apache.beam.sdk.Pipeline;
-import org.apache.beam.sdk.coders.Coder;
-import org.apache.beam.sdk.coders.StringUtf8Coder;
-import org.apache.beam.sdk.io.Sink;
-import org.apache.beam.sdk.io.Write;
-import org.apache.beam.sdk.options.PipelineOptions;
-import org.apache.beam.sdk.transforms.Create;
-import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
-import org.apache.beam.sdk.transforms.windowing.PaneInfo;
-import org.apache.flink.core.fs.FileSystem;
-import org.apache.flink.core.fs.Path;
-import org.apache.flink.test.util.JavaProgramTestBase;
-
-/**
- * Tests the translation of custom Write sinks.
- */
-public class WriteSinkITCase extends JavaProgramTestBase {
-
- protected String resultPath;
-
- public WriteSinkITCase(){
- }
-
- static final String[] EXPECTED_RESULT = new String[] {
- "Joe red 3", "Mary blue 4", "Max yellow 23"};
-
- @Override
- protected void preSubmit() throws Exception {
- resultPath = getTempDirPath("result-" + System.nanoTime());
- }
-
- @Override
- protected void postSubmit() throws Exception {
- compareResultsByLinesInMemory(Joiner.on('\n').join(EXPECTED_RESULT), resultPath);
- }
-
- @Override
- protected void testProgram() throws Exception {
- runProgram(resultPath);
- }
-
- @Override
- public void stopCluster() throws Exception {
- try {
- super.stopCluster();
- } catch (final IOException ioe) {
- if (ioe.getMessage().startsWith("Unable to delete file")) {
- // that's ok for the test itself, just the OS playing with us on cleanup phase
- }
- }
- }
-
- private static void runProgram(String resultPath) {
- Pipeline p = FlinkTestPipeline.createForBatch();
-
- p.apply(Create.of(ImmutableList.copyOf(EXPECTED_RESULT))).setCoder(StringUtf8Coder.of())
- .apply("CustomSink", Write.to(new MyCustomSink(resultPath)));
-
- p.run();
- }
-
- /**
- * Simple custom sink which writes to a file.
- */
- private static class MyCustomSink extends Sink<String> {
-
- private final String resultPath;
-
- public MyCustomSink(String resultPath) {
- this.resultPath = resultPath;
- }
-
- @Override
- public void validate(PipelineOptions options) {
- assertNotNull(options);
- }
-
- @Override
- public WriteOperation<String, ?> createWriteOperation(PipelineOptions options) {
- return new MyWriteOperation();
- }
-
- private class MyWriteOperation extends WriteOperation<String, String> {
-
- @Override
- public Coder<String> getWriterResultCoder() {
- return StringUtf8Coder.of();
- }
-
- @Override
- public void initialize(PipelineOptions options) throws Exception {
-
- }
-
- @Override
- public void setWindowedWrites(boolean windowedWrites) {
-
- }
-
- @Override
- public void finalize(Iterable<String> writerResults, PipelineOptions options)
- throws Exception {
-
- }
-
- @Override
- public Writer<String, String> createWriter(PipelineOptions options) throws Exception {
- return new MyWriter();
- }
-
- @Override
- public Sink<String> getSink() {
- return MyCustomSink.this;
- }
-
- /**
- * Simple Writer which writes to a file.
- */
- private class MyWriter extends Writer<String, String> {
-
- private PrintWriter internalWriter;
-
- @Override
- public final void openWindowed(String uId,
- BoundedWindow window,
- PaneInfo paneInfo,
- int shard,
- int numShards) throws Exception {
- throw new UnsupportedOperationException("Windowed writes not supported.");
- }
-
- @Override
- public final void openUnwindowed(String uId, int shard, int numShards) throws Exception {
- Path path = new Path(resultPath + "/" + uId);
- FileSystem.get(new URI("file:///")).create(path, false);
- internalWriter = new PrintWriter(new File(path.toUri()));
- }
-
- @Override
- public void cleanup() throws Exception {
-
- }
-
- @Override
- public void write(String value) throws Exception {
- internalWriter.println(value);
- }
-
- @Override
- public String close() throws Exception {
- internalWriter.close();
- return resultPath;
- }
-
- @Override
- public WriteOperation<String, String> getWriteOperation() {
- return MyWriteOperation.this;
- }
- }
- }
- }
-
-}
-
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/test/java/org/apache/beam/runners/flink/streaming/DoFnOperatorTest.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/test/java/org/apache/beam/runners/flink/streaming/DoFnOperatorTest.java b/runners/flink/runner/src/test/java/org/apache/beam/runners/flink/streaming/DoFnOperatorTest.java
deleted file mode 100644
index 4c826d1..0000000
--- a/runners/flink/runner/src/test/java/org/apache/beam/runners/flink/streaming/DoFnOperatorTest.java
+++ /dev/null
@@ -1,600 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink.streaming;
-
-import static org.hamcrest.Matchers.emptyIterable;
-import static org.hamcrest.collection.IsIterableContainingInOrder.contains;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertThat;
-
-import com.google.common.base.Function;
-import com.google.common.base.Predicate;
-import com.google.common.collect.FluentIterable;
-import com.google.common.collect.ImmutableList;
-import com.google.common.collect.ImmutableMap;
-import java.util.Collections;
-import java.util.HashMap;
-import javax.annotation.Nullable;
-import org.apache.beam.runners.core.StatefulDoFnRunner;
-import org.apache.beam.runners.flink.FlinkPipelineOptions;
-import org.apache.beam.runners.flink.translation.types.CoderTypeInformation;
-import org.apache.beam.runners.flink.translation.wrappers.streaming.DoFnOperator;
-import org.apache.beam.sdk.coders.Coder;
-import org.apache.beam.sdk.coders.KvCoder;
-import org.apache.beam.sdk.coders.StringUtf8Coder;
-import org.apache.beam.sdk.coders.VarIntCoder;
-import org.apache.beam.sdk.options.PipelineOptionsFactory;
-import org.apache.beam.sdk.testing.PCollectionViewTesting;
-import org.apache.beam.sdk.transforms.DoFn;
-import org.apache.beam.sdk.transforms.join.RawUnionValue;
-import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
-import org.apache.beam.sdk.transforms.windowing.FixedWindows;
-import org.apache.beam.sdk.transforms.windowing.IntervalWindow;
-import org.apache.beam.sdk.transforms.windowing.PaneInfo;
-import org.apache.beam.sdk.util.TimeDomain;
-import org.apache.beam.sdk.util.Timer;
-import org.apache.beam.sdk.util.TimerSpec;
-import org.apache.beam.sdk.util.TimerSpecs;
-import org.apache.beam.sdk.util.WindowedValue;
-import org.apache.beam.sdk.util.WindowingStrategy;
-import org.apache.beam.sdk.util.state.StateSpec;
-import org.apache.beam.sdk.util.state.StateSpecs;
-import org.apache.beam.sdk.util.state.ValueState;
-import org.apache.beam.sdk.values.KV;
-import org.apache.beam.sdk.values.PCollectionView;
-import org.apache.beam.sdk.values.TupleTag;
-import org.apache.flink.api.common.typeinfo.BasicTypeInfo;
-import org.apache.flink.api.java.functions.KeySelector;
-import org.apache.flink.streaming.runtime.streamrecord.StreamRecord;
-import org.apache.flink.streaming.util.KeyedOneInputStreamOperatorTestHarness;
-import org.apache.flink.streaming.util.KeyedTwoInputStreamOperatorTestHarness;
-import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness;
-import org.apache.flink.streaming.util.TwoInputStreamOperatorTestHarness;
-import org.joda.time.Duration;
-import org.joda.time.Instant;
-import org.junit.Test;
-import org.junit.runner.RunWith;
-import org.junit.runners.JUnit4;
-
-/**
- * Tests for {@link DoFnOperator}.
- */
-@RunWith(JUnit4.class)
-public class DoFnOperatorTest {
-
- // views and windows for testing side inputs
- private static final long WINDOW_MSECS_1 = 100;
- private static final long WINDOW_MSECS_2 = 500;
-
- private WindowingStrategy<Object, IntervalWindow> windowingStrategy1 =
- WindowingStrategy.of(FixedWindows.of(new Duration(WINDOW_MSECS_1)));
-
- private PCollectionView<Iterable<String>> view1 =
- PCollectionViewTesting.testingView(
- new TupleTag<Iterable<WindowedValue<String>>>() {},
- new PCollectionViewTesting.IdentityViewFn<String>(),
- StringUtf8Coder.of(),
- windowingStrategy1);
-
- private WindowingStrategy<Object, IntervalWindow> windowingStrategy2 =
- WindowingStrategy.of(FixedWindows.of(new Duration(WINDOW_MSECS_2)));
-
- private PCollectionView<Iterable<String>> view2 =
- PCollectionViewTesting.testingView(
- new TupleTag<Iterable<WindowedValue<String>>>() {},
- new PCollectionViewTesting.IdentityViewFn<String>(),
- StringUtf8Coder.of(),
- windowingStrategy2);
-
- @Test
- @SuppressWarnings("unchecked")
- public void testSingleOutput() throws Exception {
-
- WindowedValue.ValueOnlyWindowedValueCoder<String> windowedValueCoder =
- WindowedValue.getValueOnlyCoder(StringUtf8Coder.of());
-
- TupleTag<String> outputTag = new TupleTag<>("main-output");
-
- DoFnOperator<String, String, String> doFnOperator = new DoFnOperator<>(
- new IdentityDoFn<String>(),
- windowedValueCoder,
- outputTag,
- Collections.<TupleTag<?>>emptyList(),
- new DoFnOperator.DefaultOutputManagerFactory(),
- WindowingStrategy.globalDefault(),
- new HashMap<Integer, PCollectionView<?>>(), /* side-input mapping */
- Collections.<PCollectionView<?>>emptyList(), /* side inputs */
- PipelineOptionsFactory.as(FlinkPipelineOptions.class),
- null);
-
- OneInputStreamOperatorTestHarness<WindowedValue<String>, String> testHarness =
- new OneInputStreamOperatorTestHarness<>(doFnOperator);
-
- testHarness.open();
-
- testHarness.processElement(new StreamRecord<>(WindowedValue.valueInGlobalWindow("Hello")));
-
- assertThat(
- this.<String>stripStreamRecordFromWindowedValue(testHarness.getOutput()),
- contains(WindowedValue.valueInGlobalWindow("Hello")));
-
- testHarness.close();
- }
-
- @Test
- @SuppressWarnings("unchecked")
- public void testMultiOutputOutput() throws Exception {
-
- WindowedValue.ValueOnlyWindowedValueCoder<String> windowedValueCoder =
- WindowedValue.getValueOnlyCoder(StringUtf8Coder.of());
-
- TupleTag<String> mainOutput = new TupleTag<>("main-output");
- TupleTag<String> additionalOutput1 = new TupleTag<>("output-1");
- TupleTag<String> additionalOutput2 = new TupleTag<>("output-2");
- ImmutableMap<TupleTag<?>, Integer> outputMapping = ImmutableMap.<TupleTag<?>, Integer>builder()
- .put(mainOutput, 1)
- .put(additionalOutput1, 2)
- .put(additionalOutput2, 3)
- .build();
-
- DoFnOperator<String, String, RawUnionValue> doFnOperator = new DoFnOperator<>(
- new MultiOutputDoFn(additionalOutput1, additionalOutput2),
- windowedValueCoder,
- mainOutput,
- ImmutableList.<TupleTag<?>>of(additionalOutput1, additionalOutput2),
- new DoFnOperator.MultiOutputOutputManagerFactory(outputMapping),
- WindowingStrategy.globalDefault(),
- new HashMap<Integer, PCollectionView<?>>(), /* side-input mapping */
- Collections.<PCollectionView<?>>emptyList(), /* side inputs */
- PipelineOptionsFactory.as(FlinkPipelineOptions.class),
- null);
-
- OneInputStreamOperatorTestHarness<WindowedValue<String>, RawUnionValue> testHarness =
- new OneInputStreamOperatorTestHarness<>(doFnOperator);
-
- testHarness.open();
-
- testHarness.processElement(new StreamRecord<>(WindowedValue.valueInGlobalWindow("one")));
- testHarness.processElement(new StreamRecord<>(WindowedValue.valueInGlobalWindow("two")));
- testHarness.processElement(new StreamRecord<>(WindowedValue.valueInGlobalWindow("hello")));
-
- assertThat(
- this.stripStreamRecordFromRawUnion(testHarness.getOutput()),
- contains(
- new RawUnionValue(2, WindowedValue.valueInGlobalWindow("extra: one")),
- new RawUnionValue(3, WindowedValue.valueInGlobalWindow("extra: two")),
- new RawUnionValue(1, WindowedValue.valueInGlobalWindow("got: hello")),
- new RawUnionValue(2, WindowedValue.valueInGlobalWindow("got: hello")),
- new RawUnionValue(3, WindowedValue.valueInGlobalWindow("got: hello"))));
-
- testHarness.close();
- }
-
- @Test
- public void testLateDroppingForStatefulFn() throws Exception {
-
- WindowingStrategy<Object, IntervalWindow> windowingStrategy =
- WindowingStrategy.of(FixedWindows.of(new Duration(10)));
-
- DoFn<Integer, String> fn = new DoFn<Integer, String>() {
-
- @StateId("state")
- private final StateSpec<Object, ValueState<String>> stateSpec =
- StateSpecs.value(StringUtf8Coder.of());
-
- @ProcessElement
- public void processElement(ProcessContext context) {
- context.output(context.element().toString());
- }
- };
-
- WindowedValue.FullWindowedValueCoder<Integer> windowedValueCoder =
- WindowedValue.getFullCoder(
- VarIntCoder.of(),
- windowingStrategy.getWindowFn().windowCoder());
-
- TupleTag<String> outputTag = new TupleTag<>("main-output");
-
- DoFnOperator<Integer, String, WindowedValue<String>> doFnOperator = new DoFnOperator<>(
- fn,
- windowedValueCoder,
- outputTag,
- Collections.<TupleTag<?>>emptyList(),
- new DoFnOperator.DefaultOutputManagerFactory<WindowedValue<String>>(),
- windowingStrategy,
- new HashMap<Integer, PCollectionView<?>>(), /* side-input mapping */
- Collections.<PCollectionView<?>>emptyList(), /* side inputs */
- PipelineOptionsFactory.as(FlinkPipelineOptions.class),
- VarIntCoder.of() /* key coder */);
-
- OneInputStreamOperatorTestHarness<WindowedValue<Integer>, WindowedValue<String>> testHarness =
- new KeyedOneInputStreamOperatorTestHarness<>(
- doFnOperator,
- new KeySelector<WindowedValue<Integer>, Integer>() {
- @Override
- public Integer getKey(WindowedValue<Integer> integerWindowedValue) throws Exception {
- return integerWindowedValue.getValue();
- }
- },
- new CoderTypeInformation<>(VarIntCoder.of()));
-
- testHarness.open();
-
- testHarness.processWatermark(0);
-
- IntervalWindow window1 = new IntervalWindow(new Instant(0), Duration.millis(10));
-
- // this should not be late
- testHarness.processElement(
- new StreamRecord<>(WindowedValue.of(13, new Instant(0), window1, PaneInfo.NO_FIRING)));
-
- assertThat(
- this.<String>stripStreamRecordFromWindowedValue(testHarness.getOutput()),
- contains(WindowedValue.of("13", new Instant(0), window1, PaneInfo.NO_FIRING)));
-
- testHarness.getOutput().clear();
-
- testHarness.processWatermark(9);
-
- // this should still not be considered late
- testHarness.processElement(
- new StreamRecord<>(WindowedValue.of(17, new Instant(0), window1, PaneInfo.NO_FIRING)));
-
- assertThat(
- this.<String>stripStreamRecordFromWindowedValue(testHarness.getOutput()),
- contains(WindowedValue.of("17", new Instant(0), window1, PaneInfo.NO_FIRING)));
-
- testHarness.getOutput().clear();
-
- testHarness.processWatermark(10);
-
- // this should now be considered late
- testHarness.processElement(
- new StreamRecord<>(WindowedValue.of(17, new Instant(0), window1, PaneInfo.NO_FIRING)));
-
- assertThat(
- this.<String>stripStreamRecordFromWindowedValue(testHarness.getOutput()),
- emptyIterable());
-
- testHarness.close();
- }
-
- @Test
- public void testStateGCForStatefulFn() throws Exception {
-
- WindowingStrategy<Object, IntervalWindow> windowingStrategy =
- WindowingStrategy.of(FixedWindows.of(new Duration(10))).withAllowedLateness(Duration.ZERO);
-
- final String timerId = "boo";
- final String stateId = "dazzle";
-
- final int offset = 5000;
- final int timerOutput = 4093;
-
- DoFn<KV<String, Integer>, KV<String, Integer>> fn =
- new DoFn<KV<String, Integer>, KV<String, Integer>>() {
-
- @TimerId(timerId)
- private final TimerSpec spec = TimerSpecs.timer(TimeDomain.EVENT_TIME);
-
- @StateId(stateId)
- private final StateSpec<Object, ValueState<String>> stateSpec =
- StateSpecs.value(StringUtf8Coder.of());
-
- @ProcessElement
- public void processElement(
- ProcessContext context,
- @TimerId(timerId) Timer timer,
- @StateId(stateId) ValueState<String> state,
- BoundedWindow window) {
- timer.set(window.maxTimestamp());
- state.write(context.element().getKey());
- context.output(
- KV.of(context.element().getKey(), context.element().getValue() + offset));
- }
-
- @OnTimer(timerId)
- public void onTimer(OnTimerContext context, @StateId(stateId) ValueState<String> state) {
- context.output(KV.of(state.read(), timerOutput));
- }
- };
-
- WindowedValue.FullWindowedValueCoder<KV<String, Integer>> windowedValueCoder =
- WindowedValue.getFullCoder(
- KvCoder.of(StringUtf8Coder.of(), VarIntCoder.of()),
- windowingStrategy.getWindowFn().windowCoder());
-
- TupleTag<KV<String, Integer>> outputTag = new TupleTag<>("main-output");
-
- DoFnOperator<
- KV<String, Integer>, KV<String, Integer>, WindowedValue<KV<String, Integer>>> doFnOperator =
- new DoFnOperator<>(
- fn,
- windowedValueCoder,
- outputTag,
- Collections.<TupleTag<?>>emptyList(),
- new DoFnOperator.DefaultOutputManagerFactory<WindowedValue<KV<String, Integer>>>(),
- windowingStrategy,
- new HashMap<Integer, PCollectionView<?>>(), /* side-input mapping */
- Collections.<PCollectionView<?>>emptyList(), /* side inputs */
- PipelineOptionsFactory.as(FlinkPipelineOptions.class),
- StringUtf8Coder.of() /* key coder */);
-
- KeyedOneInputStreamOperatorTestHarness<
- String,
- WindowedValue<KV<String, Integer>>,
- WindowedValue<KV<String, Integer>>> testHarness =
- new KeyedOneInputStreamOperatorTestHarness<>(
- doFnOperator,
- new KeySelector<WindowedValue<KV<String, Integer>>, String>() {
- @Override
- public String getKey(
- WindowedValue<KV<String, Integer>> kvWindowedValue) throws Exception {
- return kvWindowedValue.getValue().getKey();
- }
- },
- new CoderTypeInformation<>(StringUtf8Coder.of()));
-
- testHarness.open();
-
- testHarness.processWatermark(0);
-
- assertEquals(0, testHarness.numKeyedStateEntries());
-
- IntervalWindow window1 = new IntervalWindow(new Instant(0), Duration.millis(10));
-
- testHarness.processElement(
- new StreamRecord<>(
- WindowedValue.of(KV.of("key1", 5), new Instant(1), window1, PaneInfo.NO_FIRING)));
-
- testHarness.processElement(
- new StreamRecord<>(
- WindowedValue.of(KV.of("key2", 7), new Instant(3), window1, PaneInfo.NO_FIRING)));
-
- assertThat(
- this.<KV<String, Integer>>stripStreamRecordFromWindowedValue(testHarness.getOutput()),
- contains(
- WindowedValue.of(
- KV.of("key1", 5 + offset), new Instant(1), window1, PaneInfo.NO_FIRING),
- WindowedValue.of(
- KV.of("key2", 7 + offset), new Instant(3), window1, PaneInfo.NO_FIRING)));
-
- assertEquals(2, testHarness.numKeyedStateEntries());
-
- testHarness.getOutput().clear();
-
- // this should trigger both the window.maxTimestamp() timer and the GC timer
- // this tests that the GC timer fires after the user timer
- testHarness.processWatermark(
- window1.maxTimestamp()
- .plus(windowingStrategy.getAllowedLateness())
- .plus(StatefulDoFnRunner.TimeInternalsCleanupTimer.GC_DELAY_MS)
- .getMillis());
-
- assertThat(
- this.<KV<String, Integer>>stripStreamRecordFromWindowedValue(testHarness.getOutput()),
- contains(
- WindowedValue.of(
- KV.of("key1", timerOutput), new Instant(9), window1, PaneInfo.NO_FIRING),
- WindowedValue.of(
- KV.of("key2", timerOutput), new Instant(9), window1, PaneInfo.NO_FIRING)));
-
- // ensure the state was garbage collected
- assertEquals(0, testHarness.numKeyedStateEntries());
-
- testHarness.close();
- }
-
- public void testSideInputs(boolean keyed) throws Exception {
-
- WindowedValue.ValueOnlyWindowedValueCoder<String> windowedValueCoder =
- WindowedValue.getValueOnlyCoder(StringUtf8Coder.of());
-
- TupleTag<String> outputTag = new TupleTag<>("main-output");
-
- ImmutableMap<Integer, PCollectionView<?>> sideInputMapping =
- ImmutableMap.<Integer, PCollectionView<?>>builder()
- .put(1, view1)
- .put(2, view2)
- .build();
-
- Coder<String> keyCoder = null;
- if (keyed) {
- keyCoder = StringUtf8Coder.of();
- }
-
- DoFnOperator<String, String, String> doFnOperator = new DoFnOperator<>(
- new IdentityDoFn<String>(),
- windowedValueCoder,
- outputTag,
- Collections.<TupleTag<?>>emptyList(),
- new DoFnOperator.DefaultOutputManagerFactory<String>(),
- WindowingStrategy.globalDefault(),
- sideInputMapping, /* side-input mapping */
- ImmutableList.<PCollectionView<?>>of(view1, view2), /* side inputs */
- PipelineOptionsFactory.as(FlinkPipelineOptions.class),
- keyCoder);
-
- TwoInputStreamOperatorTestHarness<WindowedValue<String>, RawUnionValue, String> testHarness =
- new TwoInputStreamOperatorTestHarness<>(doFnOperator);
-
- if (keyed) {
- // we use a dummy key for the second input since it is considered to be broadcast
- testHarness = new KeyedTwoInputStreamOperatorTestHarness<>(
- doFnOperator,
- new StringKeySelector(),
- new DummyKeySelector(),
- BasicTypeInfo.STRING_TYPE_INFO);
- }
-
- testHarness.open();
-
- IntervalWindow firstWindow = new IntervalWindow(new Instant(0), new Instant(100));
- IntervalWindow secondWindow = new IntervalWindow(new Instant(0), new Instant(500));
-
- // test the keep of sideInputs events
- testHarness.processElement2(
- new StreamRecord<>(
- new RawUnionValue(
- 1,
- valuesInWindow(ImmutableList.of("hello", "ciao"), new Instant(0), firstWindow))));
- testHarness.processElement2(
- new StreamRecord<>(
- new RawUnionValue(
- 2,
- valuesInWindow(ImmutableList.of("foo", "bar"), new Instant(0), secondWindow))));
-
- // push in a regular elements
- WindowedValue<String> helloElement = valueInWindow("Hello", new Instant(0), firstWindow);
- WindowedValue<String> worldElement = valueInWindow("World", new Instant(1000), firstWindow);
- testHarness.processElement1(new StreamRecord<>(helloElement));
- testHarness.processElement1(new StreamRecord<>(worldElement));
-
- // test the keep of pushed-back events
- testHarness.processElement2(
- new StreamRecord<>(
- new RawUnionValue(
- 1,
- valuesInWindow(ImmutableList.of("hello", "ciao"),
- new Instant(1000), firstWindow))));
- testHarness.processElement2(
- new StreamRecord<>(
- new RawUnionValue(
- 2,
- valuesInWindow(ImmutableList.of("foo", "bar"), new Instant(1000), secondWindow))));
-
- assertThat(
- this.<String>stripStreamRecordFromWindowedValue(testHarness.getOutput()),
- contains(helloElement, worldElement));
-
- testHarness.close();
-
- }
-
- /**
- * {@link TwoInputStreamOperatorTestHarness} support OperatorStateBackend,
- * but don't support KeyedStateBackend. So we just test sideInput of normal ParDo.
- */
- @Test
- @SuppressWarnings("unchecked")
- public void testNormalParDoSideInputs() throws Exception {
- testSideInputs(false);
- }
-
- @Test
- public void testKeyedSideInputs() throws Exception {
- testSideInputs(true);
- }
-
- private <T> Iterable<WindowedValue<T>> stripStreamRecordFromWindowedValue(
- Iterable<Object> input) {
-
- return FluentIterable.from(input).filter(new Predicate<Object>() {
- @Override
- public boolean apply(@Nullable Object o) {
- return o instanceof StreamRecord && ((StreamRecord) o).getValue() instanceof WindowedValue;
- }
- }).transform(new Function<Object, WindowedValue<T>>() {
- @Nullable
- @Override
- @SuppressWarnings({"unchecked", "rawtypes"})
- public WindowedValue<T> apply(@Nullable Object o) {
- if (o instanceof StreamRecord && ((StreamRecord) o).getValue() instanceof WindowedValue) {
- return (WindowedValue) ((StreamRecord) o).getValue();
- }
- throw new RuntimeException("unreachable");
- }
- });
- }
-
- private Iterable<RawUnionValue> stripStreamRecordFromRawUnion(Iterable<Object> input) {
- return FluentIterable.from(input).filter(new Predicate<Object>() {
- @Override
- public boolean apply(@Nullable Object o) {
- return o instanceof StreamRecord && ((StreamRecord) o).getValue() instanceof RawUnionValue;
- }
- }).transform(new Function<Object, RawUnionValue>() {
- @Nullable
- @Override
- @SuppressWarnings({"unchecked", "rawtypes"})
- public RawUnionValue apply(@Nullable Object o) {
- if (o instanceof StreamRecord && ((StreamRecord) o).getValue() instanceof RawUnionValue) {
- return (RawUnionValue) ((StreamRecord) o).getValue();
- }
- throw new RuntimeException("unreachable");
- }
- });
- }
-
- private static class MultiOutputDoFn extends DoFn<String, String> {
- private TupleTag<String> additionalOutput1;
- private TupleTag<String> additionalOutput2;
-
- public MultiOutputDoFn(TupleTag<String> additionalOutput1, TupleTag<String> additionalOutput2) {
- this.additionalOutput1 = additionalOutput1;
- this.additionalOutput2 = additionalOutput2;
- }
-
- @ProcessElement
- public void processElement(ProcessContext c) throws Exception {
- if (c.element().equals("one")) {
- c.output(additionalOutput1, "extra: one");
- } else if (c.element().equals("two")) {
- c.output(additionalOutput2, "extra: two");
- } else {
- c.output("got: " + c.element());
- c.output(additionalOutput1, "got: " + c.element());
- c.output(additionalOutput2, "got: " + c.element());
- }
- }
- }
-
- private static class IdentityDoFn<T> extends DoFn<T, T> {
- @ProcessElement
- public void processElement(ProcessContext c) throws Exception {
- c.output(c.element());
- }
- }
-
- @SuppressWarnings({"unchecked", "rawtypes"})
- private WindowedValue<Iterable<?>> valuesInWindow(
- Iterable<?> values, Instant timestamp, BoundedWindow window) {
- return (WindowedValue) WindowedValue.of(values, timestamp, window, PaneInfo.NO_FIRING);
- }
-
- @SuppressWarnings({"unchecked", "rawtypes"})
- private <T> WindowedValue<T> valueInWindow(
- T value, Instant timestamp, BoundedWindow window) {
- return WindowedValue.of(value, timestamp, window, PaneInfo.NO_FIRING);
- }
-
-
- private static class DummyKeySelector implements KeySelector<RawUnionValue, String> {
- @Override
- public String getKey(RawUnionValue stringWindowedValue) throws Exception {
- return "dummy_key";
- }
- }
-
- private static class StringKeySelector implements KeySelector<WindowedValue<String>, String> {
- @Override
- public String getKey(WindowedValue<String> stringWindowedValue) throws Exception {
- return stringWindowedValue.getValue();
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/test/java/org/apache/beam/runners/flink/streaming/FlinkBroadcastStateInternalsTest.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/test/java/org/apache/beam/runners/flink/streaming/FlinkBroadcastStateInternalsTest.java b/runners/flink/runner/src/test/java/org/apache/beam/runners/flink/streaming/FlinkBroadcastStateInternalsTest.java
deleted file mode 100644
index 7e7d1e1..0000000
--- a/runners/flink/runner/src/test/java/org/apache/beam/runners/flink/streaming/FlinkBroadcastStateInternalsTest.java
+++ /dev/null
@@ -1,245 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink.streaming;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertNotEquals;
-import static org.junit.Assert.assertThat;
-
-import java.util.Arrays;
-import org.apache.beam.runners.core.StateMerging;
-import org.apache.beam.runners.core.StateNamespace;
-import org.apache.beam.runners.core.StateNamespaceForTest;
-import org.apache.beam.runners.core.StateTag;
-import org.apache.beam.runners.core.StateTags;
-import org.apache.beam.runners.flink.translation.wrappers.streaming.state.FlinkBroadcastStateInternals;
-import org.apache.beam.sdk.coders.StringUtf8Coder;
-import org.apache.beam.sdk.coders.VarIntCoder;
-import org.apache.beam.sdk.transforms.Sum;
-import org.apache.beam.sdk.util.state.BagState;
-import org.apache.beam.sdk.util.state.CombiningState;
-import org.apache.beam.sdk.util.state.GroupingState;
-import org.apache.beam.sdk.util.state.ReadableState;
-import org.apache.beam.sdk.util.state.ValueState;
-import org.apache.flink.runtime.operators.testutils.DummyEnvironment;
-import org.apache.flink.runtime.state.OperatorStateBackend;
-import org.apache.flink.runtime.state.memory.MemoryStateBackend;
-import org.hamcrest.Matchers;
-import org.junit.Before;
-import org.junit.Test;
-import org.junit.runner.RunWith;
-import org.junit.runners.JUnit4;
-
-/**
- * Tests for {@link FlinkBroadcastStateInternals}. This is based on the tests for
- * {@code InMemoryStateInternals}.
- */
-@RunWith(JUnit4.class)
-public class FlinkBroadcastStateInternalsTest {
- private static final StateNamespace NAMESPACE_1 = new StateNamespaceForTest("ns1");
- private static final StateNamespace NAMESPACE_2 = new StateNamespaceForTest("ns2");
- private static final StateNamespace NAMESPACE_3 = new StateNamespaceForTest("ns3");
-
- private static final StateTag<Object, ValueState<String>> STRING_VALUE_ADDR =
- StateTags.value("stringValue", StringUtf8Coder.of());
- private static final StateTag<Object, CombiningState<Integer, int[], Integer>>
- SUM_INTEGER_ADDR = StateTags.combiningValueFromInputInternal(
- "sumInteger", VarIntCoder.of(), Sum.ofIntegers());
- private static final StateTag<Object, BagState<String>> STRING_BAG_ADDR =
- StateTags.bag("stringBag", StringUtf8Coder.of());
-
- FlinkBroadcastStateInternals<String> underTest;
-
- @Before
- public void initStateInternals() {
- MemoryStateBackend backend = new MemoryStateBackend();
- try {
- OperatorStateBackend operatorStateBackend =
- backend.createOperatorStateBackend(new DummyEnvironment("test", 1, 0), "");
- underTest = new FlinkBroadcastStateInternals<>(1, operatorStateBackend);
-
- } catch (Exception e) {
- throw new RuntimeException(e);
- }
- }
-
- @Test
- public void testValue() throws Exception {
- ValueState<String> value = underTest.state(NAMESPACE_1, STRING_VALUE_ADDR);
-
- assertEquals(underTest.state(NAMESPACE_1, STRING_VALUE_ADDR), value);
- assertNotEquals(
- underTest.state(NAMESPACE_2, STRING_VALUE_ADDR),
- value);
-
- assertThat(value.read(), Matchers.nullValue());
- value.write("hello");
- assertThat(value.read(), Matchers.equalTo("hello"));
- value.write("world");
- assertThat(value.read(), Matchers.equalTo("world"));
-
- value.clear();
- assertThat(value.read(), Matchers.nullValue());
- assertEquals(underTest.state(NAMESPACE_1, STRING_VALUE_ADDR), value);
-
- }
-
- @Test
- public void testBag() throws Exception {
- BagState<String> value = underTest.state(NAMESPACE_1, STRING_BAG_ADDR);
-
- assertEquals(value, underTest.state(NAMESPACE_1, STRING_BAG_ADDR));
- assertFalse(value.equals(underTest.state(NAMESPACE_2, STRING_BAG_ADDR)));
-
- assertThat(value.read(), Matchers.emptyIterable());
- value.add("hello");
- assertThat(value.read(), Matchers.containsInAnyOrder("hello"));
-
- value.add("world");
- assertThat(value.read(), Matchers.containsInAnyOrder("hello", "world"));
-
- value.clear();
- assertThat(value.read(), Matchers.emptyIterable());
- assertEquals(underTest.state(NAMESPACE_1, STRING_BAG_ADDR), value);
-
- }
-
- @Test
- public void testBagIsEmpty() throws Exception {
- BagState<String> value = underTest.state(NAMESPACE_1, STRING_BAG_ADDR);
-
- assertThat(value.isEmpty().read(), Matchers.is(true));
- ReadableState<Boolean> readFuture = value.isEmpty();
- value.add("hello");
- assertThat(readFuture.read(), Matchers.is(false));
-
- value.clear();
- assertThat(readFuture.read(), Matchers.is(true));
- }
-
- @Test
- public void testMergeBagIntoSource() throws Exception {
- BagState<String> bag1 = underTest.state(NAMESPACE_1, STRING_BAG_ADDR);
- BagState<String> bag2 = underTest.state(NAMESPACE_2, STRING_BAG_ADDR);
-
- bag1.add("Hello");
- bag2.add("World");
- bag1.add("!");
-
- StateMerging.mergeBags(Arrays.asList(bag1, bag2), bag1);
-
- // Reading the merged bag gets both the contents
- assertThat(bag1.read(), Matchers.containsInAnyOrder("Hello", "World", "!"));
- assertThat(bag2.read(), Matchers.emptyIterable());
- }
-
- @Test
- public void testMergeBagIntoNewNamespace() throws Exception {
- BagState<String> bag1 = underTest.state(NAMESPACE_1, STRING_BAG_ADDR);
- BagState<String> bag2 = underTest.state(NAMESPACE_2, STRING_BAG_ADDR);
- BagState<String> bag3 = underTest.state(NAMESPACE_3, STRING_BAG_ADDR);
-
- bag1.add("Hello");
- bag2.add("World");
- bag1.add("!");
-
- StateMerging.mergeBags(Arrays.asList(bag1, bag2, bag3), bag3);
-
- // Reading the merged bag gets both the contents
- assertThat(bag3.read(), Matchers.containsInAnyOrder("Hello", "World", "!"));
- assertThat(bag1.read(), Matchers.emptyIterable());
- assertThat(bag2.read(), Matchers.emptyIterable());
- }
-
- @Test
- public void testCombiningValue() throws Exception {
- GroupingState<Integer, Integer> value = underTest.state(NAMESPACE_1, SUM_INTEGER_ADDR);
-
- // State instances are cached, but depend on the namespace.
- assertEquals(value, underTest.state(NAMESPACE_1, SUM_INTEGER_ADDR));
- assertFalse(value.equals(underTest.state(NAMESPACE_2, SUM_INTEGER_ADDR)));
-
- assertThat(value.read(), Matchers.equalTo(0));
- value.add(2);
- assertThat(value.read(), Matchers.equalTo(2));
-
- value.add(3);
- assertThat(value.read(), Matchers.equalTo(5));
-
- value.clear();
- assertThat(value.read(), Matchers.equalTo(0));
- assertEquals(underTest.state(NAMESPACE_1, SUM_INTEGER_ADDR), value);
- }
-
- @Test
- public void testCombiningIsEmpty() throws Exception {
- GroupingState<Integer, Integer> value = underTest.state(NAMESPACE_1, SUM_INTEGER_ADDR);
-
- assertThat(value.isEmpty().read(), Matchers.is(true));
- ReadableState<Boolean> readFuture = value.isEmpty();
- value.add(5);
- assertThat(readFuture.read(), Matchers.is(false));
-
- value.clear();
- assertThat(readFuture.read(), Matchers.is(true));
- }
-
- @Test
- public void testMergeCombiningValueIntoSource() throws Exception {
- CombiningState<Integer, int[], Integer> value1 =
- underTest.state(NAMESPACE_1, SUM_INTEGER_ADDR);
- CombiningState<Integer, int[], Integer> value2 =
- underTest.state(NAMESPACE_2, SUM_INTEGER_ADDR);
-
- value1.add(5);
- value2.add(10);
- value1.add(6);
-
- assertThat(value1.read(), Matchers.equalTo(11));
- assertThat(value2.read(), Matchers.equalTo(10));
-
- // Merging clears the old values and updates the result value.
- StateMerging.mergeCombiningValues(Arrays.asList(value1, value2), value1);
-
- assertThat(value1.read(), Matchers.equalTo(21));
- assertThat(value2.read(), Matchers.equalTo(0));
- }
-
- @Test
- public void testMergeCombiningValueIntoNewNamespace() throws Exception {
- CombiningState<Integer, int[], Integer> value1 =
- underTest.state(NAMESPACE_1, SUM_INTEGER_ADDR);
- CombiningState<Integer, int[], Integer> value2 =
- underTest.state(NAMESPACE_2, SUM_INTEGER_ADDR);
- CombiningState<Integer, int[], Integer> value3 =
- underTest.state(NAMESPACE_3, SUM_INTEGER_ADDR);
-
- value1.add(5);
- value2.add(10);
- value1.add(6);
-
- StateMerging.mergeCombiningValues(Arrays.asList(value1, value2), value3);
-
- // Merging clears the old values and updates the result value.
- assertThat(value1.read(), Matchers.equalTo(0));
- assertThat(value2.read(), Matchers.equalTo(0));
- assertThat(value3.read(), Matchers.equalTo(21));
- }
-
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/test/java/org/apache/beam/runners/flink/streaming/FlinkKeyGroupStateInternalsTest.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/test/java/org/apache/beam/runners/flink/streaming/FlinkKeyGroupStateInternalsTest.java b/runners/flink/runner/src/test/java/org/apache/beam/runners/flink/streaming/FlinkKeyGroupStateInternalsTest.java
deleted file mode 100644
index 5433d07..0000000
--- a/runners/flink/runner/src/test/java/org/apache/beam/runners/flink/streaming/FlinkKeyGroupStateInternalsTest.java
+++ /dev/null
@@ -1,262 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink.streaming;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertThat;
-
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
-import java.io.DataInputStream;
-import java.io.DataOutputStream;
-import java.nio.ByteBuffer;
-import java.util.Arrays;
-import org.apache.beam.runners.core.StateMerging;
-import org.apache.beam.runners.core.StateNamespace;
-import org.apache.beam.runners.core.StateNamespaceForTest;
-import org.apache.beam.runners.core.StateTag;
-import org.apache.beam.runners.core.StateTags;
-import org.apache.beam.runners.flink.translation.wrappers.streaming.state.FlinkKeyGroupStateInternals;
-import org.apache.beam.sdk.coders.StringUtf8Coder;
-import org.apache.beam.sdk.util.CoderUtils;
-import org.apache.beam.sdk.util.state.BagState;
-import org.apache.beam.sdk.util.state.ReadableState;
-import org.apache.flink.api.common.ExecutionConfig;
-import org.apache.flink.api.common.JobID;
-import org.apache.flink.api.java.typeutils.GenericTypeInfo;
-import org.apache.flink.runtime.jobgraph.JobVertexID;
-import org.apache.flink.runtime.operators.testutils.DummyEnvironment;
-import org.apache.flink.runtime.query.KvStateRegistry;
-import org.apache.flink.runtime.state.AbstractKeyedStateBackend;
-import org.apache.flink.runtime.state.KeyGroupRange;
-import org.apache.flink.runtime.state.KeyedStateBackend;
-import org.apache.flink.runtime.state.memory.MemoryStateBackend;
-import org.apache.flink.streaming.api.operators.KeyContext;
-import org.hamcrest.Matchers;
-import org.junit.Before;
-import org.junit.Test;
-import org.junit.runner.RunWith;
-import org.junit.runners.JUnit4;
-
-/**
- * Tests for {@link FlinkKeyGroupStateInternals}. This is based on the tests for
- * {@code InMemoryStateInternals}.
- */
-@RunWith(JUnit4.class)
-public class FlinkKeyGroupStateInternalsTest {
- private static final StateNamespace NAMESPACE_1 = new StateNamespaceForTest("ns1");
- private static final StateNamespace NAMESPACE_2 = new StateNamespaceForTest("ns2");
- private static final StateNamespace NAMESPACE_3 = new StateNamespaceForTest("ns3");
-
- private static final StateTag<Object, BagState<String>> STRING_BAG_ADDR =
- StateTags.bag("stringBag", StringUtf8Coder.of());
-
- FlinkKeyGroupStateInternals<String> underTest;
- private KeyedStateBackend keyedStateBackend;
-
- @Before
- public void initStateInternals() {
- try {
- keyedStateBackend = getKeyedStateBackend(2, new KeyGroupRange(0, 1));
- underTest = new FlinkKeyGroupStateInternals<>(StringUtf8Coder.of(), keyedStateBackend);
- } catch (Exception e) {
- throw new RuntimeException(e);
- }
- }
-
- private KeyedStateBackend getKeyedStateBackend(int numberOfKeyGroups,
- KeyGroupRange keyGroupRange) {
- MemoryStateBackend backend = new MemoryStateBackend();
- try {
- AbstractKeyedStateBackend<ByteBuffer> keyedStateBackend = backend.createKeyedStateBackend(
- new DummyEnvironment("test", 1, 0),
- new JobID(),
- "test_op",
- new GenericTypeInfo<>(ByteBuffer.class).createSerializer(new ExecutionConfig()),
- numberOfKeyGroups,
- keyGroupRange,
- new KvStateRegistry().createTaskRegistry(new JobID(), new JobVertexID()));
- keyedStateBackend.setCurrentKey(ByteBuffer.wrap(
- CoderUtils.encodeToByteArray(StringUtf8Coder.of(), "1")));
- return keyedStateBackend;
- } catch (Exception e) {
- throw new RuntimeException(e);
- }
- }
-
- @Test
- public void testBag() throws Exception {
- BagState<String> value = underTest.state(NAMESPACE_1, STRING_BAG_ADDR);
-
- assertEquals(value, underTest.state(NAMESPACE_1, STRING_BAG_ADDR));
- assertFalse(value.equals(underTest.state(NAMESPACE_2, STRING_BAG_ADDR)));
-
- assertThat(value.read(), Matchers.emptyIterable());
- value.add("hello");
- assertThat(value.read(), Matchers.containsInAnyOrder("hello"));
-
- value.add("world");
- assertThat(value.read(), Matchers.containsInAnyOrder("hello", "world"));
-
- value.clear();
- assertThat(value.read(), Matchers.emptyIterable());
- assertEquals(underTest.state(NAMESPACE_1, STRING_BAG_ADDR), value);
-
- }
-
- @Test
- public void testBagIsEmpty() throws Exception {
- BagState<String> value = underTest.state(NAMESPACE_1, STRING_BAG_ADDR);
-
- assertThat(value.isEmpty().read(), Matchers.is(true));
- ReadableState<Boolean> readFuture = value.isEmpty();
- value.add("hello");
- assertThat(readFuture.read(), Matchers.is(false));
-
- value.clear();
- assertThat(readFuture.read(), Matchers.is(true));
- }
-
- @Test
- public void testMergeBagIntoSource() throws Exception {
- BagState<String> bag1 = underTest.state(NAMESPACE_1, STRING_BAG_ADDR);
- BagState<String> bag2 = underTest.state(NAMESPACE_2, STRING_BAG_ADDR);
-
- bag1.add("Hello");
- bag2.add("World");
- bag1.add("!");
-
- StateMerging.mergeBags(Arrays.asList(bag1, bag2), bag1);
-
- // Reading the merged bag gets both the contents
- assertThat(bag1.read(), Matchers.containsInAnyOrder("Hello", "World", "!"));
- assertThat(bag2.read(), Matchers.emptyIterable());
- }
-
- @Test
- public void testMergeBagIntoNewNamespace() throws Exception {
- BagState<String> bag1 = underTest.state(NAMESPACE_1, STRING_BAG_ADDR);
- BagState<String> bag2 = underTest.state(NAMESPACE_2, STRING_BAG_ADDR);
- BagState<String> bag3 = underTest.state(NAMESPACE_3, STRING_BAG_ADDR);
-
- bag1.add("Hello");
- bag2.add("World");
- bag1.add("!");
-
- StateMerging.mergeBags(Arrays.asList(bag1, bag2, bag3), bag3);
-
- // Reading the merged bag gets both the contents
- assertThat(bag3.read(), Matchers.containsInAnyOrder("Hello", "World", "!"));
- assertThat(bag1.read(), Matchers.emptyIterable());
- assertThat(bag2.read(), Matchers.emptyIterable());
- }
-
- @Test
- public void testKeyGroupAndCheckpoint() throws Exception {
- // assign to keyGroup 0
- ByteBuffer key0 = ByteBuffer.wrap(
- CoderUtils.encodeToByteArray(StringUtf8Coder.of(), "11111111"));
- // assign to keyGroup 1
- ByteBuffer key1 = ByteBuffer.wrap(
- CoderUtils.encodeToByteArray(StringUtf8Coder.of(), "22222222"));
- FlinkKeyGroupStateInternals<String> allState;
- {
- KeyedStateBackend keyedStateBackend = getKeyedStateBackend(2, new KeyGroupRange(0, 1));
- allState = new FlinkKeyGroupStateInternals<>(
- StringUtf8Coder.of(), keyedStateBackend);
- BagState<String> valueForNamespace0 = allState.state(NAMESPACE_1, STRING_BAG_ADDR);
- BagState<String> valueForNamespace1 = allState.state(NAMESPACE_2, STRING_BAG_ADDR);
- keyedStateBackend.setCurrentKey(key0);
- valueForNamespace0.add("0");
- valueForNamespace1.add("2");
- keyedStateBackend.setCurrentKey(key1);
- valueForNamespace0.add("1");
- valueForNamespace1.add("3");
- assertThat(valueForNamespace0.read(), Matchers.containsInAnyOrder("0", "1"));
- assertThat(valueForNamespace1.read(), Matchers.containsInAnyOrder("2", "3"));
- }
-
- ClassLoader classLoader = FlinkKeyGroupStateInternalsTest.class.getClassLoader();
-
- // 1. scale up
- ByteArrayOutputStream out0 = new ByteArrayOutputStream();
- allState.snapshotKeyGroupState(0, new DataOutputStream(out0));
- DataInputStream in0 = new DataInputStream(
- new ByteArrayInputStream(out0.toByteArray()));
- {
- KeyedStateBackend keyedStateBackend = getKeyedStateBackend(2, new KeyGroupRange(0, 0));
- FlinkKeyGroupStateInternals<String> state0 =
- new FlinkKeyGroupStateInternals<>(
- StringUtf8Coder.of(), keyedStateBackend);
- state0.restoreKeyGroupState(0, in0, classLoader);
- BagState<String> valueForNamespace0 = state0.state(NAMESPACE_1, STRING_BAG_ADDR);
- BagState<String> valueForNamespace1 = state0.state(NAMESPACE_2, STRING_BAG_ADDR);
- assertThat(valueForNamespace0.read(), Matchers.containsInAnyOrder("0"));
- assertThat(valueForNamespace1.read(), Matchers.containsInAnyOrder("2"));
- }
-
- ByteArrayOutputStream out1 = new ByteArrayOutputStream();
- allState.snapshotKeyGroupState(1, new DataOutputStream(out1));
- DataInputStream in1 = new DataInputStream(
- new ByteArrayInputStream(out1.toByteArray()));
- {
- KeyedStateBackend keyedStateBackend = getKeyedStateBackend(2, new KeyGroupRange(1, 1));
- FlinkKeyGroupStateInternals<String> state1 =
- new FlinkKeyGroupStateInternals<>(
- StringUtf8Coder.of(), keyedStateBackend);
- state1.restoreKeyGroupState(1, in1, classLoader);
- BagState<String> valueForNamespace0 = state1.state(NAMESPACE_1, STRING_BAG_ADDR);
- BagState<String> valueForNamespace1 = state1.state(NAMESPACE_2, STRING_BAG_ADDR);
- assertThat(valueForNamespace0.read(), Matchers.containsInAnyOrder("1"));
- assertThat(valueForNamespace1.read(), Matchers.containsInAnyOrder("3"));
- }
-
- // 2. scale down
- {
- KeyedStateBackend keyedStateBackend = getKeyedStateBackend(2, new KeyGroupRange(0, 1));
- FlinkKeyGroupStateInternals<String> newAllState = new FlinkKeyGroupStateInternals<>(
- StringUtf8Coder.of(), keyedStateBackend);
- in0.reset();
- in1.reset();
- newAllState.restoreKeyGroupState(0, in0, classLoader);
- newAllState.restoreKeyGroupState(1, in1, classLoader);
- BagState<String> valueForNamespace0 = newAllState.state(NAMESPACE_1, STRING_BAG_ADDR);
- BagState<String> valueForNamespace1 = newAllState.state(NAMESPACE_2, STRING_BAG_ADDR);
- assertThat(valueForNamespace0.read(), Matchers.containsInAnyOrder("0", "1"));
- assertThat(valueForNamespace1.read(), Matchers.containsInAnyOrder("2", "3"));
- }
-
- }
-
- private static class TestKeyContext implements KeyContext {
-
- private Object key;
-
- @Override
- public void setCurrentKey(Object key) {
- this.key = key;
- }
-
- @Override
- public Object getCurrentKey() {
- return key;
- }
- }
-
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/test/java/org/apache/beam/runners/flink/streaming/FlinkSplitStateInternalsTest.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/test/java/org/apache/beam/runners/flink/streaming/FlinkSplitStateInternalsTest.java b/runners/flink/runner/src/test/java/org/apache/beam/runners/flink/streaming/FlinkSplitStateInternalsTest.java
deleted file mode 100644
index 08ae0c4..0000000
--- a/runners/flink/runner/src/test/java/org/apache/beam/runners/flink/streaming/FlinkSplitStateInternalsTest.java
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink.streaming;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertThat;
-
-import org.apache.beam.runners.core.StateNamespace;
-import org.apache.beam.runners.core.StateNamespaceForTest;
-import org.apache.beam.runners.core.StateTag;
-import org.apache.beam.runners.core.StateTags;
-import org.apache.beam.runners.flink.translation.wrappers.streaming.state.FlinkSplitStateInternals;
-import org.apache.beam.sdk.coders.StringUtf8Coder;
-import org.apache.beam.sdk.util.state.BagState;
-import org.apache.beam.sdk.util.state.ReadableState;
-import org.apache.flink.runtime.operators.testutils.DummyEnvironment;
-import org.apache.flink.runtime.state.OperatorStateBackend;
-import org.apache.flink.runtime.state.memory.MemoryStateBackend;
-import org.hamcrest.Matchers;
-import org.junit.Before;
-import org.junit.Test;
-import org.junit.runner.RunWith;
-import org.junit.runners.JUnit4;
-
-/**
- * Tests for {@link FlinkSplitStateInternals}. This is based on the tests for
- * {@code InMemoryStateInternals}.
- */
-@RunWith(JUnit4.class)
-public class FlinkSplitStateInternalsTest {
- private static final StateNamespace NAMESPACE_1 = new StateNamespaceForTest("ns1");
- private static final StateNamespace NAMESPACE_2 = new StateNamespaceForTest("ns2");
-
- private static final StateTag<Object, BagState<String>> STRING_BAG_ADDR =
- StateTags.bag("stringBag", StringUtf8Coder.of());
-
- FlinkSplitStateInternals<String> underTest;
-
- @Before
- public void initStateInternals() {
- MemoryStateBackend backend = new MemoryStateBackend();
- try {
- OperatorStateBackend operatorStateBackend =
- backend.createOperatorStateBackend(new DummyEnvironment("test", 1, 0), "");
- underTest = new FlinkSplitStateInternals<>(operatorStateBackend);
-
- } catch (Exception e) {
- throw new RuntimeException(e);
- }
- }
-
- @Test
- public void testBag() throws Exception {
- BagState<String> value = underTest.state(NAMESPACE_1, STRING_BAG_ADDR);
-
- assertEquals(value, underTest.state(NAMESPACE_1, STRING_BAG_ADDR));
- assertFalse(value.equals(underTest.state(NAMESPACE_2, STRING_BAG_ADDR)));
-
- assertThat(value.read(), Matchers.emptyIterable());
- value.add("hello");
- assertThat(value.read(), Matchers.containsInAnyOrder("hello"));
-
- value.add("world");
- assertThat(value.read(), Matchers.containsInAnyOrder("hello", "world"));
-
- value.clear();
- assertThat(value.read(), Matchers.emptyIterable());
- assertEquals(underTest.state(NAMESPACE_1, STRING_BAG_ADDR), value);
-
- }
-
- @Test
- public void testBagIsEmpty() throws Exception {
- BagState<String> value = underTest.state(NAMESPACE_1, STRING_BAG_ADDR);
-
- assertThat(value.isEmpty().read(), Matchers.is(true));
- ReadableState<Boolean> readFuture = value.isEmpty();
- value.add("hello");
- assertThat(readFuture.read(), Matchers.is(false));
-
- value.clear();
- assertThat(readFuture.read(), Matchers.is(true));
- }
-
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/test/java/org/apache/beam/runners/flink/streaming/FlinkStateInternalsTest.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/test/java/org/apache/beam/runners/flink/streaming/FlinkStateInternalsTest.java b/runners/flink/runner/src/test/java/org/apache/beam/runners/flink/streaming/FlinkStateInternalsTest.java
deleted file mode 100644
index d140271..0000000
--- a/runners/flink/runner/src/test/java/org/apache/beam/runners/flink/streaming/FlinkStateInternalsTest.java
+++ /dev/null
@@ -1,395 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink.streaming;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertNotEquals;
-import static org.junit.Assert.assertThat;
-
-import java.nio.ByteBuffer;
-import java.util.Arrays;
-import org.apache.beam.runners.core.StateMerging;
-import org.apache.beam.runners.core.StateNamespace;
-import org.apache.beam.runners.core.StateNamespaceForTest;
-import org.apache.beam.runners.core.StateTag;
-import org.apache.beam.runners.core.StateTags;
-import org.apache.beam.runners.flink.translation.wrappers.streaming.state.FlinkStateInternals;
-import org.apache.beam.sdk.coders.StringUtf8Coder;
-import org.apache.beam.sdk.coders.VarIntCoder;
-import org.apache.beam.sdk.transforms.Sum;
-import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
-import org.apache.beam.sdk.transforms.windowing.IntervalWindow;
-import org.apache.beam.sdk.transforms.windowing.OutputTimeFns;
-import org.apache.beam.sdk.util.CoderUtils;
-import org.apache.beam.sdk.util.state.BagState;
-import org.apache.beam.sdk.util.state.CombiningState;
-import org.apache.beam.sdk.util.state.GroupingState;
-import org.apache.beam.sdk.util.state.ReadableState;
-import org.apache.beam.sdk.util.state.ValueState;
-import org.apache.beam.sdk.util.state.WatermarkHoldState;
-import org.apache.flink.api.common.ExecutionConfig;
-import org.apache.flink.api.common.JobID;
-import org.apache.flink.api.java.typeutils.GenericTypeInfo;
-import org.apache.flink.runtime.jobgraph.JobVertexID;
-import org.apache.flink.runtime.operators.testutils.DummyEnvironment;
-import org.apache.flink.runtime.query.KvStateRegistry;
-import org.apache.flink.runtime.state.AbstractKeyedStateBackend;
-import org.apache.flink.runtime.state.KeyGroupRange;
-import org.apache.flink.runtime.state.memory.MemoryStateBackend;
-import org.hamcrest.Matchers;
-import org.joda.time.Instant;
-import org.junit.Before;
-import org.junit.Test;
-import org.junit.runner.RunWith;
-import org.junit.runners.JUnit4;
-
-/**
- * Tests for {@link FlinkStateInternals}. This is based on the tests for
- * {@code InMemoryStateInternals}.
- */
-@RunWith(JUnit4.class)
-public class FlinkStateInternalsTest {
- private static final BoundedWindow WINDOW_1 = new IntervalWindow(new Instant(0), new Instant(10));
- private static final StateNamespace NAMESPACE_1 = new StateNamespaceForTest("ns1");
- private static final StateNamespace NAMESPACE_2 = new StateNamespaceForTest("ns2");
- private static final StateNamespace NAMESPACE_3 = new StateNamespaceForTest("ns3");
-
- private static final StateTag<Object, ValueState<String>> STRING_VALUE_ADDR =
- StateTags.value("stringValue", StringUtf8Coder.of());
- private static final StateTag<Object, CombiningState<Integer, int[], Integer>>
- SUM_INTEGER_ADDR = StateTags.combiningValueFromInputInternal(
- "sumInteger", VarIntCoder.of(), Sum.ofIntegers());
- private static final StateTag<Object, BagState<String>> STRING_BAG_ADDR =
- StateTags.bag("stringBag", StringUtf8Coder.of());
- private static final StateTag<Object, WatermarkHoldState<BoundedWindow>>
- WATERMARK_EARLIEST_ADDR =
- StateTags.watermarkStateInternal("watermark", OutputTimeFns.outputAtEarliestInputTimestamp());
- private static final StateTag<Object, WatermarkHoldState<BoundedWindow>>
- WATERMARK_LATEST_ADDR =
- StateTags.watermarkStateInternal("watermark", OutputTimeFns.outputAtLatestInputTimestamp());
- private static final StateTag<Object, WatermarkHoldState<BoundedWindow>> WATERMARK_EOW_ADDR =
- StateTags.watermarkStateInternal("watermark", OutputTimeFns.outputAtEndOfWindow());
-
- FlinkStateInternals<String> underTest;
-
- @Before
- public void initStateInternals() {
- MemoryStateBackend backend = new MemoryStateBackend();
- try {
- AbstractKeyedStateBackend<ByteBuffer> keyedStateBackend = backend.createKeyedStateBackend(
- new DummyEnvironment("test", 1, 0),
- new JobID(),
- "test_op",
- new GenericTypeInfo<>(ByteBuffer.class).createSerializer(new ExecutionConfig()),
- 1,
- new KeyGroupRange(0, 0),
- new KvStateRegistry().createTaskRegistry(new JobID(), new JobVertexID()));
- underTest = new FlinkStateInternals<>(keyedStateBackend, StringUtf8Coder.of());
-
- keyedStateBackend.setCurrentKey(
- ByteBuffer.wrap(CoderUtils.encodeToByteArray(StringUtf8Coder.of(), "Hello")));
- } catch (Exception e) {
- throw new RuntimeException(e);
- }
- }
-
- @Test
- public void testValue() throws Exception {
- ValueState<String> value = underTest.state(NAMESPACE_1, STRING_VALUE_ADDR);
-
- assertEquals(underTest.state(NAMESPACE_1, STRING_VALUE_ADDR), value);
- assertNotEquals(
- underTest.state(NAMESPACE_2, STRING_VALUE_ADDR),
- value);
-
- assertThat(value.read(), Matchers.nullValue());
- value.write("hello");
- assertThat(value.read(), Matchers.equalTo("hello"));
- value.write("world");
- assertThat(value.read(), Matchers.equalTo("world"));
-
- value.clear();
- assertThat(value.read(), Matchers.nullValue());
- assertEquals(underTest.state(NAMESPACE_1, STRING_VALUE_ADDR), value);
-
- }
-
- @Test
- public void testBag() throws Exception {
- BagState<String> value = underTest.state(NAMESPACE_1, STRING_BAG_ADDR);
-
- assertEquals(value, underTest.state(NAMESPACE_1, STRING_BAG_ADDR));
- assertFalse(value.equals(underTest.state(NAMESPACE_2, STRING_BAG_ADDR)));
-
- assertThat(value.read(), Matchers.emptyIterable());
- value.add("hello");
- assertThat(value.read(), Matchers.containsInAnyOrder("hello"));
-
- value.add("world");
- assertThat(value.read(), Matchers.containsInAnyOrder("hello", "world"));
-
- value.clear();
- assertThat(value.read(), Matchers.emptyIterable());
- assertEquals(underTest.state(NAMESPACE_1, STRING_BAG_ADDR), value);
-
- }
-
- @Test
- public void testBagIsEmpty() throws Exception {
- BagState<String> value = underTest.state(NAMESPACE_1, STRING_BAG_ADDR);
-
- assertThat(value.isEmpty().read(), Matchers.is(true));
- ReadableState<Boolean> readFuture = value.isEmpty();
- value.add("hello");
- assertThat(readFuture.read(), Matchers.is(false));
-
- value.clear();
- assertThat(readFuture.read(), Matchers.is(true));
- }
-
- @Test
- public void testMergeBagIntoSource() throws Exception {
- BagState<String> bag1 = underTest.state(NAMESPACE_1, STRING_BAG_ADDR);
- BagState<String> bag2 = underTest.state(NAMESPACE_2, STRING_BAG_ADDR);
-
- bag1.add("Hello");
- bag2.add("World");
- bag1.add("!");
-
- StateMerging.mergeBags(Arrays.asList(bag1, bag2), bag1);
-
- // Reading the merged bag gets both the contents
- assertThat(bag1.read(), Matchers.containsInAnyOrder("Hello", "World", "!"));
- assertThat(bag2.read(), Matchers.emptyIterable());
- }
-
- @Test
- public void testMergeBagIntoNewNamespace() throws Exception {
- BagState<String> bag1 = underTest.state(NAMESPACE_1, STRING_BAG_ADDR);
- BagState<String> bag2 = underTest.state(NAMESPACE_2, STRING_BAG_ADDR);
- BagState<String> bag3 = underTest.state(NAMESPACE_3, STRING_BAG_ADDR);
-
- bag1.add("Hello");
- bag2.add("World");
- bag1.add("!");
-
- StateMerging.mergeBags(Arrays.asList(bag1, bag2, bag3), bag3);
-
- // Reading the merged bag gets both the contents
- assertThat(bag3.read(), Matchers.containsInAnyOrder("Hello", "World", "!"));
- assertThat(bag1.read(), Matchers.emptyIterable());
- assertThat(bag2.read(), Matchers.emptyIterable());
- }
-
- @Test
- public void testCombiningValue() throws Exception {
- GroupingState<Integer, Integer> value = underTest.state(NAMESPACE_1, SUM_INTEGER_ADDR);
-
- // State instances are cached, but depend on the namespace.
- assertEquals(value, underTest.state(NAMESPACE_1, SUM_INTEGER_ADDR));
- assertFalse(value.equals(underTest.state(NAMESPACE_2, SUM_INTEGER_ADDR)));
-
- assertThat(value.read(), Matchers.equalTo(0));
- value.add(2);
- assertThat(value.read(), Matchers.equalTo(2));
-
- value.add(3);
- assertThat(value.read(), Matchers.equalTo(5));
-
- value.clear();
- assertThat(value.read(), Matchers.equalTo(0));
- assertEquals(underTest.state(NAMESPACE_1, SUM_INTEGER_ADDR), value);
- }
-
- @Test
- public void testCombiningIsEmpty() throws Exception {
- GroupingState<Integer, Integer> value = underTest.state(NAMESPACE_1, SUM_INTEGER_ADDR);
-
- assertThat(value.isEmpty().read(), Matchers.is(true));
- ReadableState<Boolean> readFuture = value.isEmpty();
- value.add(5);
- assertThat(readFuture.read(), Matchers.is(false));
-
- value.clear();
- assertThat(readFuture.read(), Matchers.is(true));
- }
-
- @Test
- public void testMergeCombiningValueIntoSource() throws Exception {
- CombiningState<Integer, int[], Integer> value1 =
- underTest.state(NAMESPACE_1, SUM_INTEGER_ADDR);
- CombiningState<Integer, int[], Integer> value2 =
- underTest.state(NAMESPACE_2, SUM_INTEGER_ADDR);
-
- value1.add(5);
- value2.add(10);
- value1.add(6);
-
- assertThat(value1.read(), Matchers.equalTo(11));
- assertThat(value2.read(), Matchers.equalTo(10));
-
- // Merging clears the old values and updates the result value.
- StateMerging.mergeCombiningValues(Arrays.asList(value1, value2), value1);
-
- assertThat(value1.read(), Matchers.equalTo(21));
- assertThat(value2.read(), Matchers.equalTo(0));
- }
-
- @Test
- public void testMergeCombiningValueIntoNewNamespace() throws Exception {
- CombiningState<Integer, int[], Integer> value1 =
- underTest.state(NAMESPACE_1, SUM_INTEGER_ADDR);
- CombiningState<Integer, int[], Integer> value2 =
- underTest.state(NAMESPACE_2, SUM_INTEGER_ADDR);
- CombiningState<Integer, int[], Integer> value3 =
- underTest.state(NAMESPACE_3, SUM_INTEGER_ADDR);
-
- value1.add(5);
- value2.add(10);
- value1.add(6);
-
- StateMerging.mergeCombiningValues(Arrays.asList(value1, value2), value3);
-
- // Merging clears the old values and updates the result value.
- assertThat(value1.read(), Matchers.equalTo(0));
- assertThat(value2.read(), Matchers.equalTo(0));
- assertThat(value3.read(), Matchers.equalTo(21));
- }
-
- @Test
- public void testWatermarkEarliestState() throws Exception {
- WatermarkHoldState<BoundedWindow> value =
- underTest.state(NAMESPACE_1, WATERMARK_EARLIEST_ADDR);
-
- // State instances are cached, but depend on the namespace.
- assertEquals(value, underTest.state(NAMESPACE_1, WATERMARK_EARLIEST_ADDR));
- assertFalse(value.equals(underTest.state(NAMESPACE_2, WATERMARK_EARLIEST_ADDR)));
-
- assertThat(value.read(), Matchers.nullValue());
- value.add(new Instant(2000));
- assertThat(value.read(), Matchers.equalTo(new Instant(2000)));
-
- value.add(new Instant(3000));
- assertThat(value.read(), Matchers.equalTo(new Instant(2000)));
-
- value.add(new Instant(1000));
- assertThat(value.read(), Matchers.equalTo(new Instant(1000)));
-
- value.clear();
- assertThat(value.read(), Matchers.equalTo(null));
- assertEquals(underTest.state(NAMESPACE_1, WATERMARK_EARLIEST_ADDR), value);
- }
-
- @Test
- public void testWatermarkLatestState() throws Exception {
- WatermarkHoldState<BoundedWindow> value =
- underTest.state(NAMESPACE_1, WATERMARK_LATEST_ADDR);
-
- // State instances are cached, but depend on the namespace.
- assertEquals(value, underTest.state(NAMESPACE_1, WATERMARK_LATEST_ADDR));
- assertFalse(value.equals(underTest.state(NAMESPACE_2, WATERMARK_LATEST_ADDR)));
-
- assertThat(value.read(), Matchers.nullValue());
- value.add(new Instant(2000));
- assertThat(value.read(), Matchers.equalTo(new Instant(2000)));
-
- value.add(new Instant(3000));
- assertThat(value.read(), Matchers.equalTo(new Instant(3000)));
-
- value.add(new Instant(1000));
- assertThat(value.read(), Matchers.equalTo(new Instant(3000)));
-
- value.clear();
- assertThat(value.read(), Matchers.equalTo(null));
- assertEquals(underTest.state(NAMESPACE_1, WATERMARK_LATEST_ADDR), value);
- }
-
- @Test
- public void testWatermarkEndOfWindowState() throws Exception {
- WatermarkHoldState<BoundedWindow> value = underTest.state(NAMESPACE_1, WATERMARK_EOW_ADDR);
-
- // State instances are cached, but depend on the namespace.
- assertEquals(value, underTest.state(NAMESPACE_1, WATERMARK_EOW_ADDR));
- assertFalse(value.equals(underTest.state(NAMESPACE_2, WATERMARK_EOW_ADDR)));
-
- assertThat(value.read(), Matchers.nullValue());
- value.add(new Instant(2000));
- assertThat(value.read(), Matchers.equalTo(new Instant(2000)));
-
- value.clear();
- assertThat(value.read(), Matchers.equalTo(null));
- assertEquals(underTest.state(NAMESPACE_1, WATERMARK_EOW_ADDR), value);
- }
-
- @Test
- public void testWatermarkStateIsEmpty() throws Exception {
- WatermarkHoldState<BoundedWindow> value =
- underTest.state(NAMESPACE_1, WATERMARK_EARLIEST_ADDR);
-
- assertThat(value.isEmpty().read(), Matchers.is(true));
- ReadableState<Boolean> readFuture = value.isEmpty();
- value.add(new Instant(1000));
- assertThat(readFuture.read(), Matchers.is(false));
-
- value.clear();
- assertThat(readFuture.read(), Matchers.is(true));
- }
-
- @Test
- public void testMergeEarliestWatermarkIntoSource() throws Exception {
- WatermarkHoldState<BoundedWindow> value1 =
- underTest.state(NAMESPACE_1, WATERMARK_EARLIEST_ADDR);
- WatermarkHoldState<BoundedWindow> value2 =
- underTest.state(NAMESPACE_2, WATERMARK_EARLIEST_ADDR);
-
- value1.add(new Instant(3000));
- value2.add(new Instant(5000));
- value1.add(new Instant(4000));
- value2.add(new Instant(2000));
-
- // Merging clears the old values and updates the merged value.
- StateMerging.mergeWatermarks(Arrays.asList(value1, value2), value1, WINDOW_1);
-
- assertThat(value1.read(), Matchers.equalTo(new Instant(2000)));
- assertThat(value2.read(), Matchers.equalTo(null));
- }
-
- @Test
- public void testMergeLatestWatermarkIntoSource() throws Exception {
- WatermarkHoldState<BoundedWindow> value1 =
- underTest.state(NAMESPACE_1, WATERMARK_LATEST_ADDR);
- WatermarkHoldState<BoundedWindow> value2 =
- underTest.state(NAMESPACE_2, WATERMARK_LATEST_ADDR);
- WatermarkHoldState<BoundedWindow> value3 =
- underTest.state(NAMESPACE_3, WATERMARK_LATEST_ADDR);
-
- value1.add(new Instant(3000));
- value2.add(new Instant(5000));
- value1.add(new Instant(4000));
- value2.add(new Instant(2000));
-
- // Merging clears the old values and updates the result value.
- StateMerging.mergeWatermarks(Arrays.asList(value1, value2), value3, WINDOW_1);
-
- // Merging clears the old values and updates the result value.
- assertThat(value3.read(), Matchers.equalTo(new Instant(5000)));
- assertThat(value1.read(), Matchers.equalTo(null));
- assertThat(value2.read(), Matchers.equalTo(null));
- }
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/test/java/org/apache/beam/runners/flink/streaming/GroupByNullKeyTest.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/test/java/org/apache/beam/runners/flink/streaming/GroupByNullKeyTest.java b/runners/flink/runner/src/test/java/org/apache/beam/runners/flink/streaming/GroupByNullKeyTest.java
deleted file mode 100644
index 663b910..0000000
--- a/runners/flink/runner/src/test/java/org/apache/beam/runners/flink/streaming/GroupByNullKeyTest.java
+++ /dev/null
@@ -1,124 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink.streaming;
-
-import com.google.common.base.Joiner;
-import java.io.Serializable;
-import java.util.Arrays;
-import org.apache.beam.runners.flink.FlinkTestPipeline;
-import org.apache.beam.sdk.Pipeline;
-import org.apache.beam.sdk.io.TextIO;
-import org.apache.beam.sdk.transforms.Create;
-import org.apache.beam.sdk.transforms.DoFn;
-import org.apache.beam.sdk.transforms.GroupByKey;
-import org.apache.beam.sdk.transforms.ParDo;
-import org.apache.beam.sdk.transforms.windowing.AfterWatermark;
-import org.apache.beam.sdk.transforms.windowing.FixedWindows;
-import org.apache.beam.sdk.transforms.windowing.Window;
-import org.apache.beam.sdk.values.KV;
-import org.apache.beam.sdk.values.PCollection;
-import org.apache.flink.streaming.util.StreamingProgramTestBase;
-import org.joda.time.Duration;
-import org.joda.time.Instant;
-
-/**
- * Test for GroupByNullKey.
- */
-public class GroupByNullKeyTest extends StreamingProgramTestBase implements Serializable {
-
-
- protected String resultPath;
-
- static final String[] EXPECTED_RESULT = new String[] {
- "k: null v: user1 user1 user1 user2 user2 user2 user2 user3"
- };
-
- public GroupByNullKeyTest(){
- }
-
- @Override
- protected void preSubmit() throws Exception {
- resultPath = getTempDirPath("result");
- }
-
- @Override
- protected void postSubmit() throws Exception {
- compareResultsByLinesInMemory(Joiner.on('\n').join(EXPECTED_RESULT), resultPath);
- }
-
- /**
- * DoFn extracting user and timestamp.
- */
- private static class ExtractUserAndTimestamp extends DoFn<KV<Integer, String>, String> {
- @ProcessElement
- public void processElement(ProcessContext c) {
- KV<Integer, String> record = c.element();
- int timestamp = record.getKey();
- String userName = record.getValue();
- if (userName != null) {
- // Sets the implicit timestamp field to be used in windowing.
- c.outputWithTimestamp(userName, new Instant(timestamp));
- }
- }
- }
-
- @Override
- protected void testProgram() throws Exception {
-
- Pipeline p = FlinkTestPipeline.createForStreaming();
-
- PCollection<String> output =
- p.apply(Create.of(Arrays.asList(
- KV.<Integer, String>of(0, "user1"),
- KV.<Integer, String>of(1, "user1"),
- KV.<Integer, String>of(2, "user1"),
- KV.<Integer, String>of(10, "user2"),
- KV.<Integer, String>of(1, "user2"),
- KV.<Integer, String>of(15000, "user2"),
- KV.<Integer, String>of(12000, "user2"),
- KV.<Integer, String>of(25000, "user3"))))
- .apply(ParDo.of(new ExtractUserAndTimestamp()))
- .apply(Window.<String>into(FixedWindows.of(Duration.standardHours(1)))
- .triggering(AfterWatermark.pastEndOfWindow())
- .withAllowedLateness(Duration.ZERO)
- .discardingFiredPanes())
-
- .apply(ParDo.of(new DoFn<String, KV<Void, String>>() {
- @ProcessElement
- public void processElement(ProcessContext c) throws Exception {
- String elem = c.element();
- c.output(KV.<Void, String>of(null, elem));
- }
- }))
- .apply(GroupByKey.<Void, String>create())
- .apply(ParDo.of(new DoFn<KV<Void, Iterable<String>>, String>() {
- @ProcessElement
- public void processElement(ProcessContext c) throws Exception {
- KV<Void, Iterable<String>> elem = c.element();
- StringBuilder str = new StringBuilder();
- str.append("k: " + elem.getKey() + " v:");
- for (String v : elem.getValue()) {
- str.append(" " + v);
- }
- c.output(str.toString());
- }
- }));
- output.apply(TextIO.Write.to(resultPath));
- p.run();
- }
-}
[41/50] [abbrv] beam git commit: This closes #2576
Posted by dh...@apache.org.
This closes #2576
Project: http://git-wip-us.apache.org/repos/asf/beam/repo
Commit: http://git-wip-us.apache.org/repos/asf/beam/commit/83193698
Tree: http://git-wip-us.apache.org/repos/asf/beam/tree/83193698
Diff: http://git-wip-us.apache.org/repos/asf/beam/diff/83193698
Branch: refs/heads/DSL_SQL
Commit: 83193698d8ea3dc9cb2a3ed8fe6b4ee6b810237c
Parents: 8a00f22 cdd2544
Author: Isma�l Mej�a <ie...@apache.org>
Authored: Wed Apr 19 15:07:54 2017 +0200
Committer: Isma�l Mej�a <ie...@apache.org>
Committed: Wed Apr 19 15:07:54 2017 +0200
----------------------------------------------------------------------
...PostCommit_Java_ValidatesRunner_Flink.groovy | 2 +-
runners/flink/examples/pom.xml | 130 ---
.../beam/runners/flink/examples/TFIDF.java | 455 --------
.../beam/runners/flink/examples/WordCount.java | 129 ---
.../runners/flink/examples/package-info.java | 22 -
.../flink/examples/streaming/AutoComplete.java | 400 -------
.../flink/examples/streaming/JoinExamples.java | 154 ---
.../examples/streaming/WindowedWordCount.java | 141 ---
.../flink/examples/streaming/package-info.java | 22 -
runners/flink/pom.xml | 275 ++++-
runners/flink/runner/pom.xml | 330 ------
.../flink/DefaultParallelismFactory.java | 39 -
.../flink/FlinkBatchPipelineTranslator.java | 139 ---
.../flink/FlinkBatchTransformTranslators.java | 723 ------------
.../flink/FlinkBatchTranslationContext.java | 153 ---
.../flink/FlinkDetachedRunnerResult.java | 75 --
.../FlinkPipelineExecutionEnvironment.java | 241 ----
.../runners/flink/FlinkPipelineOptions.java | 101 --
.../runners/flink/FlinkPipelineTranslator.java | 53 -
.../apache/beam/runners/flink/FlinkRunner.java | 232 ----
.../runners/flink/FlinkRunnerRegistrar.java | 62 --
.../beam/runners/flink/FlinkRunnerResult.java | 98 --
.../flink/FlinkStreamingPipelineTranslator.java | 276 -----
.../FlinkStreamingTransformTranslators.java | 1044 -----------------
.../flink/FlinkStreamingTranslationContext.java | 130 ---
.../flink/FlinkStreamingViewOverrides.java | 372 -------
.../flink/PipelineTranslationOptimizer.java | 72 --
.../beam/runners/flink/TestFlinkRunner.java | 84 --
.../beam/runners/flink/TranslationMode.java | 31 -
.../apache/beam/runners/flink/package-info.java | 22 -
.../functions/FlinkAggregatorFactory.java | 53 -
.../functions/FlinkAssignContext.java | 63 --
.../functions/FlinkAssignWindows.java | 49 -
.../functions/FlinkDoFnFunction.java | 161 ---
.../FlinkMergingNonShuffleReduceFunction.java | 228 ----
.../FlinkMergingPartialReduceFunction.java | 201 ----
.../functions/FlinkMergingReduceFunction.java | 199 ----
.../FlinkMultiOutputPruningFunction.java | 50 -
.../functions/FlinkNoOpStepContext.java | 73 --
.../functions/FlinkPartialReduceFunction.java | 172 ---
.../functions/FlinkReduceFunction.java | 173 ---
.../functions/FlinkSideInputReader.java | 80 --
.../functions/FlinkStatefulDoFnFunction.java | 198 ----
.../functions/SideInputInitializer.java | 73 --
.../translation/functions/package-info.java | 22 -
.../runners/flink/translation/package-info.java | 22 -
.../translation/types/CoderTypeInformation.java | 120 --
.../translation/types/CoderTypeSerializer.java | 132 ---
.../types/EncodedValueComparator.java | 195 ----
.../types/EncodedValueSerializer.java | 113 --
.../types/EncodedValueTypeInformation.java | 98 --
.../types/InspectableByteArrayOutputStream.java | 34 -
.../flink/translation/types/KvKeySelector.java | 50 -
.../flink/translation/types/package-info.java | 22 -
.../utils/SerializedPipelineOptions.java | 67 --
.../flink/translation/utils/package-info.java | 22 -
.../wrappers/DataInputViewWrapper.java | 58 -
.../wrappers/DataOutputViewWrapper.java | 51 -
.../SerializableFnAggregatorWrapper.java | 98 --
.../translation/wrappers/SourceInputFormat.java | 150 ---
.../translation/wrappers/SourceInputSplit.java | 52 -
.../translation/wrappers/package-info.java | 22 -
.../wrappers/streaming/DoFnOperator.java | 774 -------------
.../streaming/KvToByteBufferKeySelector.java | 56 -
.../streaming/SingletonKeyedWorkItem.java | 56 -
.../streaming/SingletonKeyedWorkItemCoder.java | 126 ---
.../streaming/SplittableDoFnOperator.java | 150 ---
.../wrappers/streaming/WindowDoFnOperator.java | 117 --
.../wrappers/streaming/WorkItemKeySelector.java | 56 -
.../streaming/io/BoundedSourceWrapper.java | 218 ----
.../streaming/io/UnboundedSocketSource.java | 249 -----
.../streaming/io/UnboundedSourceWrapper.java | 476 --------
.../wrappers/streaming/io/package-info.java | 22 -
.../wrappers/streaming/package-info.java | 22 -
.../state/FlinkBroadcastStateInternals.java | 865 --------------
.../state/FlinkKeyGroupStateInternals.java | 487 --------
.../state/FlinkSplitStateInternals.java | 260 -----
.../streaming/state/FlinkStateInternals.java | 1053 ------------------
.../state/KeyGroupCheckpointedOperator.java | 35 -
.../state/KeyGroupRestoringOperator.java | 32 -
.../wrappers/streaming/state/package-info.java | 22 -
.../runner/src/main/resources/log4j.properties | 23 -
.../flink/EncodedValueComparatorTest.java | 70 --
.../runners/flink/FlinkRunnerRegistrarTest.java | 48 -
.../beam/runners/flink/FlinkTestPipeline.java | 72 --
.../beam/runners/flink/PipelineOptionsTest.java | 184 ---
.../beam/runners/flink/ReadSourceITCase.java | 85 --
.../flink/ReadSourceStreamingITCase.java | 74 --
.../beam/runners/flink/WriteSinkITCase.java | 192 ----
.../flink/streaming/DoFnOperatorTest.java | 600 ----------
.../FlinkBroadcastStateInternalsTest.java | 245 ----
.../FlinkKeyGroupStateInternalsTest.java | 262 -----
.../streaming/FlinkSplitStateInternalsTest.java | 101 --
.../streaming/FlinkStateInternalsTest.java | 395 -------
.../flink/streaming/GroupByNullKeyTest.java | 124 ---
.../flink/streaming/TestCountingSource.java | 254 -----
.../streaming/TopWikipediaSessionsITCase.java | 133 ---
.../streaming/UnboundedSourceWrapperTest.java | 464 --------
.../runners/flink/streaming/package-info.java | 22 -
.../src/test/resources/log4j-test.properties | 27 -
.../flink/DefaultParallelismFactory.java | 39 +
.../flink/FlinkBatchPipelineTranslator.java | 139 +++
.../flink/FlinkBatchTransformTranslators.java | 723 ++++++++++++
.../flink/FlinkBatchTranslationContext.java | 153 +++
.../flink/FlinkDetachedRunnerResult.java | 75 ++
.../FlinkPipelineExecutionEnvironment.java | 241 ++++
.../runners/flink/FlinkPipelineOptions.java | 101 ++
.../runners/flink/FlinkPipelineTranslator.java | 53 +
.../apache/beam/runners/flink/FlinkRunner.java | 232 ++++
.../runners/flink/FlinkRunnerRegistrar.java | 62 ++
.../beam/runners/flink/FlinkRunnerResult.java | 98 ++
.../flink/FlinkStreamingPipelineTranslator.java | 276 +++++
.../FlinkStreamingTransformTranslators.java | 1044 +++++++++++++++++
.../flink/FlinkStreamingTranslationContext.java | 130 +++
.../flink/FlinkStreamingViewOverrides.java | 372 +++++++
.../flink/PipelineTranslationOptimizer.java | 72 ++
.../beam/runners/flink/TestFlinkRunner.java | 84 ++
.../beam/runners/flink/TranslationMode.java | 31 +
.../apache/beam/runners/flink/package-info.java | 22 +
.../functions/FlinkAggregatorFactory.java | 53 +
.../functions/FlinkAssignContext.java | 63 ++
.../functions/FlinkAssignWindows.java | 49 +
.../functions/FlinkDoFnFunction.java | 161 +++
.../FlinkMergingNonShuffleReduceFunction.java | 228 ++++
.../FlinkMergingPartialReduceFunction.java | 201 ++++
.../functions/FlinkMergingReduceFunction.java | 199 ++++
.../FlinkMultiOutputPruningFunction.java | 50 +
.../functions/FlinkNoOpStepContext.java | 73 ++
.../functions/FlinkPartialReduceFunction.java | 172 +++
.../functions/FlinkReduceFunction.java | 173 +++
.../functions/FlinkSideInputReader.java | 80 ++
.../functions/FlinkStatefulDoFnFunction.java | 198 ++++
.../functions/SideInputInitializer.java | 73 ++
.../translation/functions/package-info.java | 22 +
.../runners/flink/translation/package-info.java | 22 +
.../translation/types/CoderTypeInformation.java | 120 ++
.../translation/types/CoderTypeSerializer.java | 132 +++
.../types/EncodedValueComparator.java | 195 ++++
.../types/EncodedValueSerializer.java | 113 ++
.../types/EncodedValueTypeInformation.java | 98 ++
.../types/InspectableByteArrayOutputStream.java | 34 +
.../flink/translation/types/KvKeySelector.java | 50 +
.../flink/translation/types/package-info.java | 22 +
.../utils/SerializedPipelineOptions.java | 67 ++
.../flink/translation/utils/package-info.java | 22 +
.../wrappers/DataInputViewWrapper.java | 58 +
.../wrappers/DataOutputViewWrapper.java | 51 +
.../SerializableFnAggregatorWrapper.java | 98 ++
.../translation/wrappers/SourceInputFormat.java | 150 +++
.../translation/wrappers/SourceInputSplit.java | 52 +
.../translation/wrappers/package-info.java | 22 +
.../wrappers/streaming/DoFnOperator.java | 774 +++++++++++++
.../streaming/KvToByteBufferKeySelector.java | 56 +
.../streaming/SingletonKeyedWorkItem.java | 56 +
.../streaming/SingletonKeyedWorkItemCoder.java | 126 +++
.../streaming/SplittableDoFnOperator.java | 150 +++
.../wrappers/streaming/WindowDoFnOperator.java | 117 ++
.../wrappers/streaming/WorkItemKeySelector.java | 56 +
.../streaming/io/BoundedSourceWrapper.java | 218 ++++
.../streaming/io/UnboundedSocketSource.java | 249 +++++
.../streaming/io/UnboundedSourceWrapper.java | 476 ++++++++
.../wrappers/streaming/io/package-info.java | 22 +
.../wrappers/streaming/package-info.java | 22 +
.../state/FlinkBroadcastStateInternals.java | 865 ++++++++++++++
.../state/FlinkKeyGroupStateInternals.java | 487 ++++++++
.../state/FlinkSplitStateInternals.java | 260 +++++
.../streaming/state/FlinkStateInternals.java | 1053 ++++++++++++++++++
.../state/KeyGroupCheckpointedOperator.java | 35 +
.../state/KeyGroupRestoringOperator.java | 32 +
.../wrappers/streaming/state/package-info.java | 22 +
.../flink/src/main/resources/log4j.properties | 23 +
.../flink/EncodedValueComparatorTest.java | 70 ++
.../runners/flink/FlinkRunnerRegistrarTest.java | 48 +
.../beam/runners/flink/FlinkTestPipeline.java | 72 ++
.../beam/runners/flink/PipelineOptionsTest.java | 184 +++
.../beam/runners/flink/ReadSourceITCase.java | 85 ++
.../flink/ReadSourceStreamingITCase.java | 74 ++
.../beam/runners/flink/WriteSinkITCase.java | 192 ++++
.../flink/streaming/DoFnOperatorTest.java | 600 ++++++++++
.../FlinkBroadcastStateInternalsTest.java | 245 ++++
.../FlinkKeyGroupStateInternalsTest.java | 262 +++++
.../streaming/FlinkSplitStateInternalsTest.java | 101 ++
.../streaming/FlinkStateInternalsTest.java | 395 +++++++
.../flink/streaming/GroupByNullKeyTest.java | 124 +++
.../flink/streaming/TestCountingSource.java | 254 +++++
.../streaming/TopWikipediaSessionsITCase.java | 133 +++
.../streaming/UnboundedSourceWrapperTest.java | 464 ++++++++
.../runners/flink/streaming/package-info.java | 22 +
.../src/test/resources/log4j-test.properties | 27 +
189 files changed, 15765 insertions(+), 17293 deletions(-)
----------------------------------------------------------------------
[42/50] [abbrv] beam git commit: [BEAM-1441] Remove deprecated
ChannelFactory
Posted by dh...@apache.org.
[BEAM-1441] Remove deprecated ChannelFactory
Project: http://git-wip-us.apache.org/repos/asf/beam/repo
Commit: http://git-wip-us.apache.org/repos/asf/beam/commit/97c66784
Tree: http://git-wip-us.apache.org/repos/asf/beam/tree/97c66784
Diff: http://git-wip-us.apache.org/repos/asf/beam/diff/97c66784
Branch: refs/heads/DSL_SQL
Commit: 97c667846b566c312ceaadc66fb14fde1dfa7ebe
Parents: 8319369
Author: Sourabh Bajaj <so...@google.com>
Authored: Fri Apr 14 14:45:16 2017 -0700
Committer: chamikara@google.com <ch...@google.com>
Committed: Wed Apr 19 09:56:28 2017 -0700
----------------------------------------------------------------------
sdks/python/apache_beam/io/fileio.py | 90 -------------------------------
1 file changed, 90 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/beam/blob/97c66784/sdks/python/apache_beam/io/fileio.py
----------------------------------------------------------------------
diff --git a/sdks/python/apache_beam/io/fileio.py b/sdks/python/apache_beam/io/fileio.py
index 8ee5198..f61289e 100644
--- a/sdks/python/apache_beam/io/fileio.py
+++ b/sdks/python/apache_beam/io/fileio.py
@@ -27,7 +27,6 @@ import time
from apache_beam.internal import util
from apache_beam.io import iobase
from apache_beam.io.filesystem import BeamIOError
-from apache_beam.io.filesystem import CompressedFile as _CompressedFile
from apache_beam.io.filesystem import CompressionTypes
from apache_beam.io.filesystems_util import get_filesystem
from apache_beam.transforms.display import DisplayDataItem
@@ -38,95 +37,6 @@ from apache_beam.utils.value_provider import check_accessible
DEFAULT_SHARD_NAME_TEMPLATE = '-SSSSS-of-NNNNN'
-# TODO(sourabhbajaj): Remove this after BFS API is used everywhere
-class ChannelFactory(object):
- @staticmethod
- def mkdir(path):
- bfs = get_filesystem(path)
- return bfs.mkdirs(path)
-
- @staticmethod
- def open(path,
- mode,
- mime_type='application/octet-stream',
- compression_type=CompressionTypes.AUTO):
- bfs = get_filesystem(path)
- if mode == 'rb':
- return bfs.open(path, mime_type, compression_type)
- elif mode == 'wb':
- return bfs.create(path, mime_type, compression_type)
-
- @staticmethod
- def is_compressed(fileobj):
- return isinstance(fileobj, _CompressedFile)
-
- @staticmethod
- def rename(src, dest):
- bfs = get_filesystem(src)
- return bfs.rename([src], [dest])
-
- @staticmethod
- def rename_batch(src_dest_pairs):
- sources = [s for s, _ in src_dest_pairs]
- destinations = [d for _, d in src_dest_pairs]
- if not sources:
- return []
- bfs = get_filesystem(sources[0])
- try:
- bfs.rename(sources, destinations)
- return []
- except BeamIOError as exp:
- return [(s, d, e) for (s, d), e in exp.exception_details.iteritems()]
-
- @staticmethod
- def copytree(src, dest):
- bfs = get_filesystem(src)
- return bfs.copy([src], [dest])
-
- @staticmethod
- def exists(path):
- bfs = get_filesystem(path)
- return bfs.exists(path)
-
- @staticmethod
- def rmdir(path):
- bfs = get_filesystem(path)
- return bfs.delete([path])
-
- @staticmethod
- def rm(path):
- bfs = get_filesystem(path)
- return bfs.delete([path])
-
- @staticmethod
- def glob(path, limit=None):
- bfs = get_filesystem(path)
- match_result = bfs.match([path], [limit])[0]
- return [f.path for f in match_result.metadata_list]
-
- @staticmethod
- def size_in_bytes(path):
- bfs = get_filesystem(path)
- match_result = bfs.match([path])[0]
- return [f.size_in_bytes for f in match_result.metadata_list][0]
-
- @staticmethod
- def size_of_files_in_glob(path, file_names=None):
- bfs = get_filesystem(path)
- match_result = bfs.match([path])[0]
- part_files = {f.path:f.size_in_bytes for f in match_result.metadata_list}
-
- if file_names is not None:
- specific_files = {}
- match_results = bfs.match(file_names)
- for match_result in match_results:
- for metadata in match_result.metadata_list:
- specific_files[metadata.path] = metadata.size_in_bytes
-
- part_files.update(specific_files)
- return part_files
-
-
class FileSink(iobase.Sink):
"""A sink to a GCS or local files.
[39/50] [abbrv] beam git commit: [BEAM-1994] Remove Flink examples
package
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/pom.xml
----------------------------------------------------------------------
diff --git a/runners/flink/runner/pom.xml b/runners/flink/runner/pom.xml
deleted file mode 100644
index 18343ef..0000000
--- a/runners/flink/runner/pom.xml
+++ /dev/null
@@ -1,330 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
-
- <modelVersion>4.0.0</modelVersion>
-
- <parent>
- <groupId>org.apache.beam</groupId>
- <artifactId>beam-runners-flink-parent</artifactId>
- <version>0.7.0-SNAPSHOT</version>
- <relativePath>../pom.xml</relativePath>
- </parent>
-
- <artifactId>beam-runners-flink_2.10</artifactId>
-
- <name>Apache Beam :: Runners :: Flink :: Core</name>
-
- <packaging>jar</packaging>
-
- <profiles>
- <profile>
- <id>local-validates-runner-tests</id>
- <activation><activeByDefault>false</activeByDefault></activation>
- <build>
- <plugins>
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-surefire-plugin</artifactId>
- <executions>
-
- <!-- This configures the inherited validates-runner-tests
- execution to execute with a local Flink instance. -->
- <execution>
- <id>validates-runner-tests</id>
- <phase>integration-test</phase>
- <goals>
- <goal>test</goal>
- </goals>
- <configuration>
- <groups>org.apache.beam.sdk.testing.ValidatesRunner</groups>
- <excludedGroups>
- org.apache.beam.sdk.testing.FlattenWithHeterogeneousCoders,
- org.apache.beam.sdk.testing.UsesSplittableParDo,
- org.apache.beam.sdk.testing.UsesAttemptedMetrics,
- org.apache.beam.sdk.testing.UsesCommittedMetrics,
- org.apache.beam.sdk.testing.UsesTestStream
- </excludedGroups>
- <parallel>none</parallel>
- <failIfNoTests>true</failIfNoTests>
- <dependenciesToScan>
- <dependency>org.apache.beam:beam-sdks-java-core</dependency>
- </dependenciesToScan>
- <systemPropertyVariables>
- <beamTestPipelineOptions>
- [
- "--runner=TestFlinkRunner",
- "--streaming=false"
- ]
- </beamTestPipelineOptions>
- </systemPropertyVariables>
- </configuration>
- </execution>
-
- <!-- This second execution runs the tests in streaming mode -->
- <execution>
- <id>streaming-validates-runner-tests</id>
- <phase>integration-test</phase>
- <goals>
- <goal>test</goal>
- </goals>
- <configuration>
- <groups>org.apache.beam.sdk.testing.ValidatesRunner</groups>
- <excludedGroups>
- org.apache.beam.sdk.testing.FlattenWithHeterogeneousCoders,
- org.apache.beam.sdk.testing.UsesSetState,
- org.apache.beam.sdk.testing.UsesMapState,
- org.apache.beam.sdk.testing.UsesAttemptedMetrics,
- org.apache.beam.sdk.testing.UsesCommittedMetrics,
- org.apache.beam.sdk.testing.UsesTestStream,
- org.apache.beam.sdk.testing.UsesSplittableParDoWithWindowedSideInputs
- </excludedGroups>
- <parallel>none</parallel>
- <failIfNoTests>true</failIfNoTests>
- <dependenciesToScan>
- <dependency>org.apache.beam:beam-sdks-java-core</dependency>
- </dependenciesToScan>
- <systemPropertyVariables>
- <beamTestPipelineOptions>
- [
- "--runner=TestFlinkRunner",
- "--streaming=true"
- ]
- </beamTestPipelineOptions>
- </systemPropertyVariables>
- </configuration>
- </execution>
- </executions>
- </plugin>
- </plugins>
- </build>
- </profile>
- </profiles>
-
- <build>
- <plugins>
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-compiler-plugin</artifactId>
- </plugin>
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-jar-plugin</artifactId>
- </plugin>
-
- <!-- Integration Tests -->
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-failsafe-plugin</artifactId>
- </plugin>
-
- <!-- Unit Tests -->
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-surefire-plugin</artifactId>
- </plugin>
- </plugins>
- </build>
-
- <dependencies>
- <!-- Flink dependencies -->
- <dependency>
- <groupId>org.apache.flink</groupId>
- <artifactId>flink-java</artifactId>
- <version>${flink.version}</version>
- </dependency>
-
- <dependency>
- <groupId>org.apache.flink</groupId>
- <artifactId>flink-clients_2.10</artifactId>
- <version>${flink.version}</version>
- </dependency>
-
- <dependency>
- <groupId>org.apache.flink</groupId>
- <artifactId>flink-runtime_2.10</artifactId>
- <version>${flink.version}</version>
- </dependency>
-
- <!-- For testing -->
- <dependency>
- <groupId>org.apache.flink</groupId>
- <artifactId>flink-core</artifactId>
- <version>${flink.version}</version>
- <type>test-jar</type>
- <scope>test</scope>
- </dependency>
-
- <dependency>
- <groupId>org.apache.flink</groupId>
- <artifactId>flink-runtime_2.10</artifactId>
- <version>${flink.version}</version>
- <type>test-jar</type>
- <scope>test</scope>
- </dependency>
-
- <!-- Beam -->
- <dependency>
- <groupId>org.apache.beam</groupId>
- <artifactId>beam-sdks-java-core</artifactId>
- <exclusions>
- <exclusion>
- <groupId>org.slf4j</groupId>
- <artifactId>slf4j-jdk14</artifactId>
- </exclusion>
- </exclusions>
- </dependency>
-
- <dependency>
- <groupId>org.apache.beam</groupId>
- <artifactId>beam-runners-core-java</artifactId>
- <exclusions>
- <exclusion>
- <groupId>org.slf4j</groupId>
- <artifactId>slf4j-jdk14</artifactId>
- </exclusion>
- </exclusions>
- </dependency>
-
- <dependency>
- <groupId>org.apache.beam</groupId>
- <artifactId>beam-runners-core-construction-java</artifactId>
- <exclusions>
- <exclusion>
- <groupId>org.slf4j</groupId>
- <artifactId>slf4j-jdk14</artifactId>
- </exclusion>
- </exclusions>
- </dependency>
-
- <dependency>
- <groupId>com.fasterxml.jackson.core</groupId>
- <artifactId>jackson-annotations</artifactId>
- </dependency>
-
- <dependency>
- <groupId>com.fasterxml.jackson.core</groupId>
- <artifactId>jackson-databind</artifactId>
- </dependency>
-
- <dependency>
- <groupId>com.google.guava</groupId>
- <artifactId>guava</artifactId>
- </dependency>
-
- <dependency>
- <groupId>com.google.code.findbugs</groupId>
- <artifactId>jsr305</artifactId>
- </dependency>
-
- <!--
- Force an upgrade on the version of Apache Commons from Flink to support DEFLATE compression.
- -->
- <dependency>
- <groupId>org.apache.commons</groupId>
- <artifactId>commons-compress</artifactId>
- <scope>runtime</scope>
- </dependency>
-
- <!-- Test scoped -->
- <dependency>
- <groupId>com.google.apis</groupId>
- <artifactId>google-api-services-bigquery</artifactId>
- <scope>test</scope>
- </dependency>
-
- <dependency>
- <groupId>org.apache.commons</groupId>
- <artifactId>commons-lang3</artifactId>
- <scope>test</scope>
- </dependency>
-
- <dependency>
- <groupId>org.hamcrest</groupId>
- <artifactId>hamcrest-all</artifactId>
- <scope>test</scope>
- </dependency>
-
- <dependency>
- <groupId>junit</groupId>
- <artifactId>junit</artifactId>
- <scope>test</scope>
- </dependency>
-
- <dependency>
- <groupId>org.mockito</groupId>
- <artifactId>mockito-all</artifactId>
- <scope>test</scope>
- </dependency>
-
- <!-- Depend on test jar to scan for ValidatesRunner tests -->
- <dependency>
- <groupId>org.apache.beam</groupId>
- <artifactId>beam-sdks-java-core</artifactId>
- <classifier>tests</classifier>
- <scope>test</scope>
- <exclusions>
- <exclusion>
- <groupId>org.slf4j</groupId>
- <artifactId>slf4j-jdk14</artifactId>
- </exclusion>
- </exclusions>
- </dependency>
-
- <dependency>
- <groupId>org.apache.flink</groupId>
- <artifactId>flink-streaming-java_2.10</artifactId>
- <version>${flink.version}</version>
- <scope>test</scope>
- <type>test-jar</type>
- </dependency>
- <dependency>
- <groupId>org.apache.flink</groupId>
- <artifactId>flink-test-utils_2.10</artifactId>
- <version>${flink.version}</version>
- <scope>test</scope>
- <exclusions>
- <exclusion>
- <artifactId>apacheds-jdbm1</artifactId>
- <groupId>org.apache.directory.jdbm</groupId>
- </exclusion>
- </exclusions>
- </dependency>
-
- <!-- Optional Pipeline Registration -->
- <dependency>
- <groupId>com.google.auto.service</groupId>
- <artifactId>auto-service</artifactId>
- <optional>true</optional>
- </dependency>
-
- <!-- transitive test dependencies from beam-sdk-java-core -->
- <dependency>
- <groupId>com.fasterxml.jackson.dataformat</groupId>
- <artifactId>jackson-dataformat-yaml</artifactId>
- <scope>test</scope>
- </dependency>
-
- <dependency>
- <groupId>org.apache.beam</groupId>
- <artifactId>beam-sdks-common-fn-api</artifactId>
- <type>test-jar</type>
- <scope>test</scope>
- </dependency>
- </dependencies>
-</project>
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/DefaultParallelismFactory.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/DefaultParallelismFactory.java b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/DefaultParallelismFactory.java
deleted file mode 100644
index b745f0b..0000000
--- a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/DefaultParallelismFactory.java
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink;
-
-import org.apache.beam.sdk.options.DefaultValueFactory;
-import org.apache.beam.sdk.options.PipelineOptions;
-import org.apache.flink.configuration.ConfigConstants;
-import org.apache.flink.configuration.GlobalConfiguration;
-
-/**
- * {@link DefaultValueFactory} for getting a default value for the parallelism option
- * on {@link FlinkPipelineOptions}.
- *
- * <p>This will return either the default value from {@link GlobalConfiguration} or {@code 1}.
- * A valid {@link GlobalConfiguration} is only available if the program is executed by the Flink
- * run scripts.
- */
-public class DefaultParallelismFactory implements DefaultValueFactory<Integer> {
- @Override
- public Integer create(PipelineOptions options) {
- return GlobalConfiguration.loadConfiguration()
- .getInteger(ConfigConstants.DEFAULT_PARALLELISM_KEY, 1);
- }
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/FlinkBatchPipelineTranslator.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/FlinkBatchPipelineTranslator.java b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/FlinkBatchPipelineTranslator.java
deleted file mode 100644
index 854b674..0000000
--- a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/FlinkBatchPipelineTranslator.java
+++ /dev/null
@@ -1,139 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink;
-
-import org.apache.beam.sdk.Pipeline;
-import org.apache.beam.sdk.options.PipelineOptions;
-import org.apache.beam.sdk.runners.TransformHierarchy;
-import org.apache.beam.sdk.transforms.PTransform;
-import org.apache.flink.api.java.DataSet;
-import org.apache.flink.api.java.ExecutionEnvironment;
-import org.apache.flink.api.java.io.DiscardingOutputFormat;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * {@link Pipeline.PipelineVisitor} for executing a {@link Pipeline} as a
- * Flink batch job.
- */
-class FlinkBatchPipelineTranslator extends FlinkPipelineTranslator {
-
- private static final Logger LOG = LoggerFactory.getLogger(FlinkBatchPipelineTranslator.class);
-
- /**
- * The necessary context in the case of a batch job.
- */
- private final FlinkBatchTranslationContext batchContext;
-
- private int depth = 0;
-
- public FlinkBatchPipelineTranslator(ExecutionEnvironment env, PipelineOptions options) {
- this.batchContext = new FlinkBatchTranslationContext(env, options);
- }
-
- @Override
- @SuppressWarnings("rawtypes, unchecked")
- public void translate(Pipeline pipeline) {
- super.translate(pipeline);
-
- // terminate dangling DataSets
- for (DataSet<?> dataSet: batchContext.getDanglingDataSets().values()) {
- dataSet.output(new DiscardingOutputFormat());
- }
- }
-
- // --------------------------------------------------------------------------------------------
- // Pipeline Visitor Methods
- // --------------------------------------------------------------------------------------------
-
- @Override
- public CompositeBehavior enterCompositeTransform(TransformHierarchy.Node node) {
- LOG.info("{} enterCompositeTransform- {}", genSpaces(this.depth), node.getFullName());
- this.depth++;
-
- BatchTransformTranslator<?> translator = getTranslator(node);
-
- if (translator != null) {
- applyBatchTransform(node.getTransform(), node, translator);
- LOG.info("{} translated- {}", genSpaces(this.depth), node.getFullName());
- return CompositeBehavior.DO_NOT_ENTER_TRANSFORM;
- } else {
- return CompositeBehavior.ENTER_TRANSFORM;
- }
- }
-
- @Override
- public void leaveCompositeTransform(TransformHierarchy.Node node) {
- this.depth--;
- LOG.info("{} leaveCompositeTransform- {}", genSpaces(this.depth), node.getFullName());
- }
-
- @Override
- public void visitPrimitiveTransform(TransformHierarchy.Node node) {
- LOG.info("{} visitPrimitiveTransform- {}", genSpaces(this.depth), node.getFullName());
-
- // get the transformation corresponding to the node we are
- // currently visiting and translate it into its Flink alternative.
- PTransform<?, ?> transform = node.getTransform();
- BatchTransformTranslator<?> translator =
- FlinkBatchTransformTranslators.getTranslator(transform);
- if (translator == null) {
- LOG.info(node.getTransform().getClass().toString());
- throw new UnsupportedOperationException("The transform " + transform
- + " is currently not supported.");
- }
- applyBatchTransform(transform, node, translator);
- }
-
- private <T extends PTransform<?, ?>> void applyBatchTransform(
- PTransform<?, ?> transform,
- TransformHierarchy.Node node,
- BatchTransformTranslator<?> translator) {
-
- @SuppressWarnings("unchecked")
- T typedTransform = (T) transform;
-
- @SuppressWarnings("unchecked")
- BatchTransformTranslator<T> typedTranslator = (BatchTransformTranslator<T>) translator;
-
- // create the applied PTransform on the batchContext
- batchContext.setCurrentTransform(node.toAppliedPTransform());
- typedTranslator.translateNode(typedTransform, batchContext);
- }
-
- /**
- * A translator of a {@link PTransform}.
- */
- public interface BatchTransformTranslator<TransformT extends PTransform> {
- void translateNode(TransformT transform, FlinkBatchTranslationContext context);
- }
-
- /**
- * Returns a translator for the given node, if it is possible, otherwise null.
- */
- private static BatchTransformTranslator<?> getTranslator(TransformHierarchy.Node node) {
- PTransform<?, ?> transform = node.getTransform();
-
- // Root of the graph is null
- if (transform == null) {
- return null;
- }
-
- return FlinkBatchTransformTranslators.getTranslator(transform);
- }
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/FlinkBatchTransformTranslators.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/FlinkBatchTransformTranslators.java b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/FlinkBatchTransformTranslators.java
deleted file mode 100644
index ff9521c..0000000
--- a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/FlinkBatchTransformTranslators.java
+++ /dev/null
@@ -1,723 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink;
-
-import static com.google.common.base.Preconditions.checkArgument;
-import static com.google.common.base.Preconditions.checkState;
-
-import com.google.common.collect.Lists;
-import com.google.common.collect.Maps;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.Map.Entry;
-import org.apache.beam.runners.flink.translation.functions.FlinkAssignWindows;
-import org.apache.beam.runners.flink.translation.functions.FlinkDoFnFunction;
-import org.apache.beam.runners.flink.translation.functions.FlinkMergingNonShuffleReduceFunction;
-import org.apache.beam.runners.flink.translation.functions.FlinkMergingPartialReduceFunction;
-import org.apache.beam.runners.flink.translation.functions.FlinkMergingReduceFunction;
-import org.apache.beam.runners.flink.translation.functions.FlinkMultiOutputPruningFunction;
-import org.apache.beam.runners.flink.translation.functions.FlinkPartialReduceFunction;
-import org.apache.beam.runners.flink.translation.functions.FlinkReduceFunction;
-import org.apache.beam.runners.flink.translation.functions.FlinkStatefulDoFnFunction;
-import org.apache.beam.runners.flink.translation.types.CoderTypeInformation;
-import org.apache.beam.runners.flink.translation.types.KvKeySelector;
-import org.apache.beam.runners.flink.translation.wrappers.SourceInputFormat;
-import org.apache.beam.sdk.coders.CannotProvideCoderException;
-import org.apache.beam.sdk.coders.Coder;
-import org.apache.beam.sdk.coders.CoderRegistry;
-import org.apache.beam.sdk.coders.KvCoder;
-import org.apache.beam.sdk.coders.ListCoder;
-import org.apache.beam.sdk.coders.VoidCoder;
-import org.apache.beam.sdk.io.BoundedSource;
-import org.apache.beam.sdk.io.Read;
-import org.apache.beam.sdk.transforms.Combine;
-import org.apache.beam.sdk.transforms.CombineFnBase;
-import org.apache.beam.sdk.transforms.DoFn;
-import org.apache.beam.sdk.transforms.Flatten;
-import org.apache.beam.sdk.transforms.GroupByKey;
-import org.apache.beam.sdk.transforms.PTransform;
-import org.apache.beam.sdk.transforms.ParDo;
-import org.apache.beam.sdk.transforms.View;
-import org.apache.beam.sdk.transforms.join.RawUnionValue;
-import org.apache.beam.sdk.transforms.join.UnionCoder;
-import org.apache.beam.sdk.transforms.reflect.DoFnSignature;
-import org.apache.beam.sdk.transforms.reflect.DoFnSignatures;
-import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
-import org.apache.beam.sdk.transforms.windowing.GlobalWindow;
-import org.apache.beam.sdk.transforms.windowing.IntervalWindow;
-import org.apache.beam.sdk.transforms.windowing.Window;
-import org.apache.beam.sdk.transforms.windowing.WindowFn;
-import org.apache.beam.sdk.util.Reshuffle;
-import org.apache.beam.sdk.util.WindowedValue;
-import org.apache.beam.sdk.util.WindowingStrategy;
-import org.apache.beam.sdk.values.KV;
-import org.apache.beam.sdk.values.PCollection;
-import org.apache.beam.sdk.values.PCollectionView;
-import org.apache.beam.sdk.values.PValue;
-import org.apache.beam.sdk.values.TupleTag;
-import org.apache.flink.api.common.functions.FilterFunction;
-import org.apache.flink.api.common.functions.FlatMapFunction;
-import org.apache.flink.api.common.typeinfo.TypeInformation;
-import org.apache.flink.api.java.DataSet;
-import org.apache.flink.api.java.operators.DataSource;
-import org.apache.flink.api.java.operators.FlatMapOperator;
-import org.apache.flink.api.java.operators.GroupCombineOperator;
-import org.apache.flink.api.java.operators.GroupReduceOperator;
-import org.apache.flink.api.java.operators.Grouping;
-import org.apache.flink.api.java.operators.MapPartitionOperator;
-import org.apache.flink.api.java.operators.SingleInputUdfOperator;
-import org.apache.flink.util.Collector;
-
-/**
- * Translators for transforming {@link PTransform PTransforms} to
- * Flink {@link DataSet DataSets}.
- */
-class FlinkBatchTransformTranslators {
-
- // --------------------------------------------------------------------------------------------
- // Transform Translator Registry
- // --------------------------------------------------------------------------------------------
-
- @SuppressWarnings("rawtypes")
- private static final Map<
- Class<? extends PTransform>,
- FlinkBatchPipelineTranslator.BatchTransformTranslator> TRANSLATORS = new HashMap<>();
-
- static {
- TRANSLATORS.put(View.CreatePCollectionView.class, new CreatePCollectionViewTranslatorBatch());
-
- TRANSLATORS.put(Combine.PerKey.class, new CombinePerKeyTranslatorBatch());
- TRANSLATORS.put(GroupByKey.class, new GroupByKeyTranslatorBatch());
- TRANSLATORS.put(Reshuffle.class, new ReshuffleTranslatorBatch());
-
- TRANSLATORS.put(Flatten.PCollections.class, new FlattenPCollectionTranslatorBatch());
-
- TRANSLATORS.put(Window.Assign.class, new WindowAssignTranslatorBatch());
-
- TRANSLATORS.put(ParDo.MultiOutput.class, new ParDoTranslatorBatch());
-
- TRANSLATORS.put(Read.Bounded.class, new ReadSourceTranslatorBatch());
- }
-
-
- static FlinkBatchPipelineTranslator.BatchTransformTranslator<?> getTranslator(
- PTransform<?, ?> transform) {
- return TRANSLATORS.get(transform.getClass());
- }
-
- private static class ReadSourceTranslatorBatch<T>
- implements FlinkBatchPipelineTranslator.BatchTransformTranslator<Read.Bounded<T>> {
-
- @Override
- public void translateNode(Read.Bounded<T> transform, FlinkBatchTranslationContext context) {
- String name = transform.getName();
- BoundedSource<T> source = transform.getSource();
- PCollection<T> output = context.getOutput(transform);
-
- TypeInformation<WindowedValue<T>> typeInformation = context.getTypeInfo(output);
-
- DataSource<WindowedValue<T>> dataSource = new DataSource<>(
- context.getExecutionEnvironment(),
- new SourceInputFormat<>(source, context.getPipelineOptions()),
- typeInformation,
- name);
-
- context.setOutputDataSet(output, dataSource);
- }
- }
-
- private static class WindowAssignTranslatorBatch<T>
- implements FlinkBatchPipelineTranslator.BatchTransformTranslator<Window.Assign<T>> {
-
- @Override
- public void translateNode(Window.Assign<T> transform, FlinkBatchTranslationContext context) {
- PValue input = context.getInput(transform);
-
- TypeInformation<WindowedValue<T>> resultTypeInfo =
- context.getTypeInfo(context.getOutput(transform));
-
- DataSet<WindowedValue<T>> inputDataSet = context.getInputDataSet(input);
-
- @SuppressWarnings("unchecked")
- final WindowingStrategy<T, ? extends BoundedWindow> windowingStrategy =
- (WindowingStrategy<T, ? extends BoundedWindow>)
- context.getOutput(transform).getWindowingStrategy();
-
- WindowFn<T, ? extends BoundedWindow> windowFn = windowingStrategy.getWindowFn();
-
- FlinkAssignWindows<T, ? extends BoundedWindow> assignWindowsFunction =
- new FlinkAssignWindows<>(windowFn);
-
- DataSet<WindowedValue<T>> resultDataSet = inputDataSet
- .flatMap(assignWindowsFunction)
- .name(context.getOutput(transform).getName())
- .returns(resultTypeInfo);
-
- context.setOutputDataSet(context.getOutput(transform), resultDataSet);
- }
- }
-
- private static class GroupByKeyTranslatorBatch<K, InputT>
- implements FlinkBatchPipelineTranslator.BatchTransformTranslator<GroupByKey<K, InputT>> {
-
- @Override
- public void translateNode(
- GroupByKey<K, InputT> transform,
- FlinkBatchTranslationContext context) {
-
- // for now, this is copied from the Combine.PerKey translater. Once we have the new runner API
- // we can replace GroupByKey by a Combine.PerKey with the Concatenate CombineFn
-
- DataSet<WindowedValue<KV<K, InputT>>> inputDataSet =
- context.getInputDataSet(context.getInput(transform));
-
- Combine.KeyedCombineFn<K, InputT, List<InputT>, List<InputT>> combineFn =
- new Concatenate<InputT>().asKeyedFn();
-
- KvCoder<K, InputT> inputCoder =
- (KvCoder<K, InputT>) context.getInput(transform).getCoder();
-
- Coder<List<InputT>> accumulatorCoder;
-
- try {
- accumulatorCoder =
- combineFn.getAccumulatorCoder(
- context.getInput(transform).getPipeline().getCoderRegistry(),
- inputCoder.getKeyCoder(),
- inputCoder.getValueCoder());
- } catch (CannotProvideCoderException e) {
- throw new RuntimeException(e);
- }
-
- WindowingStrategy<?, ?> windowingStrategy =
- context.getInput(transform).getWindowingStrategy();
-
- TypeInformation<WindowedValue<KV<K, List<InputT>>>> partialReduceTypeInfo =
- new CoderTypeInformation<>(
- WindowedValue.getFullCoder(
- KvCoder.of(inputCoder.getKeyCoder(), accumulatorCoder),
- windowingStrategy.getWindowFn().windowCoder()));
-
-
- Grouping<WindowedValue<KV<K, InputT>>> inputGrouping =
- inputDataSet.groupBy(new KvKeySelector<InputT, K>(inputCoder.getKeyCoder()));
-
- FlinkPartialReduceFunction<K, InputT, List<InputT>, ?> partialReduceFunction;
- FlinkReduceFunction<K, List<InputT>, List<InputT>, ?> reduceFunction;
-
- if (windowingStrategy.getWindowFn().isNonMerging()) {
- @SuppressWarnings("unchecked")
- WindowingStrategy<?, BoundedWindow> boundedStrategy =
- (WindowingStrategy<?, BoundedWindow>) windowingStrategy;
-
- partialReduceFunction = new FlinkPartialReduceFunction<>(
- combineFn,
- boundedStrategy,
- Collections.<PCollectionView<?>, WindowingStrategy<?, ?>>emptyMap(),
- context.getPipelineOptions());
-
- reduceFunction = new FlinkReduceFunction<>(
- combineFn,
- boundedStrategy,
- Collections.<PCollectionView<?>, WindowingStrategy<?, ?>>emptyMap(),
- context.getPipelineOptions());
-
- } else {
- if (!windowingStrategy.getWindowFn().windowCoder().equals(IntervalWindow.getCoder())) {
- throw new UnsupportedOperationException(
- "Merging WindowFn with windows other than IntervalWindow are not supported.");
- }
-
- @SuppressWarnings("unchecked")
- WindowingStrategy<?, IntervalWindow> intervalStrategy =
- (WindowingStrategy<?, IntervalWindow>) windowingStrategy;
-
- partialReduceFunction = new FlinkMergingPartialReduceFunction<>(
- combineFn,
- intervalStrategy,
- Collections.<PCollectionView<?>, WindowingStrategy<?, ?>>emptyMap(),
- context.getPipelineOptions());
-
- reduceFunction = new FlinkMergingReduceFunction<>(
- combineFn,
- intervalStrategy,
- Collections.<PCollectionView<?>, WindowingStrategy<?, ?>>emptyMap(),
- context.getPipelineOptions());
- }
-
- // Partially GroupReduce the values into the intermediate format AccumT (combine)
- GroupCombineOperator<
- WindowedValue<KV<K, InputT>>,
- WindowedValue<KV<K, List<InputT>>>> groupCombine =
- new GroupCombineOperator<>(
- inputGrouping,
- partialReduceTypeInfo,
- partialReduceFunction,
- "GroupCombine: " + transform.getName());
-
- Grouping<WindowedValue<KV<K, List<InputT>>>> intermediateGrouping =
- groupCombine.groupBy(new KvKeySelector<List<InputT>, K>(inputCoder.getKeyCoder()));
-
- // Fully reduce the values and create output format VO
- GroupReduceOperator<
- WindowedValue<KV<K, List<InputT>>>, WindowedValue<KV<K, List<InputT>>>> outputDataSet =
- new GroupReduceOperator<>(
- intermediateGrouping, partialReduceTypeInfo, reduceFunction, transform.getName());
-
- context.setOutputDataSet(context.getOutput(transform), outputDataSet);
-
- }
-
- }
-
- private static class ReshuffleTranslatorBatch<K, InputT>
- implements FlinkBatchPipelineTranslator.BatchTransformTranslator<Reshuffle<K, InputT>> {
-
- @Override
- public void translateNode(
- Reshuffle<K, InputT> transform,
- FlinkBatchTranslationContext context) {
-
- DataSet<WindowedValue<KV<K, InputT>>> inputDataSet =
- context.getInputDataSet(context.getInput(transform));
-
- context.setOutputDataSet(context.getOutput(transform), inputDataSet.rebalance());
-
- }
-
- }
-
- /**
- * Combiner that combines {@code T}s into a single {@code List<T>} containing all inputs.
- *
- * <p>For internal use to translate {@link GroupByKey}. For a large {@link PCollection} this
- * is expected to crash!
- *
- * <p>This is copied from the dataflow runner code.
- *
- * @param <T> the type of elements to concatenate.
- */
- private static class Concatenate<T> extends Combine.CombineFn<T, List<T>, List<T>> {
- @Override
- public List<T> createAccumulator() {
- return new ArrayList<>();
- }
-
- @Override
- public List<T> addInput(List<T> accumulator, T input) {
- accumulator.add(input);
- return accumulator;
- }
-
- @Override
- public List<T> mergeAccumulators(Iterable<List<T>> accumulators) {
- List<T> result = createAccumulator();
- for (List<T> accumulator : accumulators) {
- result.addAll(accumulator);
- }
- return result;
- }
-
- @Override
- public List<T> extractOutput(List<T> accumulator) {
- return accumulator;
- }
-
- @Override
- public Coder<List<T>> getAccumulatorCoder(CoderRegistry registry, Coder<T> inputCoder) {
- return ListCoder.of(inputCoder);
- }
-
- @Override
- public Coder<List<T>> getDefaultOutputCoder(CoderRegistry registry, Coder<T> inputCoder) {
- return ListCoder.of(inputCoder);
- }
- }
-
-
- private static class CombinePerKeyTranslatorBatch<K, InputT, AccumT, OutputT>
- implements FlinkBatchPipelineTranslator.BatchTransformTranslator<
- Combine.PerKey<K, InputT, OutputT>> {
-
- @Override
- @SuppressWarnings("unchecked")
- public void translateNode(
- Combine.PerKey<K, InputT, OutputT> transform,
- FlinkBatchTranslationContext context) {
- DataSet<WindowedValue<KV<K, InputT>>> inputDataSet =
- context.getInputDataSet(context.getInput(transform));
-
- CombineFnBase.PerKeyCombineFn<K, InputT, AccumT, OutputT> combineFn =
- (CombineFnBase.PerKeyCombineFn<K, InputT, AccumT, OutputT>) transform.getFn();
-
- KvCoder<K, InputT> inputCoder =
- (KvCoder<K, InputT>) context.getInput(transform).getCoder();
-
- Coder<AccumT> accumulatorCoder;
-
- try {
- accumulatorCoder =
- combineFn.getAccumulatorCoder(
- context.getInput(transform).getPipeline().getCoderRegistry(),
- inputCoder.getKeyCoder(),
- inputCoder.getValueCoder());
- } catch (CannotProvideCoderException e) {
- throw new RuntimeException(e);
- }
-
- WindowingStrategy<?, ?> windowingStrategy =
- context.getInput(transform).getWindowingStrategy();
-
- TypeInformation<WindowedValue<KV<K, AccumT>>> partialReduceTypeInfo =
- context.getTypeInfo(
- KvCoder.of(inputCoder.getKeyCoder(), accumulatorCoder),
- windowingStrategy);
-
- Grouping<WindowedValue<KV<K, InputT>>> inputGrouping =
- inputDataSet.groupBy(new KvKeySelector<InputT, K>(inputCoder.getKeyCoder()));
-
- // construct a map from side input to WindowingStrategy so that
- // the DoFn runner can map main-input windows to side input windows
- Map<PCollectionView<?>, WindowingStrategy<?, ?>> sideInputStrategies = new HashMap<>();
- for (PCollectionView<?> sideInput: transform.getSideInputs()) {
- sideInputStrategies.put(sideInput, sideInput.getWindowingStrategyInternal());
- }
-
- if (windowingStrategy.getWindowFn().isNonMerging()) {
- WindowingStrategy<?, BoundedWindow> boundedStrategy =
- (WindowingStrategy<?, BoundedWindow>) windowingStrategy;
-
- FlinkPartialReduceFunction<K, InputT, AccumT, ?> partialReduceFunction =
- new FlinkPartialReduceFunction<>(
- combineFn,
- boundedStrategy,
- sideInputStrategies,
- context.getPipelineOptions());
-
- FlinkReduceFunction<K, AccumT, OutputT, ?> reduceFunction =
- new FlinkReduceFunction<>(
- combineFn,
- boundedStrategy,
- sideInputStrategies,
- context.getPipelineOptions());
-
- // Partially GroupReduce the values into the intermediate format AccumT (combine)
- GroupCombineOperator<
- WindowedValue<KV<K, InputT>>,
- WindowedValue<KV<K, AccumT>>> groupCombine =
- new GroupCombineOperator<>(
- inputGrouping,
- partialReduceTypeInfo,
- partialReduceFunction,
- "GroupCombine: " + transform.getName());
-
- transformSideInputs(transform.getSideInputs(), groupCombine, context);
-
- TypeInformation<WindowedValue<KV<K, OutputT>>> reduceTypeInfo =
- context.getTypeInfo(context.getOutput(transform));
-
- Grouping<WindowedValue<KV<K, AccumT>>> intermediateGrouping =
- groupCombine.groupBy(new KvKeySelector<AccumT, K>(inputCoder.getKeyCoder()));
-
- // Fully reduce the values and create output format OutputT
- GroupReduceOperator<
- WindowedValue<KV<K, AccumT>>, WindowedValue<KV<K, OutputT>>> outputDataSet =
- new GroupReduceOperator<>(
- intermediateGrouping, reduceTypeInfo, reduceFunction, transform.getName());
-
- transformSideInputs(transform.getSideInputs(), outputDataSet, context);
-
- context.setOutputDataSet(context.getOutput(transform), outputDataSet);
-
- } else {
- if (!windowingStrategy.getWindowFn().windowCoder().equals(IntervalWindow.getCoder())) {
- throw new UnsupportedOperationException(
- "Merging WindowFn with windows other than IntervalWindow are not supported.");
- }
-
- // for merging windows we can't to a pre-shuffle combine step since
- // elements would not be in their correct windows for side-input access
-
- WindowingStrategy<?, IntervalWindow> intervalStrategy =
- (WindowingStrategy<?, IntervalWindow>) windowingStrategy;
-
- FlinkMergingNonShuffleReduceFunction<K, InputT, AccumT, OutputT, ?> reduceFunction =
- new FlinkMergingNonShuffleReduceFunction<>(
- combineFn,
- intervalStrategy,
- sideInputStrategies,
- context.getPipelineOptions());
-
- TypeInformation<WindowedValue<KV<K, OutputT>>> reduceTypeInfo =
- context.getTypeInfo(context.getOutput(transform));
-
- Grouping<WindowedValue<KV<K, InputT>>> grouping =
- inputDataSet.groupBy(new KvKeySelector<InputT, K>(inputCoder.getKeyCoder()));
-
- // Fully reduce the values and create output format OutputT
- GroupReduceOperator<
- WindowedValue<KV<K, InputT>>, WindowedValue<KV<K, OutputT>>> outputDataSet =
- new GroupReduceOperator<>(
- grouping, reduceTypeInfo, reduceFunction, transform.getName());
-
- transformSideInputs(transform.getSideInputs(), outputDataSet, context);
-
- context.setOutputDataSet(context.getOutput(transform), outputDataSet);
- }
-
-
- }
- }
-
- private static void rejectSplittable(DoFn<?, ?> doFn) {
- DoFnSignature signature = DoFnSignatures.getSignature(doFn.getClass());
- if (signature.processElement().isSplittable()) {
- throw new UnsupportedOperationException(
- String.format(
- "%s does not currently support splittable DoFn: %s",
- FlinkRunner.class.getSimpleName(), doFn));
- }
- }
-
- private static class ParDoTranslatorBatch<InputT, OutputT>
- implements FlinkBatchPipelineTranslator.BatchTransformTranslator<
- ParDo.MultiOutput<InputT, OutputT>> {
-
- @Override
- @SuppressWarnings("unchecked")
- public void translateNode(
- ParDo.MultiOutput<InputT, OutputT> transform,
- FlinkBatchTranslationContext context) {
- DoFn<InputT, OutputT> doFn = transform.getFn();
- rejectSplittable(doFn);
- DataSet<WindowedValue<InputT>> inputDataSet =
- context.getInputDataSet(context.getInput(transform));
-
- Map<TupleTag<?>, PValue> outputs = context.getOutputs(transform);
-
- Map<TupleTag<?>, Integer> outputMap = Maps.newHashMap();
- // put the main output at index 0, FlinkMultiOutputDoFnFunction expects this
- outputMap.put(transform.getMainOutputTag(), 0);
- int count = 1;
- for (TupleTag<?> tag : outputs.keySet()) {
- if (!outputMap.containsKey(tag)) {
- outputMap.put(tag, count++);
- }
- }
-
- // assume that the windowing strategy is the same for all outputs
- WindowingStrategy<?, ?> windowingStrategy = null;
-
- // collect all output Coders and create a UnionCoder for our tagged outputs
- List<Coder<?>> outputCoders = Lists.newArrayList();
- for (PValue taggedValue : outputs.values()) {
- checkState(
- taggedValue instanceof PCollection,
- "Within ParDo, got a non-PCollection output %s of type %s",
- taggedValue,
- taggedValue.getClass().getSimpleName());
- PCollection<?> coll = (PCollection<?>) taggedValue;
- outputCoders.add(coll.getCoder());
- windowingStrategy = coll.getWindowingStrategy();
- }
-
- if (windowingStrategy == null) {
- throw new IllegalStateException("No outputs defined.");
- }
-
- UnionCoder unionCoder = UnionCoder.of(outputCoders);
-
- TypeInformation<WindowedValue<RawUnionValue>> typeInformation =
- new CoderTypeInformation<>(
- WindowedValue.getFullCoder(
- unionCoder,
- windowingStrategy.getWindowFn().windowCoder()));
-
- List<PCollectionView<?>> sideInputs = transform.getSideInputs();
-
- // construct a map from side input to WindowingStrategy so that
- // the DoFn runner can map main-input windows to side input windows
- Map<PCollectionView<?>, WindowingStrategy<?, ?>> sideInputStrategies = new HashMap<>();
- for (PCollectionView<?> sideInput: sideInputs) {
- sideInputStrategies.put(sideInput, sideInput.getWindowingStrategyInternal());
- }
-
- SingleInputUdfOperator<WindowedValue<InputT>, WindowedValue<RawUnionValue>, ?> outputDataSet;
- DoFnSignature signature = DoFnSignatures.getSignature(transform.getFn().getClass());
- if (signature.stateDeclarations().size() > 0
- || signature.timerDeclarations().size() > 0) {
-
- // Based on the fact that the signature is stateful, DoFnSignatures ensures
- // that it is also keyed
- KvCoder<?, InputT> inputCoder =
- (KvCoder<?, InputT>) context.getInput(transform).getCoder();
-
- FlinkStatefulDoFnFunction<?, ?, OutputT> doFnWrapper = new FlinkStatefulDoFnFunction<>(
- (DoFn) doFn, windowingStrategy, sideInputStrategies, context.getPipelineOptions(),
- outputMap, transform.getMainOutputTag()
- );
-
- Grouping<WindowedValue<InputT>> grouping =
- inputDataSet.groupBy(new KvKeySelector(inputCoder.getKeyCoder()));
-
- outputDataSet =
- new GroupReduceOperator(grouping, typeInformation, doFnWrapper, transform.getName());
-
- } else {
- FlinkDoFnFunction<InputT, RawUnionValue> doFnWrapper =
- new FlinkDoFnFunction(
- doFn,
- windowingStrategy,
- sideInputStrategies,
- context.getPipelineOptions(),
- outputMap,
- transform.getMainOutputTag());
-
- outputDataSet = new MapPartitionOperator<>(
- inputDataSet, typeInformation,
- doFnWrapper, transform.getName());
-
- }
-
- transformSideInputs(sideInputs, outputDataSet, context);
-
- for (Entry<TupleTag<?>, PValue> output : outputs.entrySet()) {
- pruneOutput(
- outputDataSet,
- context,
- outputMap.get(output.getKey()),
- (PCollection) output.getValue());
- }
-
- }
-
- private <T> void pruneOutput(
- DataSet<WindowedValue<RawUnionValue>> taggedDataSet,
- FlinkBatchTranslationContext context,
- int integerTag,
- PCollection<T> collection) {
- TypeInformation<WindowedValue<T>> outputType = context.getTypeInfo(collection);
-
- FlinkMultiOutputPruningFunction<T> pruningFunction =
- new FlinkMultiOutputPruningFunction<>(integerTag);
-
- FlatMapOperator<WindowedValue<RawUnionValue>, WindowedValue<T>> pruningOperator =
- new FlatMapOperator<>(
- taggedDataSet,
- outputType,
- pruningFunction,
- collection.getName());
-
- context.setOutputDataSet(collection, pruningOperator);
- }
- }
-
- private static class FlattenPCollectionTranslatorBatch<T>
- implements FlinkBatchPipelineTranslator.BatchTransformTranslator<
- Flatten.PCollections<T>> {
-
- @Override
- @SuppressWarnings("unchecked")
- public void translateNode(
- Flatten.PCollections<T> transform,
- FlinkBatchTranslationContext context) {
-
- Map<TupleTag<?>, PValue> allInputs = context.getInputs(transform);
- DataSet<WindowedValue<T>> result = null;
-
- if (allInputs.isEmpty()) {
-
- // create an empty dummy source to satisfy downstream operations
- // we cannot create an empty source in Flink, therefore we have to
- // add the flatMap that simply never forwards the single element
- DataSource<String> dummySource =
- context.getExecutionEnvironment().fromElements("dummy");
- result = dummySource.flatMap(new FlatMapFunction<String, WindowedValue<T>>() {
- @Override
- public void flatMap(String s, Collector<WindowedValue<T>> collector) throws Exception {
- // never return anything
- }
- }).returns(
- new CoderTypeInformation<>(
- WindowedValue.getFullCoder(
- (Coder<T>) VoidCoder.of(),
- GlobalWindow.Coder.INSTANCE)));
- } else {
- for (PValue taggedPc : allInputs.values()) {
- checkArgument(
- taggedPc instanceof PCollection,
- "Got non-PCollection input to flatten: %s of type %s",
- taggedPc,
- taggedPc.getClass().getSimpleName());
- PCollection<T> collection = (PCollection<T>) taggedPc;
- DataSet<WindowedValue<T>> current = context.getInputDataSet(collection);
- if (result == null) {
- result = current;
- } else {
- result = result.union(current);
- }
- }
- }
-
- // insert a dummy filter, there seems to be a bug in Flink
- // that produces duplicate elements after the union in some cases
- // if we don't
- result = result.filter(new FilterFunction<WindowedValue<T>>() {
- @Override
- public boolean filter(WindowedValue<T> tWindowedValue) throws Exception {
- return true;
- }
- }).name("UnionFixFilter");
- context.setOutputDataSet(context.getOutput(transform), result);
- }
- }
-
- private static class CreatePCollectionViewTranslatorBatch<ElemT, ViewT>
- implements FlinkBatchPipelineTranslator.BatchTransformTranslator<
- View.CreatePCollectionView<ElemT, ViewT>> {
-
- @Override
- public void translateNode(
- View.CreatePCollectionView<ElemT, ViewT> transform,
- FlinkBatchTranslationContext context) {
- DataSet<WindowedValue<ElemT>> inputDataSet =
- context.getInputDataSet(context.getInput(transform));
-
- PCollectionView<ViewT> input = transform.getView();
-
- context.setSideInputDataSet(input, inputDataSet);
- }
- }
-
- private static void transformSideInputs(
- List<PCollectionView<?>> sideInputs,
- SingleInputUdfOperator<?, ?, ?> outputDataSet,
- FlinkBatchTranslationContext context) {
- // get corresponding Flink broadcast DataSets
- for (PCollectionView<?> input : sideInputs) {
- DataSet<?> broadcastSet = context.getSideInputDataSet(input);
- outputDataSet.withBroadcastSet(broadcastSet, input.getTagInternal().getId());
- }
- }
-
- private FlinkBatchTransformTranslators() {}
-
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/FlinkBatchTranslationContext.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/FlinkBatchTranslationContext.java b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/FlinkBatchTranslationContext.java
deleted file mode 100644
index 98dd0fb..0000000
--- a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/FlinkBatchTranslationContext.java
+++ /dev/null
@@ -1,153 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink;
-
-import com.google.common.collect.Iterables;
-import java.util.HashMap;
-import java.util.Map;
-import org.apache.beam.runners.flink.translation.types.CoderTypeInformation;
-import org.apache.beam.sdk.coders.Coder;
-import org.apache.beam.sdk.options.PipelineOptions;
-import org.apache.beam.sdk.transforms.AppliedPTransform;
-import org.apache.beam.sdk.transforms.PTransform;
-import org.apache.beam.sdk.util.WindowedValue;
-import org.apache.beam.sdk.util.WindowingStrategy;
-import org.apache.beam.sdk.values.PCollection;
-import org.apache.beam.sdk.values.PCollectionView;
-import org.apache.beam.sdk.values.PValue;
-import org.apache.beam.sdk.values.TupleTag;
-import org.apache.flink.api.common.typeinfo.TypeInformation;
-import org.apache.flink.api.java.DataSet;
-import org.apache.flink.api.java.ExecutionEnvironment;
-
-/**
- * Helper for {@link FlinkBatchPipelineTranslator} and translators in
- * {@link FlinkBatchTransformTranslators}.
- */
-class FlinkBatchTranslationContext {
-
- private final Map<PValue, DataSet<?>> dataSets;
- private final Map<PCollectionView<?>, DataSet<?>> broadcastDataSets;
-
- /**
- * For keeping track about which DataSets don't have a successor. We
- * need to terminate these with a discarding sink because the Beam
- * model allows dangling operations.
- */
- private final Map<PValue, DataSet<?>> danglingDataSets;
-
- private final ExecutionEnvironment env;
- private final PipelineOptions options;
-
- private AppliedPTransform<?, ?, ?> currentTransform;
-
- // ------------------------------------------------------------------------
-
- public FlinkBatchTranslationContext(ExecutionEnvironment env, PipelineOptions options) {
- this.env = env;
- this.options = options;
- this.dataSets = new HashMap<>();
- this.broadcastDataSets = new HashMap<>();
-
- this.danglingDataSets = new HashMap<>();
- }
-
- // ------------------------------------------------------------------------
-
- public Map<PValue, DataSet<?>> getDanglingDataSets() {
- return danglingDataSets;
- }
-
- public ExecutionEnvironment getExecutionEnvironment() {
- return env;
- }
-
- public PipelineOptions getPipelineOptions() {
- return options;
- }
-
- @SuppressWarnings("unchecked")
- public <T> DataSet<WindowedValue<T>> getInputDataSet(PValue value) {
- // assume that the DataSet is used as an input if retrieved here
- danglingDataSets.remove(value);
- return (DataSet<WindowedValue<T>>) dataSets.get(value);
- }
-
- public <T> void setOutputDataSet(PValue value, DataSet<WindowedValue<T>> set) {
- if (!dataSets.containsKey(value)) {
- dataSets.put(value, set);
- danglingDataSets.put(value, set);
- }
- }
-
- /**
- * Sets the AppliedPTransform which carries input/output.
- * @param currentTransform
- */
- public void setCurrentTransform(AppliedPTransform<?, ?, ?> currentTransform) {
- this.currentTransform = currentTransform;
- }
-
- @SuppressWarnings("unchecked")
- public <T> DataSet<T> getSideInputDataSet(PCollectionView<?> value) {
- return (DataSet<T>) broadcastDataSets.get(value);
- }
-
- public <ViewT, ElemT> void setSideInputDataSet(
- PCollectionView<ViewT> value,
- DataSet<WindowedValue<ElemT>> set) {
- if (!broadcastDataSets.containsKey(value)) {
- broadcastDataSets.put(value, set);
- }
- }
-
- @SuppressWarnings("unchecked")
- public <T> TypeInformation<WindowedValue<T>> getTypeInfo(PCollection<T> collection) {
- return getTypeInfo(collection.getCoder(), collection.getWindowingStrategy());
- }
-
- @SuppressWarnings("unchecked")
- public <T> TypeInformation<WindowedValue<T>> getTypeInfo(
- Coder<T> coder,
- WindowingStrategy<?, ?> windowingStrategy) {
- WindowedValue.FullWindowedValueCoder<T> windowedValueCoder =
- WindowedValue.getFullCoder(
- coder,
- windowingStrategy.getWindowFn().windowCoder());
-
- return new CoderTypeInformation<>(windowedValueCoder);
- }
-
- Map<TupleTag<?>, PValue> getInputs(PTransform<?, ?> transform) {
- return currentTransform.getInputs();
- }
-
- @SuppressWarnings("unchecked")
- <T extends PValue> T getInput(PTransform<T, ?> transform) {
- return (T) Iterables.getOnlyElement(currentTransform.getInputs().values());
- }
-
- Map<TupleTag<?>, PValue> getOutputs(PTransform<?, ?> transform) {
- return currentTransform.getOutputs();
- }
-
- @SuppressWarnings("unchecked")
- <T extends PValue> T getOutput(PTransform<?, T> transform) {
- return (T) Iterables.getOnlyElement(currentTransform.getOutputs().values());
- }
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/FlinkDetachedRunnerResult.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/FlinkDetachedRunnerResult.java b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/FlinkDetachedRunnerResult.java
deleted file mode 100644
index bf4395f..0000000
--- a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/FlinkDetachedRunnerResult.java
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink;
-
-import java.io.IOException;
-
-import org.apache.beam.sdk.AggregatorRetrievalException;
-import org.apache.beam.sdk.AggregatorValues;
-import org.apache.beam.sdk.PipelineResult;
-import org.apache.beam.sdk.metrics.MetricResults;
-import org.apache.beam.sdk.transforms.Aggregator;
-import org.joda.time.Duration;
-
-
-/**
- * Result of a detached execution of a {@link org.apache.beam.sdk.Pipeline} with Flink.
- * In detached execution, results and job execution are currently unavailable.
- */
-public class FlinkDetachedRunnerResult implements PipelineResult {
-
- FlinkDetachedRunnerResult() {}
-
- @Override
- public State getState() {
- return State.UNKNOWN;
- }
-
- @Override
- public <T> AggregatorValues<T> getAggregatorValues(final Aggregator<?, T> aggregator)
- throws AggregatorRetrievalException {
- throw new AggregatorRetrievalException(
- "Accumulators can't be retrieved for detached Job executions.",
- new UnsupportedOperationException());
- }
-
- @Override
- public MetricResults metrics() {
- throw new UnsupportedOperationException("The FlinkRunner does not currently support metrics.");
- }
-
- @Override
- public State cancel() throws IOException {
- throw new UnsupportedOperationException("Cancelling is not yet supported.");
- }
-
- @Override
- public State waitUntilFinish() {
- return State.UNKNOWN;
- }
-
- @Override
- public State waitUntilFinish(Duration duration) {
- return State.UNKNOWN;
- }
-
- @Override
- public String toString() {
- return "FlinkDetachedRunnerResult{}";
- }
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/FlinkPipelineExecutionEnvironment.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/FlinkPipelineExecutionEnvironment.java b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/FlinkPipelineExecutionEnvironment.java
deleted file mode 100644
index ba00036..0000000
--- a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/FlinkPipelineExecutionEnvironment.java
+++ /dev/null
@@ -1,241 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink;
-
-import static com.google.common.base.Preconditions.checkNotNull;
-
-import java.util.List;
-import org.apache.beam.sdk.Pipeline;
-import org.apache.flink.api.common.JobExecutionResult;
-import org.apache.flink.api.java.CollectionEnvironment;
-import org.apache.flink.api.java.ExecutionEnvironment;
-import org.apache.flink.runtime.state.AbstractStateBackend;
-import org.apache.flink.streaming.api.TimeCharacteristic;
-import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * The class that instantiates and manages the execution of a given job.
- * Depending on if the job is a Streaming or Batch processing one, it creates
- * the adequate execution environment ({@link ExecutionEnvironment}
- * or {@link StreamExecutionEnvironment}), the necessary {@link FlinkPipelineTranslator}
- * ({@link FlinkBatchPipelineTranslator} or {@link FlinkStreamingPipelineTranslator}) to
- * transform the Beam job into a Flink one, and executes the (translated) job.
- */
-class FlinkPipelineExecutionEnvironment {
-
- private static final Logger LOG =
- LoggerFactory.getLogger(FlinkPipelineExecutionEnvironment.class);
-
- private final FlinkPipelineOptions options;
-
- /**
- * The Flink Batch execution environment. This is instantiated to either a
- * {@link org.apache.flink.api.java.CollectionEnvironment},
- * a {@link org.apache.flink.api.java.LocalEnvironment} or
- * a {@link org.apache.flink.api.java.RemoteEnvironment}, depending on the configuration
- * options.
- */
- private ExecutionEnvironment flinkBatchEnv;
-
- /**
- * The Flink Streaming execution environment. This is instantiated to either a
- * {@link org.apache.flink.streaming.api.environment.LocalStreamEnvironment} or
- * a {@link org.apache.flink.streaming.api.environment.RemoteStreamEnvironment}, depending
- * on the configuration options, and more specifically, the url of the master.
- */
- private StreamExecutionEnvironment flinkStreamEnv;
-
- /**
- * Creates a {@link FlinkPipelineExecutionEnvironment} with the user-specified parameters in the
- * provided {@link FlinkPipelineOptions}.
- *
- * @param options the user-defined pipeline options.
- * */
- FlinkPipelineExecutionEnvironment(FlinkPipelineOptions options) {
- this.options = checkNotNull(options);
- }
-
- /**
- * Depending on if the job is a Streaming or a Batch one, this method creates
- * the necessary execution environment and pipeline translator, and translates
- * the {@link org.apache.beam.sdk.values.PCollection} program into
- * a {@link org.apache.flink.api.java.DataSet}
- * or {@link org.apache.flink.streaming.api.datastream.DataStream} one.
- * */
- public void translate(FlinkRunner flinkRunner, Pipeline pipeline) {
- this.flinkBatchEnv = null;
- this.flinkStreamEnv = null;
-
- PipelineTranslationOptimizer optimizer =
- new PipelineTranslationOptimizer(TranslationMode.BATCH, options);
-
- optimizer.translate(pipeline);
- TranslationMode translationMode = optimizer.getTranslationMode();
-
- FlinkPipelineTranslator translator;
- if (translationMode == TranslationMode.STREAMING) {
- this.flinkStreamEnv = createStreamExecutionEnvironment();
- translator = new FlinkStreamingPipelineTranslator(flinkRunner, flinkStreamEnv, options);
- } else {
- this.flinkBatchEnv = createBatchExecutionEnvironment();
- translator = new FlinkBatchPipelineTranslator(flinkBatchEnv, options);
- }
-
- translator.translate(pipeline);
- }
-
- /**
- * Launches the program execution.
- * */
- public JobExecutionResult executePipeline() throws Exception {
- final String jobName = options.getJobName();
-
- if (flinkBatchEnv != null) {
- return flinkBatchEnv.execute(jobName);
- } else if (flinkStreamEnv != null) {
- return flinkStreamEnv.execute(jobName);
- } else {
- throw new IllegalStateException("The Pipeline has not yet been translated.");
- }
- }
-
- /**
- * If the submitted job is a batch processing job, this method creates the adequate
- * Flink {@link org.apache.flink.api.java.ExecutionEnvironment} depending
- * on the user-specified options.
- */
- private ExecutionEnvironment createBatchExecutionEnvironment() {
-
- LOG.info("Creating the required Batch Execution Environment.");
-
- String masterUrl = options.getFlinkMaster();
- ExecutionEnvironment flinkBatchEnv;
-
- // depending on the master, create the right environment.
- if (masterUrl.equals("[local]")) {
- flinkBatchEnv = ExecutionEnvironment.createLocalEnvironment();
- } else if (masterUrl.equals("[collection]")) {
- flinkBatchEnv = new CollectionEnvironment();
- } else if (masterUrl.equals("[auto]")) {
- flinkBatchEnv = ExecutionEnvironment.getExecutionEnvironment();
- } else if (masterUrl.matches(".*:\\d*")) {
- String[] parts = masterUrl.split(":");
- List<String> stagingFiles = options.getFilesToStage();
- flinkBatchEnv = ExecutionEnvironment.createRemoteEnvironment(parts[0],
- Integer.parseInt(parts[1]),
- stagingFiles.toArray(new String[stagingFiles.size()]));
- } else {
- LOG.warn("Unrecognized Flink Master URL {}. Defaulting to [auto].", masterUrl);
- flinkBatchEnv = ExecutionEnvironment.getExecutionEnvironment();
- }
-
- // set the correct parallelism.
- if (options.getParallelism() != -1 && !(flinkBatchEnv instanceof CollectionEnvironment)) {
- flinkBatchEnv.setParallelism(options.getParallelism());
- }
-
- // set parallelism in the options (required by some execution code)
- options.setParallelism(flinkBatchEnv.getParallelism());
-
- if (options.getObjectReuse()) {
- flinkBatchEnv.getConfig().enableObjectReuse();
- } else {
- flinkBatchEnv.getConfig().disableObjectReuse();
- }
-
- return flinkBatchEnv;
- }
-
- /**
- * If the submitted job is a stream processing job, this method creates the adequate
- * Flink {@link org.apache.flink.streaming.api.environment.StreamExecutionEnvironment} depending
- * on the user-specified options.
- */
- private StreamExecutionEnvironment createStreamExecutionEnvironment() {
-
- LOG.info("Creating the required Streaming Environment.");
-
- String masterUrl = options.getFlinkMaster();
- StreamExecutionEnvironment flinkStreamEnv = null;
-
- // depending on the master, create the right environment.
- if (masterUrl.equals("[local]")) {
- flinkStreamEnv = StreamExecutionEnvironment.createLocalEnvironment();
- } else if (masterUrl.equals("[auto]")) {
- flinkStreamEnv = StreamExecutionEnvironment.getExecutionEnvironment();
- } else if (masterUrl.matches(".*:\\d*")) {
- String[] parts = masterUrl.split(":");
- List<String> stagingFiles = options.getFilesToStage();
- flinkStreamEnv = StreamExecutionEnvironment.createRemoteEnvironment(parts[0],
- Integer.parseInt(parts[1]), stagingFiles.toArray(new String[stagingFiles.size()]));
- } else {
- LOG.warn("Unrecognized Flink Master URL {}. Defaulting to [auto].", masterUrl);
- flinkStreamEnv = StreamExecutionEnvironment.getExecutionEnvironment();
- }
-
- // set the correct parallelism.
- if (options.getParallelism() != -1) {
- flinkStreamEnv.setParallelism(options.getParallelism());
- }
-
- // set parallelism in the options (required by some execution code)
- options.setParallelism(flinkStreamEnv.getParallelism());
-
- if (options.getObjectReuse()) {
- flinkStreamEnv.getConfig().enableObjectReuse();
- } else {
- flinkStreamEnv.getConfig().disableObjectReuse();
- }
-
- // default to event time
- flinkStreamEnv.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
-
- // for the following 2 parameters, a value of -1 means that Flink will use
- // the default values as specified in the configuration.
- int numRetries = options.getNumberOfExecutionRetries();
- if (numRetries != -1) {
- flinkStreamEnv.setNumberOfExecutionRetries(numRetries);
- }
- long retryDelay = options.getExecutionRetryDelay();
- if (retryDelay != -1) {
- flinkStreamEnv.getConfig().setExecutionRetryDelay(retryDelay);
- }
-
- // A value of -1 corresponds to disabled checkpointing (see CheckpointConfig in Flink).
- // If the value is not -1, then the validity checks are applied.
- // By default, checkpointing is disabled.
- long checkpointInterval = options.getCheckpointingInterval();
- if (checkpointInterval != -1) {
- if (checkpointInterval < 1) {
- throw new IllegalArgumentException("The checkpoint interval must be positive");
- }
- flinkStreamEnv.enableCheckpointing(checkpointInterval);
- }
-
- // State backend
- final AbstractStateBackend stateBackend = options.getStateBackend();
- if (stateBackend != null) {
- flinkStreamEnv.setStateBackend(stateBackend);
- }
-
- return flinkStreamEnv;
- }
-
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/FlinkPipelineOptions.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/FlinkPipelineOptions.java b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/FlinkPipelineOptions.java
deleted file mode 100644
index ef9afea..0000000
--- a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/FlinkPipelineOptions.java
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink;
-
-
-import com.fasterxml.jackson.annotation.JsonIgnore;
-import java.util.List;
-import org.apache.beam.sdk.options.ApplicationNameOptions;
-import org.apache.beam.sdk.options.Default;
-import org.apache.beam.sdk.options.Description;
-import org.apache.beam.sdk.options.PipelineOptions;
-import org.apache.beam.sdk.options.StreamingOptions;
-import org.apache.flink.runtime.state.AbstractStateBackend;
-
-/**
- * Options which can be used to configure a Flink PipelineRunner.
- */
-public interface FlinkPipelineOptions
- extends PipelineOptions, ApplicationNameOptions, StreamingOptions {
-
- /**
- * List of local files to make available to workers.
- *
- * <p>Jars are placed on the worker's classpath.
- *
- * <p>The default value is the list of jars from the main program's classpath.
- */
- @Description("Jar-Files to send to all workers and put on the classpath. "
- + "The default value is all files from the classpath.")
- @JsonIgnore
- List<String> getFilesToStage();
- void setFilesToStage(List<String> value);
-
- /**
- * The url of the Flink JobManager on which to execute pipelines. This can either be
- * the the address of a cluster JobManager, in the form "host:port" or one of the special
- * Strings "[local]", "[collection]" or "[auto]". "[local]" will start a local Flink
- * Cluster in the JVM, "[collection]" will execute the pipeline on Java Collections while
- * "[auto]" will let the system decide where to execute the pipeline based on the environment.
- */
- @Description("Address of the Flink Master where the Pipeline should be executed. Can"
- + " either be of the form \"host:port\" or one of the special values [local], "
- + "[collection] or [auto].")
- String getFlinkMaster();
- void setFlinkMaster(String value);
-
- @Description("The degree of parallelism to be used when distributing operations onto workers.")
- @Default.InstanceFactory(DefaultParallelismFactory.class)
- Integer getParallelism();
- void setParallelism(Integer value);
-
- @Description("The interval between consecutive checkpoints (i.e. snapshots of the current"
- + "pipeline state used for fault tolerance).")
- @Default.Long(-1L)
- Long getCheckpointingInterval();
- void setCheckpointingInterval(Long interval);
-
- @Description("Sets the number of times that failed tasks are re-executed. "
- + "A value of zero effectively disables fault tolerance. A value of -1 indicates "
- + "that the system default value (as defined in the configuration) should be used.")
- @Default.Integer(-1)
- Integer getNumberOfExecutionRetries();
- void setNumberOfExecutionRetries(Integer retries);
-
- @Description("Sets the delay between executions. A value of {@code -1} "
- + "indicates that the default value should be used.")
- @Default.Long(-1L)
- Long getExecutionRetryDelay();
- void setExecutionRetryDelay(Long delay);
-
- @Description("Sets the behavior of reusing objects.")
- @Default.Boolean(false)
- Boolean getObjectReuse();
- void setObjectReuse(Boolean reuse);
-
- /**
- * State backend to store Beam's state during computation.
- * Note: Only applicable when executing in streaming mode.
- */
- @Description("Sets the state backend to use in streaming mode. "
- + "Otherwise the default is read from the Flink config.")
- @JsonIgnore
- AbstractStateBackend getStateBackend();
- void setStateBackend(AbstractStateBackend stateBackend);
-
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/FlinkPipelineTranslator.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/FlinkPipelineTranslator.java b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/FlinkPipelineTranslator.java
deleted file mode 100644
index 65f416d..0000000
--- a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/FlinkPipelineTranslator.java
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink;
-
-import org.apache.beam.sdk.Pipeline;
-
-/**
- * The role of this class is to translate the Beam operators to
- * their Flink counterparts. If we have a streaming job, this is instantiated as a
- * {@link FlinkStreamingPipelineTranslator}. In other case, i.e. for a batch job,
- * a {@link FlinkBatchPipelineTranslator} is created. Correspondingly, the
- * {@link org.apache.beam.sdk.values.PCollection}-based user-provided job is translated into
- * a {@link org.apache.flink.streaming.api.datastream.DataStream} (for streaming) or a
- * {@link org.apache.flink.api.java.DataSet} (for batch) one.
- */
-abstract class FlinkPipelineTranslator extends Pipeline.PipelineVisitor.Defaults {
-
- /**
- * Translates the pipeline by passing this class as a visitor.
- * @param pipeline The pipeline to be translated
- */
- public void translate(Pipeline pipeline) {
- pipeline.traverseTopologically(this);
- }
-
- /**
- * Utility formatting method.
- * @param n number of spaces to generate
- * @return String with "|" followed by n spaces
- */
- protected static String genSpaces(int n) {
- StringBuilder builder = new StringBuilder();
- for (int i = 0; i < n; i++) {
- builder.append("| ");
- }
- return builder.toString();
- }
-}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/FlinkRunner.java
----------------------------------------------------------------------
diff --git a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/FlinkRunner.java b/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/FlinkRunner.java
deleted file mode 100644
index 096f030..0000000
--- a/runners/flink/runner/src/main/java/org/apache/beam/runners/flink/FlinkRunner.java
+++ /dev/null
@@ -1,232 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.runners.flink;
-
-import com.google.common.base.Joiner;
-import java.io.File;
-import java.net.URISyntaxException;
-import java.net.URL;
-import java.net.URLClassLoader;
-import java.util.ArrayList;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-import java.util.SortedSet;
-import java.util.TreeSet;
-import org.apache.beam.sdk.Pipeline;
-import org.apache.beam.sdk.PipelineResult;
-import org.apache.beam.sdk.options.PipelineOptions;
-import org.apache.beam.sdk.options.PipelineOptionsValidator;
-import org.apache.beam.sdk.runners.PipelineRunner;
-import org.apache.beam.sdk.runners.TransformHierarchy;
-import org.apache.beam.sdk.transforms.PTransform;
-import org.apache.beam.sdk.transforms.View;
-import org.apache.beam.sdk.values.PValue;
-import org.apache.flink.api.common.JobExecutionResult;
-import org.apache.flink.client.program.DetachedEnvironment;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * A {@link PipelineRunner} that executes the operations in the
- * pipeline by first translating them to a Flink Plan and then executing them either locally
- * or on a Flink cluster, depending on the configuration.
- */
-public class FlinkRunner extends PipelineRunner<PipelineResult> {
-
- private static final Logger LOG = LoggerFactory.getLogger(FlinkRunner.class);
-
- /**
- * Provided options.
- */
- private final FlinkPipelineOptions options;
-
- /**
- * Construct a runner from the provided options.
- *
- * @param options Properties which configure the runner.
- * @return The newly created runner.
- */
- public static FlinkRunner fromOptions(PipelineOptions options) {
- FlinkPipelineOptions flinkOptions =
- PipelineOptionsValidator.validate(FlinkPipelineOptions.class, options);
- ArrayList<String> missing = new ArrayList<>();
-
- if (flinkOptions.getAppName() == null) {
- missing.add("appName");
- }
- if (missing.size() > 0) {
- throw new IllegalArgumentException(
- "Missing required values: " + Joiner.on(',').join(missing));
- }
-
- if (flinkOptions.getFilesToStage() == null) {
- flinkOptions.setFilesToStage(detectClassPathResourcesToStage(
- FlinkRunner.class.getClassLoader()));
- LOG.info("PipelineOptions.filesToStage was not specified. "
- + "Defaulting to files from the classpath: will stage {} files. "
- + "Enable logging at DEBUG level to see which files will be staged.",
- flinkOptions.getFilesToStage().size());
- LOG.debug("Classpath elements: {}", flinkOptions.getFilesToStage());
- }
-
- // Set Flink Master to [auto] if no option was specified.
- if (flinkOptions.getFlinkMaster() == null) {
- flinkOptions.setFlinkMaster("[auto]");
- }
-
- return new FlinkRunner(flinkOptions);
- }
-
- private FlinkRunner(FlinkPipelineOptions options) {
- this.options = options;
- this.ptransformViewsWithNonDeterministicKeyCoders = new HashSet<>();
- }
-
- @Override
- public PipelineResult run(Pipeline pipeline) {
- logWarningIfPCollectionViewHasNonDeterministicKeyCoder(pipeline);
-
- LOG.info("Executing pipeline using FlinkRunner.");
-
- FlinkPipelineExecutionEnvironment env = new FlinkPipelineExecutionEnvironment(options);
-
- LOG.info("Translating pipeline to Flink program.");
- env.translate(this, pipeline);
-
- JobExecutionResult result;
- try {
- LOG.info("Starting execution of Flink program.");
- result = env.executePipeline();
- } catch (Exception e) {
- LOG.error("Pipeline execution failed", e);
- throw new RuntimeException("Pipeline execution failed", e);
- }
-
- if (result instanceof DetachedEnvironment.DetachedJobExecutionResult) {
- LOG.info("Pipeline submitted in Detached mode");
- return new FlinkDetachedRunnerResult();
- } else {
- LOG.info("Execution finished in {} msecs", result.getNetRuntime());
- Map<String, Object> accumulators = result.getAllAccumulatorResults();
- if (accumulators != null && !accumulators.isEmpty()) {
- LOG.info("Final aggregator values:");
-
- for (Map.Entry<String, Object> entry : result.getAllAccumulatorResults().entrySet()) {
- LOG.info("{} : {}", entry.getKey(), entry.getValue());
- }
- }
-
- return new FlinkRunnerResult(accumulators, result.getNetRuntime());
- }
- }
-
- /**
- * For testing.
- */
- public FlinkPipelineOptions getPipelineOptions() {
- return options;
- }
-
- @Override
- public String toString() {
- return "FlinkRunner#" + hashCode();
- }
-
- /**
- * Attempts to detect all the resources the class loader has access to. This does not recurse
- * to class loader parents stopping it from pulling in resources from the system class loader.
- *
- * @param classLoader The URLClassLoader to use to detect resources to stage.
- * @return A list of absolute paths to the resources the class loader uses.
- * @throws IllegalArgumentException If either the class loader is not a URLClassLoader or one
- * of the resources the class loader exposes is not a file resource.
- */
- protected static List<String> detectClassPathResourcesToStage(
- ClassLoader classLoader) {
- if (!(classLoader instanceof URLClassLoader)) {
- String message = String.format("Unable to use ClassLoader to detect classpath elements. "
- + "Current ClassLoader is %s, only URLClassLoaders are supported.", classLoader);
- LOG.error(message);
- throw new IllegalArgumentException(message);
- }
-
- List<String> files = new ArrayList<>();
- for (URL url : ((URLClassLoader) classLoader).getURLs()) {
- try {
- files.add(new File(url.toURI()).getAbsolutePath());
- } catch (IllegalArgumentException | URISyntaxException e) {
- String message = String.format("Unable to convert url (%s) to file.", url);
- LOG.error(message);
- throw new IllegalArgumentException(message, e);
- }
- }
- return files;
- }
-
- /** A set of {@link View}s with non-deterministic key coders. */
- Set<PTransform<?, ?>> ptransformViewsWithNonDeterministicKeyCoders;
-
- /**
- * Records that the {@link PTransform} requires a deterministic key coder.
- */
- void recordViewUsesNonDeterministicKeyCoder(PTransform<?, ?> ptransform) {
- ptransformViewsWithNonDeterministicKeyCoders.add(ptransform);
- }
-
- /** Outputs a warning about PCollection views without deterministic key coders. */
- private void logWarningIfPCollectionViewHasNonDeterministicKeyCoder(Pipeline pipeline) {
- // We need to wait till this point to determine the names of the transforms since only
- // at this time do we know the hierarchy of the transforms otherwise we could
- // have just recorded the full names during apply time.
- if (!ptransformViewsWithNonDeterministicKeyCoders.isEmpty()) {
- final SortedSet<String> ptransformViewNamesWithNonDeterministicKeyCoders = new TreeSet<>();
- pipeline.traverseTopologically(new Pipeline.PipelineVisitor() {
- @Override
- public void visitValue(PValue value, TransformHierarchy.Node producer) {
- }
-
- @Override
- public void visitPrimitiveTransform(TransformHierarchy.Node node) {
- if (ptransformViewsWithNonDeterministicKeyCoders.contains(node.getTransform())) {
- ptransformViewNamesWithNonDeterministicKeyCoders.add(node.getFullName());
- }
- }
-
- @Override
- public CompositeBehavior enterCompositeTransform(TransformHierarchy.Node node) {
- if (ptransformViewsWithNonDeterministicKeyCoders.contains(node.getTransform())) {
- ptransformViewNamesWithNonDeterministicKeyCoders.add(node.getFullName());
- }
- return CompositeBehavior.ENTER_TRANSFORM;
- }
-
- @Override
- public void leaveCompositeTransform(TransformHierarchy.Node node) {
- }
- });
-
- LOG.warn("Unable to use indexed implementation for View.AsMap and View.AsMultimap for {} "
- + "because the key coder is not deterministic. Falling back to singleton implementation "
- + "which may cause memory and/or performance problems. Future major versions of "
- + "the Flink runner will require deterministic key coders.",
- ptransformViewNamesWithNonDeterministicKeyCoders);
- }
- }
-}
[48/50] [abbrv] beam git commit: [BEAM-2015] Remove shared profile in
runners/pom.xml and fix Dataflow ValidatesRunner PostCommit
Posted by dh...@apache.org.
[BEAM-2015] Remove shared profile in runners/pom.xml and fix Dataflow ValidatesRunner PostCommit
Project: http://git-wip-us.apache.org/repos/asf/beam/repo
Commit: http://git-wip-us.apache.org/repos/asf/beam/commit/546aa61f
Tree: http://git-wip-us.apache.org/repos/asf/beam/tree/546aa61f
Diff: http://git-wip-us.apache.org/repos/asf/beam/diff/546aa61f
Branch: refs/heads/DSL_SQL
Commit: 546aa61f217dc59f95727970a8dbc7c4b2f76e54
Parents: 391fb77
Author: Luke Cwik <lc...@google.com>
Authored: Wed Apr 19 09:20:38 2017 -0700
Committer: Dan Halperin <dh...@google.com>
Committed: Wed Apr 19 12:07:33 2017 -0700
----------------------------------------------------------------------
runners/apex/pom.xml | 1 +
runners/direct-java/pom.xml | 1 +
runners/flink/pom.xml | 2 ++
runners/google-cloud-dataflow-java/pom.xml | 43 +++++++++++++++++++++++++
runners/pom.xml | 40 -----------------------
runners/spark/pom.xml | 1 +
6 files changed, 48 insertions(+), 40 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/beam/blob/546aa61f/runners/apex/pom.xml
----------------------------------------------------------------------
diff --git a/runners/apex/pom.xml b/runners/apex/pom.xml
index 40fc93c..f441e3d 100644
--- a/runners/apex/pom.xml
+++ b/runners/apex/pom.xml
@@ -229,6 +229,7 @@
</beamTestPipelineOptions>
</systemPropertyVariables>
<skipTests>${skipIntegrationTests}</skipTests>
+ <threadCount>4</threadCount>
</configuration>
</execution>
</executions>
http://git-wip-us.apache.org/repos/asf/beam/blob/546aa61f/runners/direct-java/pom.xml
----------------------------------------------------------------------
diff --git a/runners/direct-java/pom.xml b/runners/direct-java/pom.xml
index 03ed791..fc28fd6 100644
--- a/runners/direct-java/pom.xml
+++ b/runners/direct-java/pom.xml
@@ -81,6 +81,7 @@
]
</beamTestPipelineOptions>
</systemPropertyVariables>
+ <threadCount>4</threadCount>
</configuration>
</execution>
</executions>
http://git-wip-us.apache.org/repos/asf/beam/blob/546aa61f/runners/flink/pom.xml
----------------------------------------------------------------------
diff --git a/runners/flink/pom.xml b/runners/flink/pom.xml
index 351035e..808219b 100644
--- a/runners/flink/pom.xml
+++ b/runners/flink/pom.xml
@@ -75,6 +75,7 @@
]
</beamTestPipelineOptions>
</systemPropertyVariables>
+ <threadCount>4</threadCount>
</configuration>
</execution>
@@ -108,6 +109,7 @@
]
</beamTestPipelineOptions>
</systemPropertyVariables>
+ <threadCount>4</threadCount>
</configuration>
</execution>
</executions>
http://git-wip-us.apache.org/repos/asf/beam/blob/546aa61f/runners/google-cloud-dataflow-java/pom.xml
----------------------------------------------------------------------
diff --git a/runners/google-cloud-dataflow-java/pom.xml b/runners/google-cloud-dataflow-java/pom.xml
index e8aadb8..4cde923 100644
--- a/runners/google-cloud-dataflow-java/pom.xml
+++ b/runners/google-cloud-dataflow-java/pom.xml
@@ -38,6 +38,49 @@
<dataflow.legacy_environment_major_version>6</dataflow.legacy_environment_major_version>
</properties>
+ <profiles>
+ <!-- A profile that adds an integration test phase if and only if
+ the validatesRunnerPipelineOptions maven property has been set.
+ It should be set to a valid PipelineOptions JSON string. -->
+ <profile>
+ <id>validates-runner-tests</id>
+ <activation>
+ <property><name>validatesRunnerPipelineOptions</name></property>
+ </activation>
+ <build>
+ <pluginManagement>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-surefire-plugin</artifactId>
+ <executions>
+ <execution>
+ <id>validates-runner-tests</id>
+ <phase>integration-test</phase>
+ <goals>
+ <goal>test</goal>
+ </goals>
+ <configuration>
+ <skip>false</skip>
+ <groups>org.apache.beam.sdk.testing.ValidatesRunner</groups>
+ <parallel>all</parallel>
+ <threadCount>4</threadCount>
+ <dependenciesToScan>
+ <dependency>org.apache.beam:beam-sdks-java-core</dependency>
+ </dependenciesToScan>
+ <systemPropertyVariables>
+ <beamTestPipelineOptions>${validatesRunnerPipelineOptions}</beamTestPipelineOptions>
+ </systemPropertyVariables>
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
+ </plugins>
+ </pluginManagement>
+ </build>
+ </profile>
+ </profiles>
+
<build>
<resources>
<resource>
http://git-wip-us.apache.org/repos/asf/beam/blob/546aa61f/runners/pom.xml
----------------------------------------------------------------------
diff --git a/runners/pom.xml b/runners/pom.xml
index 150e987..8f3cabd 100644
--- a/runners/pom.xml
+++ b/runners/pom.xml
@@ -54,46 +54,6 @@
</plugins>
</build>
</profile>
-
- <!-- A profile that adds an integration test phase if and only if
- the validatesRunnerPipelineOptions maven property has been set.
- It should be set to a valid PipelineOptions JSON string. -->
- <profile>
- <id>validates-runner-tests</id>
- <activation>
- <property><name>validatesRunnerPipelineOptions</name></property>
- </activation>
- <build>
- <pluginManagement>
- <plugins>
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-surefire-plugin</artifactId>
- <executions>
- <execution>
- <id>validates-runner-tests</id>
- <phase>integration-test</phase>
- <goals>
- <goal>test</goal>
- </goals>
- <configuration>
- <groups>org.apache.beam.sdk.testing.ValidatesRunner</groups>
- <parallel>all</parallel>
- <threadCount>4</threadCount>
- <dependenciesToScan>
- <dependency>org.apache.beam:beam-sdks-java-core</dependency>
- </dependenciesToScan>
- <systemPropertyVariables>
- <beamTestPipelineOptions>${validatesRunnerPipelineOptions}</beamTestPipelineOptions>
- </systemPropertyVariables>
- </configuration>
- </execution>
- </executions>
- </plugin>
- </plugins>
- </pluginManagement>
- </build>
- </profile>
</profiles>
<build>
http://git-wip-us.apache.org/repos/asf/beam/blob/546aa61f/runners/spark/pom.xml
----------------------------------------------------------------------
diff --git a/runners/spark/pom.xml b/runners/spark/pom.xml
index 7493485..55788e6 100644
--- a/runners/spark/pom.xml
+++ b/runners/spark/pom.xml
@@ -102,6 +102,7 @@
<spark.ui.enabled>false</spark.ui.enabled>
<spark.ui.showConsoleProgress>false</spark.ui.showConsoleProgress>
</systemPropertyVariables>
+ <threadCount>4</threadCount>
</configuration>
</execution>
</executions>
[20/50] [abbrv] beam git commit: [BEAM-1914]Â XmlIO now complies with PTransform style guide
Posted by dh...@apache.org.
[BEAM-1914]�XmlIO now complies with PTransform style guide
Project: http://git-wip-us.apache.org/repos/asf/beam/repo
Commit: http://git-wip-us.apache.org/repos/asf/beam/commit/d0c0a60c
Tree: http://git-wip-us.apache.org/repos/asf/beam/tree/d0c0a60c
Diff: http://git-wip-us.apache.org/repos/asf/beam/diff/d0c0a60c
Branch: refs/heads/DSL_SQL
Commit: d0c0a60c83a9d2a6caa29f91f89d8c0ee3b0eb93
Parents: 57929fb
Author: Eugene Kirpichov <ki...@google.com>
Authored: Mon Apr 17 16:25:42 2017 -0700
Committer: Jean-Baptiste Onofr� <jb...@apache.org>
Committed: Wed Apr 19 10:34:46 2017 +0200
----------------------------------------------------------------------
.../apache/beam/sdk/io/CompressedSource.java | 4 +-
.../main/java/org/apache/beam/sdk/io/XmlIO.java | 477 +++++++++++++++++++
.../java/org/apache/beam/sdk/io/XmlSink.java | 226 ++-------
.../java/org/apache/beam/sdk/io/XmlSource.java | 191 +-------
.../sdk/transforms/display/DisplayData.java | 6 +
.../org/apache/beam/sdk/io/XmlSinkTest.java | 89 ++--
.../org/apache/beam/sdk/io/XmlSourceTest.java | 248 ++++++----
.../sdk/transforms/display/DisplayDataTest.java | 17 +
8 files changed, 740 insertions(+), 518 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/beam/blob/d0c0a60c/sdks/java/core/src/main/java/org/apache/beam/sdk/io/CompressedSource.java
----------------------------------------------------------------------
diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/io/CompressedSource.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/io/CompressedSource.java
index ecd0fd9..1d940cb 100644
--- a/sdks/java/core/src/main/java/org/apache/beam/sdk/io/CompressedSource.java
+++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/io/CompressedSource.java
@@ -46,10 +46,10 @@ import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
* A Source that reads from compressed files. A {@code CompressedSources} wraps a delegate
* {@link FileBasedSource} that is able to read the decompressed file format.
*
- * <p>For example, use the following to read from a gzip-compressed XML file:
+ * <p>For example, use the following to read from a gzip-compressed file-based source:
*
* <pre> {@code
- * XmlSource mySource = XmlSource.from(...);
+ * FileBasedSource<T> mySource = ...;
* PCollection<T> collection = p.apply(Read.from(CompressedSource
* .from(mySource)
* .withDecompression(CompressedSource.CompressionMode.GZIP)));
http://git-wip-us.apache.org/repos/asf/beam/blob/d0c0a60c/sdks/java/core/src/main/java/org/apache/beam/sdk/io/XmlIO.java
----------------------------------------------------------------------
diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/io/XmlIO.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/io/XmlIO.java
new file mode 100644
index 0000000..a53fb86
--- /dev/null
+++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/io/XmlIO.java
@@ -0,0 +1,477 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.sdk.io;
+
+import static com.google.common.base.Preconditions.checkNotNull;
+
+import com.google.auto.value.AutoValue;
+import com.google.common.annotations.VisibleForTesting;
+import javax.annotation.Nullable;
+import javax.xml.bind.JAXBContext;
+import javax.xml.bind.JAXBException;
+import org.apache.beam.sdk.runners.PipelineRunner;
+import org.apache.beam.sdk.transforms.PTransform;
+import org.apache.beam.sdk.transforms.display.DisplayData;
+import org.apache.beam.sdk.values.PBegin;
+import org.apache.beam.sdk.values.PCollection;
+import org.apache.beam.sdk.values.PDone;
+
+/** Transforms for reading and writing XML files using JAXB mappers. */
+public class XmlIO {
+ // CHECKSTYLE.OFF: JavadocStyle
+ /**
+ * Reads XML files. This source reads one or more XML files and
+ * creates a {@link PCollection} of a given type. Please note the example given below.
+ *
+ * <p>The XML file must be of the following form, where {@code root} and {@code record} are XML
+ * element names that are defined by the user:
+ *
+ * <pre>{@code
+ * <root>
+ * <record> ... </record>
+ * <record> ... </record>
+ * <record> ... </record>
+ * ...
+ * <record> ... </record>
+ * </root>
+ * }</pre>
+ *
+ * <p>Basically, the XML document should contain a single root element with an inner list
+ * consisting entirely of record elements. The records may contain arbitrary XML content; however,
+ * that content <b>must not</b> contain the start {@code <record>} or end {@code </record>} tags.
+ * This restriction enables reading from large XML files in parallel from different offsets in the
+ * file.
+ *
+ * <p>Root and/or record elements may additionally contain an arbitrary number of XML attributes.
+ * Additionally users must provide a class of a JAXB annotated Java type that can be used convert
+ * records into Java objects and vice versa using JAXB marshalling/unmarshalling mechanisms.
+ * Reading the source will generate a {@code PCollection} of the given JAXB annotated Java type.
+ * Optionally users may provide a minimum size of a bundle that should be created for the source.
+ *
+ * <p>The following example shows how to use this method in a Beam pipeline:
+ *
+ * <pre>{@code
+ * PCollection<String> output = p.apply(XmlIO.<Record>read()
+ * .from(file.toPath().toString())
+ * .withRootElement("root")
+ * .withRecordElement("record")
+ * .withRecordClass(Record.class));
+ * }</pre>
+ *
+ * <p>Currently, only XML files that use single-byte characters are supported. Using a file that
+ * contains multi-byte characters may result in data loss or duplication.
+ *
+ * <p>To use this method:
+ *
+ * <ol>
+ * <li>Explicitly declare a dependency on org.codehaus.woodstox:stax2-api
+ * <li>Include a compatible implementation on the classpath at run-time, such as
+ * org.codehaus.woodstox:woodstox-core-asl
+ * </ol>
+ *
+ * <p>These dependencies have been declared as optional in the sdks/java/core/pom.xml file of
+ * Apache Beam.
+ *
+ * <h3>Permissions</h3>
+ * Permission requirements depend on the {@link org.apache.beam.sdk.runners.PipelineRunner
+ * PipelineRunner} that is used to execute the Beam pipeline. Please refer to the documentation of
+ * corresponding {@link PipelineRunner PipelineRunners} for more details.
+ *
+ * @param <T> Type of the objects that represent the records of the XML file. The {@code
+ * PCollection} generated by this source will be of this type.
+ */
+ // CHECKSTYLE.ON: JavadocStyle
+ public static <T> Read<T> read() {
+ return new AutoValue_XmlIO_Read.Builder<T>()
+ .setMinBundleSize(Read.DEFAULT_MIN_BUNDLE_SIZE)
+ .setCompressionType(Read.CompressionType.AUTO)
+ .build();
+ }
+
+ // CHECKSTYLE.OFF: JavadocStyle
+ /**
+ * A {@link Sink} that outputs records as XML-formatted elements. Writes a {@link PCollection} of
+ * records from JAXB-annotated classes to a single file location.
+ *
+ * <p>Given a PCollection containing records of type T that can be marshalled to XML elements,
+ * this Sink will produce a single file consisting of a single root element that contains all of
+ * the elements in the PCollection.
+ *
+ * <p>XML Sinks are created with a base filename to write to, a root element name that will be
+ * used for the root element of the output files, and a class to bind to an XML element. This
+ * class will be used in the marshalling of records in an input PCollection to their XML
+ * representation and must be able to be bound using JAXB annotations (checked at pipeline
+ * construction time).
+ *
+ * <p>XML Sinks can be written to using the {@link Write} transform:
+ *
+ * <pre>{@code
+ * p.apply(XmlIO.<Type>write()
+ * .withRecordClass(Type.class)
+ * .withRootElement(root_element)
+ * .toFilenamePrefix(output_filename));
+ * }</pre>
+ *
+ * <p>For example, consider the following class with JAXB annotations:
+ *
+ * <pre>
+ * {@literal @}XmlRootElement(name = "word_count_result")
+ * {@literal @}XmlType(propOrder = {"word", "frequency"})
+ * public class WordFrequency {
+ * private String word;
+ * private long frequency;
+ *
+ * public WordFrequency() { }
+ *
+ * public WordFrequency(String word, long frequency) {
+ * this.word = word;
+ * this.frequency = frequency;
+ * }
+ *
+ * public void setWord(String word) {
+ * this.word = word;
+ * }
+ *
+ * public void setFrequency(long frequency) {
+ * this.frequency = frequency;
+ * }
+ *
+ * public long getFrequency() {
+ * return frequency;
+ * }
+ *
+ * public String getWord() {
+ * return word;
+ * }
+ * }
+ * </pre>
+ *
+ * <p>The following will produce XML output with a root element named "words" from a PCollection
+ * of WordFrequency objects:
+ *
+ * <pre>{@code
+ * p.apply(XmlIO.<WordFrequency>write()
+ * .withRecordClass(WordFrequency.class)
+ * .withRootElement("words")
+ * .toFilenamePrefix(output_file));
+ * }</pre>
+ *
+ * <p>The output of which will look like:
+ *
+ * <pre>{@code
+ * <words>
+ *
+ * <word_count_result>
+ * <word>decreased</word>
+ * <frequency>1</frequency>
+ * </word_count_result>
+ *
+ * <word_count_result>
+ * <word>War</word>
+ * <frequency>4</frequency>
+ * </word_count_result>
+ *
+ * <word_count_result>
+ * <word>empress'</word>
+ * <frequency>14</frequency>
+ * </word_count_result>
+ *
+ * <word_count_result>
+ * <word>stoops</word>
+ * <frequency>6</frequency>
+ * </word_count_result>
+ *
+ * ...
+ * </words>
+ * }</pre>
+ */
+ // CHECKSTYLE.ON: JavadocStyle
+ public static <T> Write<T> write() {
+ return new AutoValue_XmlIO_Write.Builder<T>().build();
+ }
+
+ /** Implementation of {@link #read}. */
+ @AutoValue
+ public abstract static class Read<T> extends PTransform<PBegin, PCollection<T>> {
+ private static final int DEFAULT_MIN_BUNDLE_SIZE = 8 * 1024;
+
+ @Nullable
+ abstract String getFileOrPatternSpec();
+
+ @Nullable
+ abstract String getRootElement();
+
+ @Nullable
+ abstract String getRecordElement();
+
+ @Nullable
+ abstract Class<T> getRecordClass();
+
+ abstract CompressionType getCompressionType();
+
+ abstract long getMinBundleSize();
+
+ abstract Builder<T> toBuilder();
+
+ @AutoValue.Builder
+ abstract static class Builder<T> {
+ abstract Builder<T> setFileOrPatternSpec(String fileOrPatternSpec);
+
+ abstract Builder<T> setRootElement(String rootElement);
+
+ abstract Builder<T> setRecordElement(String recordElement);
+
+ abstract Builder<T> setRecordClass(Class<T> recordClass);
+
+ abstract Builder<T> setMinBundleSize(long minBundleSize);
+
+ abstract Builder<T> setCompressionType(CompressionType compressionType);
+
+ abstract Read<T> build();
+ }
+
+ /** Strategy for determining the compression type of XML files being read. */
+ public enum CompressionType {
+ /** Automatically determine the compression type based on filename extension. */
+ AUTO(""),
+ /** Uncompressed (i.e., may be split). */
+ UNCOMPRESSED(""),
+ /** GZipped. */
+ GZIP(".gz"),
+ /** BZipped. */
+ BZIP2(".bz2"),
+ /** Zipped. */
+ ZIP(".zip"),
+ /** Deflate compressed. */
+ DEFLATE(".deflate");
+
+ private String filenameSuffix;
+
+ CompressionType(String suffix) {
+ this.filenameSuffix = suffix;
+ }
+
+ /**
+ * Determine if a given filename matches a compression type based on its extension.
+ * @param filename the filename to match
+ * @return true iff the filename ends with the compression type's known extension.
+ */
+ public boolean matches(String filename) {
+ return filename.toLowerCase().endsWith(filenameSuffix.toLowerCase());
+ }
+ }
+
+ /**
+ * Reads a single XML file or a set of XML files defined by a Java "glob"
+ * file pattern. Each XML file should be of the form defined in {@link #read}.
+ */
+ public Read<T> from(String fileOrPatternSpec) {
+ return toBuilder().setFileOrPatternSpec(fileOrPatternSpec).build();
+ }
+
+ /**
+ * Sets name of the root element of the XML document. This will be used to create a valid
+ * starting root element when initiating a bundle of records created from an XML document. This
+ * is a required parameter.
+ */
+ public Read<T> withRootElement(String rootElement) {
+ return toBuilder().setRootElement(rootElement).build();
+ }
+
+ /**
+ * Sets name of the record element of the XML document. This will be used to determine offset of
+ * the first record of a bundle created from the XML document. This is a required parameter.
+ */
+ public Read<T> withRecordElement(String recordElement) {
+ return toBuilder().setRecordElement(recordElement).build();
+ }
+
+ /**
+ * Sets a JAXB annotated class that can be populated using a record of the provided XML file.
+ * This will be used when unmarshalling record objects from the XML file. This is a required
+ * parameter.
+ */
+ public Read<T> withRecordClass(Class<T> recordClass) {
+ return toBuilder().setRecordClass(recordClass).build();
+ }
+
+ /**
+ * Sets a parameter {@code minBundleSize} for the minimum bundle size of the source. Please
+ * refer to {@link OffsetBasedSource} for the definition of minBundleSize. This is an optional
+ * parameter.
+ */
+ public Read<T> withMinBundleSize(long minBundleSize) {
+ return toBuilder().setMinBundleSize(minBundleSize).build();
+ }
+
+ /**
+ * Decompresses all input files using the specified compression type.
+ *
+ * <p>If no compression type is specified, the default is {@link CompressionType#AUTO}.
+ * In this mode, the compression type of the file is determined by its extension.
+ * Supports .gz, .bz2, .zip and .deflate compression.
+ */
+ public Read<T> withCompressionType(CompressionType compressionType) {
+ return toBuilder().setCompressionType(compressionType).build();
+ }
+
+ @Override
+ public void validate(PBegin input) {
+ checkNotNull(
+ getRootElement(),
+ "rootElement is null. Use builder method withRootElement() to set this.");
+ checkNotNull(
+ getRecordElement(),
+ "recordElement is null. Use builder method withRecordElement() to set this.");
+ checkNotNull(
+ getRecordClass(),
+ "recordClass is null. Use builder method withRecordClass() to set this.");
+ }
+
+ @Override
+ public void populateDisplayData(DisplayData.Builder builder) {
+ builder
+ .addIfNotDefault(
+ DisplayData.item("minBundleSize", getMinBundleSize())
+ .withLabel("Minimum Bundle Size"),
+ 1L)
+ .add(DisplayData.item("filePattern", getFileOrPatternSpec()).withLabel("File Pattern"))
+ .addIfNotNull(
+ DisplayData.item("rootElement", getRootElement()).withLabel("XML Root Element"))
+ .addIfNotNull(
+ DisplayData.item("recordElement", getRecordElement()).withLabel("XML Record Element"))
+ .addIfNotNull(
+ DisplayData.item("recordClass", getRecordClass()).withLabel("XML Record Class"));
+ }
+
+ @VisibleForTesting
+ BoundedSource<T> createSource() {
+ XmlSource<T> source = new XmlSource<>(this);
+ switch (getCompressionType()) {
+ case UNCOMPRESSED:
+ return source;
+ case AUTO:
+ return CompressedSource.from(source);
+ case BZIP2:
+ return CompressedSource.from(source)
+ .withDecompression(CompressedSource.CompressionMode.BZIP2);
+ case GZIP:
+ return CompressedSource.from(source)
+ .withDecompression(CompressedSource.CompressionMode.GZIP);
+ case ZIP:
+ return CompressedSource.from(source)
+ .withDecompression(CompressedSource.CompressionMode.ZIP);
+ case DEFLATE:
+ return CompressedSource.from(source)
+ .withDecompression(CompressedSource.CompressionMode.DEFLATE);
+ default:
+ throw new IllegalArgumentException("Unknown compression type: " + getCompressionType());
+ }
+ }
+
+ @Override
+ public PCollection<T> expand(PBegin input) {
+ return input.apply(org.apache.beam.sdk.io.Read.from(createSource()));
+ }
+ }
+
+ /** Implementation of {@link #write}. */
+ @AutoValue
+ public abstract static class Write<T> extends PTransform<PCollection<T>, PDone> {
+ @Nullable
+ abstract String getFilenamePrefix();
+
+ @Nullable
+ abstract Class<T> getRecordClass();
+
+ @Nullable
+ abstract String getRootElement();
+
+ abstract Builder<T> toBuilder();
+
+ @AutoValue.Builder
+ abstract static class Builder<T> {
+ abstract Builder<T> setFilenamePrefix(String baseOutputFilename);
+
+ abstract Builder<T> setRecordClass(Class<T> recordClass);
+
+ abstract Builder<T> setRootElement(String rootElement);
+
+ abstract Write<T> build();
+ }
+
+
+ /**
+ * Writes to files with the given path prefix.
+ *
+ * <p>Output files will have the name {@literal {filenamePrefix}-0000i-of-0000n.xml} where n is
+ * the number of output bundles.
+ */
+ public Write<T> toFilenamePrefix(String filenamePrefix) {
+ return toBuilder().setFilenamePrefix(filenamePrefix).build();
+ }
+
+ /**
+ * Writes objects of the given class mapped to XML elements using JAXB.
+ *
+ * <p>The specified class must be able to be used to create a JAXB context.
+ */
+ public Write<T> withRecordClass(Class<T> recordClass) {
+ return toBuilder().setRecordClass(recordClass).build();
+ }
+
+ /**
+ * Sets the enclosing root element for the generated XML files.
+ */
+ public Write<T> withRootElement(String rootElement) {
+ return toBuilder().setRootElement(rootElement).build();
+ }
+
+ @Override
+ public void validate(PCollection<T> input) {
+ checkNotNull(getRecordClass(), "Missing a class to bind to a JAXB context.");
+ checkNotNull(getRootElement(), "Missing a root element name.");
+ checkNotNull(getFilenamePrefix(), "Missing a filename to write to.");
+ try {
+ JAXBContext.newInstance(getRecordClass());
+ } catch (JAXBException e) {
+ throw new RuntimeException("Error binding classes to a JAXB Context.", e);
+ }
+ }
+
+ @Override
+ public PDone expand(PCollection<T> input) {
+ return input.apply(org.apache.beam.sdk.io.Write.to(createSink()));
+ }
+
+ @VisibleForTesting
+ XmlSink<T> createSink() {
+ return new XmlSink<>(this);
+ }
+
+ @Override
+ public void populateDisplayData(DisplayData.Builder builder) {
+ createSink().populateFileBasedDisplayData(builder);
+ builder
+ .addIfNotNull(DisplayData.item("rootElement", getRootElement())
+ .withLabel("XML Root Element"))
+ .addIfNotNull(DisplayData.item("recordClass", getRecordClass())
+ .withLabel("XML Record Class"));
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/d0c0a60c/sdks/java/core/src/main/java/org/apache/beam/sdk/io/XmlSink.java
----------------------------------------------------------------------
diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/io/XmlSink.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/io/XmlSink.java
index 2159c8f..7700329 100644
--- a/sdks/java/core/src/main/java/org/apache/beam/sdk/io/XmlSink.java
+++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/io/XmlSink.java
@@ -17,226 +17,58 @@
*/
package org.apache.beam.sdk.io;
-import static com.google.common.base.Preconditions.checkNotNull;
-
import java.io.OutputStream;
import java.nio.channels.Channels;
import java.nio.channels.WritableByteChannel;
import javax.xml.bind.JAXBContext;
-import javax.xml.bind.JAXBException;
import javax.xml.bind.Marshaller;
import org.apache.beam.sdk.coders.StringUtf8Coder;
-import org.apache.beam.sdk.io.FileBasedSink.FileBasedWriteOperation;
-import org.apache.beam.sdk.io.FileBasedSink.FileBasedWriter;
import org.apache.beam.sdk.options.PipelineOptions;
import org.apache.beam.sdk.transforms.display.DisplayData;
import org.apache.beam.sdk.util.CoderUtils;
-import org.apache.beam.sdk.values.PCollection;
-// CHECKSTYLE.OFF: JavadocStyle
-/**
- * A {@link Sink} that outputs records as XML-formatted elements. Writes a {@link PCollection} of
- * records from JAXB-annotated classes to a single file location.
- *
- * <p>Given a PCollection containing records of type T that can be marshalled to XML elements, this
- * Sink will produce a single file consisting of a single root element that contains all of the
- * elements in the PCollection.
- *
- * <p>XML Sinks are created with a base filename to write to, a root element name that will be used
- * for the root element of the output files, and a class to bind to an XML element. This class
- * will be used in the marshalling of records in an input PCollection to their XML representation
- * and must be able to be bound using JAXB annotations (checked at pipeline construction time).
- *
- * <p>XML Sinks can be written to using the {@link Write} transform:
- *
- * <pre>
- * p.apply(Write.to(
- * XmlSink.ofRecordClass(Type.class)
- * .withRootElementName(root_element)
- * .toFilenamePrefix(output_filename)));
- * </pre>
- *
- * <p>For example, consider the following class with JAXB annotations:
- *
- * <pre>
- * {@literal @}XmlRootElement(name = "word_count_result")
- * {@literal @}XmlType(propOrder = {"word", "frequency"})
- * public class WordFrequency {
- * private String word;
- * private long frequency;
- *
- * public WordFrequency() { }
- *
- * public WordFrequency(String word, long frequency) {
- * this.word = word;
- * this.frequency = frequency;
- * }
- *
- * public void setWord(String word) {
- * this.word = word;
- * }
- *
- * public void setFrequency(long frequency) {
- * this.frequency = frequency;
- * }
- *
- * public long getFrequency() {
- * return frequency;
- * }
- *
- * public String getWord() {
- * return word;
- * }
- * }
- * </pre>
- *
- * <p>The following will produce XML output with a root element named "words" from a PCollection of
- * WordFrequency objects:
- * <pre>
- * p.apply(Write.to(
- * XmlSink.ofRecordClass(WordFrequency.class)
- * .withRootElement("words")
- * .toFilenamePrefix(output_file)));
- * </pre>
- *
- * <p>The output of which will look like:
- * <pre>
- * {@code
- * <words>
- *
- * <word_count_result>
- * <word>decreased</word>
- * <frequency>1</frequency>
- * </word_count_result>
- *
- * <word_count_result>
- * <word>War</word>
- * <frequency>4</frequency>
- * </word_count_result>
- *
- * <word_count_result>
- * <word>empress'</word>
- * <frequency>14</frequency>
- * </word_count_result>
- *
- * <word_count_result>
- * <word>stoops</word>
- * <frequency>6</frequency>
- * </word_count_result>
- *
- * ...
- * </words>
- * }</pre>
- */
-// CHECKSTYLE.ON: JavadocStyle
-@SuppressWarnings("checkstyle:javadocstyle")
-public class XmlSink {
+/** Implementation of {@link XmlIO#write}. */
+class XmlSink<T> extends FileBasedSink<T> {
protected static final String XML_EXTENSION = "xml";
- /**
- * Returns a builder for an XmlSink. You'll need to configure the class to bind, the root
- * element name, and the output file prefix with {@link Bound#ofRecordClass}, {@link
- * Bound#withRootElement}, and {@link Bound#toFilenamePrefix}, respectively.
- */
- public static Bound<?> write() {
- return new Bound<>(null, null, null);
+ private final XmlIO.Write<T> spec;
+
+ XmlSink(XmlIO.Write<T> spec) {
+ super(spec.getFilenamePrefix(), XML_EXTENSION);
+ this.spec = spec;
}
/**
- * Returns an XmlSink that writes objects as XML entities.
- *
- * <p>Output files will have the name {@literal {baseOutputFilename}-0000i-of-0000n.xml} where n
- * is the number of output bundles.
- *
- * @param klass the class of the elements to write.
- * @param rootElementName the enclosing root element.
- * @param baseOutputFilename the output filename prefix.
+ * Validates that the root element, class to bind to a JAXB context, and filenamePrefix have
+ * been set and that the class can be bound in a JAXB context.
*/
- public static <T> Bound<T> writeOf(
- Class<T> klass, String rootElementName, String baseOutputFilename) {
- return new Bound<>(klass, rootElementName, baseOutputFilename);
+ @Override
+ public void validate(PipelineOptions options) {
+ spec.validate(null);
}
/**
- * A {@link FileBasedSink} that writes objects as XML elements.
+ * Creates an {@link XmlWriteOperation}.
*/
- public static class Bound<T> extends FileBasedSink<T> {
- final Class<T> classToBind;
- final String rootElementName;
-
- private Bound(Class<T> classToBind, String rootElementName, String baseOutputFilename) {
- super(baseOutputFilename, XML_EXTENSION);
- this.classToBind = classToBind;
- this.rootElementName = rootElementName;
- }
-
- /**
- * Returns an XmlSink that writes objects of the class specified as XML elements.
- *
- * <p>The specified class must be able to be used to create a JAXB context.
- */
- public <T> Bound<T> ofRecordClass(Class<T> classToBind) {
- return new Bound<>(classToBind, rootElementName, getBaseOutputFilenameProvider().get());
- }
-
- /**
- * Returns an XmlSink that writes to files with the given prefix.
- *
- * <p>Output files will have the name {@literal {filenamePrefix}-0000i-of-0000n.xml} where n is
- * the number of output bundles.
- */
- public Bound<T> toFilenamePrefix(String baseOutputFilename) {
- return new Bound<>(classToBind, rootElementName, baseOutputFilename);
- }
-
- /**
- * Returns an XmlSink that writes XML files with an enclosing root element of the
- * supplied name.
- */
- public Bound<T> withRootElement(String rootElementName) {
- return new Bound<>(classToBind, rootElementName, getBaseOutputFilenameProvider().get());
- }
-
- /**
- * Validates that the root element, class to bind to a JAXB context, and filenamePrefix have
- * been set and that the class can be bound in a JAXB context.
- */
- @Override
- public void validate(PipelineOptions options) {
- checkNotNull(classToBind, "Missing a class to bind to a JAXB context.");
- checkNotNull(rootElementName, "Missing a root element name.");
- checkNotNull(getBaseOutputFilenameProvider().get(), "Missing a filename to write to.");
- try {
- JAXBContext.newInstance(classToBind);
- } catch (JAXBException e) {
- throw new RuntimeException("Error binding classes to a JAXB Context.", e);
- }
- }
+ @Override
+ public XmlWriteOperation<T> createWriteOperation(PipelineOptions options) {
+ return new XmlWriteOperation<>(this);
+ }
- /**
- * Creates an {@link XmlWriteOperation}.
- */
- @Override
- public XmlWriteOperation<T> createWriteOperation(PipelineOptions options) {
- return new XmlWriteOperation<>(this);
- }
+ @Override
+ public void populateDisplayData(DisplayData.Builder builder) {
+ spec.populateDisplayData(builder);
+ }
- @Override
- public void populateDisplayData(DisplayData.Builder builder) {
- super.populateDisplayData(builder);
- builder
- .addIfNotNull(DisplayData.item("rootElement", rootElementName)
- .withLabel("XML Root Element"))
- .addIfNotNull(DisplayData.item("recordClass", classToBind)
- .withLabel("XML Record Class"));
- }
+ void populateFileBasedDisplayData(DisplayData.Builder builder) {
+ super.populateDisplayData(builder);
}
/**
* {@link Sink.WriteOperation} for XML {@link Sink}s.
*/
protected static final class XmlWriteOperation<T> extends FileBasedWriteOperation<T> {
- public XmlWriteOperation(XmlSink.Bound<T> sink) {
+ public XmlWriteOperation(XmlSink<T> sink) {
super(sink);
}
@@ -247,7 +79,7 @@ public class XmlSink {
public XmlWriter<T> createWriter(PipelineOptions options) throws Exception {
JAXBContext context;
Marshaller marshaller;
- context = JAXBContext.newInstance(getSink().classToBind);
+ context = JAXBContext.newInstance(getSink().spec.getRecordClass());
marshaller = context.createMarshaller();
marshaller.setProperty(Marshaller.JAXB_FORMATTED_OUTPUT, Boolean.TRUE);
marshaller.setProperty(Marshaller.JAXB_FRAGMENT, Boolean.TRUE);
@@ -259,8 +91,8 @@ public class XmlSink {
* Return the XmlSink.Bound for this write operation.
*/
@Override
- public XmlSink.Bound<T> getSink() {
- return (XmlSink.Bound<T>) super.getSink();
+ public XmlSink<T> getSink() {
+ return (XmlSink<T>) super.getSink();
}
}
@@ -289,7 +121,7 @@ public class XmlSink {
*/
@Override
protected void writeHeader() throws Exception {
- String rootElementName = getWriteOperation().getSink().rootElementName;
+ String rootElementName = getWriteOperation().getSink().spec.getRootElement();
os.write(CoderUtils.encodeToByteArray(StringUtf8Coder.of(), "<" + rootElementName + ">\n"));
}
@@ -298,7 +130,7 @@ public class XmlSink {
*/
@Override
protected void writeFooter() throws Exception {
- String rootElementName = getWriteOperation().getSink().rootElementName;
+ String rootElementName = getWriteOperation().getSink().spec.getRootElement();
os.write(CoderUtils.encodeToByteArray(StringUtf8Coder.of(), "\n</" + rootElementName + ">"));
}
http://git-wip-us.apache.org/repos/asf/beam/blob/d0c0a60c/sdks/java/core/src/main/java/org/apache/beam/sdk/io/XmlSource.java
----------------------------------------------------------------------
diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/io/XmlSource.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/io/XmlSource.java
index 6bf2015..7416c85 100644
--- a/sdks/java/core/src/main/java/org/apache/beam/sdk/io/XmlSource.java
+++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/io/XmlSource.java
@@ -17,8 +17,6 @@
*/
package org.apache.beam.sdk.io;
-import static com.google.common.base.Preconditions.checkNotNull;
-
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
@@ -45,154 +43,29 @@ import javax.xml.stream.XMLStreamReader;
import org.apache.beam.sdk.coders.Coder;
import org.apache.beam.sdk.coders.JAXBCoder;
import org.apache.beam.sdk.options.PipelineOptions;
-import org.apache.beam.sdk.runners.PipelineRunner;
import org.apache.beam.sdk.transforms.display.DisplayData;
-import org.apache.beam.sdk.values.PCollection;
import org.codehaus.stax2.XMLInputFactory2;
-// CHECKSTYLE.OFF: JavadocStyle
-/**
- * A source that can be used to read XML files. This source reads one or more
- * XML files and creates a {@link PCollection} of a given type. A {@link Read} transform can be
- * created by passing an {@link XmlSource} object to {@link Read#from}. Please note the
- * example given below.
- *
- * <p>The XML file must be of the following form, where {@code root} and {@code record} are XML
- * element names that are defined by the user:
- *
- * <pre>
- * {@code
- * <root>
- * <record> ... </record>
- * <record> ... </record>
- * <record> ... </record>
- * ...
- * <record> ... </record>
- * </root>
- * }
- * </pre>
- *
- * <p>Basically, the XML document should contain a single root element with an inner list consisting
- * entirely of record elements. The records may contain arbitrary XML content; however, that content
- * <b>must not</b> contain the start {@code <record>} or end {@code </record>} tags. This
- * restriction enables reading from large XML files in parallel from different offsets in the file.
- *
- * <p>Root and/or record elements may additionally contain an arbitrary number of XML attributes.
- * Additionally users must provide a class of a JAXB annotated Java type that can be used convert
- * records into Java objects and vice versa using JAXB marshalling/unmarshalling mechanisms. Reading
- * the source will generate a {@code PCollection} of the given JAXB annotated Java type.
- * Optionally users may provide a minimum size of a bundle that should be created for the source.
- *
- * <p>The following example shows how to read from {@link XmlSource} in a Beam pipeline:
- *
- * <pre>
- * {@code
- * XmlSource<String> source = XmlSource.<String>from(file.toPath().toString())
- * .withRootElement("root")
- * .withRecordElement("record")
- * .withRecordClass(Record.class);
- * PCollection<String> output = p.apply(Read.from(source));
- * }
- * </pre>
- *
- * <p>Currently, only XML files that use single-byte characters are supported. Using a file that
- * contains multi-byte characters may result in data loss or duplication.
- *
- * <p>To use {@link XmlSource}:
- * <ol>
- * <li>Explicitly declare a dependency on org.codehaus.woodstox:stax2-api</li>
- * <li>Include a compatible implementation on the classpath at run-time,
- * such as org.codehaus.woodstox:woodstox-core-asl</li>
- * </ol>
- *
- * <p>These dependencies have been declared as optional in the sdks/java/core/pom.xml file of
- * Apache Beam.
- *
- * <h3>Permissions</h3>
- * Permission requirements depend on the
- * {@link org.apache.beam.sdk.runners.PipelineRunner PipelineRunner} that is
- * used to execute the Beam pipeline. Please refer to the documentation of corresponding
- * {@link PipelineRunner PipelineRunners} for more details.
- *
- * @param <T> Type of the objects that represent the records of the XML file. The
- * {@code PCollection} generated by this source will be of this type.
- */
-// CHECKSTYLE.ON: JavadocStyle
+/** Implementation of {@link XmlIO#read}. */
public class XmlSource<T> extends FileBasedSource<T> {
private static final String XML_VERSION = "1.1";
- private static final int DEFAULT_MIN_BUNDLE_SIZE = 8 * 1024;
- private final String rootElement;
- private final String recordElement;
- private final Class<T> recordClass;
-
- /**
- * Creates an XmlSource for a single XML file or a set of XML files defined by a Java "glob" file
- * pattern. Each XML file should be of the form defined in {@link XmlSource}.
- */
- public static <T> XmlSource<T> from(String fileOrPatternSpec) {
- return new XmlSource<>(fileOrPatternSpec, DEFAULT_MIN_BUNDLE_SIZE, null, null, null);
- }
-
- /**
- * Sets name of the root element of the XML document. This will be used to create a valid starting
- * root element when initiating a bundle of records created from an XML document. This is a
- * required parameter.
- */
- public XmlSource<T> withRootElement(String rootElement) {
- return new XmlSource<>(
- getFileOrPatternSpec(), getMinBundleSize(), rootElement, recordElement, recordClass);
- }
- /**
- * Sets name of the record element of the XML document. This will be used to determine offset of
- * the first record of a bundle created from the XML document. This is a required parameter.
- */
- public XmlSource<T> withRecordElement(String recordElement) {
- return new XmlSource<>(
- getFileOrPatternSpec(), getMinBundleSize(), rootElement, recordElement, recordClass);
- }
+ private final XmlIO.Read<T> spec;
- /**
- * Sets a JAXB annotated class that can be populated using a record of the provided XML file. This
- * will be used when unmarshalling record objects from the XML file. This is a required
- * parameter.
- */
- public XmlSource<T> withRecordClass(Class<T> recordClass) {
- return new XmlSource<>(
- getFileOrPatternSpec(), getMinBundleSize(), rootElement, recordElement, recordClass);
+ XmlSource(XmlIO.Read<T> spec) {
+ super(spec.getFileOrPatternSpec(), spec.getMinBundleSize());
+ this.spec = spec;
}
- /**
- * Sets a parameter {@code minBundleSize} for the minimum bundle size of the source. Please refer
- * to {@link OffsetBasedSource} for the definition of minBundleSize. This is an optional
- * parameter.
- */
- public XmlSource<T> withMinBundleSize(long minBundleSize) {
- return new XmlSource<>(
- getFileOrPatternSpec(), minBundleSize, rootElement, recordElement, recordClass);
- }
-
- private XmlSource(String fileOrPattern, long minBundleSize, String rootElement,
- String recordElement, Class<T> recordClass) {
- super(fileOrPattern, minBundleSize);
- this.rootElement = rootElement;
- this.recordElement = recordElement;
- this.recordClass = recordClass;
- }
-
- private XmlSource(String fileOrPattern, long minBundleSize, long startOffset, long endOffset,
- String rootElement, String recordElement, Class<T> recordClass) {
- super(fileOrPattern, minBundleSize, startOffset, endOffset);
- this.rootElement = rootElement;
- this.recordElement = recordElement;
- this.recordClass = recordClass;
+ private XmlSource(XmlIO.Read<T> spec, long startOffset, long endOffset) {
+ super(spec.getFileOrPatternSpec(), spec.getMinBundleSize(), startOffset, endOffset);
+ this.spec = spec;
}
@Override
protected FileBasedSource<T> createForSubrangeOfFile(String fileName, long start, long end) {
- return new XmlSource<T>(
- fileName, getMinBundleSize(), start, end, rootElement, recordElement, recordClass);
+ return new XmlSource<T>(spec.from(fileName), start, end);
}
@Override
@@ -203,42 +76,17 @@ public class XmlSource<T> extends FileBasedSource<T> {
@Override
public void validate() {
super.validate();
- checkNotNull(
- rootElement, "rootElement is null. Use builder method withRootElement() to set this.");
- checkNotNull(
- recordElement,
- "recordElement is null. Use builder method withRecordElement() to set this.");
- checkNotNull(
- recordClass, "recordClass is null. Use builder method withRecordClass() to set this.");
+ spec.validate(null);
}
@Override
public void populateDisplayData(DisplayData.Builder builder) {
- super.populateDisplayData(builder);
- builder
- .addIfNotNull(DisplayData.item("rootElement", rootElement)
- .withLabel("XML Root Element"))
- .addIfNotNull(DisplayData.item("recordElement", recordElement)
- .withLabel("XML Record Element"))
- .addIfNotNull(DisplayData.item("recordClass", recordClass)
- .withLabel("XML Record Class"));
+ spec.populateDisplayData(builder);
}
@Override
public Coder<T> getDefaultOutputCoder() {
- return JAXBCoder.of(recordClass);
- }
-
- public String getRootElement() {
- return rootElement;
- }
-
- public String getRecordElement() {
- return recordElement;
- }
-
- public Class<T> getRecordClass() {
- return recordClass;
+ return JAXBCoder.of(spec.getRecordClass());
}
/**
@@ -289,7 +137,7 @@ public class XmlSource<T> extends FileBasedSource<T> {
// Set up a JAXB Unmarshaller that can be used to unmarshall record objects.
try {
- JAXBContext jaxbContext = JAXBContext.newInstance(getCurrentSource().recordClass);
+ JAXBContext jaxbContext = JAXBContext.newInstance(getCurrentSource().spec.getRecordClass());
jaxbUnmarshaller = jaxbContext.createUnmarshaller();
// Throw errors if validation fails. JAXB by default ignores validation errors.
@@ -334,8 +182,10 @@ public class XmlSource<T> extends FileBasedSource<T> {
// this XML parsing may fail or may produce incorrect results.
byte[] dummyStartDocumentBytes =
- ("<?xml version=\"" + XML_VERSION + "\" encoding=\"UTF-8\" ?>"
- + "<" + getCurrentSource().rootElement + ">").getBytes(StandardCharsets.UTF_8);
+ (String.format(
+ "<?xml version=\"%s\" encoding=\"UTF-8\" ?><%s>",
+ XML_VERSION, getCurrentSource().spec.getRootElement()))
+ .getBytes(StandardCharsets.UTF_8);
preambleByteBuffer.write(dummyStartDocumentBytes);
// Gets the byte offset (in the input file) of the first record in ReadableByteChannel. This
// method returns the offset and stores any bytes that should be used when creating the XML
@@ -383,7 +233,7 @@ public class XmlSource<T> extends FileBasedSource<T> {
ByteBuffer buf = ByteBuffer.allocate(BUF_SIZE);
byte[] recordStartBytes =
- ("<" + getCurrentSource().recordElement).getBytes(StandardCharsets.UTF_8);
+ ("<" + getCurrentSource().spec.getRecordElement()).getBytes(StandardCharsets.UTF_8);
outer: while (channel.read(buf) > 0) {
buf.flip();
@@ -494,7 +344,7 @@ public class XmlSource<T> extends FileBasedSource<T> {
int event = parser.next();
if (event == XMLStreamConstants.START_ELEMENT) {
String localName = parser.getLocalName();
- if (localName.equals(getCurrentSource().recordElement)) {
+ if (localName.equals(getCurrentSource().spec.getRecordElement())) {
break;
}
}
@@ -521,7 +371,8 @@ public class XmlSource<T> extends FileBasedSource<T> {
return false;
}
}
- JAXBElement<T> jb = jaxbUnmarshaller.unmarshal(parser, getCurrentSource().recordClass);
+ JAXBElement<T> jb =
+ jaxbUnmarshaller.unmarshal(parser, getCurrentSource().spec.getRecordClass());
currentRecord = jb.getValue();
return true;
} catch (JAXBException | XMLStreamException e) {
http://git-wip-us.apache.org/repos/asf/beam/blob/d0c0a60c/sdks/java/core/src/main/java/org/apache/beam/sdk/transforms/display/DisplayData.java
----------------------------------------------------------------------
diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/transforms/display/DisplayData.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/transforms/display/DisplayData.java
index 669dc6d..3c4337b 100644
--- a/sdks/java/core/src/main/java/org/apache/beam/sdk/transforms/display/DisplayData.java
+++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/transforms/display/DisplayData.java
@@ -778,6 +778,12 @@ public class DisplayData implements Serializable {
visitedComponents.add(subComponent);
visitedPathMap.put(path, subComponent);
Class<?> namespace = subComponent.getClass();
+ // Common case: AutoValue classes such as AutoValue_FooIO_Read. It's more useful
+ // to show the user the FooIO.Read class, which is the direct superclass of the AutoValue
+ // generated class.
+ if (namespace.getSimpleName().startsWith("AutoValue_")) {
+ namespace = namespace.getSuperclass();
+ }
Path prevPath = latestPath;
Class<?> prevNs = latestNs;
http://git-wip-us.apache.org/repos/asf/beam/blob/d0c0a60c/sdks/java/core/src/test/java/org/apache/beam/sdk/io/XmlSinkTest.java
----------------------------------------------------------------------
diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/io/XmlSinkTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/io/XmlSinkTest.java
index 63b5d11..7f559d1 100644
--- a/sdks/java/core/src/test/java/org/apache/beam/sdk/io/XmlSinkTest.java
+++ b/sdks/java/core/src/test/java/org/apache/beam/sdk/io/XmlSinkTest.java
@@ -59,7 +59,6 @@ public class XmlSinkTest {
@Rule
public ExpectedException thrown = ExpectedException.none();
- private Class<Bird> testClass = Bird.class;
private String testRootElement = "testElement";
private String testFilePrefix = "/path/to/testPrefix";
@@ -70,7 +69,12 @@ public class XmlSinkTest {
public void testXmlWriter() throws Exception {
PipelineOptions options = PipelineOptionsFactory.create();
XmlWriteOperation<Bird> writeOp =
- XmlSink.writeOf(Bird.class, "birds", testFilePrefix).createWriteOperation(options);
+ XmlIO.<Bird>write()
+ .toFilenamePrefix(testFilePrefix)
+ .withRecordClass(Bird.class)
+ .withRootElement("birds")
+ .createSink()
+ .createWriteOperation(options);
XmlWriter<Bird> writer = writeOp.createWriter(options);
List<Bird> bundle =
@@ -85,51 +89,37 @@ public class XmlSinkTest {
* Builder methods correctly initialize an XML Sink.
*/
@Test
- public void testBuildXmlSink() {
- XmlSink.Bound<Bird> sink =
- XmlSink.write()
+ public void testBuildXmlWriteTransform() {
+ XmlIO.Write<Bird> write =
+ XmlIO.<Bird>write()
.toFilenamePrefix(testFilePrefix)
- .ofRecordClass(testClass)
+ .withRecordClass(Bird.class)
.withRootElement(testRootElement);
- assertEquals(testClass, sink.classToBind);
- assertEquals(testRootElement, sink.rootElementName);
- assertEquals(testFilePrefix, sink.getBaseOutputFilenameProvider().get());
+ assertEquals(Bird.class, write.getRecordClass());
+ assertEquals(testRootElement, write.getRootElement());
+ assertEquals(testFilePrefix, write.getFilenamePrefix());
}
- /**
- * Alternate builder method correctly initializes an XML Sink.
- */
+ /** Validation ensures no fields are missing. */
@Test
- public void testBuildXmlSinkDirect() {
- XmlSink.Bound<Bird> sink =
- XmlSink.writeOf(Bird.class, testRootElement, testFilePrefix);
- assertEquals(testClass, sink.classToBind);
- assertEquals(testRootElement, sink.rootElementName);
- assertEquals(testFilePrefix, sink.getBaseOutputFilenameProvider().get());
+ public void testValidateXmlSinkMissingRecordClass() {
+ thrown.expect(NullPointerException.class);
+ XmlIO.<Bird>write()
+ .withRootElement(testRootElement)
+ .toFilenamePrefix(testFilePrefix)
+ .validate(null);
}
- /**
- * Validation ensures no fields are missing.
- */
@Test
- public void testValidateXmlSinkMissingFields() {
- XmlSink.Bound<Bird> sink;
- sink = XmlSink.writeOf(null, testRootElement, testFilePrefix);
- validateAndFailIfSucceeds(sink, NullPointerException.class);
- sink = XmlSink.writeOf(testClass, null, testFilePrefix);
- validateAndFailIfSucceeds(sink, NullPointerException.class);
- sink = XmlSink.writeOf(testClass, testRootElement, null);
- validateAndFailIfSucceeds(sink, NullPointerException.class);
+ public void testValidateXmlSinkMissingRootElement() {
+ thrown.expect(NullPointerException.class);
+ XmlIO.<Bird>write().withRecordClass(Bird.class).toFilenamePrefix(testFilePrefix).validate(null);
}
- /**
- * Call validate and fail if validation does not throw the expected exception.
- */
- private <T> void validateAndFailIfSucceeds(
- XmlSink.Bound<T> sink, Class<? extends Exception> expected) {
- thrown.expect(expected);
- PipelineOptions options = PipelineOptionsFactory.create();
- sink.validate(options);
+ @Test
+ public void testValidateXmlSinkMissingFilePrefix() {
+ thrown.expect(NullPointerException.class);
+ XmlIO.<Bird>write().withRecordClass(Bird.class).withRootElement(testRootElement).validate(null);
}
/**
@@ -138,13 +128,13 @@ public class XmlSinkTest {
@Test
public void testCreateWriteOperations() {
PipelineOptions options = PipelineOptionsFactory.create();
- XmlSink.Bound<Bird> sink =
- XmlSink.writeOf(testClass, testRootElement, testFilePrefix);
+ XmlSink<Bird> sink =
+ XmlIO.<Bird>write()
+ .withRecordClass(Bird.class)
+ .withRootElement(testRootElement)
+ .toFilenamePrefix(testFilePrefix)
+ .createSink();
XmlWriteOperation<Bird> writeOp = sink.createWriteOperation(options);
- assertEquals(testClass, writeOp.getSink().classToBind);
- assertEquals(testFilePrefix, writeOp.getSink().getBaseOutputFilenameProvider().get());
- assertEquals(testRootElement, writeOp.getSink().rootElementName);
- // assertEquals(XmlSink.XML_EXTENSION, writeOp.getSink().getFilenamePolicy().extension);
Path outputPath = new File(testFilePrefix).toPath();
Path tempPath = new File(writeOp.tempDirectory.get()).toPath();
assertEquals(outputPath.getParent(), tempPath.getParent());
@@ -159,7 +149,11 @@ public class XmlSinkTest {
public void testCreateWriter() throws Exception {
PipelineOptions options = PipelineOptionsFactory.create();
XmlWriteOperation<Bird> writeOp =
- XmlSink.writeOf(testClass, testRootElement, testFilePrefix)
+ XmlIO.<Bird>write()
+ .withRecordClass(Bird.class)
+ .withRootElement(testRootElement)
+ .toFilenamePrefix(testFilePrefix)
+ .createSink()
.createWriteOperation(options);
XmlWriter<Bird> writer = writeOp.createWriter(options);
Path outputPath = new File(testFilePrefix).toPath();
@@ -167,18 +161,17 @@ public class XmlSinkTest {
assertEquals(outputPath.getParent(), tempPath.getParent());
assertThat(
tempPath.getFileName().toString(), containsString("temp-beam-" + outputPath.getFileName()));
- assertEquals(testRootElement, writer.getWriteOperation().getSink().rootElementName);
assertNotNull(writer.marshaller);
}
@Test
public void testDisplayData() {
- XmlSink.Bound<Integer> sink = XmlSink.write()
+ XmlIO.Write<Integer> write = XmlIO.<Integer>write()
.toFilenamePrefix("foobar")
.withRootElement("bird")
- .ofRecordClass(Integer.class);
+ .withRecordClass(Integer.class);
- DisplayData displayData = DisplayData.from(sink);
+ DisplayData displayData = DisplayData.from(write);
assertThat(displayData, hasDisplayItem("fileNamePattern", "foobar-SSSSS-of-NNNNN.xml"));
assertThat(displayData, hasDisplayItem("rootElement", "bird"));
http://git-wip-us.apache.org/repos/asf/beam/blob/d0c0a60c/sdks/java/core/src/test/java/org/apache/beam/sdk/io/XmlSourceTest.java
----------------------------------------------------------------------
diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/io/XmlSourceTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/io/XmlSourceTest.java
index 5f71f30..0120b8b 100644
--- a/sdks/java/core/src/test/java/org/apache/beam/sdk/io/XmlSourceTest.java
+++ b/sdks/java/core/src/test/java/org/apache/beam/sdk/io/XmlSourceTest.java
@@ -285,12 +285,14 @@ public class XmlSourceTest {
File file = tempFolder.newFile("trainXMLTiny");
Files.write(file.toPath(), tinyXML.getBytes(StandardCharsets.UTF_8));
- XmlSource<Train> source =
- XmlSource.<Train>from(file.toPath().toString())
+ BoundedSource<Train> source =
+ XmlIO.<Train>read()
+ .from(file.toPath().toString())
.withRootElement("trains")
.withRecordElement("train")
.withRecordClass(Train.class)
- .withMinBundleSize(1024);
+ .withMinBundleSize(1024)
+ .createSource();
List<Train> expectedResults = ImmutableList.of(
new Train("Thomas", Train.TRAIN_NUMBER_UNDEFINED, null, null),
@@ -308,12 +310,14 @@ public class XmlSourceTest {
File file = tempFolder.newFile("trainXMLTiny");
Files.write(file.toPath(), xmlWithMultiByteChars.getBytes(StandardCharsets.UTF_8));
- XmlSource<Train> source =
- XmlSource.<Train>from(file.toPath().toString())
+ BoundedSource<Train> source =
+ XmlIO.<Train>read()
+ .from(file.toPath().toString())
.withRootElement("trains")
.withRecordElement("train")
.withRecordClass(Train.class)
- .withMinBundleSize(1024);
+ .withMinBundleSize(1024)
+ .createSource();
List<Train> expectedResults = ImmutableList.of(
new Train("Thomas�", Train.TRAIN_NUMBER_UNDEFINED, null, null),
@@ -334,12 +338,14 @@ public class XmlSourceTest {
File file = tempFolder.newFile("trainXMLTiny");
Files.write(file.toPath(), xmlWithMultiByteElementName.getBytes(StandardCharsets.UTF_8));
- XmlSource<Train> source =
- XmlSource.<Train>from(file.toPath().toString())
+ BoundedSource<Train> source =
+ XmlIO.<Train>read()
+ .from(file.toPath().toString())
.withRootElement("\u0daf\u0dd4\u0db8\u0dca\u0dbb\u0dd2\u0dba\u0db1\u0dca")
.withRecordElement("\u0daf\u0dd4\u0db8\u0dca\u0dbb\u0dd2\u0dba")
.withRecordClass(Train.class)
- .withMinBundleSize(1024);
+ .withMinBundleSize(1024)
+ .createSource();
List<Train> expectedResults = ImmutableList.of(
new Train("Thomas", Train.TRAIN_NUMBER_UNDEFINED, null, null),
@@ -357,18 +363,20 @@ public class XmlSourceTest {
File file = tempFolder.newFile("trainXMLTiny");
Files.write(file.toPath(), tinyXML.getBytes(StandardCharsets.UTF_8));
- XmlSource<Train> source =
- XmlSource.<Train>from(file.toPath().toString())
+ BoundedSource<Train> source =
+ XmlIO.<Train>read()
+ .from(file.toPath().toString())
.withRootElement("trains")
.withRecordElement("train")
.withRecordClass(Train.class)
- .withMinBundleSize(10);
- List<? extends FileBasedSource<Train>> splits = source.split(50, null);
+ .withMinBundleSize(10)
+ .createSource();
+ List<? extends BoundedSource<Train>> splits = source.split(50, null);
assertTrue(splits.size() > 2);
List<Train> results = new ArrayList<>();
- for (FileBasedSource<Train> split : splits) {
+ for (BoundedSource<Train> split : splits) {
results.addAll(readEverythingFromReader(split.createReader(null)));
}
@@ -394,12 +402,14 @@ public class XmlSourceTest {
File file = tempFolder.newFile("trainXMLSmall");
Files.write(file.toPath(), trainXML.getBytes(StandardCharsets.UTF_8));
- XmlSource<Train> source =
- XmlSource.<Train>from(file.toPath().toString())
+ BoundedSource<Train> source =
+ XmlIO.<Train>read()
+ .from(file.toPath().toString())
.withRootElement("trains")
.withRecordElement("train")
.withRecordClass(Train.class)
- .withMinBundleSize(1024);
+ .withMinBundleSize(1024)
+ .createSource();
List<Train> expectedResults =
ImmutableList.of(new Train("Thomas", 1, "blue", null), new Train("Henry", 3, "green", null),
@@ -417,10 +427,12 @@ public class XmlSourceTest {
File file = tempFolder.newFile("trainXMLSmall");
Files.write(file.toPath(), trainXML.getBytes(StandardCharsets.UTF_8));
- XmlSource<Train> source =
- XmlSource.<Train>from(file.toPath().toString())
+ BoundedSource<Train> source =
+ XmlIO.<Train>read()
+ .from(file.toPath().toString())
.withRecordElement("train")
- .withRecordClass(Train.class);
+ .withRecordClass(Train.class)
+ .createSource();
exception.expect(NullPointerException.class);
exception.expectMessage(
@@ -433,10 +445,12 @@ public class XmlSourceTest {
File file = tempFolder.newFile("trainXMLSmall");
Files.write(file.toPath(), trainXML.getBytes(StandardCharsets.UTF_8));
- XmlSource<Train> source =
- XmlSource.<Train>from(file.toPath().toString())
+ BoundedSource<Train> source =
+ XmlIO.<Train>read()
+ .from(file.toPath().toString())
.withRootElement("trains")
- .withRecordClass(Train.class);
+ .withRecordClass(Train.class)
+ .createSource();
exception.expect(NullPointerException.class);
exception.expectMessage(
@@ -449,10 +463,12 @@ public class XmlSourceTest {
File file = tempFolder.newFile("trainXMLSmall");
Files.write(file.toPath(), trainXML.getBytes(StandardCharsets.UTF_8));
- XmlSource<Train> source =
- XmlSource.<Train>from(file.toPath().toString())
+ BoundedSource<Train> source =
+ XmlIO.<Train>read()
+ .from(file.toPath().toString())
.withRootElement("trains")
- .withRecordElement("train");
+ .withRecordElement("train")
+ .createSource();
exception.expect(NullPointerException.class);
exception.expectMessage(
@@ -465,11 +481,13 @@ public class XmlSourceTest {
File file = tempFolder.newFile("trainXMLSmall");
Files.write(file.toPath(), trainXML.getBytes(StandardCharsets.UTF_8));
- XmlSource<Train> source =
- XmlSource.<Train>from(file.toPath().toString())
+ BoundedSource<Train> source =
+ XmlIO.<Train>read()
+ .from(file.toPath().toString())
.withRootElement("something")
.withRecordElement("train")
- .withRecordClass(Train.class);
+ .withRecordClass(Train.class)
+ .createSource();
exception.expectMessage("Unexpected close tag </trains>; expected </something>.");
readEverythingFromReader(source.createReader(null));
@@ -480,11 +498,13 @@ public class XmlSourceTest {
File file = tempFolder.newFile("trainXMLSmall");
Files.write(file.toPath(), trainXML.getBytes(StandardCharsets.UTF_8));
- XmlSource<Train> source =
- XmlSource.<Train>from(file.toPath().toString())
+ BoundedSource<Train> source =
+ XmlIO.<Train>read()
+ .from(file.toPath().toString())
.withRootElement("trains")
.withRecordElement("something")
- .withRecordClass(Train.class);
+ .withRecordClass(Train.class)
+ .createSource();
assertEquals(readEverythingFromReader(source.createReader(null)), new ArrayList<Train>());
}
@@ -500,11 +520,13 @@ public class XmlSourceTest {
File file = tempFolder.newFile("trainXMLSmall");
Files.write(file.toPath(), trainXML.getBytes(StandardCharsets.UTF_8));
- XmlSource<WrongTrainType> source =
- XmlSource.<WrongTrainType>from(file.toPath().toString())
+ BoundedSource<WrongTrainType> source =
+ XmlIO.<WrongTrainType>read()
+ .from(file.toPath().toString())
.withRootElement("trains")
.withRecordElement("train")
- .withRecordClass(WrongTrainType.class);
+ .withRecordClass(WrongTrainType.class)
+ .createSource();
exception.expect(RuntimeException.class);
@@ -525,11 +547,13 @@ public class XmlSourceTest {
File file = tempFolder.newFile("trainXMLSmall");
Files.write(file.toPath(), trainXML.getBytes(StandardCharsets.UTF_8));
- XmlSource<Train> source =
- XmlSource.<Train>from(file.toPath().toString())
+ BoundedSource<Train> source =
+ XmlIO.<Train>read()
+ .from(file.toPath().toString())
.withRootElement("trains")
.withRecordElement("train")
- .withRecordClass(Train.class);
+ .withRecordClass(Train.class)
+ .createSource();
List<Train> expectedResults =
ImmutableList.of(new Train("Thomas", 1, "blue", null), new Train("Henry", 3, "green", null),
@@ -548,12 +572,14 @@ public class XmlSourceTest {
File file = tempFolder.newFile("trainXMLSmall");
Files.write(file.toPath(), trainXMLWithEmptyTags.getBytes(StandardCharsets.UTF_8));
- XmlSource<Train> source =
- XmlSource.<Train>from(file.toPath().toString())
+ BoundedSource<Train> source =
+ XmlIO.<Train>read()
+ .from(file.toPath().toString())
.withRootElement("trains")
.withRecordElement("train")
.withRecordClass(Train.class)
- .withMinBundleSize(1024);
+ .withMinBundleSize(1024)
+ .createSource();
List<Train> expectedResults = ImmutableList.of(new Train("Thomas", 1, "blue", null),
new Train("Henry", 3, "green", null), new Train("Toby", 7, "brown", null),
@@ -572,14 +598,15 @@ public class XmlSourceTest {
File file = tempFolder.newFile("trainXMLSmall");
Files.write(file.toPath(), trainXML.getBytes(StandardCharsets.UTF_8));
- XmlSource<Train> source =
- XmlSource.<Train>from(file.toPath().toString())
- .withRootElement("trains")
- .withRecordElement("train")
- .withRecordClass(Train.class)
- .withMinBundleSize(1024);
-
- PCollection<Train> output = p.apply("ReadFileData", Read.from(source));
+ PCollection<Train> output =
+ p.apply(
+ "ReadFileData",
+ XmlIO.<Train>read()
+ .from(file.toPath().toString())
+ .withRootElement("trains")
+ .withRecordElement("train")
+ .withRecordClass(Train.class)
+ .withMinBundleSize(1024));
List<Train> expectedResults =
ImmutableList.of(new Train("Thomas", 1, "blue", null), new Train("Henry", 3, "green", null),
@@ -595,12 +622,14 @@ public class XmlSourceTest {
File file = tempFolder.newFile("trainXMLSmall");
Files.write(file.toPath(), trainXMLWithAttributes.getBytes(StandardCharsets.UTF_8));
- XmlSource<Train> source =
- XmlSource.<Train>from(file.toPath().toString())
+ BoundedSource<Train> source =
+ XmlIO.<Train>read()
+ .from(file.toPath().toString())
.withRootElement("trains")
.withRecordElement("train")
.withRecordClass(Train.class)
- .withMinBundleSize(1024);
+ .withMinBundleSize(1024)
+ .createSource();
List<Train> expectedResults = ImmutableList.of(new Train("Thomas", 1, "blue", "small"),
new Train("Henry", 3, "green", "big"), new Train("Toby", 7, "brown", "small"),
@@ -618,12 +647,14 @@ public class XmlSourceTest {
File file = tempFolder.newFile("trainXMLSmall");
Files.write(file.toPath(), trainXMLWithSpaces.getBytes(StandardCharsets.UTF_8));
- XmlSource<Train> source =
- XmlSource.<Train>from(file.toPath().toString())
+ BoundedSource<Train> source =
+ XmlIO.<Train>read()
+ .from(file.toPath().toString())
.withRootElement("trains")
.withRecordElement("train")
.withRecordClass(Train.class)
- .withMinBundleSize(1024);
+ .withMinBundleSize(1024)
+ .createSource();
List<Train> expectedResults = ImmutableList.of(new Train("Thomas ", 1, "blue", null),
new Train("Henry", 3, "green", null), new Train("Toby", 7, " brown ", null),
@@ -642,12 +673,14 @@ public class XmlSourceTest {
List<Train> trains = generateRandomTrainList(100);
File file = createRandomTrainXML(fileName, trains);
- XmlSource<Train> source =
- XmlSource.<Train>from(file.toPath().toString())
+ BoundedSource<Train> source =
+ XmlIO.<Train>read()
+ .from(file.toPath().toString())
.withRootElement("trains")
.withRecordElement("train")
.withRecordClass(Train.class)
- .withMinBundleSize(1024);
+ .withMinBundleSize(1024)
+ .createSource();
assertThat(
trainsToStrings(trains),
@@ -662,13 +695,15 @@ public class XmlSourceTest {
List<Train> trains = generateRandomTrainList(100);
File file = createRandomTrainXML(fileName, trains);
- XmlSource<Train> source =
- XmlSource.<Train>from(file.toPath().toString())
- .withRootElement("trains")
- .withRecordElement("train")
- .withRecordClass(Train.class)
- .withMinBundleSize(1024);
- PCollection<Train> output = p.apply("ReadFileData", Read.from(source));
+ PCollection<Train> output =
+ p.apply(
+ "ReadFileData",
+ XmlIO.<Train>read()
+ .from(file.toPath().toString())
+ .withRootElement("trains")
+ .withRecordElement("train")
+ .withRecordClass(Train.class)
+ .withMinBundleSize(1024));
PAssert.that(output).containsInAnyOrder(trains);
p.run();
@@ -680,18 +715,20 @@ public class XmlSourceTest {
List<Train> trains = generateRandomTrainList(10);
File file = createRandomTrainXML(fileName, trains);
- XmlSource<Train> source =
- XmlSource.<Train>from(file.toPath().toString())
+ BoundedSource<Train> source =
+ XmlIO.<Train>read()
+ .from(file.toPath().toString())
.withRootElement("trains")
.withRecordElement("train")
.withRecordClass(Train.class)
- .withMinBundleSize(10);
- List<? extends FileBasedSource<Train>> splits = source.split(100, null);
+ .withMinBundleSize(10)
+ .createSource();
+ List<? extends BoundedSource<Train>> splits = source.split(100, null);
assertTrue(splits.size() > 2);
List<Train> results = new ArrayList<>();
- for (FileBasedSource<Train> split : splits) {
+ for (BoundedSource<Train> split : splits) {
results.addAll(readEverythingFromReader(split.createReader(null)));
}
@@ -704,19 +741,21 @@ public class XmlSourceTest {
List<Train> trains = generateRandomTrainList(100);
File file = createRandomTrainXML(fileName, trains);
- XmlSource<Train> source =
- XmlSource.<Train>from(file.toPath().toString())
+ BoundedSource<Train> source =
+ XmlIO.<Train>read()
+ .from(file.toPath().toString())
.withRootElement("trains")
.withRecordElement("train")
.withRecordClass(Train.class)
- .withMinBundleSize(10);
- List<? extends FileBasedSource<Train>> splits = source.split(256, null);
+ .withMinBundleSize(10)
+ .createSource();
+ List<? extends BoundedSource<Train>> splits = source.split(256, null);
// Not a trivial split
assertTrue(splits.size() > 2);
List<Train> results = new ArrayList<>();
- for (FileBasedSource<Train> split : splits) {
+ for (BoundedSource<Train> split : splits) {
results.addAll(readEverythingFromReader(split.createReader(null)));
}
assertThat(trainsToStrings(trains), containsInAnyOrder(trainsToStrings(results).toArray()));
@@ -729,14 +768,16 @@ public class XmlSourceTest {
List<Train> trains = generateRandomTrainList(100);
File file = createRandomTrainXML(fileName, trains);
- XmlSource<Train> fileSource =
- XmlSource.<Train>from(file.toPath().toString())
+ BoundedSource<Train> fileSource =
+ XmlIO.<Train>read()
+ .from(file.toPath().toString())
.withRootElement("trains")
.withRecordElement("train")
.withRecordClass(Train.class)
- .withMinBundleSize(10);
+ .withMinBundleSize(10)
+ .createSource();
- List<? extends FileBasedSource<Train>> splits =
+ List<? extends BoundedSource<Train>> splits =
fileSource.split(file.length() / 3, null);
for (BoundedSource<Train> splitSource : splits) {
int numItems = readEverythingFromReader(splitSource.createReader(null)).size();
@@ -771,11 +812,13 @@ public class XmlSourceTest {
File file = tempFolder.newFile("trainXMLSmall");
Files.write(file.toPath(), trainXMLWithAllFeaturesSingleByte.getBytes(StandardCharsets.UTF_8));
- XmlSource<Train> source =
- XmlSource.<Train>from(file.toPath().toString())
+ BoundedSource<Train> source =
+ XmlIO.<Train>read()
+ .from(file.toPath().toString())
.withRootElement("trains")
.withRecordElement("train")
- .withRecordClass(Train.class);
+ .withRecordClass(Train.class)
+ .createSource();
assertSplitAtFractionExhaustive(source, options);
}
@@ -788,11 +831,13 @@ public class XmlSourceTest {
File file = tempFolder.newFile("trainXMLSmall");
Files.write(file.toPath(), trainXMLWithAllFeaturesMultiByte.getBytes(StandardCharsets.UTF_8));
- XmlSource<Train> source =
- XmlSource.<Train>from(file.toPath().toString())
+ BoundedSource<Train> source =
+ XmlIO.<Train>read()
+ .from(file.toPath().toString())
.withRootElement("\u0daf\u0dd4\u0db8\u0dca\u0dbb\u0dd2\u0dba\u0db1\u0dca")
.withRecordElement("\u0daf\u0dd4\u0db8\u0dca\u0dbb\u0dd2\u0dba")
- .withRecordClass(Train.class);
+ .withRecordClass(Train.class)
+ .createSource();
assertSplitAtFractionExhaustive(source, options);
}
@@ -808,13 +853,15 @@ public class XmlSourceTest {
generateRandomTrainList(8);
createRandomTrainXML("otherfile.xml", trains1);
- XmlSource<Train> source =
- XmlSource.<Train>from(file.getParent() + "/" + "temp*.xml")
- .withRootElement("trains")
- .withRecordElement("train")
- .withRecordClass(Train.class)
- .withMinBundleSize(1024);
- PCollection<Train> output = p.apply("ReadFileData", Read.from(source));
+ PCollection<Train> output =
+ p.apply(
+ "ReadFileData",
+ XmlIO.<Train>read()
+ .from(file.getParent() + "/" + "temp*.xml")
+ .withRootElement("trains")
+ .withRecordElement("train")
+ .withRecordClass(Train.class)
+ .withMinBundleSize(1024));
List<Train> expectedResults = new ArrayList<>();
expectedResults.addAll(trains1);
@@ -827,15 +874,14 @@ public class XmlSourceTest {
@Test
public void testDisplayData() {
-
-
- XmlSource<?> source = XmlSource
- .<Integer>from("foo.xml")
- .withRootElement("bird")
- .withRecordElement("cat")
- .withMinBundleSize(1234)
- .withRecordClass(Integer.class);
- DisplayData displayData = DisplayData.from(source);
+ DisplayData displayData =
+ DisplayData.from(
+ XmlIO.<Integer>read()
+ .from("foo.xml")
+ .withRootElement("bird")
+ .withRecordElement("cat")
+ .withMinBundleSize(1234)
+ .withRecordClass(Integer.class));
assertThat(displayData, hasDisplayItem("filePattern", "foo.xml"));
assertThat(displayData, hasDisplayItem("rootElement", "bird"));
http://git-wip-us.apache.org/repos/asf/beam/blob/d0c0a60c/sdks/java/core/src/test/java/org/apache/beam/sdk/transforms/display/DisplayDataTest.java
----------------------------------------------------------------------
diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/transforms/display/DisplayDataTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/transforms/display/DisplayDataTest.java
index c617f06..9b24b69 100644
--- a/sdks/java/core/src/test/java/org/apache/beam/sdk/transforms/display/DisplayDataTest.java
+++ b/sdks/java/core/src/test/java/org/apache/beam/sdk/transforms/display/DisplayDataTest.java
@@ -44,8 +44,10 @@ import static org.junit.Assert.fail;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
+import com.google.auto.value.AutoValue;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableMultimap;
+import com.google.common.collect.Iterables;
import com.google.common.collect.Multimap;
import com.google.common.testing.EqualsTester;
import java.io.IOException;
@@ -1299,6 +1301,21 @@ public class DisplayDataTest implements Serializable {
DisplayData.from(component);
}
+ @AutoValue
+ abstract static class Foo implements HasDisplayData {
+ @Override
+ public void populateDisplayData(Builder builder) {
+ builder.add(DisplayData.item("someKey", "someValue"));
+ }
+ }
+
+ @Test
+ public void testAutoValue() {
+ DisplayData data = DisplayData.from(new AutoValue_DisplayDataTest_Foo());
+ Item item = Iterables.getOnlyElement(data.asMap().values());
+ assertEquals(Foo.class, item.getNamespace());
+ }
+
private String quoted(Object obj) {
return String.format("\"%s\"", obj);
}
[19/50] [abbrv] beam git commit: This closes #2415
Posted by dh...@apache.org.
This closes #2415
Project: http://git-wip-us.apache.org/repos/asf/beam/repo
Commit: http://git-wip-us.apache.org/repos/asf/beam/commit/57929fb8
Tree: http://git-wip-us.apache.org/repos/asf/beam/tree/57929fb8
Diff: http://git-wip-us.apache.org/repos/asf/beam/diff/57929fb8
Branch: refs/heads/DSL_SQL
Commit: 57929fb802d0cb6a6b6c3f14819d473dc2ace113
Parents: e0df7d8 7d13061
Author: Eugene Kirpichov <ki...@google.com>
Authored: Tue Apr 18 21:13:05 2017 -0700
Committer: Eugene Kirpichov <ki...@google.com>
Committed: Tue Apr 18 21:13:05 2017 -0700
----------------------------------------------------------------------
.../apache/beam/sdk/util/IOChannelUtils.java | 9 +
.../sdk/io/gcp/bigquery/BatchLoadBigQuery.java | 180 ---
.../beam/sdk/io/gcp/bigquery/BatchLoads.java | 225 +++
.../sdk/io/gcp/bigquery/BigQueryHelpers.java | 13 +
.../beam/sdk/io/gcp/bigquery/BigQueryIO.java | 113 +-
.../io/gcp/bigquery/BigQueryTableSource.java | 4 +-
.../beam/sdk/io/gcp/bigquery/CreateTables.java | 127 ++
.../io/gcp/bigquery/GenerateShardedTable.java | 47 +
.../beam/sdk/io/gcp/bigquery/PrepareWrite.java | 81 +
.../beam/sdk/io/gcp/bigquery/ShardedKey.java | 25 +-
.../sdk/io/gcp/bigquery/StreamWithDeDup.java | 90 --
.../sdk/io/gcp/bigquery/StreamingInserts.java | 79 +
.../sdk/io/gcp/bigquery/StreamingWriteFn.java | 81 +-
.../io/gcp/bigquery/StreamingWriteTables.java | 86 ++
.../sdk/io/gcp/bigquery/TableDestination.java | 76 +
.../io/gcp/bigquery/TableDestinationCoder.java | 60 +
.../sdk/io/gcp/bigquery/TableRowWriter.java | 19 +-
.../sdk/io/gcp/bigquery/TagWithUniqueIds.java | 62 +
.../gcp/bigquery/TagWithUniqueIdsAndTable.java | 135 --
.../beam/sdk/io/gcp/bigquery/WriteBundles.java | 82 --
.../io/gcp/bigquery/WriteBundlesToFiles.java | 157 ++
.../sdk/io/gcp/bigquery/WritePartition.java | 163 +-
.../beam/sdk/io/gcp/bigquery/WriteRename.java | 71 +-
.../beam/sdk/io/gcp/bigquery/WriteTables.java | 58 +-
.../sdk/io/gcp/bigquery/BigQueryIOTest.java | 1393 +++++++-----------
.../io/gcp/bigquery/FakeBigQueryServices.java | 166 +++
.../sdk/io/gcp/bigquery/FakeDatasetService.java | 208 +++
.../sdk/io/gcp/bigquery/FakeJobService.java | 395 +++++
.../sdk/io/gcp/bigquery/TableContainer.java | 61 +
29 files changed, 2642 insertions(+), 1624 deletions(-)
----------------------------------------------------------------------
[05/50] [abbrv] beam git commit: Separates side input test and side
output test
Posted by dh...@apache.org.
Separates side input test and side output test
Project: http://git-wip-us.apache.org/repos/asf/beam/repo
Commit: http://git-wip-us.apache.org/repos/asf/beam/commit/a51bdd26
Tree: http://git-wip-us.apache.org/repos/asf/beam/tree/a51bdd26
Diff: http://git-wip-us.apache.org/repos/asf/beam/diff/a51bdd26
Branch: refs/heads/DSL_SQL
Commit: a51bdd266f9c877cb407de986a465fc9c7de76ff
Parents: a9bcc8b
Author: Eugene Kirpichov <ki...@google.com>
Authored: Sat Apr 15 16:38:35 2017 -0700
Committer: Eugene Kirpichov <ki...@google.com>
Committed: Tue Apr 18 18:02:06 2017 -0700
----------------------------------------------------------------------
.../beam/sdk/transforms/SplittableDoFnTest.java | 63 ++++++++++++++------
1 file changed, 44 insertions(+), 19 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/beam/blob/a51bdd26/sdks/java/core/src/test/java/org/apache/beam/sdk/transforms/SplittableDoFnTest.java
----------------------------------------------------------------------
diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/transforms/SplittableDoFnTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/transforms/SplittableDoFnTest.java
index 9e8c12e..30329f4 100644
--- a/sdks/java/core/src/test/java/org/apache/beam/sdk/transforms/SplittableDoFnTest.java
+++ b/sdks/java/core/src/test/java/org/apache/beam/sdk/transforms/SplittableDoFnTest.java
@@ -22,6 +22,7 @@ import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
+import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
@@ -61,7 +62,7 @@ import org.junit.runners.JUnit4;
* Tests for <a href="https://s.apache.org/splittable-do-fn>splittable</a> {@link DoFn} behavior.
*/
@RunWith(JUnit4.class)
-public class SplittableDoFnTest {
+public class SplittableDoFnTest implements Serializable {
static class PairStringWithIndexToLength extends DoFn<String, KV<String, Integer>> {
@ProcessElement
@@ -216,22 +217,18 @@ public class SplittableDoFnTest {
p.run();
}
- private static class SDFWithSideInputsAndOutputs extends DoFn<Integer, String> {
+ private static class SDFWithSideInput extends DoFn<Integer, String> {
private final PCollectionView<String> sideInput;
- private final TupleTag<String> additionalOutput;
- private SDFWithSideInputsAndOutputs(
- PCollectionView<String> sideInput, TupleTag<String> additionalOutput) {
+ private SDFWithSideInput(PCollectionView<String> sideInput) {
this.sideInput = sideInput;
- this.additionalOutput = additionalOutput;
}
@ProcessElement
public void process(ProcessContext c, OffsetRangeTracker tracker) {
checkState(tracker.tryClaim(tracker.currentRestriction().getFrom()));
String side = c.sideInput(sideInput);
- c.output("main:" + side + ":" + c.element());
- c.output(additionalOutput, "additional:" + side + ":" + c.element());
+ c.output(side + ":" + c.element());
}
@GetInitialRestriction
@@ -242,27 +239,55 @@ public class SplittableDoFnTest {
@Test
@Category({ValidatesRunner.class, UsesSplittableParDo.class})
- public void testSideInputsAndOutputs() throws Exception {
-
+ public void testSideInput() throws Exception {
PCollectionView<String> sideInput =
p.apply("side input", Create.of("foo")).apply(View.<String>asSingleton());
- TupleTag<String> mainOutputTag = new TupleTag<>("main");
- TupleTag<String> additionalOutputTag = new TupleTag<>("additional");
+
+ PCollection<String> res =
+ p.apply("input", Create.of(0, 1, 2))
+ .apply(ParDo.of(new SDFWithSideInput(sideInput)).withSideInputs(sideInput));
+
+ PAssert.that(res).containsInAnyOrder(Arrays.asList("foo:0", "foo:1", "foo:2"));
+
+ p.run();
+ }
+
+ private static class SDFWithAdditionalOutput extends DoFn<Integer, String> {
+ private final TupleTag<String> additionalOutput;
+
+ private SDFWithAdditionalOutput(TupleTag<String> additionalOutput) {
+ this.additionalOutput = additionalOutput;
+ }
+
+ @ProcessElement
+ public void process(ProcessContext c, OffsetRangeTracker tracker) {
+ checkState(tracker.tryClaim(tracker.currentRestriction().getFrom()));
+ c.output("main:" + c.element());
+ c.output(additionalOutput, "additional:" + c.element());
+ }
+
+ @GetInitialRestriction
+ public OffsetRange getInitialRestriction(Integer value) {
+ return new OffsetRange(0, 1);
+ }
+ }
+
+ @Test
+ @Category({ValidatesRunner.class, UsesSplittableParDo.class})
+ public void testAdditionalOutput() throws Exception {
+ TupleTag<String> mainOutputTag = new TupleTag<String>("main") {};
+ TupleTag<String> additionalOutputTag = new TupleTag<String>("additional") {};
PCollectionTuple res =
p.apply("input", Create.of(0, 1, 2))
.apply(
- ParDo.of(new SDFWithSideInputsAndOutputs(sideInput, additionalOutputTag))
- .withSideInputs(sideInput)
+ ParDo.of(new SDFWithAdditionalOutput(additionalOutputTag))
.withOutputTags(mainOutputTag, TupleTagList.of(additionalOutputTag)));
- res.get(mainOutputTag).setCoder(StringUtf8Coder.of());
- res.get(additionalOutputTag).setCoder(StringUtf8Coder.of());
PAssert.that(res.get(mainOutputTag))
- .containsInAnyOrder(Arrays.asList("main:foo:0", "main:foo:1", "main:foo:2"));
+ .containsInAnyOrder(Arrays.asList("main:0", "main:1", "main:2"));
PAssert.that(res.get(additionalOutputTag))
- .containsInAnyOrder(
- Arrays.asList("additional:foo:0", "additional:foo:1", "additional:foo:2"));
+ .containsInAnyOrder(Arrays.asList("additional:0", "additional:1", "additional:2"));
p.run();
}
[26/50] [abbrv] beam git commit: [BEAM-1994] Remove Flink examples
package
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/state/FlinkSplitStateInternals.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/state/FlinkSplitStateInternals.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/state/FlinkSplitStateInternals.java
new file mode 100644
index 0000000..2bf0bf1
--- /dev/null
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/state/FlinkSplitStateInternals.java
@@ -0,0 +1,260 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink.translation.wrappers.streaming.state;
+
+import com.google.common.collect.Iterators;
+import java.util.Collections;
+import org.apache.beam.runners.core.StateInternals;
+import org.apache.beam.runners.core.StateNamespace;
+import org.apache.beam.runners.core.StateTag;
+import org.apache.beam.runners.flink.translation.types.CoderTypeInformation;
+import org.apache.beam.sdk.coders.Coder;
+import org.apache.beam.sdk.transforms.Combine;
+import org.apache.beam.sdk.transforms.CombineWithContext;
+import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
+import org.apache.beam.sdk.transforms.windowing.OutputTimeFn;
+import org.apache.beam.sdk.util.state.BagState;
+import org.apache.beam.sdk.util.state.CombiningState;
+import org.apache.beam.sdk.util.state.MapState;
+import org.apache.beam.sdk.util.state.ReadableState;
+import org.apache.beam.sdk.util.state.SetState;
+import org.apache.beam.sdk.util.state.State;
+import org.apache.beam.sdk.util.state.StateContext;
+import org.apache.beam.sdk.util.state.StateContexts;
+import org.apache.beam.sdk.util.state.ValueState;
+import org.apache.beam.sdk.util.state.WatermarkHoldState;
+import org.apache.flink.api.common.ExecutionConfig;
+import org.apache.flink.api.common.state.ListStateDescriptor;
+import org.apache.flink.runtime.state.OperatorStateBackend;
+
+/**
+ * {@link StateInternals} that uses a Flink {@link OperatorStateBackend}
+ * to manage the split-distribute state.
+ *
+ * <p>Elements in ListState will be redistributed in round robin fashion
+ * to operators when restarting with a different parallelism.
+ *
+ * <p>Note:
+ * Ignore index of key and namespace.
+ * Just implement BagState.
+ */
+public class FlinkSplitStateInternals<K> implements StateInternals<K> {
+
+ private final OperatorStateBackend stateBackend;
+
+ public FlinkSplitStateInternals(OperatorStateBackend stateBackend) {
+ this.stateBackend = stateBackend;
+ }
+
+ @Override
+ public K getKey() {
+ return null;
+ }
+
+ @Override
+ public <T extends State> T state(
+ final StateNamespace namespace,
+ StateTag<? super K, T> address) {
+
+ return state(namespace, address, StateContexts.nullContext());
+ }
+
+ @Override
+ public <T extends State> T state(
+ final StateNamespace namespace,
+ StateTag<? super K, T> address,
+ final StateContext<?> context) {
+
+ return address.bind(new StateTag.StateBinder<K>() {
+
+ @Override
+ public <T> ValueState<T> bindValue(
+ StateTag<? super K, ValueState<T>> address,
+ Coder<T> coder) {
+ throw new UnsupportedOperationException(
+ String.format("%s is not supported", ValueState.class.getSimpleName()));
+ }
+
+ @Override
+ public <T> BagState<T> bindBag(
+ StateTag<? super K, BagState<T>> address,
+ Coder<T> elemCoder) {
+
+ return new FlinkSplitBagState<>(stateBackend, address, namespace, elemCoder);
+ }
+
+ @Override
+ public <T> SetState<T> bindSet(
+ StateTag<? super K, SetState<T>> address,
+ Coder<T> elemCoder) {
+ throw new UnsupportedOperationException(
+ String.format("%s is not supported", SetState.class.getSimpleName()));
+ }
+
+ @Override
+ public <KeyT, ValueT> MapState<KeyT, ValueT> bindMap(
+ StateTag<? super K, MapState<KeyT, ValueT>> spec,
+ Coder<KeyT> mapKeyCoder, Coder<ValueT> mapValueCoder) {
+ throw new UnsupportedOperationException(
+ String.format("%s is not supported", MapState.class.getSimpleName()));
+ }
+
+ @Override
+ public <InputT, AccumT, OutputT>
+ CombiningState<InputT, AccumT, OutputT>
+ bindCombiningValue(
+ StateTag<? super K, CombiningState<InputT, AccumT, OutputT>> address,
+ Coder<AccumT> accumCoder,
+ Combine.CombineFn<InputT, AccumT, OutputT> combineFn) {
+ throw new UnsupportedOperationException("bindCombiningValue is not supported.");
+ }
+
+ @Override
+ public <InputT, AccumT, OutputT>
+ CombiningState<InputT, AccumT, OutputT> bindKeyedCombiningValue(
+ StateTag<? super K, CombiningState<InputT, AccumT, OutputT>> address,
+ Coder<AccumT> accumCoder,
+ final Combine.KeyedCombineFn<? super K, InputT, AccumT, OutputT> combineFn) {
+ throw new UnsupportedOperationException("bindKeyedCombiningValue is not supported.");
+
+ }
+
+ @Override
+ public <InputT, AccumT, OutputT>
+ CombiningState<InputT, AccumT, OutputT> bindKeyedCombiningValueWithContext(
+ StateTag<? super K, CombiningState<InputT, AccumT, OutputT>> address,
+ Coder<AccumT> accumCoder,
+ CombineWithContext.KeyedCombineFnWithContext<
+ ? super K, InputT, AccumT, OutputT> combineFn) {
+ throw new UnsupportedOperationException(
+ "bindKeyedCombiningValueWithContext is not supported.");
+ }
+
+ @Override
+ public <W extends BoundedWindow> WatermarkHoldState<W> bindWatermark(
+ StateTag<? super K, WatermarkHoldState<W>> address,
+ OutputTimeFn<? super W> outputTimeFn) {
+ throw new UnsupportedOperationException(
+ String.format("%s is not supported", CombiningState.class.getSimpleName()));
+ }
+ });
+ }
+
+ private static class FlinkSplitBagState<K, T> implements BagState<T> {
+
+ private final ListStateDescriptor<T> descriptor;
+ private OperatorStateBackend flinkStateBackend;
+ private final StateNamespace namespace;
+ private final StateTag<? super K, BagState<T>> address;
+
+ FlinkSplitBagState(
+ OperatorStateBackend flinkStateBackend,
+ StateTag<? super K, BagState<T>> address,
+ StateNamespace namespace,
+ Coder<T> coder) {
+ this.flinkStateBackend = flinkStateBackend;
+ this.namespace = namespace;
+ this.address = address;
+
+ CoderTypeInformation<T> typeInfo =
+ new CoderTypeInformation<>(coder);
+
+ descriptor = new ListStateDescriptor<>(address.getId(),
+ typeInfo.createSerializer(new ExecutionConfig()));
+ }
+
+ @Override
+ public void add(T input) {
+ try {
+ flinkStateBackend.getOperatorState(descriptor).add(input);
+ } catch (Exception e) {
+ throw new RuntimeException("Error updating state.", e);
+ }
+ }
+
+ @Override
+ public BagState<T> readLater() {
+ return this;
+ }
+
+ @Override
+ public Iterable<T> read() {
+ try {
+ Iterable<T> result = flinkStateBackend.getOperatorState(descriptor).get();
+ return result != null ? result : Collections.<T>emptyList();
+ } catch (Exception e) {
+ throw new RuntimeException("Error updating state.", e);
+ }
+ }
+
+ @Override
+ public ReadableState<Boolean> isEmpty() {
+ return new ReadableState<Boolean>() {
+ @Override
+ public Boolean read() {
+ try {
+ Iterable<T> result = flinkStateBackend.getOperatorState(descriptor).get();
+ // PartitionableListState.get() return empty collection When there is no element,
+ // KeyedListState different. (return null)
+ return result == null || Iterators.size(result.iterator()) == 0;
+ } catch (Exception e) {
+ throw new RuntimeException("Error reading state.", e);
+ }
+
+ }
+
+ @Override
+ public ReadableState<Boolean> readLater() {
+ return this;
+ }
+ };
+ }
+
+ @Override
+ public void clear() {
+ try {
+ flinkStateBackend.getOperatorState(descriptor).clear();
+ } catch (Exception e) {
+ throw new RuntimeException("Error reading state.", e);
+ }
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) {
+ return true;
+ }
+ if (o == null || getClass() != o.getClass()) {
+ return false;
+ }
+
+ FlinkSplitBagState<?, ?> that = (FlinkSplitBagState<?, ?>) o;
+
+ return namespace.equals(that.namespace) && address.equals(that.address);
+
+ }
+
+ @Override
+ public int hashCode() {
+ int result = namespace.hashCode();
+ result = 31 * result + address.hashCode();
+ return result;
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/state/FlinkStateInternals.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/state/FlinkStateInternals.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/state/FlinkStateInternals.java
new file mode 100644
index 0000000..4f961e5
--- /dev/null
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/state/FlinkStateInternals.java
@@ -0,0 +1,1053 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink.translation.wrappers.streaming.state;
+
+import com.google.common.collect.Lists;
+import java.nio.ByteBuffer;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+import org.apache.beam.runners.core.StateInternals;
+import org.apache.beam.runners.core.StateNamespace;
+import org.apache.beam.runners.core.StateTag;
+import org.apache.beam.runners.flink.translation.types.CoderTypeInformation;
+import org.apache.beam.sdk.coders.Coder;
+import org.apache.beam.sdk.coders.CoderException;
+import org.apache.beam.sdk.coders.InstantCoder;
+import org.apache.beam.sdk.transforms.Combine;
+import org.apache.beam.sdk.transforms.CombineWithContext;
+import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
+import org.apache.beam.sdk.transforms.windowing.OutputTimeFn;
+import org.apache.beam.sdk.util.CoderUtils;
+import org.apache.beam.sdk.util.CombineContextFactory;
+import org.apache.beam.sdk.util.state.BagState;
+import org.apache.beam.sdk.util.state.CombiningState;
+import org.apache.beam.sdk.util.state.MapState;
+import org.apache.beam.sdk.util.state.ReadableState;
+import org.apache.beam.sdk.util.state.SetState;
+import org.apache.beam.sdk.util.state.State;
+import org.apache.beam.sdk.util.state.StateContext;
+import org.apache.beam.sdk.util.state.StateContexts;
+import org.apache.beam.sdk.util.state.ValueState;
+import org.apache.beam.sdk.util.state.WatermarkHoldState;
+import org.apache.flink.api.common.state.ListStateDescriptor;
+import org.apache.flink.api.common.state.ValueStateDescriptor;
+import org.apache.flink.api.common.typeutils.base.StringSerializer;
+import org.apache.flink.runtime.state.KeyedStateBackend;
+import org.joda.time.Instant;
+
+/**
+ * {@link StateInternals} that uses a Flink {@link KeyedStateBackend} to manage state.
+ *
+ * <p>Note: In the Flink streaming runner the key is always encoded
+ * using an {@link Coder} and stored in a {@link ByteBuffer}.
+ */
+public class FlinkStateInternals<K> implements StateInternals<K> {
+
+ private final KeyedStateBackend<ByteBuffer> flinkStateBackend;
+ private Coder<K> keyCoder;
+
+ // on recovery, these will no be properly set because we don't
+ // know which watermark hold states there are in the Flink State Backend
+ private final Map<String, Instant> watermarkHolds = new HashMap<>();
+
+ public FlinkStateInternals(KeyedStateBackend<ByteBuffer> flinkStateBackend, Coder<K> keyCoder) {
+ this.flinkStateBackend = flinkStateBackend;
+ this.keyCoder = keyCoder;
+ }
+
+ /**
+ * Returns the minimum over all watermark holds.
+ */
+ public Instant watermarkHold() {
+ long min = Long.MAX_VALUE;
+ for (Instant hold: watermarkHolds.values()) {
+ min = Math.min(min, hold.getMillis());
+ }
+ return new Instant(min);
+ }
+
+ @Override
+ public K getKey() {
+ ByteBuffer keyBytes = flinkStateBackend.getCurrentKey();
+ try {
+ return CoderUtils.decodeFromByteArray(keyCoder, keyBytes.array());
+ } catch (CoderException e) {
+ throw new RuntimeException("Error decoding key.", e);
+ }
+ }
+
+ @Override
+ public <T extends State> T state(
+ final StateNamespace namespace,
+ StateTag<? super K, T> address) {
+
+ return state(namespace, address, StateContexts.nullContext());
+ }
+
+ @Override
+ public <T extends State> T state(
+ final StateNamespace namespace,
+ StateTag<? super K, T> address,
+ final StateContext<?> context) {
+
+ return address.bind(new StateTag.StateBinder<K>() {
+
+ @Override
+ public <T> ValueState<T> bindValue(
+ StateTag<? super K, ValueState<T>> address,
+ Coder<T> coder) {
+
+ return new FlinkValueState<>(flinkStateBackend, address, namespace, coder);
+ }
+
+ @Override
+ public <T> BagState<T> bindBag(
+ StateTag<? super K, BagState<T>> address,
+ Coder<T> elemCoder) {
+
+ return new FlinkBagState<>(flinkStateBackend, address, namespace, elemCoder);
+ }
+
+ @Override
+ public <T> SetState<T> bindSet(
+ StateTag<? super K, SetState<T>> address,
+ Coder<T> elemCoder) {
+ throw new UnsupportedOperationException(
+ String.format("%s is not supported", SetState.class.getSimpleName()));
+ }
+
+ @Override
+ public <KeyT, ValueT> MapState<KeyT, ValueT> bindMap(
+ StateTag<? super K, MapState<KeyT, ValueT>> spec,
+ Coder<KeyT> mapKeyCoder, Coder<ValueT> mapValueCoder) {
+ throw new UnsupportedOperationException(
+ String.format("%s is not supported", MapState.class.getSimpleName()));
+ }
+
+ @Override
+ public <InputT, AccumT, OutputT>
+ CombiningState<InputT, AccumT, OutputT>
+ bindCombiningValue(
+ StateTag<? super K, CombiningState<InputT, AccumT, OutputT>> address,
+ Coder<AccumT> accumCoder,
+ Combine.CombineFn<InputT, AccumT, OutputT> combineFn) {
+
+ return new FlinkCombiningState<>(
+ flinkStateBackend, address, combineFn, namespace, accumCoder);
+ }
+
+ @Override
+ public <InputT, AccumT, OutputT>
+ CombiningState<InputT, AccumT, OutputT> bindKeyedCombiningValue(
+ StateTag<? super K, CombiningState<InputT, AccumT, OutputT>> address,
+ Coder<AccumT> accumCoder,
+ final Combine.KeyedCombineFn<? super K, InputT, AccumT, OutputT> combineFn) {
+ return new FlinkKeyedCombiningState<>(
+ flinkStateBackend,
+ address,
+ combineFn,
+ namespace,
+ accumCoder,
+ FlinkStateInternals.this);
+ }
+
+ @Override
+ public <InputT, AccumT, OutputT>
+ CombiningState<InputT, AccumT, OutputT> bindKeyedCombiningValueWithContext(
+ StateTag<? super K, CombiningState<InputT, AccumT, OutputT>> address,
+ Coder<AccumT> accumCoder,
+ CombineWithContext.KeyedCombineFnWithContext<
+ ? super K, InputT, AccumT, OutputT> combineFn) {
+ return new FlinkCombiningStateWithContext<>(
+ flinkStateBackend,
+ address,
+ combineFn,
+ namespace,
+ accumCoder,
+ FlinkStateInternals.this,
+ CombineContextFactory.createFromStateContext(context));
+ }
+
+ @Override
+ public <W extends BoundedWindow> WatermarkHoldState<W> bindWatermark(
+ StateTag<? super K, WatermarkHoldState<W>> address,
+ OutputTimeFn<? super W> outputTimeFn) {
+
+ return new FlinkWatermarkHoldState<>(
+ flinkStateBackend, FlinkStateInternals.this, address, namespace, outputTimeFn);
+ }
+ });
+ }
+
+ private static class FlinkValueState<K, T> implements ValueState<T> {
+
+ private final StateNamespace namespace;
+ private final StateTag<? super K, ValueState<T>> address;
+ private final ValueStateDescriptor<T> flinkStateDescriptor;
+ private final KeyedStateBackend<ByteBuffer> flinkStateBackend;
+
+ FlinkValueState(
+ KeyedStateBackend<ByteBuffer> flinkStateBackend,
+ StateTag<? super K, ValueState<T>> address,
+ StateNamespace namespace,
+ Coder<T> coder) {
+
+ this.namespace = namespace;
+ this.address = address;
+ this.flinkStateBackend = flinkStateBackend;
+
+ CoderTypeInformation<T> typeInfo = new CoderTypeInformation<>(coder);
+
+ flinkStateDescriptor = new ValueStateDescriptor<>(address.getId(), typeInfo, null);
+ }
+
+ @Override
+ public void write(T input) {
+ try {
+ flinkStateBackend.getPartitionedState(
+ namespace.stringKey(),
+ StringSerializer.INSTANCE,
+ flinkStateDescriptor).update(input);
+ } catch (Exception e) {
+ throw new RuntimeException("Error updating state.", e);
+ }
+ }
+
+ @Override
+ public ValueState<T> readLater() {
+ return this;
+ }
+
+ @Override
+ public T read() {
+ try {
+ return flinkStateBackend.getPartitionedState(
+ namespace.stringKey(),
+ StringSerializer.INSTANCE,
+ flinkStateDescriptor).value();
+ } catch (Exception e) {
+ throw new RuntimeException("Error reading state.", e);
+ }
+ }
+
+ @Override
+ public void clear() {
+ try {
+ flinkStateBackend.getPartitionedState(
+ namespace.stringKey(),
+ StringSerializer.INSTANCE,
+ flinkStateDescriptor).clear();
+ } catch (Exception e) {
+ throw new RuntimeException("Error clearing state.", e);
+ }
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) {
+ return true;
+ }
+ if (o == null || getClass() != o.getClass()) {
+ return false;
+ }
+
+ FlinkValueState<?, ?> that = (FlinkValueState<?, ?>) o;
+
+ return namespace.equals(that.namespace) && address.equals(that.address);
+
+ }
+
+ @Override
+ public int hashCode() {
+ int result = namespace.hashCode();
+ result = 31 * result + address.hashCode();
+ return result;
+ }
+ }
+
+ private static class FlinkBagState<K, T> implements BagState<T> {
+
+ private final StateNamespace namespace;
+ private final StateTag<? super K, BagState<T>> address;
+ private final ListStateDescriptor<T> flinkStateDescriptor;
+ private final KeyedStateBackend<ByteBuffer> flinkStateBackend;
+
+ FlinkBagState(
+ KeyedStateBackend<ByteBuffer> flinkStateBackend,
+ StateTag<? super K, BagState<T>> address,
+ StateNamespace namespace,
+ Coder<T> coder) {
+
+ this.namespace = namespace;
+ this.address = address;
+ this.flinkStateBackend = flinkStateBackend;
+
+ CoderTypeInformation<T> typeInfo = new CoderTypeInformation<>(coder);
+
+ flinkStateDescriptor = new ListStateDescriptor<>(address.getId(), typeInfo);
+ }
+
+ @Override
+ public void add(T input) {
+ try {
+ flinkStateBackend.getPartitionedState(
+ namespace.stringKey(),
+ StringSerializer.INSTANCE,
+ flinkStateDescriptor).add(input);
+ } catch (Exception e) {
+ throw new RuntimeException("Error adding to bag state.", e);
+ }
+ }
+
+ @Override
+ public BagState<T> readLater() {
+ return this;
+ }
+
+ @Override
+ public Iterable<T> read() {
+ try {
+ Iterable<T> result = flinkStateBackend.getPartitionedState(
+ namespace.stringKey(),
+ StringSerializer.INSTANCE,
+ flinkStateDescriptor).get();
+
+ return result != null ? result : Collections.<T>emptyList();
+ } catch (Exception e) {
+ throw new RuntimeException("Error reading state.", e);
+ }
+ }
+
+ @Override
+ public ReadableState<Boolean> isEmpty() {
+ return new ReadableState<Boolean>() {
+ @Override
+ public Boolean read() {
+ try {
+ Iterable<T> result = flinkStateBackend.getPartitionedState(
+ namespace.stringKey(),
+ StringSerializer.INSTANCE,
+ flinkStateDescriptor).get();
+ return result == null;
+ } catch (Exception e) {
+ throw new RuntimeException("Error reading state.", e);
+ }
+
+ }
+
+ @Override
+ public ReadableState<Boolean> readLater() {
+ return this;
+ }
+ };
+ }
+
+ @Override
+ public void clear() {
+ try {
+ flinkStateBackend.getPartitionedState(
+ namespace.stringKey(),
+ StringSerializer.INSTANCE,
+ flinkStateDescriptor).clear();
+ } catch (Exception e) {
+ throw new RuntimeException("Error clearing state.", e);
+ }
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) {
+ return true;
+ }
+ if (o == null || getClass() != o.getClass()) {
+ return false;
+ }
+
+ FlinkBagState<?, ?> that = (FlinkBagState<?, ?>) o;
+
+ return namespace.equals(that.namespace) && address.equals(that.address);
+
+ }
+
+ @Override
+ public int hashCode() {
+ int result = namespace.hashCode();
+ result = 31 * result + address.hashCode();
+ return result;
+ }
+ }
+
+ private static class FlinkCombiningState<K, InputT, AccumT, OutputT>
+ implements CombiningState<InputT, AccumT, OutputT> {
+
+ private final StateNamespace namespace;
+ private final StateTag<? super K, CombiningState<InputT, AccumT, OutputT>> address;
+ private final Combine.CombineFn<InputT, AccumT, OutputT> combineFn;
+ private final ValueStateDescriptor<AccumT> flinkStateDescriptor;
+ private final KeyedStateBackend<ByteBuffer> flinkStateBackend;
+
+ FlinkCombiningState(
+ KeyedStateBackend<ByteBuffer> flinkStateBackend,
+ StateTag<? super K, CombiningState<InputT, AccumT, OutputT>> address,
+ Combine.CombineFn<InputT, AccumT, OutputT> combineFn,
+ StateNamespace namespace,
+ Coder<AccumT> accumCoder) {
+
+ this.namespace = namespace;
+ this.address = address;
+ this.combineFn = combineFn;
+ this.flinkStateBackend = flinkStateBackend;
+
+ CoderTypeInformation<AccumT> typeInfo = new CoderTypeInformation<>(accumCoder);
+
+ flinkStateDescriptor = new ValueStateDescriptor<>(address.getId(), typeInfo, null);
+ }
+
+ @Override
+ public CombiningState<InputT, AccumT, OutputT> readLater() {
+ return this;
+ }
+
+ @Override
+ public void add(InputT value) {
+ try {
+ org.apache.flink.api.common.state.ValueState<AccumT> state =
+ flinkStateBackend.getPartitionedState(
+ namespace.stringKey(),
+ StringSerializer.INSTANCE,
+ flinkStateDescriptor);
+
+ AccumT current = state.value();
+ if (current == null) {
+ current = combineFn.createAccumulator();
+ }
+ current = combineFn.addInput(current, value);
+ state.update(current);
+ } catch (Exception e) {
+ throw new RuntimeException("Error adding to state." , e);
+ }
+ }
+
+ @Override
+ public void addAccum(AccumT accum) {
+ try {
+ org.apache.flink.api.common.state.ValueState<AccumT> state =
+ flinkStateBackend.getPartitionedState(
+ namespace.stringKey(),
+ StringSerializer.INSTANCE,
+ flinkStateDescriptor);
+
+ AccumT current = state.value();
+ if (current == null) {
+ state.update(accum);
+ } else {
+ current = combineFn.mergeAccumulators(Lists.newArrayList(current, accum));
+ state.update(current);
+ }
+ } catch (Exception e) {
+ throw new RuntimeException("Error adding to state.", e);
+ }
+ }
+
+ @Override
+ public AccumT getAccum() {
+ try {
+ return flinkStateBackend.getPartitionedState(
+ namespace.stringKey(),
+ StringSerializer.INSTANCE,
+ flinkStateDescriptor).value();
+ } catch (Exception e) {
+ throw new RuntimeException("Error reading state.", e);
+ }
+ }
+
+ @Override
+ public AccumT mergeAccumulators(Iterable<AccumT> accumulators) {
+ return combineFn.mergeAccumulators(accumulators);
+ }
+
+ @Override
+ public OutputT read() {
+ try {
+ org.apache.flink.api.common.state.ValueState<AccumT> state =
+ flinkStateBackend.getPartitionedState(
+ namespace.stringKey(),
+ StringSerializer.INSTANCE,
+ flinkStateDescriptor);
+
+ AccumT accum = state.value();
+ if (accum != null) {
+ return combineFn.extractOutput(accum);
+ } else {
+ return combineFn.extractOutput(combineFn.createAccumulator());
+ }
+ } catch (Exception e) {
+ throw new RuntimeException("Error reading state.", e);
+ }
+ }
+
+ @Override
+ public ReadableState<Boolean> isEmpty() {
+ return new ReadableState<Boolean>() {
+ @Override
+ public Boolean read() {
+ try {
+ return flinkStateBackend.getPartitionedState(
+ namespace.stringKey(),
+ StringSerializer.INSTANCE,
+ flinkStateDescriptor).value() == null;
+ } catch (Exception e) {
+ throw new RuntimeException("Error reading state.", e);
+ }
+
+ }
+
+ @Override
+ public ReadableState<Boolean> readLater() {
+ return this;
+ }
+ };
+ }
+
+ @Override
+ public void clear() {
+ try {
+ flinkStateBackend.getPartitionedState(
+ namespace.stringKey(),
+ StringSerializer.INSTANCE,
+ flinkStateDescriptor).clear();
+ } catch (Exception e) {
+ throw new RuntimeException("Error clearing state.", e);
+ }
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) {
+ return true;
+ }
+ if (o == null || getClass() != o.getClass()) {
+ return false;
+ }
+
+ FlinkCombiningState<?, ?, ?, ?> that =
+ (FlinkCombiningState<?, ?, ?, ?>) o;
+
+ return namespace.equals(that.namespace) && address.equals(that.address);
+
+ }
+
+ @Override
+ public int hashCode() {
+ int result = namespace.hashCode();
+ result = 31 * result + address.hashCode();
+ return result;
+ }
+ }
+
+ private static class FlinkKeyedCombiningState<K, InputT, AccumT, OutputT>
+ implements CombiningState<InputT, AccumT, OutputT> {
+
+ private final StateNamespace namespace;
+ private final StateTag<? super K, CombiningState<InputT, AccumT, OutputT>> address;
+ private final Combine.KeyedCombineFn<? super K, InputT, AccumT, OutputT> combineFn;
+ private final ValueStateDescriptor<AccumT> flinkStateDescriptor;
+ private final KeyedStateBackend<ByteBuffer> flinkStateBackend;
+ private final FlinkStateInternals<K> flinkStateInternals;
+
+ FlinkKeyedCombiningState(
+ KeyedStateBackend<ByteBuffer> flinkStateBackend,
+ StateTag<? super K, CombiningState<InputT, AccumT, OutputT>> address,
+ Combine.KeyedCombineFn<? super K, InputT, AccumT, OutputT> combineFn,
+ StateNamespace namespace,
+ Coder<AccumT> accumCoder,
+ FlinkStateInternals<K> flinkStateInternals) {
+
+ this.namespace = namespace;
+ this.address = address;
+ this.combineFn = combineFn;
+ this.flinkStateBackend = flinkStateBackend;
+ this.flinkStateInternals = flinkStateInternals;
+
+ CoderTypeInformation<AccumT> typeInfo = new CoderTypeInformation<>(accumCoder);
+
+ flinkStateDescriptor = new ValueStateDescriptor<>(address.getId(), typeInfo, null);
+ }
+
+ @Override
+ public CombiningState<InputT, AccumT, OutputT> readLater() {
+ return this;
+ }
+
+ @Override
+ public void add(InputT value) {
+ try {
+ org.apache.flink.api.common.state.ValueState<AccumT> state =
+ flinkStateBackend.getPartitionedState(
+ namespace.stringKey(),
+ StringSerializer.INSTANCE,
+ flinkStateDescriptor);
+
+ AccumT current = state.value();
+ if (current == null) {
+ current = combineFn.createAccumulator(flinkStateInternals.getKey());
+ }
+ current = combineFn.addInput(flinkStateInternals.getKey(), current, value);
+ state.update(current);
+ } catch (Exception e) {
+ throw new RuntimeException("Error adding to state." , e);
+ }
+ }
+
+ @Override
+ public void addAccum(AccumT accum) {
+ try {
+ org.apache.flink.api.common.state.ValueState<AccumT> state =
+ flinkStateBackend.getPartitionedState(
+ namespace.stringKey(),
+ StringSerializer.INSTANCE,
+ flinkStateDescriptor);
+
+ AccumT current = state.value();
+ if (current == null) {
+ state.update(accum);
+ } else {
+ current = combineFn.mergeAccumulators(
+ flinkStateInternals.getKey(),
+ Lists.newArrayList(current, accum));
+ state.update(current);
+ }
+ } catch (Exception e) {
+ throw new RuntimeException("Error adding to state.", e);
+ }
+ }
+
+ @Override
+ public AccumT getAccum() {
+ try {
+ return flinkStateBackend.getPartitionedState(
+ namespace.stringKey(),
+ StringSerializer.INSTANCE,
+ flinkStateDescriptor).value();
+ } catch (Exception e) {
+ throw new RuntimeException("Error reading state.", e);
+ }
+ }
+
+ @Override
+ public AccumT mergeAccumulators(Iterable<AccumT> accumulators) {
+ return combineFn.mergeAccumulators(flinkStateInternals.getKey(), accumulators);
+ }
+
+ @Override
+ public OutputT read() {
+ try {
+ org.apache.flink.api.common.state.ValueState<AccumT> state =
+ flinkStateBackend.getPartitionedState(
+ namespace.stringKey(),
+ StringSerializer.INSTANCE,
+ flinkStateDescriptor);
+
+ AccumT accum = state.value();
+ if (accum != null) {
+ return combineFn.extractOutput(flinkStateInternals.getKey(), accum);
+ } else {
+ return combineFn.extractOutput(
+ flinkStateInternals.getKey(),
+ combineFn.createAccumulator(flinkStateInternals.getKey()));
+ }
+ } catch (Exception e) {
+ throw new RuntimeException("Error reading state.", e);
+ }
+ }
+
+ @Override
+ public ReadableState<Boolean> isEmpty() {
+ return new ReadableState<Boolean>() {
+ @Override
+ public Boolean read() {
+ try {
+ return flinkStateBackend.getPartitionedState(
+ namespace.stringKey(),
+ StringSerializer.INSTANCE,
+ flinkStateDescriptor).value() == null;
+ } catch (Exception e) {
+ throw new RuntimeException("Error reading state.", e);
+ }
+
+ }
+
+ @Override
+ public ReadableState<Boolean> readLater() {
+ return this;
+ }
+ };
+ }
+
+ @Override
+ public void clear() {
+ try {
+ flinkStateBackend.getPartitionedState(
+ namespace.stringKey(),
+ StringSerializer.INSTANCE,
+ flinkStateDescriptor).clear();
+ } catch (Exception e) {
+ throw new RuntimeException("Error clearing state.", e);
+ }
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) {
+ return true;
+ }
+ if (o == null || getClass() != o.getClass()) {
+ return false;
+ }
+
+ FlinkKeyedCombiningState<?, ?, ?, ?> that =
+ (FlinkKeyedCombiningState<?, ?, ?, ?>) o;
+
+ return namespace.equals(that.namespace) && address.equals(that.address);
+
+ }
+
+ @Override
+ public int hashCode() {
+ int result = namespace.hashCode();
+ result = 31 * result + address.hashCode();
+ return result;
+ }
+ }
+
+ private static class FlinkCombiningStateWithContext<K, InputT, AccumT, OutputT>
+ implements CombiningState<InputT, AccumT, OutputT> {
+
+ private final StateNamespace namespace;
+ private final StateTag<? super K, CombiningState<InputT, AccumT, OutputT>> address;
+ private final CombineWithContext.KeyedCombineFnWithContext<
+ ? super K, InputT, AccumT, OutputT> combineFn;
+ private final ValueStateDescriptor<AccumT> flinkStateDescriptor;
+ private final KeyedStateBackend<ByteBuffer> flinkStateBackend;
+ private final FlinkStateInternals<K> flinkStateInternals;
+ private final CombineWithContext.Context context;
+
+ FlinkCombiningStateWithContext(
+ KeyedStateBackend<ByteBuffer> flinkStateBackend,
+ StateTag<? super K, CombiningState<InputT, AccumT, OutputT>> address,
+ CombineWithContext.KeyedCombineFnWithContext<
+ ? super K, InputT, AccumT, OutputT> combineFn,
+ StateNamespace namespace,
+ Coder<AccumT> accumCoder,
+ FlinkStateInternals<K> flinkStateInternals,
+ CombineWithContext.Context context) {
+
+ this.namespace = namespace;
+ this.address = address;
+ this.combineFn = combineFn;
+ this.flinkStateBackend = flinkStateBackend;
+ this.flinkStateInternals = flinkStateInternals;
+ this.context = context;
+
+ CoderTypeInformation<AccumT> typeInfo = new CoderTypeInformation<>(accumCoder);
+
+ flinkStateDescriptor = new ValueStateDescriptor<>(address.getId(), typeInfo, null);
+ }
+
+ @Override
+ public CombiningState<InputT, AccumT, OutputT> readLater() {
+ return this;
+ }
+
+ @Override
+ public void add(InputT value) {
+ try {
+ org.apache.flink.api.common.state.ValueState<AccumT> state =
+ flinkStateBackend.getPartitionedState(
+ namespace.stringKey(),
+ StringSerializer.INSTANCE,
+ flinkStateDescriptor);
+
+ AccumT current = state.value();
+ if (current == null) {
+ current = combineFn.createAccumulator(flinkStateInternals.getKey(), context);
+ }
+ current = combineFn.addInput(flinkStateInternals.getKey(), current, value, context);
+ state.update(current);
+ } catch (Exception e) {
+ throw new RuntimeException("Error adding to state." , e);
+ }
+ }
+
+ @Override
+ public void addAccum(AccumT accum) {
+ try {
+ org.apache.flink.api.common.state.ValueState<AccumT> state =
+ flinkStateBackend.getPartitionedState(
+ namespace.stringKey(),
+ StringSerializer.INSTANCE,
+ flinkStateDescriptor);
+
+ AccumT current = state.value();
+ if (current == null) {
+ state.update(accum);
+ } else {
+ current = combineFn.mergeAccumulators(
+ flinkStateInternals.getKey(),
+ Lists.newArrayList(current, accum),
+ context);
+ state.update(current);
+ }
+ } catch (Exception e) {
+ throw new RuntimeException("Error adding to state.", e);
+ }
+ }
+
+ @Override
+ public AccumT getAccum() {
+ try {
+ return flinkStateBackend.getPartitionedState(
+ namespace.stringKey(),
+ StringSerializer.INSTANCE,
+ flinkStateDescriptor).value();
+ } catch (Exception e) {
+ throw new RuntimeException("Error reading state.", e);
+ }
+ }
+
+ @Override
+ public AccumT mergeAccumulators(Iterable<AccumT> accumulators) {
+ return combineFn.mergeAccumulators(flinkStateInternals.getKey(), accumulators, context);
+ }
+
+ @Override
+ public OutputT read() {
+ try {
+ org.apache.flink.api.common.state.ValueState<AccumT> state =
+ flinkStateBackend.getPartitionedState(
+ namespace.stringKey(),
+ StringSerializer.INSTANCE,
+ flinkStateDescriptor);
+
+ AccumT accum = state.value();
+ return combineFn.extractOutput(flinkStateInternals.getKey(), accum, context);
+ } catch (Exception e) {
+ throw new RuntimeException("Error reading state.", e);
+ }
+ }
+
+ @Override
+ public ReadableState<Boolean> isEmpty() {
+ return new ReadableState<Boolean>() {
+ @Override
+ public Boolean read() {
+ try {
+ return flinkStateBackend.getPartitionedState(
+ namespace.stringKey(),
+ StringSerializer.INSTANCE,
+ flinkStateDescriptor).value() == null;
+ } catch (Exception e) {
+ throw new RuntimeException("Error reading state.", e);
+ }
+
+ }
+
+ @Override
+ public ReadableState<Boolean> readLater() {
+ return this;
+ }
+ };
+ }
+
+ @Override
+ public void clear() {
+ try {
+ flinkStateBackend.getPartitionedState(
+ namespace.stringKey(),
+ StringSerializer.INSTANCE,
+ flinkStateDescriptor).clear();
+ } catch (Exception e) {
+ throw new RuntimeException("Error clearing state.", e);
+ }
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) {
+ return true;
+ }
+ if (o == null || getClass() != o.getClass()) {
+ return false;
+ }
+
+ FlinkCombiningStateWithContext<?, ?, ?, ?> that =
+ (FlinkCombiningStateWithContext<?, ?, ?, ?>) o;
+
+ return namespace.equals(that.namespace) && address.equals(that.address);
+
+ }
+
+ @Override
+ public int hashCode() {
+ int result = namespace.hashCode();
+ result = 31 * result + address.hashCode();
+ return result;
+ }
+ }
+
+ private static class FlinkWatermarkHoldState<K, W extends BoundedWindow>
+ implements WatermarkHoldState<W> {
+ private final StateTag<? super K, WatermarkHoldState<W>> address;
+ private final OutputTimeFn<? super W> outputTimeFn;
+ private final StateNamespace namespace;
+ private final KeyedStateBackend<ByteBuffer> flinkStateBackend;
+ private final FlinkStateInternals<K> flinkStateInternals;
+ private final ValueStateDescriptor<Instant> flinkStateDescriptor;
+
+ public FlinkWatermarkHoldState(
+ KeyedStateBackend<ByteBuffer> flinkStateBackend,
+ FlinkStateInternals<K> flinkStateInternals,
+ StateTag<? super K, WatermarkHoldState<W>> address,
+ StateNamespace namespace,
+ OutputTimeFn<? super W> outputTimeFn) {
+ this.address = address;
+ this.outputTimeFn = outputTimeFn;
+ this.namespace = namespace;
+ this.flinkStateBackend = flinkStateBackend;
+ this.flinkStateInternals = flinkStateInternals;
+
+ CoderTypeInformation<Instant> typeInfo = new CoderTypeInformation<>(InstantCoder.of());
+ flinkStateDescriptor = new ValueStateDescriptor<>(address.getId(), typeInfo, null);
+ }
+
+ @Override
+ public OutputTimeFn<? super W> getOutputTimeFn() {
+ return outputTimeFn;
+ }
+
+ @Override
+ public WatermarkHoldState<W> readLater() {
+ return this;
+ }
+
+ @Override
+ public ReadableState<Boolean> isEmpty() {
+ return new ReadableState<Boolean>() {
+ @Override
+ public Boolean read() {
+ try {
+ return flinkStateBackend.getPartitionedState(
+ namespace.stringKey(),
+ StringSerializer.INSTANCE,
+ flinkStateDescriptor).value() == null;
+ } catch (Exception e) {
+ throw new RuntimeException("Error reading state.", e);
+ }
+ }
+
+ @Override
+ public ReadableState<Boolean> readLater() {
+ return this;
+ }
+ };
+
+ }
+
+ @Override
+ public void add(Instant value) {
+ try {
+ org.apache.flink.api.common.state.ValueState<Instant> state =
+ flinkStateBackend.getPartitionedState(
+ namespace.stringKey(),
+ StringSerializer.INSTANCE,
+ flinkStateDescriptor);
+
+ Instant current = state.value();
+ if (current == null) {
+ state.update(value);
+ flinkStateInternals.watermarkHolds.put(namespace.stringKey(), value);
+ } else {
+ Instant combined = outputTimeFn.combine(current, value);
+ state.update(combined);
+ flinkStateInternals.watermarkHolds.put(namespace.stringKey(), combined);
+ }
+ } catch (Exception e) {
+ throw new RuntimeException("Error updating state.", e);
+ }
+ }
+
+ @Override
+ public Instant read() {
+ try {
+ org.apache.flink.api.common.state.ValueState<Instant> state =
+ flinkStateBackend.getPartitionedState(
+ namespace.stringKey(),
+ StringSerializer.INSTANCE,
+ flinkStateDescriptor);
+ return state.value();
+ } catch (Exception e) {
+ throw new RuntimeException("Error reading state.", e);
+ }
+ }
+
+ @Override
+ public void clear() {
+ flinkStateInternals.watermarkHolds.remove(namespace.stringKey());
+ try {
+ org.apache.flink.api.common.state.ValueState<Instant> state =
+ flinkStateBackend.getPartitionedState(
+ namespace.stringKey(),
+ StringSerializer.INSTANCE,
+ flinkStateDescriptor);
+ state.clear();
+ } catch (Exception e) {
+ throw new RuntimeException("Error reading state.", e);
+ }
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) {
+ return true;
+ }
+ if (o == null || getClass() != o.getClass()) {
+ return false;
+ }
+
+ FlinkWatermarkHoldState<?, ?> that = (FlinkWatermarkHoldState<?, ?>) o;
+
+ if (!address.equals(that.address)) {
+ return false;
+ }
+ if (!outputTimeFn.equals(that.outputTimeFn)) {
+ return false;
+ }
+ return namespace.equals(that.namespace);
+
+ }
+
+ @Override
+ public int hashCode() {
+ int result = address.hashCode();
+ result = 31 * result + outputTimeFn.hashCode();
+ result = 31 * result + namespace.hashCode();
+ return result;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/state/KeyGroupCheckpointedOperator.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/state/KeyGroupCheckpointedOperator.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/state/KeyGroupCheckpointedOperator.java
new file mode 100644
index 0000000..b38a520
--- /dev/null
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/state/KeyGroupCheckpointedOperator.java
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink.translation.wrappers.streaming.state;
+
+import java.io.DataOutputStream;
+
+/**
+ * This interface is used to checkpoint key-groups state.
+ */
+public interface KeyGroupCheckpointedOperator extends KeyGroupRestoringOperator{
+ /**
+ * Snapshots the state for a given {@code keyGroupIdx}.
+ *
+ * <p>AbstractStreamOperator would call this hook in
+ * AbstractStreamOperator.snapshotState() while iterating over the key groups.
+ * @param keyGroupIndex the id of the key-group to be put in the snapshot.
+ * @param out the stream to write to.
+ */
+ void snapshotKeyGroupState(int keyGroupIndex, DataOutputStream out) throws Exception;
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/state/KeyGroupRestoringOperator.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/state/KeyGroupRestoringOperator.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/state/KeyGroupRestoringOperator.java
new file mode 100644
index 0000000..2bdfc6e
--- /dev/null
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/state/KeyGroupRestoringOperator.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink.translation.wrappers.streaming.state;
+
+import java.io.DataInputStream;
+
+/**
+ * This interface is used to restore key-groups state.
+ */
+public interface KeyGroupRestoringOperator {
+ /**
+ * Restore the state for a given {@code keyGroupIndex}.
+ * @param keyGroupIndex the id of the key-group to be put in the snapshot.
+ * @param in the stream to read from.
+ */
+ void restoreKeyGroupState(int keyGroupIndex, DataInputStream in) throws Exception;
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/state/package-info.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/state/package-info.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/state/package-info.java
new file mode 100644
index 0000000..0004e9e
--- /dev/null
+++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/state/package-info.java
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Internal state implementation of the Beam runner for Apache Flink.
+ */
+package org.apache.beam.runners.flink.translation.wrappers.streaming.state;
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/main/resources/log4j.properties
----------------------------------------------------------------------
diff --git a/runners/flink/src/main/resources/log4j.properties b/runners/flink/src/main/resources/log4j.properties
new file mode 100644
index 0000000..4b6a708
--- /dev/null
+++ b/runners/flink/src/main/resources/log4j.properties
@@ -0,0 +1,23 @@
+################################################################################
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+log4j.rootLogger=OFF,console
+log4j.appender.console=org.apache.log4j.ConsoleAppender
+log4j.appender.console.target=System.err
+log4j.appender.console.layout=org.apache.log4j.PatternLayout
+log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{2}: %m%n
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/test/java/org/apache/beam/runners/flink/EncodedValueComparatorTest.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/test/java/org/apache/beam/runners/flink/EncodedValueComparatorTest.java b/runners/flink/src/test/java/org/apache/beam/runners/flink/EncodedValueComparatorTest.java
new file mode 100644
index 0000000..10d6d9d
--- /dev/null
+++ b/runners/flink/src/test/java/org/apache/beam/runners/flink/EncodedValueComparatorTest.java
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink;
+
+import org.apache.beam.runners.flink.translation.types.EncodedValueComparator;
+import org.apache.beam.runners.flink.translation.types.EncodedValueTypeInformation;
+import org.apache.beam.sdk.coders.CoderException;
+import org.apache.beam.sdk.coders.StringUtf8Coder;
+import org.apache.beam.sdk.util.CoderUtils;
+import org.apache.flink.api.common.ExecutionConfig;
+import org.apache.flink.api.common.typeutils.ComparatorTestBase;
+import org.apache.flink.api.common.typeutils.TypeComparator;
+import org.apache.flink.api.common.typeutils.TypeSerializer;
+import org.junit.Assert;
+
+/**
+ * Test for {@link EncodedValueComparator}.
+ */
+public class EncodedValueComparatorTest extends ComparatorTestBase<byte[]> {
+
+ @Override
+ protected TypeComparator<byte[]> createComparator(boolean ascending) {
+ return new EncodedValueTypeInformation().createComparator(ascending, new ExecutionConfig());
+ }
+
+ @Override
+ protected TypeSerializer<byte[]> createSerializer() {
+ return new EncodedValueTypeInformation().createSerializer(new ExecutionConfig());
+ }
+
+ @Override
+ protected void deepEquals(String message, byte[] should, byte[] is) {
+ Assert.assertArrayEquals(message, should, is);
+ }
+
+ @Override
+ protected byte[][] getSortedTestData() {
+ StringUtf8Coder coder = StringUtf8Coder.of();
+
+ try {
+ return new byte[][]{
+ CoderUtils.encodeToByteArray(coder, ""),
+ CoderUtils.encodeToByteArray(coder, "Lorem Ipsum Dolor Omit Longer"),
+ CoderUtils.encodeToByteArray(coder, "aaaa"),
+ CoderUtils.encodeToByteArray(coder, "abcd"),
+ CoderUtils.encodeToByteArray(coder, "abce"),
+ CoderUtils.encodeToByteArray(coder, "abdd"),
+ CoderUtils.encodeToByteArray(coder, "accd"),
+ CoderUtils.encodeToByteArray(coder, "bbcd")
+ };
+ } catch (CoderException e) {
+ throw new RuntimeException("Could not encode values.", e);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkRunnerRegistrarTest.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkRunnerRegistrarTest.java b/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkRunnerRegistrarTest.java
new file mode 100644
index 0000000..d9d174c
--- /dev/null
+++ b/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkRunnerRegistrarTest.java
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.beam.runners.flink;
+
+import static org.junit.Assert.assertEquals;
+
+import org.apache.beam.sdk.options.PipelineOptions;
+import org.apache.beam.sdk.options.PipelineOptionsFactory;
+import org.junit.Test;
+
+/**
+ * Tests the proper registration of the Flink runner.
+ */
+public class FlinkRunnerRegistrarTest {
+
+ @Test
+ public void testFullName() {
+ String[] args =
+ new String[] {String.format("--runner=%s", FlinkRunner.class.getName())};
+ PipelineOptions opts = PipelineOptionsFactory.fromArgs(args).create();
+ assertEquals(opts.getRunner(), FlinkRunner.class);
+ }
+
+ @Test
+ public void testClassName() {
+ String[] args =
+ new String[] {String.format("--runner=%s", FlinkRunner.class.getSimpleName())};
+ PipelineOptions opts = PipelineOptionsFactory.fromArgs(args).create();
+ assertEquals(opts.getRunner(), FlinkRunner.class);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkTestPipeline.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkTestPipeline.java b/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkTestPipeline.java
new file mode 100644
index 0000000..d6240c4
--- /dev/null
+++ b/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkTestPipeline.java
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink;
+
+import org.apache.beam.sdk.Pipeline;
+import org.apache.beam.sdk.PipelineResult;
+import org.apache.beam.sdk.options.PipelineOptions;
+import org.apache.beam.sdk.runners.PipelineRunner;
+
+/**
+ * {@link org.apache.beam.sdk.Pipeline} for testing Dataflow programs on the
+ * {@link FlinkRunner}.
+ */
+public class FlinkTestPipeline extends Pipeline {
+
+ /**
+ * Creates and returns a new test pipeline for batch execution.
+ *
+ * <p>Use {@link org.apache.beam.sdk.testing.PAssert} to add tests, then call
+ * {@link Pipeline#run} to execute the pipeline and check the tests.
+ */
+ public static FlinkTestPipeline createForBatch() {
+ return create(false);
+ }
+
+ /**
+ * Creates and returns a new test pipeline for streaming execution.
+ *
+ * <p>Use {@link org.apache.beam.sdk.testing.PAssert} to add tests, then call
+ * {@link Pipeline#run} to execute the pipeline and check the tests.
+ *
+ * @return The Test Pipeline
+ */
+ public static FlinkTestPipeline createForStreaming() {
+ return create(true);
+ }
+
+ /**
+ * Creates and returns a new test pipeline for streaming or batch execution.
+ *
+ * <p>Use {@link org.apache.beam.sdk.testing.PAssert} to add tests, then call
+ * {@link Pipeline#run} to execute the pipeline and check the tests.
+ *
+ * @param streaming <code>True</code> for streaming mode, <code>False</code> for batch.
+ * @return The Test Pipeline.
+ */
+ private static FlinkTestPipeline create(boolean streaming) {
+ TestFlinkRunner flinkRunner = TestFlinkRunner.create(streaming);
+ return new FlinkTestPipeline(flinkRunner, flinkRunner.getPipelineOptions());
+ }
+
+ private FlinkTestPipeline(PipelineRunner<? extends PipelineResult> runner,
+ PipelineOptions options) {
+ super(runner, options);
+ }
+}
+
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/test/java/org/apache/beam/runners/flink/PipelineOptionsTest.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/test/java/org/apache/beam/runners/flink/PipelineOptionsTest.java b/runners/flink/src/test/java/org/apache/beam/runners/flink/PipelineOptionsTest.java
new file mode 100644
index 0000000..06187f6
--- /dev/null
+++ b/runners/flink/src/test/java/org/apache/beam/runners/flink/PipelineOptionsTest.java
@@ -0,0 +1,184 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
+
+import java.util.Collections;
+import java.util.HashMap;
+import org.apache.beam.runners.flink.translation.utils.SerializedPipelineOptions;
+import org.apache.beam.runners.flink.translation.wrappers.streaming.DoFnOperator;
+import org.apache.beam.sdk.coders.StringUtf8Coder;
+import org.apache.beam.sdk.options.Default;
+import org.apache.beam.sdk.options.Description;
+import org.apache.beam.sdk.options.PipelineOptions;
+import org.apache.beam.sdk.options.PipelineOptionsFactory;
+import org.apache.beam.sdk.transforms.DoFn;
+import org.apache.beam.sdk.transforms.windowing.GlobalWindow;
+import org.apache.beam.sdk.transforms.windowing.PaneInfo;
+import org.apache.beam.sdk.util.WindowedValue;
+import org.apache.beam.sdk.util.WindowingStrategy;
+import org.apache.beam.sdk.values.PCollectionView;
+import org.apache.beam.sdk.values.TupleTag;
+import org.apache.commons.lang3.SerializationUtils;
+import org.apache.flink.api.common.ExecutionConfig;
+import org.apache.flink.api.common.typeinfo.TypeHint;
+import org.apache.flink.api.common.typeinfo.TypeInformation;
+import org.apache.flink.runtime.state.memory.MemoryStateBackend;
+import org.apache.flink.streaming.runtime.streamrecord.StreamRecord;
+import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness;
+import org.joda.time.Instant;
+import org.junit.Assert;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+/**
+ * Tests for serialization and deserialization of {@link PipelineOptions} in {@link DoFnOperator}.
+ */
+public class PipelineOptionsTest {
+
+ /**
+ * Pipeline options.
+ */
+ public interface MyOptions extends FlinkPipelineOptions {
+ @Description("Bla bla bla")
+ @Default.String("Hello")
+ String getTestOption();
+ void setTestOption(String value);
+ }
+
+ private static MyOptions options;
+ private static SerializedPipelineOptions serializedOptions;
+
+ private static final String[] args = new String[]{"--testOption=nothing"};
+
+ @BeforeClass
+ public static void beforeTest() {
+ options = PipelineOptionsFactory.fromArgs(args).as(MyOptions.class);
+ serializedOptions = new SerializedPipelineOptions(options);
+ }
+
+ @Test
+ public void testDeserialization() {
+ MyOptions deserializedOptions = serializedOptions.getPipelineOptions().as(MyOptions.class);
+ assertEquals("nothing", deserializedOptions.getTestOption());
+ }
+
+ @Test
+ public void testIgnoredFieldSerialization() {
+ FlinkPipelineOptions options = PipelineOptionsFactory.as(FlinkPipelineOptions.class);
+ options.setStateBackend(new MemoryStateBackend());
+
+ FlinkPipelineOptions deserialized =
+ new SerializedPipelineOptions(options).getPipelineOptions().as(FlinkPipelineOptions.class);
+
+ assertNull(deserialized.getStateBackend());
+ }
+
+ @Test
+ public void testCaching() {
+ PipelineOptions deserializedOptions =
+ serializedOptions.getPipelineOptions().as(PipelineOptions.class);
+
+ assertNotNull(deserializedOptions);
+ assertTrue(deserializedOptions == serializedOptions.getPipelineOptions());
+ assertTrue(deserializedOptions == serializedOptions.getPipelineOptions());
+ assertTrue(deserializedOptions == serializedOptions.getPipelineOptions());
+ }
+
+ @Test(expected = Exception.class)
+ public void testNonNull() {
+ new SerializedPipelineOptions(null);
+ }
+
+ @Test(expected = Exception.class)
+ public void parDoBaseClassPipelineOptionsNullTest() {
+ DoFnOperator<String, String, String> doFnOperator = new DoFnOperator<>(
+ new TestDoFn(),
+ WindowedValue.getValueOnlyCoder(StringUtf8Coder.of()),
+ new TupleTag<String>("main-output"),
+ Collections.<TupleTag<?>>emptyList(),
+ new DoFnOperator.DefaultOutputManagerFactory<String>(),
+ WindowingStrategy.globalDefault(),
+ new HashMap<Integer, PCollectionView<?>>(),
+ Collections.<PCollectionView<?>>emptyList(),
+ null,
+ null);
+
+ }
+
+ /**
+ * Tests that PipelineOptions are present after serialization.
+ */
+ @Test
+ public void parDoBaseClassPipelineOptionsSerializationTest() throws Exception {
+
+ DoFnOperator<String, String, String> doFnOperator = new DoFnOperator<>(
+ new TestDoFn(),
+ WindowedValue.getValueOnlyCoder(StringUtf8Coder.of()),
+ new TupleTag<String>("main-output"),
+ Collections.<TupleTag<?>>emptyList(),
+ new DoFnOperator.DefaultOutputManagerFactory<String>(),
+ WindowingStrategy.globalDefault(),
+ new HashMap<Integer, PCollectionView<?>>(),
+ Collections.<PCollectionView<?>>emptyList(),
+ options,
+ null);
+
+ final byte[] serialized = SerializationUtils.serialize(doFnOperator);
+
+ @SuppressWarnings("unchecked")
+ DoFnOperator<Object, Object, Object> deserialized =
+ (DoFnOperator<Object, Object, Object>) SerializationUtils.deserialize(serialized);
+
+ TypeInformation<WindowedValue<Object>> typeInformation = TypeInformation.of(
+ new TypeHint<WindowedValue<Object>>() {});
+
+ OneInputStreamOperatorTestHarness<WindowedValue<Object>, Object> testHarness =
+ new OneInputStreamOperatorTestHarness<>(deserialized,
+ typeInformation.createSerializer(new ExecutionConfig()));
+
+ testHarness.open();
+
+ // execute once to access options
+ testHarness.processElement(new StreamRecord<>(
+ WindowedValue.of(
+ new Object(),
+ Instant.now(),
+ GlobalWindow.INSTANCE,
+ PaneInfo.NO_FIRING)));
+
+ testHarness.close();
+
+ }
+
+
+ private static class TestDoFn extends DoFn<String, String> {
+
+ @ProcessElement
+ public void processElement(ProcessContext c) throws Exception {
+ Assert.assertNotNull(c.getPipelineOptions());
+ Assert.assertEquals(
+ options.getTestOption(),
+ c.getPipelineOptions().as(MyOptions.class).getTestOption());
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/test/java/org/apache/beam/runners/flink/ReadSourceITCase.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/test/java/org/apache/beam/runners/flink/ReadSourceITCase.java b/runners/flink/src/test/java/org/apache/beam/runners/flink/ReadSourceITCase.java
new file mode 100644
index 0000000..44c9017
--- /dev/null
+++ b/runners/flink/src/test/java/org/apache/beam/runners/flink/ReadSourceITCase.java
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink;
+
+import com.google.common.base.Joiner;
+import java.io.File;
+import java.net.URI;
+import org.apache.beam.sdk.Pipeline;
+import org.apache.beam.sdk.io.CountingInput;
+import org.apache.beam.sdk.io.TextIO;
+import org.apache.beam.sdk.transforms.DoFn;
+import org.apache.beam.sdk.transforms.ParDo;
+import org.apache.beam.sdk.values.PCollection;
+import org.apache.flink.test.util.JavaProgramTestBase;
+
+/**
+ * Reads from a bounded source in batch execution.
+ */
+public class ReadSourceITCase extends JavaProgramTestBase {
+
+ protected String resultPath;
+
+ public ReadSourceITCase(){
+ }
+
+ private static final String[] EXPECTED_RESULT = new String[] {
+ "0", "1", "2", "3", "4", "5", "6", "7", "8", "9"};
+
+ @Override
+ protected void preSubmit() throws Exception {
+ resultPath = getTempDirPath("result");
+
+ // need to create the dir, otherwise Beam sinks don't
+ // work for these tests
+
+ if (!new File(new URI(resultPath)).mkdirs()) {
+ throw new RuntimeException("Could not create output dir.");
+ }
+ }
+
+ @Override
+ protected void postSubmit() throws Exception {
+ compareResultsByLinesInMemory(Joiner.on('\n').join(EXPECTED_RESULT), resultPath);
+ }
+
+ @Override
+ protected void testProgram() throws Exception {
+ runProgram(resultPath);
+ }
+
+ private static void runProgram(String resultPath) throws Exception {
+
+ Pipeline p = FlinkTestPipeline.createForBatch();
+
+ PCollection<String> result = p
+ .apply(CountingInput.upTo(10))
+ .apply(ParDo.of(new DoFn<Long, String>() {
+ @ProcessElement
+ public void processElement(ProcessContext c) throws Exception {
+ c.output(c.element().toString());
+ }
+ }));
+
+ result.apply(TextIO.Write.to(new URI(resultPath).getPath() + "/part"));
+
+ p.run();
+ }
+}
+
+
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/test/java/org/apache/beam/runners/flink/ReadSourceStreamingITCase.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/test/java/org/apache/beam/runners/flink/ReadSourceStreamingITCase.java b/runners/flink/src/test/java/org/apache/beam/runners/flink/ReadSourceStreamingITCase.java
new file mode 100644
index 0000000..79b7882
--- /dev/null
+++ b/runners/flink/src/test/java/org/apache/beam/runners/flink/ReadSourceStreamingITCase.java
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.flink;
+
+import com.google.common.base.Joiner;
+import org.apache.beam.sdk.Pipeline;
+import org.apache.beam.sdk.io.CountingInput;
+import org.apache.beam.sdk.io.TextIO;
+import org.apache.beam.sdk.transforms.DoFn;
+import org.apache.beam.sdk.transforms.ParDo;
+import org.apache.flink.streaming.util.StreamingProgramTestBase;
+
+/**
+ * Reads from a bounded source in streaming.
+ */
+public class ReadSourceStreamingITCase extends StreamingProgramTestBase {
+
+ protected String resultPath;
+
+ public ReadSourceStreamingITCase(){
+ }
+
+ private static final String[] EXPECTED_RESULT = new String[] {
+ "0", "1", "2", "3", "4", "5", "6", "7", "8", "9"};
+
+ @Override
+ protected void preSubmit() throws Exception {
+ resultPath = getTempDirPath("result");
+ }
+
+ @Override
+ protected void postSubmit() throws Exception {
+ compareResultsByLinesInMemory(Joiner.on('\n').join(EXPECTED_RESULT), resultPath);
+ }
+
+ @Override
+ protected void testProgram() throws Exception {
+ runProgram(resultPath);
+ }
+
+ private static void runProgram(String resultPath) {
+
+ Pipeline p = FlinkTestPipeline.createForStreaming();
+
+ p
+ .apply(CountingInput.upTo(10))
+ .apply(ParDo.of(new DoFn<Long, String>() {
+ @ProcessElement
+ public void processElement(ProcessContext c) throws Exception {
+ c.output(c.element().toString());
+ }
+ }))
+ .apply(TextIO.Write.to(resultPath));
+
+ p.run();
+ }
+}
+
+
http://git-wip-us.apache.org/repos/asf/beam/blob/cdd2544b/runners/flink/src/test/java/org/apache/beam/runners/flink/WriteSinkITCase.java
----------------------------------------------------------------------
diff --git a/runners/flink/src/test/java/org/apache/beam/runners/flink/WriteSinkITCase.java b/runners/flink/src/test/java/org/apache/beam/runners/flink/WriteSinkITCase.java
new file mode 100644
index 0000000..38b790e
--- /dev/null
+++ b/runners/flink/src/test/java/org/apache/beam/runners/flink/WriteSinkITCase.java
@@ -0,0 +1,192 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.beam.runners.flink;
+
+import static org.junit.Assert.assertNotNull;
+
+import com.google.common.base.Joiner;
+import com.google.common.collect.ImmutableList;
+import java.io.File;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.net.URI;
+import org.apache.beam.sdk.Pipeline;
+import org.apache.beam.sdk.coders.Coder;
+import org.apache.beam.sdk.coders.StringUtf8Coder;
+import org.apache.beam.sdk.io.Sink;
+import org.apache.beam.sdk.io.Write;
+import org.apache.beam.sdk.options.PipelineOptions;
+import org.apache.beam.sdk.transforms.Create;
+import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
+import org.apache.beam.sdk.transforms.windowing.PaneInfo;
+import org.apache.flink.core.fs.FileSystem;
+import org.apache.flink.core.fs.Path;
+import org.apache.flink.test.util.JavaProgramTestBase;
+
+/**
+ * Tests the translation of custom Write sinks.
+ */
+public class WriteSinkITCase extends JavaProgramTestBase {
+
+ protected String resultPath;
+
+ public WriteSinkITCase(){
+ }
+
+ static final String[] EXPECTED_RESULT = new String[] {
+ "Joe red 3", "Mary blue 4", "Max yellow 23"};
+
+ @Override
+ protected void preSubmit() throws Exception {
+ resultPath = getTempDirPath("result-" + System.nanoTime());
+ }
+
+ @Override
+ protected void postSubmit() throws Exception {
+ compareResultsByLinesInMemory(Joiner.on('\n').join(EXPECTED_RESULT), resultPath);
+ }
+
+ @Override
+ protected void testProgram() throws Exception {
+ runProgram(resultPath);
+ }
+
+ @Override
+ public void stopCluster() throws Exception {
+ try {
+ super.stopCluster();
+ } catch (final IOException ioe) {
+ if (ioe.getMessage().startsWith("Unable to delete file")) {
+ // that's ok for the test itself, just the OS playing with us on cleanup phase
+ }
+ }
+ }
+
+ private static void runProgram(String resultPath) {
+ Pipeline p = FlinkTestPipeline.createForBatch();
+
+ p.apply(Create.of(ImmutableList.copyOf(EXPECTED_RESULT))).setCoder(StringUtf8Coder.of())
+ .apply("CustomSink", Write.to(new MyCustomSink(resultPath)));
+
+ p.run();
+ }
+
+ /**
+ * Simple custom sink which writes to a file.
+ */
+ private static class MyCustomSink extends Sink<String> {
+
+ private final String resultPath;
+
+ public MyCustomSink(String resultPath) {
+ this.resultPath = resultPath;
+ }
+
+ @Override
+ public void validate(PipelineOptions options) {
+ assertNotNull(options);
+ }
+
+ @Override
+ public WriteOperation<String, ?> createWriteOperation(PipelineOptions options) {
+ return new MyWriteOperation();
+ }
+
+ private class MyWriteOperation extends WriteOperation<String, String> {
+
+ @Override
+ public Coder<String> getWriterResultCoder() {
+ return StringUtf8Coder.of();
+ }
+
+ @Override
+ public void initialize(PipelineOptions options) throws Exception {
+
+ }
+
+ @Override
+ public void setWindowedWrites(boolean windowedWrites) {
+
+ }
+
+ @Override
+ public void finalize(Iterable<String> writerResults, PipelineOptions options)
+ throws Exception {
+
+ }
+
+ @Override
+ public Writer<String, String> createWriter(PipelineOptions options) throws Exception {
+ return new MyWriter();
+ }
+
+ @Override
+ public Sink<String> getSink() {
+ return MyCustomSink.this;
+ }
+
+ /**
+ * Simple Writer which writes to a file.
+ */
+ private class MyWriter extends Writer<String, String> {
+
+ private PrintWriter internalWriter;
+
+ @Override
+ public final void openWindowed(String uId,
+ BoundedWindow window,
+ PaneInfo paneInfo,
+ int shard,
+ int numShards) throws Exception {
+ throw new UnsupportedOperationException("Windowed writes not supported.");
+ }
+
+ @Override
+ public final void openUnwindowed(String uId, int shard, int numShards) throws Exception {
+ Path path = new Path(resultPath + "/" + uId);
+ FileSystem.get(new URI("file:///")).create(path, false);
+ internalWriter = new PrintWriter(new File(path.toUri()));
+ }
+
+ @Override
+ public void cleanup() throws Exception {
+
+ }
+
+ @Override
+ public void write(String value) throws Exception {
+ internalWriter.println(value);
+ }
+
+ @Override
+ public String close() throws Exception {
+ internalWriter.close();
+ return resultPath;
+ }
+
+ @Override
+ public WriteOperation<String, String> getWriteOperation() {
+ return MyWriteOperation.this;
+ }
+ }
+ }
+ }
+
+}
+