You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@beam.apache.org by dh...@apache.org on 2016/03/24 03:47:25 UTC
[01/67] [partial] incubator-beam git commit: Directory reorganization
Repository: incubator-beam
Updated Branches:
refs/heads/master 9f8dd182c -> 257a7a6be
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/test/java/com/google/cloud/dataflow/sdk/coders/CoderRegistryTest.java
----------------------------------------------------------------------
diff --git a/sdk/src/test/java/com/google/cloud/dataflow/sdk/coders/CoderRegistryTest.java b/sdk/src/test/java/com/google/cloud/dataflow/sdk/coders/CoderRegistryTest.java
deleted file mode 100644
index 2f350b2..0000000
--- a/sdk/src/test/java/com/google/cloud/dataflow/sdk/coders/CoderRegistryTest.java
+++ /dev/null
@@ -1,521 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.coders;
-
-import static org.hamcrest.Matchers.allOf;
-import static org.hamcrest.Matchers.containsString;
-import static org.junit.Assert.assertEquals;
-
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.coders.CoderRegistry.IncompatibleCoderException;
-import com.google.cloud.dataflow.sdk.coders.Proto2CoderTestMessages.MessageA;
-import com.google.cloud.dataflow.sdk.coders.protobuf.ProtoCoder;
-import com.google.cloud.dataflow.sdk.testing.TestPipeline;
-import com.google.cloud.dataflow.sdk.transforms.Create;
-import com.google.cloud.dataflow.sdk.transforms.DoFn;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-import com.google.cloud.dataflow.sdk.transforms.ParDo;
-import com.google.cloud.dataflow.sdk.util.CloudObject;
-import com.google.cloud.dataflow.sdk.util.common.ElementByteSizeObserver;
-import com.google.cloud.dataflow.sdk.values.KV;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.cloud.dataflow.sdk.values.TypeDescriptor;
-import com.google.common.collect.ImmutableList;
-import com.google.protobuf.Duration;
-
-import org.junit.Rule;
-import org.junit.Test;
-import org.junit.rules.ExpectedException;
-import org.junit.runner.RunWith;
-import org.junit.runners.JUnit4;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.io.Serializable;
-import java.lang.reflect.Type;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-
-/**
- * Tests for CoderRegistry.
- */
-@RunWith(JUnit4.class)
-public class CoderRegistryTest {
-
- @Rule
- public ExpectedException thrown = ExpectedException.none();
-
- public static CoderRegistry getStandardRegistry() {
- CoderRegistry registry = new CoderRegistry();
- registry.registerStandardCoders();
- return registry;
- }
-
- private static class SerializableClass implements Serializable {
- }
-
- private static class NotSerializableClass { }
-
- @Test
- public void testSerializableFallbackCoderProvider() throws Exception {
- CoderRegistry registry = getStandardRegistry();
- registry.setFallbackCoderProvider(SerializableCoder.PROVIDER);
- Coder<?> serializableCoder = registry.getDefaultCoder(SerializableClass.class);
-
- assertEquals(serializableCoder, SerializableCoder.of(SerializableClass.class));
- }
-
- @Test
- public void testProtoCoderFallbackCoderProvider() throws Exception {
- CoderRegistry registry = getStandardRegistry();
-
- // MessageA is a Protocol Buffers test message with syntax 2
- assertEquals(registry.getDefaultCoder(MessageA.class), ProtoCoder.of(MessageA.class));
-
- // Duration is a Protocol Buffers default type with syntax 3
- assertEquals(registry.getDefaultCoder(Duration.class), ProtoCoder.of(Duration.class));
- }
-
- @Test
- public void testAvroFallbackCoderProvider() throws Exception {
- CoderRegistry registry = getStandardRegistry();
- registry.setFallbackCoderProvider(AvroCoder.PROVIDER);
- Coder<?> avroCoder = registry.getDefaultCoder(NotSerializableClass.class);
-
- assertEquals(avroCoder, AvroCoder.of(NotSerializableClass.class));
- }
-
- @Test
- public void testRegisterInstantiatedCoder() throws Exception {
- CoderRegistry registry = new CoderRegistry();
- registry.registerCoder(MyValue.class, MyValueCoder.of());
- assertEquals(registry.getDefaultCoder(MyValue.class), MyValueCoder.of());
- }
-
- @SuppressWarnings("rawtypes") // this class exists to fail a test because of its rawtypes
- private class MyListCoder extends DeterministicStandardCoder<List> {
- @Override
- public void encode(List value, OutputStream outStream, Context context)
- throws CoderException, IOException {
- }
-
- @Override
- public List decode(InputStream inStream, Context context)
- throws CoderException, IOException {
- return Collections.emptyList();
- }
-
- @Override
- public List<Coder<?>> getCoderArguments() {
- return Collections.emptyList();
- }
- }
-
- @Test
- public void testRegisterInstantiatedCoderInvalidRawtype() throws Exception {
- thrown.expect(IllegalArgumentException.class);
- thrown.expectMessage("may not be used with unspecialized generic classes");
- CoderRegistry registry = new CoderRegistry();
- registry.registerCoder(List.class, new MyListCoder());
- }
-
- @Test
- public void testSimpleDefaultCoder() throws Exception {
- CoderRegistry registry = getStandardRegistry();
- assertEquals(StringUtf8Coder.of(), registry.getDefaultCoder(String.class));
- }
-
- @Test
- public void testSimpleUnknownDefaultCoder() throws Exception {
- CoderRegistry registry = getStandardRegistry();
- thrown.expect(CannotProvideCoderException.class);
- thrown.expectMessage(allOf(
- containsString(UnknownType.class.getCanonicalName()),
- containsString("No CoderFactory has been registered"),
- containsString("does not have a @DefaultCoder annotation"),
- containsString("does not implement Serializable")));
- registry.getDefaultCoder(UnknownType.class);
- }
-
- @Test
- public void testParameterizedDefaultListCoder() throws Exception {
- CoderRegistry registry = getStandardRegistry();
- TypeDescriptor<List<Integer>> listToken = new TypeDescriptor<List<Integer>>() {};
- assertEquals(ListCoder.of(VarIntCoder.of()),
- registry.getDefaultCoder(listToken));
-
- registry.registerCoder(MyValue.class, MyValueCoder.class);
- TypeDescriptor<KV<String, List<MyValue>>> kvToken =
- new TypeDescriptor<KV<String, List<MyValue>>>() {};
- assertEquals(KvCoder.of(StringUtf8Coder.of(),
- ListCoder.of(MyValueCoder.of())),
- registry.getDefaultCoder(kvToken));
-
- }
-
- @Test
- public void testParameterizedDefaultMapCoder() throws Exception {
- CoderRegistry registry = getStandardRegistry();
- TypeDescriptor<Map<Integer, String>> mapToken = new TypeDescriptor<Map<Integer, String>>() {};
- assertEquals(MapCoder.of(VarIntCoder.of(), StringUtf8Coder.of()),
- registry.getDefaultCoder(mapToken));
- }
-
- @Test
- public void testParameterizedDefaultNestedMapCoder() throws Exception {
- CoderRegistry registry = getStandardRegistry();
- TypeDescriptor<Map<Integer, Map<String, Double>>> mapToken =
- new TypeDescriptor<Map<Integer, Map<String, Double>>>() {};
- assertEquals(
- MapCoder.of(VarIntCoder.of(), MapCoder.of(StringUtf8Coder.of(), DoubleCoder.of())),
- registry.getDefaultCoder(mapToken));
- }
-
- @Test
- public void testParameterizedDefaultSetCoder() throws Exception {
- CoderRegistry registry = getStandardRegistry();
- TypeDescriptor<Set<Integer>> setToken = new TypeDescriptor<Set<Integer>>() {};
- assertEquals(SetCoder.of(VarIntCoder.of()), registry.getDefaultCoder(setToken));
- }
-
- @Test
- public void testParameterizedDefaultNestedSetCoder() throws Exception {
- CoderRegistry registry = getStandardRegistry();
- TypeDescriptor<Set<Set<Integer>>> setToken = new TypeDescriptor<Set<Set<Integer>>>() {};
- assertEquals(SetCoder.of(SetCoder.of(VarIntCoder.of())), registry.getDefaultCoder(setToken));
- }
-
- @Test
- public void testParameterizedDefaultCoderUnknown() throws Exception {
- CoderRegistry registry = getStandardRegistry();
- TypeDescriptor<List<UnknownType>> listUnknownToken = new TypeDescriptor<List<UnknownType>>() {};
-
- thrown.expect(CannotProvideCoderException.class);
- thrown.expectMessage(String.format(
- "Cannot provide coder for parameterized type %s: Unable to provide a default Coder for %s",
- listUnknownToken,
- UnknownType.class.getCanonicalName()));
-
- registry.getDefaultCoder(listUnknownToken);
- }
-
- @Test
- public void testTypeParameterInferenceForward() throws Exception {
- CoderRegistry registry = getStandardRegistry();
- MyGenericClass<MyValue, List<MyValue>> instance =
- new MyGenericClass<MyValue, List<MyValue>>() {};
-
- Coder<?> bazCoder = registry.getDefaultCoder(
- instance.getClass(),
- MyGenericClass.class,
- Collections.<Type, Coder<?>>singletonMap(
- TypeDescriptor.of(MyGenericClass.class).getTypeParameter("FooT"), MyValueCoder.of()),
- TypeDescriptor.of(MyGenericClass.class).getTypeParameter("BazT"));
-
- assertEquals(ListCoder.of(MyValueCoder.of()), bazCoder);
- }
-
- @Test
- public void testTypeParameterInferenceBackward() throws Exception {
- CoderRegistry registry = getStandardRegistry();
- MyGenericClass<MyValue, List<MyValue>> instance =
- new MyGenericClass<MyValue, List<MyValue>>() {};
-
- Coder<?> fooCoder = registry.getDefaultCoder(
- instance.getClass(),
- MyGenericClass.class,
- Collections.<Type, Coder<?>>singletonMap(
- TypeDescriptor.of(MyGenericClass.class).getTypeParameter("BazT"),
- ListCoder.of(MyValueCoder.of())),
- TypeDescriptor.of(MyGenericClass.class).getTypeParameter("FooT"));
-
- assertEquals(MyValueCoder.of(), fooCoder);
- }
-
- @Test
- public void testGetDefaultCoderFromIntegerValue() throws Exception {
- CoderRegistry registry = getStandardRegistry();
- Integer i = 13;
- Coder<Integer> coder = registry.getDefaultCoder(i);
- assertEquals(VarIntCoder.of(), coder);
- }
-
- @Test
- public void testGetDefaultCoderFromNullValue() throws Exception {
- CoderRegistry registry = getStandardRegistry();
- assertEquals(VoidCoder.of(), registry.getDefaultCoder((Void) null));
- }
-
- @Test
- public void testGetDefaultCoderFromKvValue() throws Exception {
- CoderRegistry registry = getStandardRegistry();
- KV<Integer, String> kv = KV.of(13, "hello");
- Coder<KV<Integer, String>> coder = registry.getDefaultCoder(kv);
- assertEquals(KvCoder.of(VarIntCoder.of(), StringUtf8Coder.of()),
- coder);
- }
-
- @Test
- public void testGetDefaultCoderFromKvNullValue() throws Exception {
- CoderRegistry registry = getStandardRegistry();
- KV<Void, Void> kv = KV.of((Void) null, (Void) null);
- assertEquals(KvCoder.of(VoidCoder.of(), VoidCoder.of()),
- registry.getDefaultCoder(kv));
- }
-
- @Test
- public void testGetDefaultCoderFromNestedKvValue() throws Exception {
- CoderRegistry registry = getStandardRegistry();
- KV<Integer, KV<Long, KV<String, String>>> kv = KV.of(13, KV.of(17L, KV.of("hello", "goodbye")));
- Coder<KV<Integer, KV<Long, KV<String, String>>>> coder = registry.getDefaultCoder(kv);
- assertEquals(
- KvCoder.of(VarIntCoder.of(),
- KvCoder.of(VarLongCoder.of(),
- KvCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of()))),
- coder);
- }
-
- @Test
- public void testTypeCompatibility() throws Exception {
- CoderRegistry.verifyCompatible(BigEndianIntegerCoder.of(), Integer.class);
- CoderRegistry.verifyCompatible(
- ListCoder.of(BigEndianIntegerCoder.of()),
- new TypeDescriptor<List<Integer>>() {}.getType());
- }
-
- @Test
- public void testIntVersusStringIncompatibility() throws Exception {
- thrown.expect(IncompatibleCoderException.class);
- thrown.expectMessage("not assignable");
- CoderRegistry.verifyCompatible(BigEndianIntegerCoder.of(), String.class);
- }
-
- private static class TooManyComponentCoders<T> extends ListCoder<T> {
- public TooManyComponentCoders(Coder<T> actualComponentCoder) {
- super(actualComponentCoder);
- }
-
- @Override
- public List<? extends Coder<?>> getCoderArguments() {
- return ImmutableList.<Coder<?>>builder()
- .addAll(super.getCoderArguments())
- .add(BigEndianLongCoder.of())
- .build();
- }
- }
-
- @Test
- public void testTooManyCoderArguments() throws Exception {
- thrown.expect(IncompatibleCoderException.class);
- thrown.expectMessage("type parameters");
- thrown.expectMessage("less than the number of coder arguments");
- CoderRegistry.verifyCompatible(
- new TooManyComponentCoders<>(BigEndianIntegerCoder.of()), List.class);
- }
-
- @Test
- public void testComponentIncompatibility() throws Exception {
- thrown.expect(IncompatibleCoderException.class);
- thrown.expectMessage("component coder is incompatible");
- CoderRegistry.verifyCompatible(
- ListCoder.of(BigEndianIntegerCoder.of()),
- new TypeDescriptor<List<String>>() {}.getType());
- }
-
- @Test
- public void testDefaultCoderAnnotationGenericRawtype() throws Exception {
- CoderRegistry registry = new CoderRegistry();
- registry.registerStandardCoders();
- assertEquals(
- registry.getDefaultCoder(MySerializableGeneric.class),
- SerializableCoder.of(MySerializableGeneric.class));
- }
-
- @Test
- public void testDefaultCoderAnnotationGeneric() throws Exception {
- CoderRegistry registry = new CoderRegistry();
- registry.registerStandardCoders();
- assertEquals(
- registry.getDefaultCoder(new TypeDescriptor<MySerializableGeneric<String>>() {}),
- SerializableCoder.of(MySerializableGeneric.class));
- }
-
- private static class PTransformOutputingMySerializableGeneric
- extends PTransform<PCollection<String>, PCollection<KV<String, MySerializableGeneric<String>>>> {
-
- private class OutputDoFn extends DoFn<String, KV<String, MySerializableGeneric<String>>> {
- @Override
- public void processElement(ProcessContext c) { }
- }
-
- @Override
- public PCollection<KV<String, MySerializableGeneric<String>>>
- apply(PCollection<String> input) {
- return input.apply(ParDo.of(new OutputDoFn()));
- }
- }
-
- /**
- * Tests that the error message for a type variable includes a mention of where the
- * type variable was declared.
- */
- @Test
- public void testTypeVariableErrorMessage() throws Exception {
- CoderRegistry registry = new CoderRegistry();
-
- thrown.expect(CannotProvideCoderException.class);
- thrown.expectMessage(allOf(
- containsString("TestGenericT"),
- containsString("erasure"),
- containsString("com.google.cloud.dataflow.sdk.coders.CoderRegistryTest$TestGenericClass")));
- registry.getDefaultCoder(TypeDescriptor.of(
- TestGenericClass.class.getTypeParameters()[0]));
- }
-
- private static class TestGenericClass<TestGenericT> { }
-
- /**
- * In-context test that assures the functionality tested in
- * {@link #testDefaultCoderAnnotationGeneric} is invoked in the right ways.
- */
- @Test
- public void testSpecializedButIgnoredGenericInPipeline() throws Exception {
- Pipeline pipeline = TestPipeline.create();
-
- pipeline
- .apply(Create.of("hello", "goodbye"))
- .apply(new PTransformOutputingMySerializableGeneric());
-
- pipeline.run();
- }
-
- private static class GenericOutputMySerializedGeneric<T extends Serializable>
- extends PTransform<
- PCollection<String>,
- PCollection<KV<String, MySerializableGeneric<T>>>> {
-
- private class OutputDoFn extends DoFn<String, KV<String, MySerializableGeneric<T>>> {
- @Override
- public void processElement(ProcessContext c) { }
- }
-
- @Override
- public PCollection<KV<String, MySerializableGeneric<T>>>
- apply(PCollection<String> input) {
- return input.apply(ParDo.of(new OutputDoFn()));
- }
- }
-
- @Test
- public void testIgnoredGenericInPipeline() throws Exception {
- Pipeline pipeline = TestPipeline.create();
-
- pipeline
- .apply(Create.of("hello", "goodbye"))
- .apply(new GenericOutputMySerializedGeneric<String>());
-
- pipeline.run();
- }
-
- private static class MyGenericClass<FooT, BazT> { }
-
- private static class MyValue { }
-
- private static class MyValueCoder implements Coder<MyValue> {
-
- private static final MyValueCoder INSTANCE = new MyValueCoder();
-
- public static MyValueCoder of() {
- return INSTANCE;
- }
-
- @SuppressWarnings("unused")
- public static List<Object> getInstanceComponents(
- @SuppressWarnings("unused") MyValue exampleValue) {
- return Arrays.asList();
- }
-
- @Override
- public void encode(MyValue value, OutputStream outStream, Context context)
- throws CoderException, IOException {
- }
-
- @Override
- public MyValue decode(InputStream inStream, Context context)
- throws CoderException, IOException {
- return new MyValue();
- }
-
- @Override
- public List<? extends Coder<?>> getCoderArguments() {
- return null;
- }
-
- @Override
- public CloudObject asCloudObject() {
- return null;
- }
-
- @Override
- public void verifyDeterministic() { }
-
- @Override
- public boolean consistentWithEquals() {
- return true;
- }
-
- @Override
- public Object structuralValue(MyValue value) {
- return value;
- }
-
- @Override
- public boolean isRegisterByteSizeObserverCheap(MyValue value, Context context) {
- return true;
- }
-
- @Override
- public void registerByteSizeObserver(
- MyValue value, ElementByteSizeObserver observer, Context context)
- throws Exception {
- observer.update(0L);
- }
-
- @Override
- public String getEncodingId() {
- return getClass().getName();
- }
-
- @Override
- public Collection<String> getAllowedEncodings() {
- return Collections.singletonList(getEncodingId());
- }
- }
-
- private static class UnknownType { }
-
- @DefaultCoder(SerializableCoder.class)
- private static class MySerializableGeneric<T extends Serializable> implements Serializable {
- @SuppressWarnings("unused")
- private T foo;
- }
-}
[33/67] [partial] incubator-beam git commit: Directory reorganization
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/DirectPipelineRunner.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/DirectPipelineRunner.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/DirectPipelineRunner.java
deleted file mode 100644
index 872cfef..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/DirectPipelineRunner.java
+++ /dev/null
@@ -1,1156 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.runners;
-
-import static com.google.common.base.Preconditions.checkArgument;
-import static com.google.common.base.Preconditions.checkState;
-
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.Pipeline.PipelineVisitor;
-import com.google.cloud.dataflow.sdk.PipelineResult;
-import com.google.cloud.dataflow.sdk.coders.CannotProvideCoderException;
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.coders.ListCoder;
-import com.google.cloud.dataflow.sdk.io.AvroIO;
-import com.google.cloud.dataflow.sdk.io.FileBasedSink;
-import com.google.cloud.dataflow.sdk.io.TextIO;
-import com.google.cloud.dataflow.sdk.options.DirectPipelineOptions;
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-import com.google.cloud.dataflow.sdk.options.PipelineOptions.CheckEnabled;
-import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
-import com.google.cloud.dataflow.sdk.options.PipelineOptionsValidator;
-import com.google.cloud.dataflow.sdk.transforms.Aggregator;
-import com.google.cloud.dataflow.sdk.transforms.AppliedPTransform;
-import com.google.cloud.dataflow.sdk.transforms.Combine;
-import com.google.cloud.dataflow.sdk.transforms.Combine.KeyedCombineFn;
-import com.google.cloud.dataflow.sdk.transforms.DoFn;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-import com.google.cloud.dataflow.sdk.transforms.ParDo;
-import com.google.cloud.dataflow.sdk.transforms.Partition;
-import com.google.cloud.dataflow.sdk.transforms.Partition.PartitionFn;
-import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
-import com.google.cloud.dataflow.sdk.util.AppliedCombineFn;
-import com.google.cloud.dataflow.sdk.util.IOChannelUtils;
-import com.google.cloud.dataflow.sdk.util.MapAggregatorValues;
-import com.google.cloud.dataflow.sdk.util.PerKeyCombineFnRunner;
-import com.google.cloud.dataflow.sdk.util.PerKeyCombineFnRunners;
-import com.google.cloud.dataflow.sdk.util.SerializableUtils;
-import com.google.cloud.dataflow.sdk.util.TestCredential;
-import com.google.cloud.dataflow.sdk.util.WindowedValue;
-import com.google.cloud.dataflow.sdk.util.common.Counter;
-import com.google.cloud.dataflow.sdk.util.common.CounterSet;
-import com.google.cloud.dataflow.sdk.values.KV;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.cloud.dataflow.sdk.values.PCollectionList;
-import com.google.cloud.dataflow.sdk.values.PCollectionView;
-import com.google.cloud.dataflow.sdk.values.PDone;
-import com.google.cloud.dataflow.sdk.values.PInput;
-import com.google.cloud.dataflow.sdk.values.POutput;
-import com.google.cloud.dataflow.sdk.values.PValue;
-import com.google.cloud.dataflow.sdk.values.TypedPValue;
-import com.google.common.base.Function;
-import com.google.common.collect.Lists;
-
-import org.joda.time.Instant;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.Serializable;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.Random;
-
-/**
- * Executes the operations in the pipeline directly, in this process, without
- * any optimization. Useful for small local execution and tests.
- *
- * <p>Throws an exception from {@link #run} if execution fails.
- *
- * <p><h3>Permissions</h3>
- * When reading from a Dataflow source or writing to a Dataflow sink using
- * {@code DirectPipelineRunner}, the Cloud Platform account that you configured with the
- * <a href="https://cloud.google.com/sdk/gcloud">gcloud</a> executable will need access to the
- * corresponding source/sink.
- *
- * <p>Please see <a href="https://cloud.google.com/dataflow/security-and-permissions">Google Cloud
- * Dataflow Security and Permissions</a> for more details.
- */
-@SuppressWarnings({"rawtypes", "unchecked"})
-public class DirectPipelineRunner
- extends PipelineRunner<DirectPipelineRunner.EvaluationResults> {
- private static final Logger LOG = LoggerFactory.getLogger(DirectPipelineRunner.class);
-
- /**
- * A source of random data, which can be seeded if determinism is desired.
- */
- private Random rand;
-
- /**
- * A map from PTransform class to the corresponding
- * TransformEvaluator to use to evaluate that transform.
- *
- * <p>A static map that contains system-wide defaults.
- */
- private static Map<Class, TransformEvaluator> defaultTransformEvaluators =
- new HashMap<>();
-
- /**
- * A map from PTransform class to the corresponding
- * TransformEvaluator to use to evaluate that transform.
- *
- * <p>An instance map that contains bindings for this DirectPipelineRunner.
- * Bindings in this map override those in the default map.
- */
- private Map<Class, TransformEvaluator> localTransformEvaluators =
- new HashMap<>();
-
- /**
- * Records that instances of the specified PTransform class
- * should be evaluated by default by the corresponding
- * TransformEvaluator.
- */
- public static <TransformT extends PTransform<?, ?>>
- void registerDefaultTransformEvaluator(
- Class<TransformT> transformClass,
- TransformEvaluator<? super TransformT> transformEvaluator) {
- if (defaultTransformEvaluators.put(transformClass, transformEvaluator)
- != null) {
- throw new IllegalArgumentException(
- "defining multiple evaluators for " + transformClass);
- }
- }
-
- /**
- * Records that instances of the specified PTransform class
- * should be evaluated by the corresponding TransformEvaluator.
- * Overrides any bindings specified by
- * {@link #registerDefaultTransformEvaluator}.
- */
- public <TransformT extends PTransform<?, ?>>
- void registerTransformEvaluator(
- Class<TransformT> transformClass,
- TransformEvaluator<TransformT> transformEvaluator) {
- if (localTransformEvaluators.put(transformClass, transformEvaluator)
- != null) {
- throw new IllegalArgumentException(
- "defining multiple evaluators for " + transformClass);
- }
- }
-
- /**
- * Returns the TransformEvaluator to use for instances of the
- * specified PTransform class, or null if none registered.
- */
- public <TransformT extends PTransform<?, ?>>
- TransformEvaluator<TransformT> getTransformEvaluator(Class<TransformT> transformClass) {
- TransformEvaluator<TransformT> transformEvaluator =
- localTransformEvaluators.get(transformClass);
- if (transformEvaluator == null) {
- transformEvaluator = defaultTransformEvaluators.get(transformClass);
- }
- return transformEvaluator;
- }
-
- /**
- * Constructs a DirectPipelineRunner from the given options.
- */
- public static DirectPipelineRunner fromOptions(PipelineOptions options) {
- DirectPipelineOptions directOptions =
- PipelineOptionsValidator.validate(DirectPipelineOptions.class, options);
- LOG.debug("Creating DirectPipelineRunner");
- return new DirectPipelineRunner(directOptions);
- }
-
- /**
- * Constructs a runner with default properties for testing.
- *
- * @return The newly created runner.
- */
- public static DirectPipelineRunner createForTest() {
- DirectPipelineOptions options = PipelineOptionsFactory.as(DirectPipelineOptions.class);
- options.setStableUniqueNames(CheckEnabled.ERROR);
- options.setGcpCredential(new TestCredential());
- return new DirectPipelineRunner(options);
- }
-
- /**
- * Enable runtime testing to verify that all functions and {@link Coder}
- * instances can be serialized.
- *
- * <p>Enabled by default.
- *
- * <p>This method modifies the {@code DirectPipelineRunner} instance and
- * returns itself.
- */
- public DirectPipelineRunner withSerializabilityTesting(boolean enable) {
- this.testSerializability = enable;
- return this;
- }
-
- /**
- * Enable runtime testing to verify that all values can be encoded.
- *
- * <p>Enabled by default.
- *
- * <p>This method modifies the {@code DirectPipelineRunner} instance and
- * returns itself.
- */
- public DirectPipelineRunner withEncodabilityTesting(boolean enable) {
- this.testEncodability = enable;
- return this;
- }
-
- /**
- * Enable runtime testing to verify that functions do not depend on order
- * of the elements.
- *
- * <p>This is accomplished by randomizing the order of elements.
- *
- * <p>Enabled by default.
- *
- * <p>This method modifies the {@code DirectPipelineRunner} instance and
- * returns itself.
- */
- public DirectPipelineRunner withUnorderednessTesting(boolean enable) {
- this.testUnorderedness = enable;
- return this;
- }
-
- @Override
- public <OutputT extends POutput, InputT extends PInput> OutputT apply(
- PTransform<InputT, OutputT> transform, InputT input) {
- if (transform instanceof Combine.GroupedValues) {
- return (OutputT) applyTestCombine((Combine.GroupedValues) transform, (PCollection) input);
- } else if (transform instanceof TextIO.Write.Bound) {
- return (OutputT) applyTextIOWrite((TextIO.Write.Bound) transform, (PCollection<?>) input);
- } else if (transform instanceof AvroIO.Write.Bound) {
- return (OutputT) applyAvroIOWrite((AvroIO.Write.Bound) transform, (PCollection<?>) input);
- } else {
- return super.apply(transform, input);
- }
- }
-
- private <K, InputT, AccumT, OutputT> PCollection<KV<K, OutputT>> applyTestCombine(
- Combine.GroupedValues<K, InputT, OutputT> transform,
- PCollection<KV<K, Iterable<InputT>>> input) {
-
- PCollection<KV<K, OutputT>> output = input
- .apply(ParDo.of(TestCombineDoFn.create(transform, input, testSerializability, rand))
- .withSideInputs(transform.getSideInputs()));
-
- try {
- output.setCoder(transform.getDefaultOutputCoder(input));
- } catch (CannotProvideCoderException exc) {
- // let coder inference occur later, if it can
- }
- return output;
- }
-
- private static class ElementProcessingOrderPartitionFn<T> implements PartitionFn<T> {
- private int elementNumber;
- @Override
- public int partitionFor(T elem, int numPartitions) {
- return elementNumber++ % numPartitions;
- }
- }
-
- /**
- * Applies TextIO.Write honoring user requested sharding controls (i.e. withNumShards)
- * by applying a partition function based upon the number of shards the user requested.
- */
- private static class DirectTextIOWrite<T> extends PTransform<PCollection<T>, PDone> {
- private final TextIO.Write.Bound<T> transform;
-
- private DirectTextIOWrite(TextIO.Write.Bound<T> transform) {
- this.transform = transform;
- }
-
- @Override
- public PDone apply(PCollection<T> input) {
- checkState(transform.getNumShards() > 1,
- "DirectTextIOWrite is expected to only be used when sharding controls are required.");
-
- // Evenly distribute all the elements across the partitions.
- PCollectionList<T> partitionedElements =
- input.apply(Partition.of(transform.getNumShards(),
- new ElementProcessingOrderPartitionFn<T>()));
-
- // For each input PCollection partition, create a write transform that represents
- // one of the specific shards.
- for (int i = 0; i < transform.getNumShards(); ++i) {
- /*
- * This logic mirrors the file naming strategy within
- * {@link FileBasedSink#generateDestinationFilenames()}
- */
- String outputFilename = IOChannelUtils.constructName(
- transform.getFilenamePrefix(),
- transform.getShardNameTemplate(),
- getFileExtension(transform.getFilenameSuffix()),
- i,
- transform.getNumShards());
-
- String transformName = String.format("%s(Shard:%s)", transform.getName(), i);
- partitionedElements.get(i).apply(transformName,
- transform.withNumShards(1).withShardNameTemplate("").withSuffix("").to(outputFilename));
- }
- return PDone.in(input.getPipeline());
- }
- }
-
- /**
- * Returns the file extension to be used. If the user did not request a file
- * extension then this method returns the empty string. Otherwise this method
- * adds a {@code "."} to the beginning of the users extension if one is not present.
- *
- * <p>This is copied from {@link FileBasedSink} to not expose it.
- */
- private static String getFileExtension(String usersExtension) {
- if (usersExtension == null || usersExtension.isEmpty()) {
- return "";
- }
- if (usersExtension.startsWith(".")) {
- return usersExtension;
- }
- return "." + usersExtension;
- }
-
- /**
- * Apply the override for TextIO.Write.Bound if the user requested sharding controls
- * greater than one.
- */
- private <T> PDone applyTextIOWrite(TextIO.Write.Bound<T> transform, PCollection<T> input) {
- if (transform.getNumShards() <= 1) {
- // By default, the DirectPipelineRunner outputs to only 1 shard. Since the user never
- // requested sharding controls greater than 1, we default to outputting to 1 file.
- return super.apply(transform.withNumShards(1), input);
- }
- return input.apply(new DirectTextIOWrite<>(transform));
- }
-
- /**
- * Applies AvroIO.Write honoring user requested sharding controls (i.e. withNumShards)
- * by applying a partition function based upon the number of shards the user requested.
- */
- private static class DirectAvroIOWrite<T> extends PTransform<PCollection<T>, PDone> {
- private final AvroIO.Write.Bound<T> transform;
-
- private DirectAvroIOWrite(AvroIO.Write.Bound<T> transform) {
- this.transform = transform;
- }
-
- @Override
- public PDone apply(PCollection<T> input) {
- checkState(transform.getNumShards() > 1,
- "DirectAvroIOWrite is expected to only be used when sharding controls are required.");
-
- // Evenly distribute all the elements across the partitions.
- PCollectionList<T> partitionedElements =
- input.apply(Partition.of(transform.getNumShards(),
- new ElementProcessingOrderPartitionFn<T>()));
-
- // For each input PCollection partition, create a write transform that represents
- // one of the specific shards.
- for (int i = 0; i < transform.getNumShards(); ++i) {
- /*
- * This logic mirrors the file naming strategy within
- * {@link FileBasedSink#generateDestinationFilenames()}
- */
- String outputFilename = IOChannelUtils.constructName(
- transform.getFilenamePrefix(),
- transform.getShardNameTemplate(),
- getFileExtension(transform.getFilenameSuffix()),
- i,
- transform.getNumShards());
-
- String transformName = String.format("%s(Shard:%s)", transform.getName(), i);
- partitionedElements.get(i).apply(transformName,
- transform.withNumShards(1).withShardNameTemplate("").withSuffix("").to(outputFilename));
- }
- return PDone.in(input.getPipeline());
- }
- }
-
- /**
- * Apply the override for AvroIO.Write.Bound if the user requested sharding controls
- * greater than one.
- */
- private <T> PDone applyAvroIOWrite(AvroIO.Write.Bound<T> transform, PCollection<T> input) {
- if (transform.getNumShards() <= 1) {
- // By default, the DirectPipelineRunner outputs to only 1 shard. Since the user never
- // requested sharding controls greater than 1, we default to outputting to 1 file.
- return super.apply(transform.withNumShards(1), input);
- }
- return input.apply(new DirectAvroIOWrite<>(transform));
- }
-
- /**
- * The implementation may split the {@link KeyedCombineFn} into ADD, MERGE and EXTRACT phases (
- * see {@code com.google.cloud.dataflow.sdk.runners.worker.CombineValuesFn}). In order to emulate
- * this for the {@link DirectPipelineRunner} and provide an experience closer to the service, go
- * through heavy serializability checks for the equivalent of the results of the ADD phase, but
- * after the {@link com.google.cloud.dataflow.sdk.transforms.GroupByKey} shuffle, and the MERGE
- * phase. Doing these checks ensure that not only is the accumulator coder serializable, but
- * the accumulator coder can actually serialize the data in question.
- */
- public static class TestCombineDoFn<K, InputT, AccumT, OutputT>
- extends DoFn<KV<K, Iterable<InputT>>, KV<K, OutputT>> {
- private final PerKeyCombineFnRunner<? super K, ? super InputT, AccumT, OutputT> fnRunner;
- private final Coder<AccumT> accumCoder;
- private final boolean testSerializability;
- private final Random rand;
-
- public static <K, InputT, AccumT, OutputT> TestCombineDoFn<K, InputT, AccumT, OutputT> create(
- Combine.GroupedValues<K, InputT, OutputT> transform,
- PCollection<KV<K, Iterable<InputT>>> input,
- boolean testSerializability,
- Random rand) {
-
- AppliedCombineFn<? super K, ? super InputT, ?, OutputT> fn = transform.getAppliedFn(
- input.getPipeline().getCoderRegistry(), input.getCoder(), input.getWindowingStrategy());
-
- return new TestCombineDoFn(
- PerKeyCombineFnRunners.create(fn.getFn()),
- fn.getAccumulatorCoder(),
- testSerializability,
- rand);
- }
-
- public TestCombineDoFn(
- PerKeyCombineFnRunner<? super K, ? super InputT, AccumT, OutputT> fnRunner,
- Coder<AccumT> accumCoder,
- boolean testSerializability,
- Random rand) {
- this.fnRunner = fnRunner;
- this.accumCoder = accumCoder;
- this.testSerializability = testSerializability;
- this.rand = rand;
-
- // Check that this does not crash, specifically to catch anonymous CustomCoder subclasses.
- this.accumCoder.getEncodingId();
- }
-
- @Override
- public void processElement(ProcessContext c) throws Exception {
- K key = c.element().getKey();
- Iterable<InputT> values = c.element().getValue();
- List<AccumT> groupedPostShuffle =
- ensureSerializableByCoder(ListCoder.of(accumCoder),
- addInputsRandomly(fnRunner, key, values, rand, c),
- "After addInputs of KeyedCombineFn " + fnRunner.fn().toString());
- AccumT merged =
- ensureSerializableByCoder(accumCoder,
- fnRunner.mergeAccumulators(key, groupedPostShuffle, c),
- "After mergeAccumulators of KeyedCombineFn " + fnRunner.fn().toString());
- // Note: The serializability of KV<K, OutputT> is ensured by the
- // runner itself, since it's a transform output.
- c.output(KV.of(key, fnRunner.extractOutput(key, merged, c)));
- }
-
- /**
- * Create a random list of accumulators from the given list of values.
- *
- * <p>Visible for testing purposes only.
- */
- public static <K, AccumT, InputT> List<AccumT> addInputsRandomly(
- PerKeyCombineFnRunner<? super K, ? super InputT, AccumT, ?> fnRunner,
- K key,
- Iterable<InputT> values,
- Random random,
- DoFn<?, ?>.ProcessContext c) {
- List<AccumT> out = new ArrayList<AccumT>();
- int i = 0;
- AccumT accumulator = fnRunner.createAccumulator(key, c);
- boolean hasInput = false;
-
- for (InputT value : values) {
- accumulator = fnRunner.addInput(key, accumulator, value, c);
- hasInput = true;
-
- // For each index i, flip a 1/2^i weighted coin for whether to
- // create a new accumulator after index i is added, i.e. [0]
- // is guaranteed, [1] is an even 1/2, [2] is 1/4, etc. The
- // goal is to partition the inputs into accumulators, and make
- // the accumulators potentially lumpy. Also compact about half
- // of the accumulators.
- if (i == 0 || random.nextInt(1 << Math.min(i, 30)) == 0) {
- if (i % 2 == 0) {
- accumulator = fnRunner.compact(key, accumulator, c);
- }
- out.add(accumulator);
- accumulator = fnRunner.createAccumulator(key, c);
- hasInput = false;
- }
- i++;
- }
- if (hasInput) {
- out.add(accumulator);
- }
-
- Collections.shuffle(out, random);
- return out;
- }
-
- public <T> T ensureSerializableByCoder(
- Coder<T> coder, T value, String errorContext) {
- if (testSerializability) {
- return SerializableUtils.ensureSerializableByCoder(
- coder, value, errorContext);
- }
- return value;
- }
- }
-
- @Override
- public EvaluationResults run(Pipeline pipeline) {
- LOG.info("Executing pipeline using the DirectPipelineRunner.");
-
- Evaluator evaluator = new Evaluator(rand);
- evaluator.run(pipeline);
-
- // Log all counter values for debugging purposes.
- for (Counter counter : evaluator.getCounters()) {
- LOG.info("Final aggregator value: {}", counter);
- }
-
- LOG.info("Pipeline execution complete.");
-
- return evaluator;
- }
-
- /**
- * An evaluator of a PTransform.
- */
- public interface TransformEvaluator<TransformT extends PTransform> {
- public void evaluate(TransformT transform,
- EvaluationContext context);
- }
-
- /**
- * The interface provided to registered callbacks for interacting
- * with the {@code DirectPipelineRunner}, including reading and writing the
- * values of {@link PCollection}s and {@link PCollectionView}s.
- */
- public interface EvaluationResults extends PipelineResult {
- /**
- * Retrieves the value of the given PCollection.
- * Throws an exception if the PCollection's value hasn't already been set.
- */
- <T> List<T> getPCollection(PCollection<T> pc);
-
- /**
- * Retrieves the windowed value of the given PCollection.
- * Throws an exception if the PCollection's value hasn't already been set.
- */
- <T> List<WindowedValue<T>> getPCollectionWindowedValues(PCollection<T> pc);
-
- /**
- * Retrieves the values of each PCollection in the given
- * PCollectionList. Throws an exception if the PCollectionList's
- * value hasn't already been set.
- */
- <T> List<List<T>> getPCollectionList(PCollectionList<T> pcs);
-
- /**
- * Retrieves the values indicated by the given {@link PCollectionView}.
- * Note that within the {@link com.google.cloud.dataflow.sdk.transforms.DoFn.Context}
- * implementation a {@link PCollectionView} should convert from this representation to a
- * suitable side input value.
- */
- <T, WindowedT> Iterable<WindowedValue<?>> getPCollectionView(PCollectionView<T> view);
- }
-
- /**
- * An immutable (value, timestamp) pair, along with other metadata necessary
- * for the implementation of {@code DirectPipelineRunner}.
- */
- public static class ValueWithMetadata<V> {
- /**
- * Returns a new {@code ValueWithMetadata} with the {@code WindowedValue}.
- * Key is null.
- */
- public static <V> ValueWithMetadata<V> of(WindowedValue<V> windowedValue) {
- return new ValueWithMetadata<>(windowedValue, null);
- }
-
- /**
- * Returns a new {@code ValueWithMetadata} with the implicit key associated
- * with this value set. The key is the last key grouped by in the chain of
- * productions that produced this element.
- * These keys are used internally by {@link DirectPipelineRunner} for keeping
- * persisted state separate across keys.
- */
- public ValueWithMetadata<V> withKey(Object key) {
- return new ValueWithMetadata<>(windowedValue, key);
- }
-
- /**
- * Returns a new {@code ValueWithMetadata} that is a copy of this one, but with
- * a different value.
- */
- public <T> ValueWithMetadata<T> withValue(T value) {
- return new ValueWithMetadata(windowedValue.withValue(value), getKey());
- }
-
- /**
- * Returns the {@code WindowedValue} associated with this element.
- */
- public WindowedValue<V> getWindowedValue() {
- return windowedValue;
- }
-
- /**
- * Returns the value associated with this element.
- *
- * @see #withValue
- */
- public V getValue() {
- return windowedValue.getValue();
- }
-
- /**
- * Returns the timestamp associated with this element.
- */
- public Instant getTimestamp() {
- return windowedValue.getTimestamp();
- }
-
- /**
- * Returns the collection of windows this element has been placed into. May
- * be null if the {@code PCollection} this element is in has not yet been
- * windowed.
- *
- * @see #getWindows()
- */
- public Collection<? extends BoundedWindow> getWindows() {
- return windowedValue.getWindows();
- }
-
-
- /**
- * Returns the key associated with this element. May be null if the
- * {@code PCollection} this element is in is not keyed.
- *
- * @see #withKey
- */
- public Object getKey() {
- return key;
- }
-
- ////////////////////////////////////////////////////////////////////////////
-
- private final Object key;
- private final WindowedValue<V> windowedValue;
-
- private ValueWithMetadata(WindowedValue<V> windowedValue,
- Object key) {
- this.windowedValue = windowedValue;
- this.key = key;
- }
- }
-
- /**
- * The interface provided to registered callbacks for interacting
- * with the {@code DirectPipelineRunner}, including reading and writing the
- * values of {@link PCollection}s and {@link PCollectionView}s.
- */
- public interface EvaluationContext extends EvaluationResults {
- /**
- * Returns the configured pipeline options.
- */
- DirectPipelineOptions getPipelineOptions();
-
- /**
- * Returns the input of the currently being processed transform.
- */
- <InputT extends PInput> InputT getInput(PTransform<InputT, ?> transform);
-
- /**
- * Returns the output of the currently being processed transform.
- */
- <OutputT extends POutput> OutputT getOutput(PTransform<?, OutputT> transform);
-
- /**
- * Sets the value of the given PCollection, where each element also has a timestamp
- * and collection of windows.
- * Throws an exception if the PCollection's value has already been set.
- */
- <T> void setPCollectionValuesWithMetadata(
- PCollection<T> pc, List<ValueWithMetadata<T>> elements);
-
- /**
- * Sets the value of the given PCollection, where each element also has a timestamp
- * and collection of windows.
- * Throws an exception if the PCollection's value has already been set.
- */
- <T> void setPCollectionWindowedValue(PCollection<T> pc, List<WindowedValue<T>> elements);
-
- /**
- * Shorthand for setting the value of a PCollection where the elements do not have
- * timestamps or windows.
- * Throws an exception if the PCollection's value has already been set.
- */
- <T> void setPCollection(PCollection<T> pc, List<T> elements);
-
- /**
- * Retrieves the value of the given PCollection, along with element metadata
- * such as timestamps and windows.
- * Throws an exception if the PCollection's value hasn't already been set.
- */
- <T> List<ValueWithMetadata<T>> getPCollectionValuesWithMetadata(PCollection<T> pc);
-
- /**
- * Sets the value associated with the given {@link PCollectionView}.
- * Throws an exception if the {@link PCollectionView}'s value has already been set.
- */
- <ElemT, T, WindowedT> void setPCollectionView(
- PCollectionView<T> pc,
- Iterable<WindowedValue<ElemT>> value);
-
- /**
- * Ensures that the element is encodable and decodable using the
- * TypePValue's coder, by encoding it and decoding it, and
- * returning the result.
- */
- <T> T ensureElementEncodable(TypedPValue<T> pvalue, T element);
-
- /**
- * If the evaluation context is testing unorderedness,
- * randomly permutes the order of the elements, in a
- * copy if !inPlaceAllowed, and returns the permuted list,
- * otherwise returns the argument unchanged.
- */
- <T> List<T> randomizeIfUnordered(List<T> elements,
- boolean inPlaceAllowed);
-
- /**
- * If the evaluation context is testing serializability, ensures
- * that the argument function is serializable and deserializable
- * by encoding it and then decoding it, and returning the result.
- * Otherwise returns the argument unchanged.
- */
- <FunctionT extends Serializable> FunctionT ensureSerializable(FunctionT fn);
-
- /**
- * If the evaluation context is testing serializability, ensures
- * that the argument Coder is serializable and deserializable
- * by encoding it and then decoding it, and returning the result.
- * Otherwise returns the argument unchanged.
- */
- <T> Coder<T> ensureCoderSerializable(Coder<T> coder);
-
- /**
- * If the evaluation context is testing serializability, ensures
- * that the given data is serializable and deserializable with the
- * given Coder by encoding it and then decoding it, and returning
- * the result. Otherwise returns the argument unchanged.
- *
- * <p>Error context is prefixed to any thrown exceptions.
- */
- <T> T ensureSerializableByCoder(Coder<T> coder,
- T data, String errorContext);
-
- /**
- * Returns a mutator, which can be used to add additional counters to
- * this EvaluationContext.
- */
- CounterSet.AddCounterMutator getAddCounterMutator();
-
- /**
- * Gets the step name for this transform.
- */
- public String getStepName(PTransform<?, ?> transform);
- }
-
-
- /////////////////////////////////////////////////////////////////////////////
-
- class Evaluator implements PipelineVisitor, EvaluationContext {
- /**
- * A map from PTransform to the step name of that transform. This is the internal name for the
- * transform (e.g. "s2").
- */
- private final Map<PTransform<?, ?>, String> stepNames = new HashMap<>();
- private final Map<PValue, Object> store = new HashMap<>();
- private final CounterSet counters = new CounterSet();
- private AppliedPTransform<?, ?, ?> currentTransform;
-
- private Map<Aggregator<?, ?>, Collection<PTransform<?, ?>>> aggregatorSteps = null;
-
- /**
- * A map from PTransform to the full name of that transform. This is the user name of the
- * transform (e.g. "RemoveDuplicates/Combine/GroupByKey").
- */
- private final Map<PTransform<?, ?>, String> fullNames = new HashMap<>();
-
- private Random rand;
-
- public Evaluator() {
- this(new Random());
- }
-
- public Evaluator(Random rand) {
- this.rand = rand;
- }
-
- public void run(Pipeline pipeline) {
- pipeline.traverseTopologically(this);
- aggregatorSteps = new AggregatorPipelineExtractor(pipeline).getAggregatorSteps();
- }
-
- @Override
- public DirectPipelineOptions getPipelineOptions() {
- return options;
- }
-
- @Override
- public <InputT extends PInput> InputT getInput(PTransform<InputT, ?> transform) {
- checkArgument(currentTransform != null && currentTransform.getTransform() == transform,
- "can only be called with current transform");
- return (InputT) currentTransform.getInput();
- }
-
- @Override
- public <OutputT extends POutput> OutputT getOutput(PTransform<?, OutputT> transform) {
- checkArgument(currentTransform != null && currentTransform.getTransform() == transform,
- "can only be called with current transform");
- return (OutputT) currentTransform.getOutput();
- }
-
- @Override
- public void enterCompositeTransform(TransformTreeNode node) {
- }
-
- @Override
- public void leaveCompositeTransform(TransformTreeNode node) {
- }
-
- @Override
- public void visitTransform(TransformTreeNode node) {
- PTransform<?, ?> transform = node.getTransform();
- fullNames.put(transform, node.getFullName());
- TransformEvaluator evaluator =
- getTransformEvaluator(transform.getClass());
- if (evaluator == null) {
- throw new IllegalStateException(
- "no evaluator registered for " + transform);
- }
- LOG.debug("Evaluating {}", transform);
- currentTransform = AppliedPTransform.of(
- node.getFullName(), node.getInput(), node.getOutput(), (PTransform) transform);
- evaluator.evaluate(transform, this);
- currentTransform = null;
- }
-
- @Override
- public void visitValue(PValue value, TransformTreeNode producer) {
- LOG.debug("Checking evaluation of {}", value);
- if (value.getProducingTransformInternal() == null) {
- throw new RuntimeException(
- "internal error: expecting a PValue " +
- "to have a producingTransform");
- }
- if (!producer.isCompositeNode()) {
- // Verify that primitive transform outputs are already computed.
- getPValue(value);
- }
- }
-
- /**
- * Sets the value of the given PValue.
- * Throws an exception if the PValue's value has already been set.
- */
- void setPValue(PValue pvalue, Object contents) {
- if (store.containsKey(pvalue)) {
- throw new IllegalStateException(
- "internal error: setting the value of " + pvalue +
- " more than once");
- }
- store.put(pvalue, contents);
- }
-
- /**
- * Retrieves the value of the given PValue.
- * Throws an exception if the PValue's value hasn't already been set.
- */
- Object getPValue(PValue pvalue) {
- if (!store.containsKey(pvalue)) {
- throw new IllegalStateException(
- "internal error: getting the value of " + pvalue +
- " before it has been computed");
- }
- return store.get(pvalue);
- }
-
- /**
- * Convert a list of T to a list of {@code ValueWithMetadata<T>}, with a timestamp of 0
- * and null windows.
- */
- <T> List<ValueWithMetadata<T>> toValueWithMetadata(List<T> values) {
- List<ValueWithMetadata<T>> result = new ArrayList<>(values.size());
- for (T value : values) {
- result.add(ValueWithMetadata.of(WindowedValue.valueInGlobalWindow(value)));
- }
- return result;
- }
-
- /**
- * Convert a list of {@code WindowedValue<T>} to a list of {@code ValueWithMetadata<T>}.
- */
- <T> List<ValueWithMetadata<T>> toValueWithMetadataFromWindowedValue(
- List<WindowedValue<T>> values) {
- List<ValueWithMetadata<T>> result = new ArrayList<>(values.size());
- for (WindowedValue<T> value : values) {
- result.add(ValueWithMetadata.of(value));
- }
- return result;
- }
-
- @Override
- public <T> void setPCollection(PCollection<T> pc, List<T> elements) {
- setPCollectionValuesWithMetadata(pc, toValueWithMetadata(elements));
- }
-
- @Override
- public <T> void setPCollectionWindowedValue(
- PCollection<T> pc, List<WindowedValue<T>> elements) {
- setPCollectionValuesWithMetadata(pc, toValueWithMetadataFromWindowedValue(elements));
- }
-
- @Override
- public <T> void setPCollectionValuesWithMetadata(
- PCollection<T> pc, List<ValueWithMetadata<T>> elements) {
- LOG.debug("Setting {} = {}", pc, elements);
- ensurePCollectionEncodable(pc, elements);
- setPValue(pc, elements);
- }
-
- @Override
- public <ElemT, T, WindowedT> void setPCollectionView(
- PCollectionView<T> view,
- Iterable<WindowedValue<ElemT>> value) {
- LOG.debug("Setting {} = {}", view, value);
- setPValue(view, value);
- }
-
- /**
- * Retrieves the value of the given {@link PCollection}.
- * Throws an exception if the {@link PCollection}'s value hasn't already been set.
- */
- @Override
- public <T> List<T> getPCollection(PCollection<T> pc) {
- List<T> result = new ArrayList<>();
- for (ValueWithMetadata<T> elem : getPCollectionValuesWithMetadata(pc)) {
- result.add(elem.getValue());
- }
- return result;
- }
-
- @Override
- public <T> List<WindowedValue<T>> getPCollectionWindowedValues(PCollection<T> pc) {
- return Lists.transform(
- getPCollectionValuesWithMetadata(pc),
- new Function<ValueWithMetadata<T>, WindowedValue<T>>() {
- @Override
- public WindowedValue<T> apply(ValueWithMetadata<T> input) {
- return input.getWindowedValue();
- }});
- }
-
- @Override
- public <T> List<ValueWithMetadata<T>> getPCollectionValuesWithMetadata(PCollection<T> pc) {
- List<ValueWithMetadata<T>> elements = (List<ValueWithMetadata<T>>) getPValue(pc);
- elements = randomizeIfUnordered(elements, false /* not inPlaceAllowed */);
- LOG.debug("Getting {} = {}", pc, elements);
- return elements;
- }
-
- @Override
- public <T> List<List<T>> getPCollectionList(PCollectionList<T> pcs) {
- List<List<T>> elementsList = new ArrayList<>();
- for (PCollection<T> pc : pcs.getAll()) {
- elementsList.add(getPCollection(pc));
- }
- return elementsList;
- }
-
- /**
- * Retrieves the value indicated by the given {@link PCollectionView}.
- * Note that within the {@link DoFnContext} a {@link PCollectionView}
- * converts from this representation to a suitable side input value.
- */
- @Override
- public <T, WindowedT> Iterable<WindowedValue<?>> getPCollectionView(PCollectionView<T> view) {
- Iterable<WindowedValue<?>> value = (Iterable<WindowedValue<?>>) getPValue(view);
- LOG.debug("Getting {} = {}", view, value);
- return value;
- }
-
- /**
- * If {@code testEncodability}, ensures that the {@link PCollection}'s coder and elements are
- * encodable and decodable by encoding them and decoding them, and returning the result.
- * Otherwise returns the argument elements.
- */
- <T> List<ValueWithMetadata<T>> ensurePCollectionEncodable(
- PCollection<T> pc, List<ValueWithMetadata<T>> elements) {
- ensureCoderSerializable(pc.getCoder());
- if (!testEncodability) {
- return elements;
- }
- List<ValueWithMetadata<T>> elementsCopy = new ArrayList<>(elements.size());
- for (ValueWithMetadata<T> element : elements) {
- elementsCopy.add(
- element.withValue(ensureElementEncodable(pc, element.getValue())));
- }
- return elementsCopy;
- }
-
- @Override
- public <T> T ensureElementEncodable(TypedPValue<T> pvalue, T element) {
- return ensureSerializableByCoder(
- pvalue.getCoder(), element, "Within " + pvalue.toString());
- }
-
- @Override
- public <T> List<T> randomizeIfUnordered(List<T> elements,
- boolean inPlaceAllowed) {
- if (!testUnorderedness) {
- return elements;
- }
- List<T> elementsCopy = new ArrayList<>(elements);
- Collections.shuffle(elementsCopy, rand);
- return elementsCopy;
- }
-
- @Override
- public <FunctionT extends Serializable> FunctionT ensureSerializable(FunctionT fn) {
- if (!testSerializability) {
- return fn;
- }
- return SerializableUtils.ensureSerializable(fn);
- }
-
- @Override
- public <T> Coder<T> ensureCoderSerializable(Coder<T> coder) {
- if (testSerializability) {
- SerializableUtils.ensureSerializable(coder);
- }
- return coder;
- }
-
- @Override
- public <T> T ensureSerializableByCoder(
- Coder<T> coder, T value, String errorContext) {
- if (testSerializability) {
- return SerializableUtils.ensureSerializableByCoder(
- coder, value, errorContext);
- }
- return value;
- }
-
- @Override
- public CounterSet.AddCounterMutator getAddCounterMutator() {
- return counters.getAddCounterMutator();
- }
-
- @Override
- public String getStepName(PTransform<?, ?> transform) {
- String stepName = stepNames.get(transform);
- if (stepName == null) {
- stepName = "s" + (stepNames.size() + 1);
- stepNames.put(transform, stepName);
- }
- return stepName;
- }
-
- /**
- * Returns the CounterSet generated during evaluation, which includes
- * user-defined Aggregators and may include system-defined counters.
- */
- public CounterSet getCounters() {
- return counters;
- }
-
- /**
- * Returns JobState.DONE in all situations. The Evaluator is not returned
- * until the pipeline has been traversed, so it will either be returned
- * after a successful run or the run call will terminate abnormally.
- */
- @Override
- public State getState() {
- return State.DONE;
- }
-
- @Override
- public <T> AggregatorValues<T> getAggregatorValues(Aggregator<?, T> aggregator) {
- Map<String, T> stepValues = new HashMap<>();
- for (PTransform<?, ?> step : aggregatorSteps.get(aggregator)) {
- String stepName = String.format("user-%s-%s", stepNames.get(step), aggregator.getName());
- String fullName = fullNames.get(step);
- Counter<?> counter = counters.getExistingCounter(stepName);
- if (counter == null) {
- throw new IllegalArgumentException(
- "Aggregator " + aggregator + " is not used in this pipeline");
- }
- stepValues.put(fullName, (T) counter.getAggregate());
- }
- return new MapAggregatorValues<>(stepValues);
- }
- }
-
- /////////////////////////////////////////////////////////////////////////////
-
- private final DirectPipelineOptions options;
- private boolean testSerializability;
- private boolean testEncodability;
- private boolean testUnorderedness;
-
- /** Returns a new DirectPipelineRunner. */
- private DirectPipelineRunner(DirectPipelineOptions options) {
- this.options = options;
- // (Re-)register standard IO factories. Clobbers any prior credentials.
- IOChannelUtils.registerStandardIOFactories(options);
- long randomSeed;
- if (options.getDirectPipelineRunnerRandomSeed() != null) {
- randomSeed = options.getDirectPipelineRunnerRandomSeed();
- } else {
- randomSeed = new Random().nextLong();
- }
-
- LOG.debug("DirectPipelineRunner using random seed {}.", randomSeed);
- rand = new Random(randomSeed);
-
- testSerializability = options.isTestSerializability();
- testEncodability = options.isTestEncodability();
- testUnorderedness = options.isTestUnorderedness();
- }
-
- /**
- * Get the options used in this {@link Pipeline}.
- */
- public DirectPipelineOptions getPipelineOptions() {
- return options;
- }
-
- @Override
- public String toString() {
- return "DirectPipelineRunner#" + hashCode();
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/PipelineRunner.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/PipelineRunner.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/PipelineRunner.java
deleted file mode 100644
index 26d8e1e..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/PipelineRunner.java
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.runners;
-
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.PipelineResult;
-import com.google.cloud.dataflow.sdk.options.GcsOptions;
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-import com.google.cloud.dataflow.sdk.options.PipelineOptionsValidator;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-import com.google.cloud.dataflow.sdk.util.IOChannelUtils;
-import com.google.cloud.dataflow.sdk.util.InstanceBuilder;
-import com.google.cloud.dataflow.sdk.values.PInput;
-import com.google.cloud.dataflow.sdk.values.POutput;
-import com.google.common.base.Preconditions;
-
-/**
- * A {@link PipelineRunner} can execute, translate, or otherwise process a
- * {@link Pipeline}.
- *
- * @param <ResultT> the type of the result of {@link #run}.
- */
-public abstract class PipelineRunner<ResultT extends PipelineResult> {
-
- /**
- * Constructs a runner from the provided options.
- *
- * @return The newly created runner.
- */
- public static PipelineRunner<? extends PipelineResult> fromOptions(PipelineOptions options) {
- GcsOptions gcsOptions = PipelineOptionsValidator.validate(GcsOptions.class, options);
- Preconditions.checkNotNull(options);
-
- // (Re-)register standard IO factories. Clobbers any prior credentials.
- IOChannelUtils.registerStandardIOFactories(gcsOptions);
-
- @SuppressWarnings("unchecked")
- PipelineRunner<? extends PipelineResult> result =
- InstanceBuilder.ofType(PipelineRunner.class)
- .fromClass(options.getRunner())
- .fromFactoryMethod("fromOptions")
- .withArg(PipelineOptions.class, options)
- .build();
- return result;
- }
-
- /**
- * Processes the given Pipeline, returning the results.
- */
- public abstract ResultT run(Pipeline pipeline);
-
- /**
- * Applies a transform to the given input, returning the output.
- *
- * <p>The default implementation calls PTransform.apply(input), but can be overridden
- * to customize behavior for a particular runner.
- */
- public <OutputT extends POutput, InputT extends PInput> OutputT apply(
- PTransform<InputT, OutputT> transform, InputT input) {
- return transform.apply(input);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/PipelineRunnerRegistrar.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/PipelineRunnerRegistrar.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/PipelineRunnerRegistrar.java
deleted file mode 100644
index 1ca3346..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/PipelineRunnerRegistrar.java
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.runners;
-
-import com.google.auto.service.AutoService;
-import java.util.ServiceLoader;
-
-/**
- * {@link PipelineRunner} creators have the ability to automatically have their
- * {@link PipelineRunner} registered with this SDK by creating a {@link ServiceLoader} entry
- * and a concrete implementation of this interface.
- *
- * <p>Note that automatic registration of any
- * {@link com.google.cloud.dataflow.sdk.options.PipelineOptions} requires users
- * conform to the limit that each {@link PipelineRunner}'s
- * {@link Class#getSimpleName() simple name} must be unique.
- *
- * <p>It is optional but recommended to use one of the many build time tools such as
- * {@link AutoService} to generate the necessary META-INF files automatically.
- */
-public interface PipelineRunnerRegistrar {
- /**
- * Get the set of {@link PipelineRunner PipelineRunners} to register.
- */
- public Iterable<Class<? extends PipelineRunner<?>>> getPipelineRunners();
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/RecordingPipelineVisitor.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/RecordingPipelineVisitor.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/RecordingPipelineVisitor.java
deleted file mode 100644
index ca02b39..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/RecordingPipelineVisitor.java
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.runners;
-
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-import com.google.cloud.dataflow.sdk.values.PValue;
-
-import java.util.ArrayList;
-import java.util.List;
-
-/**
- * Provides a simple {@link com.google.cloud.dataflow.sdk.Pipeline.PipelineVisitor}
- * that records the transformation tree.
- *
- * <p>Provided for internal unit tests.
- */
-public class RecordingPipelineVisitor implements Pipeline.PipelineVisitor {
-
- public final List<PTransform<?, ?>> transforms = new ArrayList<>();
- public final List<PValue> values = new ArrayList<>();
-
- @Override
- public void enterCompositeTransform(TransformTreeNode node) {
- }
-
- @Override
- public void leaveCompositeTransform(TransformTreeNode node) {
- }
-
- @Override
- public void visitTransform(TransformTreeNode node) {
- transforms.add(node.getTransform());
- }
-
- @Override
- public void visitValue(PValue value, TransformTreeNode producer) {
- values.add(value);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/TransformHierarchy.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/TransformHierarchy.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/TransformHierarchy.java
deleted file mode 100644
index d62192d..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/TransformHierarchy.java
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.runners;
-
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.values.PInput;
-import com.google.cloud.dataflow.sdk.values.POutput;
-import com.google.cloud.dataflow.sdk.values.PValue;
-import com.google.common.base.Preconditions;
-
-import java.util.Deque;
-import java.util.HashMap;
-import java.util.LinkedList;
-import java.util.Map;
-import java.util.Set;
-
-/**
- * Captures information about a collection of transformations and their
- * associated {@link PValue}s.
- */
-public class TransformHierarchy {
- private final Deque<TransformTreeNode> transformStack = new LinkedList<>();
- private final Map<PInput, TransformTreeNode> producingTransformNode = new HashMap<>();
-
- /**
- * Create a {@code TransformHierarchy} containing a root node.
- */
- public TransformHierarchy() {
- // First element in the stack is the root node, holding all child nodes.
- transformStack.add(new TransformTreeNode(null, null, "", null));
- }
-
- /**
- * Returns the last TransformTreeNode on the stack.
- */
- public TransformTreeNode getCurrent() {
- return transformStack.peek();
- }
-
- /**
- * Add a TransformTreeNode to the stack.
- */
- public void pushNode(TransformTreeNode current) {
- transformStack.push(current);
- }
-
- /**
- * Removes the last TransformTreeNode from the stack.
- */
- public void popNode() {
- transformStack.pop();
- Preconditions.checkState(!transformStack.isEmpty());
- }
-
- /**
- * Adds an input to the given node.
- *
- * <p>This forces the producing node to be finished.
- */
- public void addInput(TransformTreeNode node, PInput input) {
- for (PValue i : input.expand()) {
- TransformTreeNode producer = producingTransformNode.get(i);
- if (producer == null) {
- throw new IllegalStateException("Producer unknown for input: " + i);
- }
-
- producer.finishSpecifying();
- node.addInputProducer(i, producer);
- }
- }
-
- /**
- * Sets the output of a transform node.
- */
- public void setOutput(TransformTreeNode producer, POutput output) {
- producer.setOutput(output);
-
- for (PValue o : output.expand()) {
- producingTransformNode.put(o, producer);
- }
- }
-
- /**
- * Visits all nodes in the transform hierarchy, in transitive order.
- */
- public void visit(Pipeline.PipelineVisitor visitor,
- Set<PValue> visitedNodes) {
- transformStack.peekFirst().visit(visitor, visitedNodes);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/TransformTreeNode.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/TransformTreeNode.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/TransformTreeNode.java
deleted file mode 100644
index 2649458..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/TransformTreeNode.java
+++ /dev/null
@@ -1,252 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.runners;
-
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-import com.google.cloud.dataflow.sdk.values.PInput;
-import com.google.cloud.dataflow.sdk.values.POutput;
-import com.google.cloud.dataflow.sdk.values.PValue;
-import com.google.common.base.Preconditions;
-
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.Set;
-
-import javax.annotation.Nullable;
-
-/**
- * Provides internal tracking of transform relationships with helper methods
- * for initialization and ordered visitation.
- */
-public class TransformTreeNode {
- private final TransformTreeNode enclosingNode;
-
- // The PTransform for this node, which may be a composite PTransform.
- // The root of a TransformHierarchy is represented as a TransformTreeNode
- // with a null transform field.
- private final PTransform<?, ?> transform;
-
- private final String fullName;
-
- // Nodes for sub-transforms of a composite transform.
- private final Collection<TransformTreeNode> parts = new ArrayList<>();
-
- // Inputs to the transform, in expanded form and mapped to the producer
- // of the input.
- private final Map<PValue, TransformTreeNode> inputs = new HashMap<>();
-
- // Input to the transform, in unexpanded form.
- private final PInput input;
-
- // TODO: track which outputs need to be exported to parent.
- // Output of the transform, in unexpanded form.
- private POutput output;
-
- private boolean finishedSpecifying = false;
-
- /**
- * Creates a new TransformTreeNode with the given parent and transform.
- *
- * <p>EnclosingNode and transform may both be null for
- * a root-level node, which holds all other nodes.
- *
- * @param enclosingNode the composite node containing this node
- * @param transform the PTransform tracked by this node
- * @param fullName the fully qualified name of the transform
- * @param input the unexpanded input to the transform
- */
- public TransformTreeNode(@Nullable TransformTreeNode enclosingNode,
- @Nullable PTransform<?, ?> transform,
- String fullName,
- @Nullable PInput input) {
- this.enclosingNode = enclosingNode;
- this.transform = transform;
- Preconditions.checkArgument((enclosingNode == null && transform == null)
- || (enclosingNode != null && transform != null),
- "EnclosingNode and transform must both be specified, or both be null");
- this.fullName = fullName;
- this.input = input;
- }
-
- /**
- * Returns the transform associated with this transform node.
- */
- public PTransform<?, ?> getTransform() {
- return transform;
- }
-
- /**
- * Returns the enclosing composite transform node, or null if there is none.
- */
- public TransformTreeNode getEnclosingNode() {
- return enclosingNode;
- }
-
- /**
- * Adds a composite operation to the transform node.
- *
- * <p>As soon as a node is added, the transform node is considered a
- * composite operation instead of a primitive transform.
- */
- public void addComposite(TransformTreeNode node) {
- parts.add(node);
- }
-
- /**
- * Returns true if this node represents a composite transform that does not perform
- * processing of its own, but merely encapsulates a sub-pipeline (which may be empty).
- *
- * <p>Note that a node may be composite with no sub-transforms if it returns its input directly
- * extracts a component of a tuple, or other operations that occur at pipeline assembly time.
- */
- public boolean isCompositeNode() {
- return !parts.isEmpty() || returnsOthersOutput() || isRootNode();
- }
-
- private boolean returnsOthersOutput() {
- PTransform<?, ?> transform = getTransform();
- for (PValue output : getExpandedOutputs()) {
- if (!output.getProducingTransformInternal().getTransform().equals(transform)) {
- return true;
- }
- }
- return false;
- }
-
- public boolean isRootNode() {
- return transform == null;
- }
-
- public String getFullName() {
- return fullName;
- }
-
- /**
- * Adds an input to the transform node.
- */
- public void addInputProducer(PValue expandedInput, TransformTreeNode producer) {
- Preconditions.checkState(!finishedSpecifying);
- inputs.put(expandedInput, producer);
- }
-
- /**
- * Returns the transform input, in unexpanded form.
- */
- public PInput getInput() {
- return input;
- }
-
- /**
- * Returns a mapping of inputs to the producing nodes for all inputs to
- * the transform.
- */
- public Map<PValue, TransformTreeNode> getInputs() {
- return Collections.unmodifiableMap(inputs);
- }
-
- /**
- * Adds an output to the transform node.
- */
- public void setOutput(POutput output) {
- Preconditions.checkState(!finishedSpecifying);
- Preconditions.checkState(this.output == null);
- this.output = output;
- }
-
- /**
- * Returns the transform output, in unexpanded form.
- */
- public POutput getOutput() {
- return output;
- }
-
- /**
- * Returns the transform outputs, in expanded form.
- */
- public Collection<? extends PValue> getExpandedOutputs() {
- if (output != null) {
- return output.expand();
- } else {
- return Collections.emptyList();
- }
- }
-
- /**
- * Visit the transform node.
- *
- * <p>Provides an ordered visit of the input values, the primitive
- * transform (or child nodes for composite transforms), then the
- * output values.
- */
- public void visit(Pipeline.PipelineVisitor visitor,
- Set<PValue> visitedValues) {
- if (!finishedSpecifying) {
- finishSpecifying();
- }
-
- // Visit inputs.
- for (Map.Entry<PValue, TransformTreeNode> entry : inputs.entrySet()) {
- if (visitedValues.add(entry.getKey())) {
- visitor.visitValue(entry.getKey(), entry.getValue());
- }
- }
-
- if (isCompositeNode()) {
- visitor.enterCompositeTransform(this);
- for (TransformTreeNode child : parts) {
- child.visit(visitor, visitedValues);
- }
- visitor.leaveCompositeTransform(this);
- } else {
- visitor.visitTransform(this);
- }
-
- // Visit outputs.
- for (PValue pValue : getExpandedOutputs()) {
- if (visitedValues.add(pValue)) {
- visitor.visitValue(pValue, this);
- }
- }
- }
-
- /**
- * Finish specifying a transform.
- *
- * <p>All inputs are finished first, then the transform, then
- * all outputs.
- */
- public void finishSpecifying() {
- if (finishedSpecifying) {
- return;
- }
- finishedSpecifying = true;
-
- for (TransformTreeNode input : inputs.values()) {
- if (input != null) {
- input.finishSpecifying();
- }
- }
-
- if (output != null) {
- output.finishSpecifyingOutput();
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/dataflow/AssignWindows.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/dataflow/AssignWindows.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/dataflow/AssignWindows.java
deleted file mode 100644
index 093783d..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/dataflow/AssignWindows.java
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.runners.dataflow;
-
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.transforms.DoFn;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-import com.google.cloud.dataflow.sdk.transforms.ParDo;
-import com.google.cloud.dataflow.sdk.transforms.windowing.Window;
-import com.google.cloud.dataflow.sdk.transforms.windowing.WindowFn;
-import com.google.cloud.dataflow.sdk.util.WindowingStrategy;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-
-/**
- * A primitive {@link PTransform} that implements the {@link Window#into(WindowFn)}
- * {@link PTransform}.
- *
- * For an application of {@link Window#into(WindowFn)} that changes the {@link WindowFn}, applies
- * a primitive {@link PTransform} in the Dataflow service.
- *
- * For an application of {@link Window#into(WindowFn)} that does not change the {@link WindowFn},
- * applies an identity {@link ParDo} and sets the windowing strategy of the output
- * {@link PCollection}.
- *
- * For internal use only.
- *
- * @param <T> the type of input element
- */
-public class AssignWindows<T> extends PTransform<PCollection<T>, PCollection<T>> {
- private final Window.Bound<T> transform;
-
- /**
- * Builds an instance of this class from the overriden transform.
- */
- @SuppressWarnings("unused") // Used via reflection
- public AssignWindows(Window.Bound<T> transform) {
- this.transform = transform;
- }
-
- @Override
- public PCollection<T> apply(PCollection<T> input) {
- WindowingStrategy<?, ?> outputStrategy =
- transform.getOutputStrategyInternal(input.getWindowingStrategy());
- if (transform.getWindowFn() != null) {
- // If the windowFn changed, we create a primitive, and run the AssignWindows operation here.
- return PCollection.<T>createPrimitiveOutputInternal(
- input.getPipeline(), outputStrategy, input.isBounded());
- } else {
- // If the windowFn didn't change, we just run a pass-through transform and then set the
- // new windowing strategy.
- return input.apply(ParDo.named("Identity").of(new DoFn<T, T>() {
- @Override
- public void processElement(DoFn<T, T>.ProcessContext c) throws Exception {
- c.output(c.element());
- }
- })).setWindowingStrategyInternal(outputStrategy);
- }
- }
-
- @Override
- public void validate(PCollection<T> input) {
- transform.validate(input);
- }
-
- @Override
- protected Coder<?> getDefaultOutputCoder(PCollection<T> input) {
- return input.getCoder();
- }
-
- @Override
- protected String getKindString() {
- return "Window.Into()";
- }
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/dataflow/BigQueryIOTranslator.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/dataflow/BigQueryIOTranslator.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/dataflow/BigQueryIOTranslator.java
deleted file mode 100644
index 538901c..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/dataflow/BigQueryIOTranslator.java
+++ /dev/null
@@ -1,125 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.runners.dataflow;
-
-import com.google.api.client.json.JsonFactory;
-import com.google.api.services.bigquery.model.TableReference;
-import com.google.cloud.dataflow.sdk.coders.TableRowJsonCoder;
-import com.google.cloud.dataflow.sdk.io.BigQueryIO;
-import com.google.cloud.dataflow.sdk.runners.DataflowPipelineTranslator;
-import com.google.cloud.dataflow.sdk.util.PropertyNames;
-import com.google.cloud.dataflow.sdk.util.Transport;
-import com.google.cloud.dataflow.sdk.util.WindowedValue;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.IOException;
-
-/**
- * BigQuery transform support code for the Dataflow backend.
- */
-public class BigQueryIOTranslator {
- private static final JsonFactory JSON_FACTORY = Transport.getJsonFactory();
- private static final Logger LOG = LoggerFactory.getLogger(BigQueryIOTranslator.class);
-
- /**
- * Implements BigQueryIO Read translation for the Dataflow backend.
- */
- public static class ReadTranslator
- implements DataflowPipelineTranslator.TransformTranslator<BigQueryIO.Read.Bound> {
-
- @Override
- public void translate(
- BigQueryIO.Read.Bound transform, DataflowPipelineTranslator.TranslationContext context) {
- // Actual translation.
- context.addStep(transform, "ParallelRead");
- context.addInput(PropertyNames.FORMAT, "bigquery");
- context.addInput(PropertyNames.BIGQUERY_EXPORT_FORMAT, "FORMAT_AVRO");
-
- if (transform.getQuery() != null) {
- context.addInput(PropertyNames.BIGQUERY_QUERY, transform.getQuery());
- context.addInput(PropertyNames.BIGQUERY_FLATTEN_RESULTS, transform.getFlattenResults());
- } else {
- TableReference table = transform.getTable();
- if (table.getProjectId() == null) {
- // If user does not specify a project we assume the table to be located in the project
- // that owns the Dataflow job.
- String projectIdFromOptions = context.getPipelineOptions().getProject();
- LOG.warn(String.format(BigQueryIO.SET_PROJECT_FROM_OPTIONS_WARNING, table.getDatasetId(),
- table.getDatasetId(), table.getTableId(), projectIdFromOptions));
- table.setProjectId(projectIdFromOptions);
- }
-
- context.addInput(PropertyNames.BIGQUERY_TABLE, table.getTableId());
- context.addInput(PropertyNames.BIGQUERY_DATASET, table.getDatasetId());
- if (table.getProjectId() != null) {
- context.addInput(PropertyNames.BIGQUERY_PROJECT, table.getProjectId());
- }
- }
- context.addValueOnlyOutput(PropertyNames.OUTPUT, context.getOutput(transform));
- }
- }
-
- /**
- * Implements BigQueryIO Write translation for the Dataflow backend.
- */
- public static class WriteTranslator
- implements DataflowPipelineTranslator.TransformTranslator<BigQueryIO.Write.Bound> {
-
- @Override
- public void translate(BigQueryIO.Write.Bound transform,
- DataflowPipelineTranslator.TranslationContext context) {
- if (context.getPipelineOptions().isStreaming()) {
- // Streaming is handled by the streaming runner.
- throw new AssertionError(
- "BigQueryIO is specified to use streaming write in batch mode.");
- }
-
- TableReference table = transform.getTable();
-
- // Actual translation.
- context.addStep(transform, "ParallelWrite");
- context.addInput(PropertyNames.FORMAT, "bigquery");
- context.addInput(PropertyNames.BIGQUERY_TABLE,
- table.getTableId());
- context.addInput(PropertyNames.BIGQUERY_DATASET,
- table.getDatasetId());
- if (table.getProjectId() != null) {
- context.addInput(PropertyNames.BIGQUERY_PROJECT, table.getProjectId());
- }
- if (transform.getSchema() != null) {
- try {
- context.addInput(PropertyNames.BIGQUERY_SCHEMA,
- JSON_FACTORY.toString(transform.getSchema()));
- } catch (IOException exn) {
- throw new IllegalArgumentException("Invalid table schema.", exn);
- }
- }
- context.addInput(
- PropertyNames.BIGQUERY_CREATE_DISPOSITION,
- transform.getCreateDisposition().name());
- context.addInput(
- PropertyNames.BIGQUERY_WRITE_DISPOSITION,
- transform.getWriteDisposition().name());
- // Set sink encoding to TableRowJsonCoder.
- context.addEncodingInput(
- WindowedValue.getValueOnlyCoder(TableRowJsonCoder.of()));
- context.addInput(PropertyNames.PARALLEL_INPUT, context.getInput(transform));
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/dataflow/CustomSources.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/dataflow/CustomSources.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/dataflow/CustomSources.java
deleted file mode 100644
index 8160693..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/dataflow/CustomSources.java
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.runners.dataflow;
-
-import static com.google.api.client.util.Base64.encodeBase64String;
-import static com.google.cloud.dataflow.sdk.util.SerializableUtils.serializeToByteArray;
-import static com.google.cloud.dataflow.sdk.util.Structs.addString;
-import static com.google.cloud.dataflow.sdk.util.Structs.addStringList;
-import static com.google.common.base.Preconditions.checkArgument;
-
-import com.google.api.services.dataflow.model.SourceMetadata;
-import com.google.cloud.dataflow.sdk.io.BoundedSource;
-import com.google.cloud.dataflow.sdk.io.Source;
-import com.google.cloud.dataflow.sdk.io.UnboundedSource;
-import com.google.cloud.dataflow.sdk.options.DataflowPipelineOptions;
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-import com.google.cloud.dataflow.sdk.util.CloudObject;
-import com.google.common.annotations.VisibleForTesting;
-import com.google.protobuf.ByteString;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.util.ArrayList;
-import java.util.List;
-
-
-/**
- * A helper class for supporting sources defined as {@code Source}.
- *
- * <p>Provides a bridge between the high-level {@code Source} API and the
- * low-level {@code CloudSource} class.
- */
-public class CustomSources {
- private static final String SERIALIZED_SOURCE = "serialized_source";
- @VisibleForTesting static final String SERIALIZED_SOURCE_SPLITS = "serialized_source_splits";
- /**
- * The current limit on the size of a ReportWorkItemStatus RPC to Google Cloud Dataflow, which
- * includes the initial splits, is 20 MB.
- */
- public static final long DATAFLOW_SPLIT_RESPONSE_API_SIZE_BYTES = 20 * (1 << 20);
-
- private static final Logger LOG = LoggerFactory.getLogger(CustomSources.class);
-
- private static final ByteString firstSplitKey = ByteString.copyFromUtf8("0000000000000001");
-
- public static boolean isFirstUnboundedSourceSplit(ByteString splitKey) {
- return splitKey.equals(firstSplitKey);
- }
-
- private static int getDesiredNumUnboundedSourceSplits(DataflowPipelineOptions options) {
- if (options.getMaxNumWorkers() > 0) {
- return options.getMaxNumWorkers();
- } else if (options.getNumWorkers() > 0) {
- return options.getNumWorkers() * 3;
- } else {
- return 20;
- }
- }
-
- public static com.google.api.services.dataflow.model.Source serializeToCloudSource(
- Source<?> source, PipelineOptions options) throws Exception {
- com.google.api.services.dataflow.model.Source cloudSource =
- new com.google.api.services.dataflow.model.Source();
- // We ourselves act as the SourceFormat.
- cloudSource.setSpec(CloudObject.forClass(CustomSources.class));
- addString(
- cloudSource.getSpec(), SERIALIZED_SOURCE, encodeBase64String(serializeToByteArray(source)));
-
- SourceMetadata metadata = new SourceMetadata();
- if (source instanceof BoundedSource) {
- BoundedSource<?> boundedSource = (BoundedSource<?>) source;
- try {
- metadata.setProducesSortedKeys(boundedSource.producesSortedKeys(options));
- } catch (Exception e) {
- LOG.warn("Failed to check if the source produces sorted keys: " + source, e);
- }
-
- // Size estimation is best effort so we continue even if it fails here.
- try {
- metadata.setEstimatedSizeBytes(boundedSource.getEstimatedSizeBytes(options));
- } catch (Exception e) {
- LOG.warn("Size estimation of the source failed: " + source, e);
- }
- } else if (source instanceof UnboundedSource) {
- UnboundedSource<?, ?> unboundedSource = (UnboundedSource<?, ?>) source;
- metadata.setInfinite(true);
- List<String> encodedSplits = new ArrayList<>();
- int desiredNumSplits =
- getDesiredNumUnboundedSourceSplits(options.as(DataflowPipelineOptions.class));
- for (UnboundedSource<?, ?> split :
- unboundedSource.generateInitialSplits(desiredNumSplits, options)) {
- encodedSplits.add(encodeBase64String(serializeToByteArray(split)));
- }
- checkArgument(!encodedSplits.isEmpty(), "UnboundedSources must have at least one split");
- addStringList(cloudSource.getSpec(), SERIALIZED_SOURCE_SPLITS, encodedSplits);
- } else {
- throw new IllegalArgumentException("Unexpected source kind: " + source.getClass());
- }
-
- cloudSource.setMetadata(metadata);
- return cloudSource;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/dataflow/DataflowAggregatorTransforms.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/dataflow/DataflowAggregatorTransforms.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/dataflow/DataflowAggregatorTransforms.java
deleted file mode 100644
index e1d7301..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/dataflow/DataflowAggregatorTransforms.java
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.runners.dataflow;
-
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.transforms.Aggregator;
-import com.google.cloud.dataflow.sdk.transforms.AppliedPTransform;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-import com.google.common.collect.BiMap;
-import com.google.common.collect.HashBiMap;
-import com.google.common.collect.HashMultimap;
-import com.google.common.collect.Multimap;
-
-import java.util.Collection;
-import java.util.HashSet;
-import java.util.Map;
-
-/**
- * A mapping relating {@link Aggregator}s and the {@link PTransform} in which they are used.
- */
-public class DataflowAggregatorTransforms {
- private final Map<Aggregator<?, ?>, Collection<PTransform<?, ?>>> aggregatorTransforms;
- private final Multimap<PTransform<?, ?>, AppliedPTransform<?, ?, ?>> transformAppliedTransforms;
- private final BiMap<AppliedPTransform<?, ?, ?>, String> appliedStepNames;
-
- public DataflowAggregatorTransforms(
- Map<Aggregator<?, ?>, Collection<PTransform<?, ?>>> aggregatorTransforms,
- Map<AppliedPTransform<?, ?, ?>, String> transformStepNames) {
- this.aggregatorTransforms = aggregatorTransforms;
- appliedStepNames = HashBiMap.create(transformStepNames);
-
- transformAppliedTransforms = HashMultimap.create();
- for (AppliedPTransform<?, ?, ?> appliedTransform : transformStepNames.keySet()) {
- transformAppliedTransforms.put(appliedTransform.getTransform(), appliedTransform);
- }
- }
-
- /**
- * Returns true if the provided {@link Aggregator} is used in the constructing {@link Pipeline}.
- */
- public boolean contains(Aggregator<?, ?> aggregator) {
- return aggregatorTransforms.containsKey(aggregator);
- }
-
- /**
- * Gets the step names in which the {@link Aggregator} is used.
- */
- public Collection<String> getAggregatorStepNames(Aggregator<?, ?> aggregator) {
- Collection<String> names = new HashSet<>();
- Collection<PTransform<?, ?>> transforms = aggregatorTransforms.get(aggregator);
- for (PTransform<?, ?> transform : transforms) {
- for (AppliedPTransform<?, ?, ?> applied : transformAppliedTransforms.get(transform)) {
- names.add(appliedStepNames.get(applied));
- }
- }
- return names;
- }
-
- /**
- * Gets the {@link PTransform} that was assigned the provided step name.
- */
- public AppliedPTransform<?, ?, ?> getAppliedTransformForStepName(String stepName) {
- return appliedStepNames.inverse().get(stepName);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/dataflow/DataflowMetricUpdateExtractor.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/dataflow/DataflowMetricUpdateExtractor.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/dataflow/DataflowMetricUpdateExtractor.java
deleted file mode 100644
index 13016dd..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/dataflow/DataflowMetricUpdateExtractor.java
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.runners.dataflow;
-
-import com.google.api.services.dataflow.model.MetricStructuredName;
-import com.google.api.services.dataflow.model.MetricUpdate;
-import com.google.cloud.dataflow.sdk.transforms.Aggregator;
-import com.google.cloud.dataflow.sdk.transforms.AppliedPTransform;
-import com.google.cloud.dataflow.sdk.transforms.Combine.CombineFn;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-
-import java.util.Collection;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-
-/**
- * Methods for extracting the values of an {@link Aggregator} from a collection of {@link
- * MetricUpdate MetricUpdates}.
- */
-public final class DataflowMetricUpdateExtractor {
- private static final String STEP_NAME_CONTEXT_KEY = "step";
- private static final String IS_TENTATIVE_KEY = "tentative";
-
- private DataflowMetricUpdateExtractor() {
- // Do not instantiate.
- }
-
- /**
- * Extract the values of the provided {@link Aggregator} at each {@link PTransform} it was used in
- * according to the provided {@link DataflowAggregatorTransforms} from the given list of {@link
- * MetricUpdate MetricUpdates}.
- */
- public static <OutputT> Map<String, OutputT> fromMetricUpdates(Aggregator<?, OutputT> aggregator,
- DataflowAggregatorTransforms aggregatorTransforms, List<MetricUpdate> metricUpdates) {
- Map<String, OutputT> results = new HashMap<>();
- if (metricUpdates == null) {
- return results;
- }
-
- String aggregatorName = aggregator.getName();
- Collection<String> aggregatorSteps = aggregatorTransforms.getAggregatorStepNames(aggregator);
-
- for (MetricUpdate metricUpdate : metricUpdates) {
- MetricStructuredName metricStructuredName = metricUpdate.getName();
- Map<String, String> context = metricStructuredName.getContext();
- if (metricStructuredName.getName().equals(aggregatorName) && context != null
- && aggregatorSteps.contains(context.get(STEP_NAME_CONTEXT_KEY))) {
- AppliedPTransform<?, ?, ?> transform =
- aggregatorTransforms.getAppliedTransformForStepName(
- context.get(STEP_NAME_CONTEXT_KEY));
- String fullName = transform.getFullName();
- // Prefer the tentative (fresher) value if it exists.
- if (Boolean.parseBoolean(context.get(IS_TENTATIVE_KEY)) || !results.containsKey(fullName)) {
- results.put(fullName, toValue(aggregator, metricUpdate));
- }
- }
- }
-
- return results;
-
- }
-
- private static <OutputT> OutputT toValue(
- Aggregator<?, OutputT> aggregator, MetricUpdate metricUpdate) {
- CombineFn<?, ?, OutputT> combineFn = aggregator.getCombineFn();
- Class<? super OutputT> outputType = combineFn.getOutputType().getRawType();
-
- if (outputType.equals(Long.class)) {
- @SuppressWarnings("unchecked")
- OutputT asLong = (OutputT) Long.valueOf(toNumber(metricUpdate).longValue());
- return asLong;
- }
- if (outputType.equals(Integer.class)) {
- @SuppressWarnings("unchecked")
- OutputT asInt = (OutputT) Integer.valueOf(toNumber(metricUpdate).intValue());
- return asInt;
- }
- if (outputType.equals(Double.class)) {
- @SuppressWarnings("unchecked")
- OutputT asDouble = (OutputT) Double.valueOf(toNumber(metricUpdate).doubleValue());
- return asDouble;
- }
- throw new UnsupportedOperationException(
- "Unsupported Output Type " + outputType + " in aggregator " + aggregator);
- }
-
- private static Number toNumber(MetricUpdate update) {
- if (update.getScalar() instanceof Number) {
- return (Number) update.getScalar();
- }
- throw new IllegalArgumentException(
- "Metric Update " + update + " does not have a numeric scalar");
- }
-}
-
[64/67] incubator-beam git commit: Directory reorganization
Posted by dh...@apache.org.
Directory reorganization
Move Java 8-specific tests from "java8tests" into "sdks/java/java8tests/".
Project: http://git-wip-us.apache.org/repos/asf/incubator-beam/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-beam/commit/d4233aa0
Tree: http://git-wip-us.apache.org/repos/asf/incubator-beam/tree/d4233aa0
Diff: http://git-wip-us.apache.org/repos/asf/incubator-beam/diff/d4233aa0
Branch: refs/heads/master
Commit: d4233aa009375ec9a3e393bcec6b496920a6a54e
Parents: 11bb9e0
Author: Davor Bonaci <da...@google.com>
Authored: Wed Mar 23 17:44:35 2016 -0700
Committer: Davor Bonaci <da...@google.com>
Committed: Wed Mar 23 18:33:33 2016 -0700
----------------------------------------------------------------------
java8tests/pom.xml | 184 -------------------
.../sdk/transforms/CombineJava8Test.java | 133 --------------
.../sdk/transforms/FilterJava8Test.java | 118 ------------
.../transforms/FlatMapElementsJava8Test.java | 84 ---------
.../sdk/transforms/MapElementsJava8Test.java | 77 --------
.../sdk/transforms/PartitionJava8Test.java | 74 --------
.../transforms/RemoveDuplicatesJava8Test.java | 98 ----------
.../sdk/transforms/WithKeysJava8Test.java | 73 --------
.../sdk/transforms/WithTimestampsJava8Test.java | 65 -------
pom.xml | 2 +-
sdks/java/java8tests/pom.xml | 184 +++++++++++++++++++
.../sdk/transforms/CombineJava8Test.java | 133 ++++++++++++++
.../sdk/transforms/FilterJava8Test.java | 118 ++++++++++++
.../transforms/FlatMapElementsJava8Test.java | 84 +++++++++
.../sdk/transforms/MapElementsJava8Test.java | 77 ++++++++
.../sdk/transforms/PartitionJava8Test.java | 74 ++++++++
.../transforms/RemoveDuplicatesJava8Test.java | 98 ++++++++++
.../sdk/transforms/WithKeysJava8Test.java | 73 ++++++++
.../sdk/transforms/WithTimestampsJava8Test.java | 65 +++++++
19 files changed, 907 insertions(+), 907 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/d4233aa0/java8tests/pom.xml
----------------------------------------------------------------------
diff --git a/java8tests/pom.xml b/java8tests/pom.xml
deleted file mode 100644
index 1d253dc..0000000
--- a/java8tests/pom.xml
+++ /dev/null
@@ -1,184 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project xmlns="http://maven.apache.org/POM/4.0.0"
- xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
- xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
-
- <modelVersion>4.0.0</modelVersion>
-
- <parent>
- <groupId>org.apache.beam</groupId>
- <artifactId>parent</artifactId>
- <version>0.1.0-incubating-SNAPSHOT</version>
- <relativePath>../pom.xml</relativePath>
- </parent>
-
- <artifactId>java8tests-all</artifactId>
- <name>Apache Beam :: Tests :: Java 8 All</name>
- <description>Apache Beam Java SDK provides a simple, Java-based
- interface for processing virtually any size data.
- This artifact includes tests of the SDK from a Java 8
- user.</description>
-
- <packaging>jar</packaging>
-
- <profiles>
- <profile>
- <id>DataflowPipelineTests</id>
- <properties>
- <runIntegrationTestOnService>true</runIntegrationTestOnService>
- <testGroups>com.google.cloud.dataflow.sdk.testing.RunnableOnService</testGroups>
- <testParallelValue>both</testParallelValue>
- </properties>
- </profile>
- </profiles>
-
- <build>
- <plugins>
- <plugin>
- <artifactId>maven-compiler-plugin</artifactId>
- <configuration>
- <testSource>1.8</testSource>
- <testTarget>1.8</testTarget>
- </configuration>
- </plugin>
-
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-dependency-plugin</artifactId>
- <executions>
- <execution>
- <goals><goal>analyze-only</goal></goals>
- <configuration>
- <failOnWarning>true</failOnWarning>
- </configuration>
- </execution>
- </executions>
- </plugin>
-
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-checkstyle-plugin</artifactId>
- <version>2.12</version>
- <dependencies>
- <dependency>
- <groupId>com.puppycrawl.tools</groupId>
- <artifactId>checkstyle</artifactId>
- <version>6.6</version>
- </dependency>
- </dependencies>
- <configuration>
- <configLocation>../checkstyle.xml</configLocation>
- <consoleOutput>true</consoleOutput>
- <failOnViolation>true</failOnViolation>
- <includeTestSourceDirectory>true</includeTestSourceDirectory>
- <includeResources>false</includeResources>
- </configuration>
- <executions>
- <execution>
- <goals>
- <goal>check</goal>
- </goals>
- </execution>
- </executions>
- </plugin>
-
- <!-- Source plugin for generating source and test-source JARs. -->
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-source-plugin</artifactId>
- <version>2.4</version>
- <executions>
- <execution>
- <id>attach-sources</id>
- <phase>compile</phase>
- <goals>
- <goal>jar</goal>
- </goals>
- </execution>
- <execution>
- <id>attach-test-sources</id>
- <phase>test-compile</phase>
- <goals>
- <goal>test-jar</goal>
- </goals>
- </execution>
- </executions>
- </plugin>
-
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-jar-plugin</artifactId>
- <executions>
- <execution>
- <id>default-jar</id>
- <goals>
- <goal>jar</goal>
- </goals>
- </execution>
- <execution>
- <id>default-test-jar</id>
- <goals>
- <goal>test-jar</goal>
- </goals>
- </execution>
- </executions>
- </plugin>
-
- <!-- Coverage analysis for unit tests. -->
- <plugin>
- <groupId>org.jacoco</groupId>
- <artifactId>jacoco-maven-plugin</artifactId>
- </plugin>
- </plugins>
- </build>
-
- <dependencies>
- <dependency>
- <groupId>org.apache.beam</groupId>
- <artifactId>java-sdk-all</artifactId>
- <version>${project.version}</version>
- </dependency>
-
- <dependency>
- <groupId>com.google.guava</groupId>
- <artifactId>guava</artifactId>
- <version>${guava.version}</version>
- </dependency>
-
- <dependency>
- <groupId>joda-time</groupId>
- <artifactId>joda-time</artifactId>
- <version>${joda.version}</version>
- </dependency>
-
- <dependency>
- <groupId>org.hamcrest</groupId>
- <artifactId>hamcrest-all</artifactId>
- <version>${hamcrest.version}</version>
- <scope>test</scope>
- </dependency>
-
- <dependency>
- <groupId>junit</groupId>
- <artifactId>junit</artifactId>
- <version>${junit.version}</version>
- <scope>test</scope>
- </dependency>
- </dependencies>
-</project>
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/d4233aa0/java8tests/src/test/java/com/google/cloud/dataflow/sdk/transforms/CombineJava8Test.java
----------------------------------------------------------------------
diff --git a/java8tests/src/test/java/com/google/cloud/dataflow/sdk/transforms/CombineJava8Test.java b/java8tests/src/test/java/com/google/cloud/dataflow/sdk/transforms/CombineJava8Test.java
deleted file mode 100644
index b569e49..0000000
--- a/java8tests/src/test/java/com/google/cloud/dataflow/sdk/transforms/CombineJava8Test.java
+++ /dev/null
@@ -1,133 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms;
-
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.testing.DataflowAssert;
-import com.google.cloud.dataflow.sdk.testing.TestPipeline;
-import com.google.cloud.dataflow.sdk.values.KV;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-
-import org.junit.Rule;
-import org.junit.Test;
-import org.junit.rules.ExpectedException;
-import org.junit.runner.RunWith;
-import org.junit.runners.JUnit4;
-
-import java.io.Serializable;
-
-/**
- * Java 8 Tests for {@link Combine}.
- */
-@RunWith(JUnit4.class)
-@SuppressWarnings("serial")
-public class CombineJava8Test implements Serializable {
-
- @Rule
- public transient ExpectedException thrown = ExpectedException.none();
-
- /**
- * Class for use in testing use of Java 8 method references.
- */
- private static class Summer implements Serializable {
- public int sum(Iterable<Integer> integers) {
- int sum = 0;
- for (int i : integers) {
- sum += i;
- }
- return sum;
- }
- }
-
- /**
- * Tests creation of a global {@link Combine} via Java 8 lambda.
- */
- @Test
- public void testCombineGloballyLambda() {
- Pipeline pipeline = TestPipeline.create();
-
- PCollection<Integer> output = pipeline
- .apply(Create.of(1, 2, 3, 4))
- .apply(Combine.globally(integers -> {
- int sum = 0;
- for (int i : integers) {
- sum += i;
- }
- return sum;
- }));
-
- DataflowAssert.that(output).containsInAnyOrder(10);
- pipeline.run();
- }
-
- /**
- * Tests creation of a global {@link Combine} via a Java 8 method reference.
- */
- @Test
- public void testCombineGloballyInstanceMethodReference() {
- Pipeline pipeline = TestPipeline.create();
-
- PCollection<Integer> output = pipeline
- .apply(Create.of(1, 2, 3, 4))
- .apply(Combine.globally(new Summer()::sum));
-
- DataflowAssert.that(output).containsInAnyOrder(10);
- pipeline.run();
- }
-
- /**
- * Tests creation of a per-key {@link Combine} via a Java 8 lambda.
- */
- @Test
- public void testCombinePerKeyLambda() {
- Pipeline pipeline = TestPipeline.create();
-
- PCollection<KV<String, Integer>> output = pipeline
- .apply(Create.of(KV.of("a", 1), KV.of("b", 2), KV.of("a", 3), KV.of("c", 4)))
- .apply(Combine.perKey(integers -> {
- int sum = 0;
- for (int i : integers) {
- sum += i;
- }
- return sum;
- }));
-
- DataflowAssert.that(output).containsInAnyOrder(
- KV.of("a", 4),
- KV.of("b", 2),
- KV.of("c", 4));
- pipeline.run();
- }
-
- /**
- * Tests creation of a per-key {@link Combine} via a Java 8 method reference.
- */
- @Test
- public void testCombinePerKeyInstanceMethodReference() {
- Pipeline pipeline = TestPipeline.create();
-
- PCollection<KV<String, Integer>> output = pipeline
- .apply(Create.of(KV.of("a", 1), KV.of("b", 2), KV.of("a", 3), KV.of("c", 4)))
- .apply(Combine.perKey(new Summer()::sum));
-
- DataflowAssert.that(output).containsInAnyOrder(
- KV.of("a", 4),
- KV.of("b", 2),
- KV.of("c", 4));
- pipeline.run();
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/d4233aa0/java8tests/src/test/java/com/google/cloud/dataflow/sdk/transforms/FilterJava8Test.java
----------------------------------------------------------------------
diff --git a/java8tests/src/test/java/com/google/cloud/dataflow/sdk/transforms/FilterJava8Test.java b/java8tests/src/test/java/com/google/cloud/dataflow/sdk/transforms/FilterJava8Test.java
deleted file mode 100644
index db65932..0000000
--- a/java8tests/src/test/java/com/google/cloud/dataflow/sdk/transforms/FilterJava8Test.java
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms;
-
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.coders.CannotProvideCoderException;
-import com.google.cloud.dataflow.sdk.testing.DataflowAssert;
-import com.google.cloud.dataflow.sdk.testing.RunnableOnService;
-import com.google.cloud.dataflow.sdk.testing.TestPipeline;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-
-import org.junit.Rule;
-import org.junit.Test;
-import org.junit.experimental.categories.Category;
-import org.junit.rules.ExpectedException;
-import org.junit.runner.RunWith;
-import org.junit.runners.JUnit4;
-
-import java.io.Serializable;
-
-/**
- * Java 8 Tests for {@link Filter}.
- */
-@RunWith(JUnit4.class)
-@SuppressWarnings("serial")
-public class FilterJava8Test implements Serializable {
-
- @Rule
- public transient ExpectedException thrown = ExpectedException.none();
-
- @Test
- @Category(RunnableOnService.class)
- public void testIdentityFilterByPredicate() {
- Pipeline pipeline = TestPipeline.create();
-
- PCollection<Integer> output = pipeline
- .apply(Create.of(591, 11789, 1257, 24578, 24799, 307))
- .apply(Filter.byPredicate(i -> true));
-
- DataflowAssert.that(output).containsInAnyOrder(591, 11789, 1257, 24578, 24799, 307);
- pipeline.run();
- }
-
- @Test
- public void testNoFilterByPredicate() {
- Pipeline pipeline = TestPipeline.create();
-
- PCollection<Integer> output = pipeline
- .apply(Create.of(1, 2, 4, 5))
- .apply(Filter.byPredicate(i -> false));
-
- DataflowAssert.that(output).empty();
- pipeline.run();
- }
-
- @Test
- @Category(RunnableOnService.class)
- public void testFilterByPredicate() {
- Pipeline pipeline = TestPipeline.create();
-
- PCollection<Integer> output = pipeline
- .apply(Create.of(1, 2, 3, 4, 5, 6, 7))
- .apply(Filter.byPredicate(i -> i % 2 == 0));
-
- DataflowAssert.that(output).containsInAnyOrder(2, 4, 6);
- pipeline.run();
- }
-
- /**
- * Confirms that in Java 8 style, where a lambda results in a rawtype, the output type token is
- * not useful. If this test ever fails there may be simplifications available to us.
- */
- @Test
- public void testFilterParDoOutputTypeDescriptorRaw() throws Exception {
- Pipeline pipeline = TestPipeline.create();
-
- @SuppressWarnings({"unchecked", "rawtypes"})
- PCollection<String> output = pipeline
- .apply(Create.of("hello"))
- .apply(Filter.by(s -> true));
-
- thrown.expect(CannotProvideCoderException.class);
- pipeline.getCoderRegistry().getDefaultCoder(output.getTypeDescriptor());
- }
-
- @Test
- @Category(RunnableOnService.class)
- public void testFilterByMethodReference() {
- Pipeline pipeline = TestPipeline.create();
-
- PCollection<Integer> output = pipeline
- .apply(Create.of(1, 2, 3, 4, 5, 6, 7))
- .apply(Filter.byPredicate(new EvenFilter()::isEven));
-
- DataflowAssert.that(output).containsInAnyOrder(2, 4, 6);
- pipeline.run();
- }
-
- private static class EvenFilter implements Serializable {
- public boolean isEven(int i) {
- return i % 2 == 0;
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/d4233aa0/java8tests/src/test/java/com/google/cloud/dataflow/sdk/transforms/FlatMapElementsJava8Test.java
----------------------------------------------------------------------
diff --git a/java8tests/src/test/java/com/google/cloud/dataflow/sdk/transforms/FlatMapElementsJava8Test.java b/java8tests/src/test/java/com/google/cloud/dataflow/sdk/transforms/FlatMapElementsJava8Test.java
deleted file mode 100644
index e0b946b..0000000
--- a/java8tests/src/test/java/com/google/cloud/dataflow/sdk/transforms/FlatMapElementsJava8Test.java
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms;
-
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.testing.DataflowAssert;
-import com.google.cloud.dataflow.sdk.testing.TestPipeline;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.cloud.dataflow.sdk.values.TypeDescriptor;
-import com.google.common.collect.ImmutableList;
-
-import org.junit.Rule;
-import org.junit.Test;
-import org.junit.rules.ExpectedException;
-import org.junit.runner.RunWith;
-import org.junit.runners.JUnit4;
-
-import java.io.Serializable;
-import java.util.List;
-
-/**
- * Java 8 Tests for {@link FlatMapElements}.
- */
-@RunWith(JUnit4.class)
-public class FlatMapElementsJava8Test implements Serializable {
-
- @Rule
- public transient ExpectedException thrown = ExpectedException.none();
-
- /**
- * Basic test of {@link FlatMapElements} with a lambda (which is instantiated as a
- * {@link SerializableFunction}).
- */
- @Test
- public void testFlatMapBasic() throws Exception {
- Pipeline pipeline = TestPipeline.create();
- PCollection<Integer> output = pipeline
- .apply(Create.of(1, 2, 3))
- .apply(FlatMapElements
- // Note that the input type annotation is required.
- .via((Integer i) -> ImmutableList.of(i, -i))
- .withOutputType(new TypeDescriptor<Integer>() {}));
-
- DataflowAssert.that(output).containsInAnyOrder(1, 3, -1, -3, 2, -2);
- pipeline.run();
- }
-
- /**
- * Basic test of {@link FlatMapElements} with a method reference.
- */
- @Test
- public void testFlatMapMethodReference() throws Exception {
- Pipeline pipeline = TestPipeline.create();
- PCollection<Integer> output = pipeline
- .apply(Create.of(1, 2, 3))
- .apply(FlatMapElements
- // Note that the input type annotation is required.
- .via(new Negater()::numAndNegation)
- .withOutputType(new TypeDescriptor<Integer>() {}));
-
- DataflowAssert.that(output).containsInAnyOrder(1, 3, -1, -3, 2, -2);
- pipeline.run();
- }
-
- private static class Negater implements Serializable {
- public List<Integer> numAndNegation(int input) {
- return ImmutableList.of(input, -input);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/d4233aa0/java8tests/src/test/java/com/google/cloud/dataflow/sdk/transforms/MapElementsJava8Test.java
----------------------------------------------------------------------
diff --git a/java8tests/src/test/java/com/google/cloud/dataflow/sdk/transforms/MapElementsJava8Test.java b/java8tests/src/test/java/com/google/cloud/dataflow/sdk/transforms/MapElementsJava8Test.java
deleted file mode 100644
index 123e680..0000000
--- a/java8tests/src/test/java/com/google/cloud/dataflow/sdk/transforms/MapElementsJava8Test.java
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms;
-
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.testing.DataflowAssert;
-import com.google.cloud.dataflow.sdk.testing.TestPipeline;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.cloud.dataflow.sdk.values.TypeDescriptor;
-
-import org.junit.Test;
-import org.junit.runner.RunWith;
-import org.junit.runners.JUnit4;
-
-import java.io.Serializable;
-
-/**
- * Java 8 tests for {@link MapElements}.
- */
-@RunWith(JUnit4.class)
-public class MapElementsJava8Test implements Serializable {
-
- /**
- * Basic test of {@link MapElements} with a lambda (which is instantiated as a
- * {@link SerializableFunction}).
- */
- @Test
- public void testMapBasic() throws Exception {
- Pipeline pipeline = TestPipeline.create();
- PCollection<Integer> output = pipeline
- .apply(Create.of(1, 2, 3))
- .apply(MapElements
- // Note that the type annotation is required (for Java, not for Dataflow)
- .via((Integer i) -> i * 2)
- .withOutputType(new TypeDescriptor<Integer>() {}));
-
- DataflowAssert.that(output).containsInAnyOrder(6, 2, 4);
- pipeline.run();
- }
-
- /**
- * Basic test of {@link MapElements} with a method reference.
- */
- @Test
- public void testMapMethodReference() throws Exception {
- Pipeline pipeline = TestPipeline.create();
- PCollection<Integer> output = pipeline
- .apply(Create.of(1, 2, 3))
- .apply(MapElements
- // Note that the type annotation is required (for Java, not for Dataflow)
- .via(new Doubler()::doubleIt)
- .withOutputType(new TypeDescriptor<Integer>() {}));
-
- DataflowAssert.that(output).containsInAnyOrder(6, 2, 4);
- pipeline.run();
- }
-
- private static class Doubler implements Serializable {
- public int doubleIt(int val) {
- return val * 2;
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/d4233aa0/java8tests/src/test/java/com/google/cloud/dataflow/sdk/transforms/PartitionJava8Test.java
----------------------------------------------------------------------
diff --git a/java8tests/src/test/java/com/google/cloud/dataflow/sdk/transforms/PartitionJava8Test.java b/java8tests/src/test/java/com/google/cloud/dataflow/sdk/transforms/PartitionJava8Test.java
deleted file mode 100644
index c459ada..0000000
--- a/java8tests/src/test/java/com/google/cloud/dataflow/sdk/transforms/PartitionJava8Test.java
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms;
-
-import static org.junit.Assert.assertEquals;
-
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.coders.CannotProvideCoderException;
-import com.google.cloud.dataflow.sdk.testing.DataflowAssert;
-import com.google.cloud.dataflow.sdk.testing.TestPipeline;
-import com.google.cloud.dataflow.sdk.values.PCollectionList;
-
-import org.junit.Rule;
-import org.junit.Test;
-import org.junit.rules.ExpectedException;
-import org.junit.runner.RunWith;
-import org.junit.runners.JUnit4;
-
-import java.io.Serializable;
-
-/**
- * Java 8 Tests for {@link Filter}.
- */
-@RunWith(JUnit4.class)
-@SuppressWarnings("serial")
-public class PartitionJava8Test implements Serializable {
-
- @Rule
- public transient ExpectedException thrown = ExpectedException.none();
-
- @Test
- public void testModPartition() {
- Pipeline pipeline = TestPipeline.create();
-
- PCollectionList<Integer> outputs = pipeline
- .apply(Create.of(1, 2, 4, 5))
- .apply(Partition.of(3, (element, numPartitions) -> element % numPartitions));
- assertEquals(3, outputs.size());
- DataflowAssert.that(outputs.get(0)).empty();
- DataflowAssert.that(outputs.get(1)).containsInAnyOrder(1, 4);
- DataflowAssert.that(outputs.get(2)).containsInAnyOrder(2, 5);
- pipeline.run();
- }
-
- /**
- * Confirms that in Java 8 style, where a lambda results in a rawtype, the output type token is
- * not useful. If this test ever fails there may be simplifications available to us.
- */
- @Test
- public void testPartitionFnOutputTypeDescriptorRaw() throws Exception {
- Pipeline pipeline = TestPipeline.create();
-
- PCollectionList<String> output = pipeline
- .apply(Create.of("hello"))
- .apply(Partition.of(1, (element, numPartitions) -> 0));
-
- thrown.expect(CannotProvideCoderException.class);
- pipeline.getCoderRegistry().getDefaultCoder(output.get(0).getTypeDescriptor());
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/d4233aa0/java8tests/src/test/java/com/google/cloud/dataflow/sdk/transforms/RemoveDuplicatesJava8Test.java
----------------------------------------------------------------------
diff --git a/java8tests/src/test/java/com/google/cloud/dataflow/sdk/transforms/RemoveDuplicatesJava8Test.java b/java8tests/src/test/java/com/google/cloud/dataflow/sdk/transforms/RemoveDuplicatesJava8Test.java
deleted file mode 100644
index dfa1ca6..0000000
--- a/java8tests/src/test/java/com/google/cloud/dataflow/sdk/transforms/RemoveDuplicatesJava8Test.java
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.transforms;
-
-import static org.hamcrest.Matchers.contains;
-import static org.hamcrest.Matchers.hasItem;
-import static org.hamcrest.Matchers.not;
-import static org.junit.Assert.assertThat;
-
-import com.google.cloud.dataflow.sdk.testing.DataflowAssert;
-import com.google.cloud.dataflow.sdk.testing.TestPipeline;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.cloud.dataflow.sdk.values.TypeDescriptor;
-import com.google.common.collect.HashMultimap;
-import com.google.common.collect.Multimap;
-
-import org.junit.Rule;
-import org.junit.Test;
-import org.junit.rules.ExpectedException;
-import org.junit.runner.RunWith;
-import org.junit.runners.JUnit4;
-
-import java.util.HashSet;
-import java.util.Set;
-
-/**
- * Java 8 tests for {@link RemoveDuplicates}.
- */
-@RunWith(JUnit4.class)
-public class RemoveDuplicatesJava8Test {
-
- @Rule
- public ExpectedException thrown = ExpectedException.none();
-
- @Test
- public void withLambdaRepresentativeValuesFnAndTypeDescriptorShouldApplyFn() {
- TestPipeline p = TestPipeline.create();
-
- Multimap<Integer, String> predupedContents = HashMultimap.create();
- predupedContents.put(3, "foo");
- predupedContents.put(4, "foos");
- predupedContents.put(6, "barbaz");
- predupedContents.put(6, "bazbar");
- PCollection<String> dupes =
- p.apply(Create.of("foo", "foos", "barbaz", "barbaz", "bazbar", "foo"));
- PCollection<String> deduped =
- dupes.apply(RemoveDuplicates.withRepresentativeValueFn((String s) -> s.length())
- .withRepresentativeType(TypeDescriptor.of(Integer.class)));
-
- DataflowAssert.that(deduped).satisfies((Iterable<String> strs) -> {
- Set<Integer> seenLengths = new HashSet<>();
- for (String s : strs) {
- assertThat(predupedContents.values(), hasItem(s));
- assertThat(seenLengths, not(contains(s.length())));
- seenLengths.add(s.length());
- }
- return null;
- });
-
- p.run();
- }
-
- @Test
- public void withLambdaRepresentativeValuesFnNoTypeDescriptorShouldThrow() {
- TestPipeline p = TestPipeline.create();
-
- Multimap<Integer, String> predupedContents = HashMultimap.create();
- predupedContents.put(3, "foo");
- predupedContents.put(4, "foos");
- predupedContents.put(6, "barbaz");
- predupedContents.put(6, "bazbar");
- PCollection<String> dupes =
- p.apply(Create.of("foo", "foos", "barbaz", "barbaz", "bazbar", "foo"));
-
- thrown.expect(IllegalStateException.class);
- thrown.expectMessage("Unable to return a default Coder for RemoveRepresentativeDupes");
- thrown.expectMessage("Cannot provide a coder for type variable K");
- thrown.expectMessage("the actual type is unknown due to erasure.");
-
- // Thrown when applying a transform to the internal WithKeys that withRepresentativeValueFn is
- // implemented with
- dupes.apply("RemoveRepresentativeDupes",
- RemoveDuplicates.withRepresentativeValueFn((String s) -> s.length()));
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/d4233aa0/java8tests/src/test/java/com/google/cloud/dataflow/sdk/transforms/WithKeysJava8Test.java
----------------------------------------------------------------------
diff --git a/java8tests/src/test/java/com/google/cloud/dataflow/sdk/transforms/WithKeysJava8Test.java b/java8tests/src/test/java/com/google/cloud/dataflow/sdk/transforms/WithKeysJava8Test.java
deleted file mode 100644
index 3771f78..0000000
--- a/java8tests/src/test/java/com/google/cloud/dataflow/sdk/transforms/WithKeysJava8Test.java
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.transforms;
-
-import com.google.cloud.dataflow.sdk.Pipeline.PipelineExecutionException;
-import com.google.cloud.dataflow.sdk.testing.DataflowAssert;
-import com.google.cloud.dataflow.sdk.testing.RunnableOnService;
-import com.google.cloud.dataflow.sdk.testing.TestPipeline;
-import com.google.cloud.dataflow.sdk.values.KV;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.cloud.dataflow.sdk.values.TypeDescriptor;
-
-import org.junit.Rule;
-import org.junit.Test;
-import org.junit.experimental.categories.Category;
-import org.junit.rules.ExpectedException;
-import org.junit.runner.RunWith;
-import org.junit.runners.JUnit4;
-
-/**
- * Java 8 Tests for {@link WithKeys}.
- */
-@RunWith(JUnit4.class)
-public class WithKeysJava8Test {
-
- @Rule
- public ExpectedException thrown = ExpectedException.none();
-
- @Test
- @Category(RunnableOnService.class)
- public void withLambdaAndTypeDescriptorShouldSucceed() {
- TestPipeline p = TestPipeline.create();
-
- PCollection<String> values = p.apply(Create.of("1234", "3210", "0", "-12"));
- PCollection<KV<Integer, String>> kvs = values.apply(
- WithKeys.of((String s) -> Integer.valueOf(s))
- .withKeyType(TypeDescriptor.of(Integer.class)));
-
- DataflowAssert.that(kvs).containsInAnyOrder(
- KV.of(1234, "1234"), KV.of(0, "0"), KV.of(-12, "-12"), KV.of(3210, "3210"));
-
- p.run();
- }
-
- @Test
- public void withLambdaAndNoTypeDescriptorShouldThrow() {
- TestPipeline p = TestPipeline.create();
-
- PCollection<String> values = p.apply(Create.of("1234", "3210", "0", "-12"));
-
- values.apply("ApplyKeysWithWithKeys", WithKeys.of((String s) -> Integer.valueOf(s)));
-
- thrown.expect(PipelineExecutionException.class);
- thrown.expectMessage("Unable to return a default Coder for ApplyKeysWithWithKeys");
- thrown.expectMessage("Cannot provide a coder for type variable K");
- thrown.expectMessage("the actual type is unknown due to erasure.");
-
- p.run();
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/d4233aa0/java8tests/src/test/java/com/google/cloud/dataflow/sdk/transforms/WithTimestampsJava8Test.java
----------------------------------------------------------------------
diff --git a/java8tests/src/test/java/com/google/cloud/dataflow/sdk/transforms/WithTimestampsJava8Test.java b/java8tests/src/test/java/com/google/cloud/dataflow/sdk/transforms/WithTimestampsJava8Test.java
deleted file mode 100644
index b2b6dbc..0000000
--- a/java8tests/src/test/java/com/google/cloud/dataflow/sdk/transforms/WithTimestampsJava8Test.java
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.transforms;
-
-import com.google.cloud.dataflow.sdk.testing.DataflowAssert;
-import com.google.cloud.dataflow.sdk.testing.RunnableOnService;
-import com.google.cloud.dataflow.sdk.testing.TestPipeline;
-import com.google.cloud.dataflow.sdk.values.KV;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-
-import org.joda.time.Instant;
-import org.junit.Test;
-import org.junit.experimental.categories.Category;
-import org.junit.runner.RunWith;
-import org.junit.runners.JUnit4;
-
-import java.io.Serializable;
-
-/**
- * Java 8 tests for {@link WithTimestamps}.
- */
-@RunWith(JUnit4.class)
-public class WithTimestampsJava8Test implements Serializable {
- @Test
- @Category(RunnableOnService.class)
- public void withTimestampsLambdaShouldApplyTimestamps() {
- TestPipeline p = TestPipeline.create();
-
- String yearTwoThousand = "946684800000";
- PCollection<String> timestamped =
- p.apply(Create.of("1234", "0", Integer.toString(Integer.MAX_VALUE), yearTwoThousand))
- .apply(WithTimestamps.of((String input) -> new Instant(Long.valueOf(yearTwoThousand))));
-
- PCollection<KV<String, Instant>> timestampedVals =
- timestamped.apply(ParDo.of(new DoFn<String, KV<String, Instant>>() {
- @Override
- public void processElement(DoFn<String, KV<String, Instant>>.ProcessContext c)
- throws Exception {
- c.output(KV.of(c.element(), c.timestamp()));
- }
- }));
-
- DataflowAssert.that(timestamped)
- .containsInAnyOrder(yearTwoThousand, "0", "1234", Integer.toString(Integer.MAX_VALUE));
- DataflowAssert.that(timestampedVals)
- .containsInAnyOrder(
- KV.of("0", new Instant(0)),
- KV.of("1234", new Instant("1234")),
- KV.of(Integer.toString(Integer.MAX_VALUE), new Instant(Integer.MAX_VALUE)),
- KV.of(yearTwoThousand, new Instant(Long.valueOf(yearTwoThousand))));
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/d4233aa0/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index 3803007..6b2fd93 100644
--- a/pom.xml
+++ b/pom.xml
@@ -138,7 +138,7 @@
<jdk>[1.8,)</jdk>
</activation>
<modules>
- <module>java8tests</module>
+ <module>sdks/java/java8tests</module>
</modules>
</profile>
<profile>
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/d4233aa0/sdks/java/java8tests/pom.xml
----------------------------------------------------------------------
diff --git a/sdks/java/java8tests/pom.xml b/sdks/java/java8tests/pom.xml
new file mode 100644
index 0000000..bb8d629
--- /dev/null
+++ b/sdks/java/java8tests/pom.xml
@@ -0,0 +1,184 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.beam</groupId>
+ <artifactId>parent</artifactId>
+ <version>0.1.0-incubating-SNAPSHOT</version>
+ <relativePath>../../../pom.xml</relativePath>
+ </parent>
+
+ <artifactId>java8tests-all</artifactId>
+ <name>Apache Beam :: Tests :: Java 8 All</name>
+ <description>Apache Beam Java SDK provides a simple, Java-based
+ interface for processing virtually any size data.
+ This artifact includes tests of the SDK from a Java 8
+ user.</description>
+
+ <packaging>jar</packaging>
+
+ <profiles>
+ <profile>
+ <id>DataflowPipelineTests</id>
+ <properties>
+ <runIntegrationTestOnService>true</runIntegrationTestOnService>
+ <testGroups>com.google.cloud.dataflow.sdk.testing.RunnableOnService</testGroups>
+ <testParallelValue>both</testParallelValue>
+ </properties>
+ </profile>
+ </profiles>
+
+ <build>
+ <plugins>
+ <plugin>
+ <artifactId>maven-compiler-plugin</artifactId>
+ <configuration>
+ <testSource>1.8</testSource>
+ <testTarget>1.8</testTarget>
+ </configuration>
+ </plugin>
+
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-dependency-plugin</artifactId>
+ <executions>
+ <execution>
+ <goals><goal>analyze-only</goal></goals>
+ <configuration>
+ <failOnWarning>true</failOnWarning>
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
+
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-checkstyle-plugin</artifactId>
+ <version>2.12</version>
+ <dependencies>
+ <dependency>
+ <groupId>com.puppycrawl.tools</groupId>
+ <artifactId>checkstyle</artifactId>
+ <version>6.6</version>
+ </dependency>
+ </dependencies>
+ <configuration>
+ <configLocation>../../../checkstyle.xml</configLocation>
+ <consoleOutput>true</consoleOutput>
+ <failOnViolation>true</failOnViolation>
+ <includeTestSourceDirectory>true</includeTestSourceDirectory>
+ <includeResources>false</includeResources>
+ </configuration>
+ <executions>
+ <execution>
+ <goals>
+ <goal>check</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
+
+ <!-- Source plugin for generating source and test-source JARs. -->
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-source-plugin</artifactId>
+ <version>2.4</version>
+ <executions>
+ <execution>
+ <id>attach-sources</id>
+ <phase>compile</phase>
+ <goals>
+ <goal>jar</goal>
+ </goals>
+ </execution>
+ <execution>
+ <id>attach-test-sources</id>
+ <phase>test-compile</phase>
+ <goals>
+ <goal>test-jar</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
+
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-jar-plugin</artifactId>
+ <executions>
+ <execution>
+ <id>default-jar</id>
+ <goals>
+ <goal>jar</goal>
+ </goals>
+ </execution>
+ <execution>
+ <id>default-test-jar</id>
+ <goals>
+ <goal>test-jar</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
+
+ <!-- Coverage analysis for unit tests. -->
+ <plugin>
+ <groupId>org.jacoco</groupId>
+ <artifactId>jacoco-maven-plugin</artifactId>
+ </plugin>
+ </plugins>
+ </build>
+
+ <dependencies>
+ <dependency>
+ <groupId>org.apache.beam</groupId>
+ <artifactId>java-sdk-all</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+
+ <dependency>
+ <groupId>com.google.guava</groupId>
+ <artifactId>guava</artifactId>
+ <version>${guava.version}</version>
+ </dependency>
+
+ <dependency>
+ <groupId>joda-time</groupId>
+ <artifactId>joda-time</artifactId>
+ <version>${joda.version}</version>
+ </dependency>
+
+ <dependency>
+ <groupId>org.hamcrest</groupId>
+ <artifactId>hamcrest-all</artifactId>
+ <version>${hamcrest.version}</version>
+ <scope>test</scope>
+ </dependency>
+
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <version>${junit.version}</version>
+ <scope>test</scope>
+ </dependency>
+ </dependencies>
+</project>
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/d4233aa0/sdks/java/java8tests/src/test/java/com/google/cloud/dataflow/sdk/transforms/CombineJava8Test.java
----------------------------------------------------------------------
diff --git a/sdks/java/java8tests/src/test/java/com/google/cloud/dataflow/sdk/transforms/CombineJava8Test.java b/sdks/java/java8tests/src/test/java/com/google/cloud/dataflow/sdk/transforms/CombineJava8Test.java
new file mode 100644
index 0000000..b569e49
--- /dev/null
+++ b/sdks/java/java8tests/src/test/java/com/google/cloud/dataflow/sdk/transforms/CombineJava8Test.java
@@ -0,0 +1,133 @@
+/*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package com.google.cloud.dataflow.sdk.transforms;
+
+import com.google.cloud.dataflow.sdk.Pipeline;
+import com.google.cloud.dataflow.sdk.testing.DataflowAssert;
+import com.google.cloud.dataflow.sdk.testing.TestPipeline;
+import com.google.cloud.dataflow.sdk.values.KV;
+import com.google.cloud.dataflow.sdk.values.PCollection;
+
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.ExpectedException;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+import java.io.Serializable;
+
+/**
+ * Java 8 Tests for {@link Combine}.
+ */
+@RunWith(JUnit4.class)
+@SuppressWarnings("serial")
+public class CombineJava8Test implements Serializable {
+
+ @Rule
+ public transient ExpectedException thrown = ExpectedException.none();
+
+ /**
+ * Class for use in testing use of Java 8 method references.
+ */
+ private static class Summer implements Serializable {
+ public int sum(Iterable<Integer> integers) {
+ int sum = 0;
+ for (int i : integers) {
+ sum += i;
+ }
+ return sum;
+ }
+ }
+
+ /**
+ * Tests creation of a global {@link Combine} via Java 8 lambda.
+ */
+ @Test
+ public void testCombineGloballyLambda() {
+ Pipeline pipeline = TestPipeline.create();
+
+ PCollection<Integer> output = pipeline
+ .apply(Create.of(1, 2, 3, 4))
+ .apply(Combine.globally(integers -> {
+ int sum = 0;
+ for (int i : integers) {
+ sum += i;
+ }
+ return sum;
+ }));
+
+ DataflowAssert.that(output).containsInAnyOrder(10);
+ pipeline.run();
+ }
+
+ /**
+ * Tests creation of a global {@link Combine} via a Java 8 method reference.
+ */
+ @Test
+ public void testCombineGloballyInstanceMethodReference() {
+ Pipeline pipeline = TestPipeline.create();
+
+ PCollection<Integer> output = pipeline
+ .apply(Create.of(1, 2, 3, 4))
+ .apply(Combine.globally(new Summer()::sum));
+
+ DataflowAssert.that(output).containsInAnyOrder(10);
+ pipeline.run();
+ }
+
+ /**
+ * Tests creation of a per-key {@link Combine} via a Java 8 lambda.
+ */
+ @Test
+ public void testCombinePerKeyLambda() {
+ Pipeline pipeline = TestPipeline.create();
+
+ PCollection<KV<String, Integer>> output = pipeline
+ .apply(Create.of(KV.of("a", 1), KV.of("b", 2), KV.of("a", 3), KV.of("c", 4)))
+ .apply(Combine.perKey(integers -> {
+ int sum = 0;
+ for (int i : integers) {
+ sum += i;
+ }
+ return sum;
+ }));
+
+ DataflowAssert.that(output).containsInAnyOrder(
+ KV.of("a", 4),
+ KV.of("b", 2),
+ KV.of("c", 4));
+ pipeline.run();
+ }
+
+ /**
+ * Tests creation of a per-key {@link Combine} via a Java 8 method reference.
+ */
+ @Test
+ public void testCombinePerKeyInstanceMethodReference() {
+ Pipeline pipeline = TestPipeline.create();
+
+ PCollection<KV<String, Integer>> output = pipeline
+ .apply(Create.of(KV.of("a", 1), KV.of("b", 2), KV.of("a", 3), KV.of("c", 4)))
+ .apply(Combine.perKey(new Summer()::sum));
+
+ DataflowAssert.that(output).containsInAnyOrder(
+ KV.of("a", 4),
+ KV.of("b", 2),
+ KV.of("c", 4));
+ pipeline.run();
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/d4233aa0/sdks/java/java8tests/src/test/java/com/google/cloud/dataflow/sdk/transforms/FilterJava8Test.java
----------------------------------------------------------------------
diff --git a/sdks/java/java8tests/src/test/java/com/google/cloud/dataflow/sdk/transforms/FilterJava8Test.java b/sdks/java/java8tests/src/test/java/com/google/cloud/dataflow/sdk/transforms/FilterJava8Test.java
new file mode 100644
index 0000000..db65932
--- /dev/null
+++ b/sdks/java/java8tests/src/test/java/com/google/cloud/dataflow/sdk/transforms/FilterJava8Test.java
@@ -0,0 +1,118 @@
+/*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package com.google.cloud.dataflow.sdk.transforms;
+
+import com.google.cloud.dataflow.sdk.Pipeline;
+import com.google.cloud.dataflow.sdk.coders.CannotProvideCoderException;
+import com.google.cloud.dataflow.sdk.testing.DataflowAssert;
+import com.google.cloud.dataflow.sdk.testing.RunnableOnService;
+import com.google.cloud.dataflow.sdk.testing.TestPipeline;
+import com.google.cloud.dataflow.sdk.values.PCollection;
+
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+import org.junit.rules.ExpectedException;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+import java.io.Serializable;
+
+/**
+ * Java 8 Tests for {@link Filter}.
+ */
+@RunWith(JUnit4.class)
+@SuppressWarnings("serial")
+public class FilterJava8Test implements Serializable {
+
+ @Rule
+ public transient ExpectedException thrown = ExpectedException.none();
+
+ @Test
+ @Category(RunnableOnService.class)
+ public void testIdentityFilterByPredicate() {
+ Pipeline pipeline = TestPipeline.create();
+
+ PCollection<Integer> output = pipeline
+ .apply(Create.of(591, 11789, 1257, 24578, 24799, 307))
+ .apply(Filter.byPredicate(i -> true));
+
+ DataflowAssert.that(output).containsInAnyOrder(591, 11789, 1257, 24578, 24799, 307);
+ pipeline.run();
+ }
+
+ @Test
+ public void testNoFilterByPredicate() {
+ Pipeline pipeline = TestPipeline.create();
+
+ PCollection<Integer> output = pipeline
+ .apply(Create.of(1, 2, 4, 5))
+ .apply(Filter.byPredicate(i -> false));
+
+ DataflowAssert.that(output).empty();
+ pipeline.run();
+ }
+
+ @Test
+ @Category(RunnableOnService.class)
+ public void testFilterByPredicate() {
+ Pipeline pipeline = TestPipeline.create();
+
+ PCollection<Integer> output = pipeline
+ .apply(Create.of(1, 2, 3, 4, 5, 6, 7))
+ .apply(Filter.byPredicate(i -> i % 2 == 0));
+
+ DataflowAssert.that(output).containsInAnyOrder(2, 4, 6);
+ pipeline.run();
+ }
+
+ /**
+ * Confirms that in Java 8 style, where a lambda results in a rawtype, the output type token is
+ * not useful. If this test ever fails there may be simplifications available to us.
+ */
+ @Test
+ public void testFilterParDoOutputTypeDescriptorRaw() throws Exception {
+ Pipeline pipeline = TestPipeline.create();
+
+ @SuppressWarnings({"unchecked", "rawtypes"})
+ PCollection<String> output = pipeline
+ .apply(Create.of("hello"))
+ .apply(Filter.by(s -> true));
+
+ thrown.expect(CannotProvideCoderException.class);
+ pipeline.getCoderRegistry().getDefaultCoder(output.getTypeDescriptor());
+ }
+
+ @Test
+ @Category(RunnableOnService.class)
+ public void testFilterByMethodReference() {
+ Pipeline pipeline = TestPipeline.create();
+
+ PCollection<Integer> output = pipeline
+ .apply(Create.of(1, 2, 3, 4, 5, 6, 7))
+ .apply(Filter.byPredicate(new EvenFilter()::isEven));
+
+ DataflowAssert.that(output).containsInAnyOrder(2, 4, 6);
+ pipeline.run();
+ }
+
+ private static class EvenFilter implements Serializable {
+ public boolean isEven(int i) {
+ return i % 2 == 0;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/d4233aa0/sdks/java/java8tests/src/test/java/com/google/cloud/dataflow/sdk/transforms/FlatMapElementsJava8Test.java
----------------------------------------------------------------------
diff --git a/sdks/java/java8tests/src/test/java/com/google/cloud/dataflow/sdk/transforms/FlatMapElementsJava8Test.java b/sdks/java/java8tests/src/test/java/com/google/cloud/dataflow/sdk/transforms/FlatMapElementsJava8Test.java
new file mode 100644
index 0000000..e0b946b
--- /dev/null
+++ b/sdks/java/java8tests/src/test/java/com/google/cloud/dataflow/sdk/transforms/FlatMapElementsJava8Test.java
@@ -0,0 +1,84 @@
+/*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package com.google.cloud.dataflow.sdk.transforms;
+
+import com.google.cloud.dataflow.sdk.Pipeline;
+import com.google.cloud.dataflow.sdk.testing.DataflowAssert;
+import com.google.cloud.dataflow.sdk.testing.TestPipeline;
+import com.google.cloud.dataflow.sdk.values.PCollection;
+import com.google.cloud.dataflow.sdk.values.TypeDescriptor;
+import com.google.common.collect.ImmutableList;
+
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.ExpectedException;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+import java.io.Serializable;
+import java.util.List;
+
+/**
+ * Java 8 Tests for {@link FlatMapElements}.
+ */
+@RunWith(JUnit4.class)
+public class FlatMapElementsJava8Test implements Serializable {
+
+ @Rule
+ public transient ExpectedException thrown = ExpectedException.none();
+
+ /**
+ * Basic test of {@link FlatMapElements} with a lambda (which is instantiated as a
+ * {@link SerializableFunction}).
+ */
+ @Test
+ public void testFlatMapBasic() throws Exception {
+ Pipeline pipeline = TestPipeline.create();
+ PCollection<Integer> output = pipeline
+ .apply(Create.of(1, 2, 3))
+ .apply(FlatMapElements
+ // Note that the input type annotation is required.
+ .via((Integer i) -> ImmutableList.of(i, -i))
+ .withOutputType(new TypeDescriptor<Integer>() {}));
+
+ DataflowAssert.that(output).containsInAnyOrder(1, 3, -1, -3, 2, -2);
+ pipeline.run();
+ }
+
+ /**
+ * Basic test of {@link FlatMapElements} with a method reference.
+ */
+ @Test
+ public void testFlatMapMethodReference() throws Exception {
+ Pipeline pipeline = TestPipeline.create();
+ PCollection<Integer> output = pipeline
+ .apply(Create.of(1, 2, 3))
+ .apply(FlatMapElements
+ // Note that the input type annotation is required.
+ .via(new Negater()::numAndNegation)
+ .withOutputType(new TypeDescriptor<Integer>() {}));
+
+ DataflowAssert.that(output).containsInAnyOrder(1, 3, -1, -3, 2, -2);
+ pipeline.run();
+ }
+
+ private static class Negater implements Serializable {
+ public List<Integer> numAndNegation(int input) {
+ return ImmutableList.of(input, -input);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/d4233aa0/sdks/java/java8tests/src/test/java/com/google/cloud/dataflow/sdk/transforms/MapElementsJava8Test.java
----------------------------------------------------------------------
diff --git a/sdks/java/java8tests/src/test/java/com/google/cloud/dataflow/sdk/transforms/MapElementsJava8Test.java b/sdks/java/java8tests/src/test/java/com/google/cloud/dataflow/sdk/transforms/MapElementsJava8Test.java
new file mode 100644
index 0000000..123e680
--- /dev/null
+++ b/sdks/java/java8tests/src/test/java/com/google/cloud/dataflow/sdk/transforms/MapElementsJava8Test.java
@@ -0,0 +1,77 @@
+/*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package com.google.cloud.dataflow.sdk.transforms;
+
+import com.google.cloud.dataflow.sdk.Pipeline;
+import com.google.cloud.dataflow.sdk.testing.DataflowAssert;
+import com.google.cloud.dataflow.sdk.testing.TestPipeline;
+import com.google.cloud.dataflow.sdk.values.PCollection;
+import com.google.cloud.dataflow.sdk.values.TypeDescriptor;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+import java.io.Serializable;
+
+/**
+ * Java 8 tests for {@link MapElements}.
+ */
+@RunWith(JUnit4.class)
+public class MapElementsJava8Test implements Serializable {
+
+ /**
+ * Basic test of {@link MapElements} with a lambda (which is instantiated as a
+ * {@link SerializableFunction}).
+ */
+ @Test
+ public void testMapBasic() throws Exception {
+ Pipeline pipeline = TestPipeline.create();
+ PCollection<Integer> output = pipeline
+ .apply(Create.of(1, 2, 3))
+ .apply(MapElements
+ // Note that the type annotation is required (for Java, not for Dataflow)
+ .via((Integer i) -> i * 2)
+ .withOutputType(new TypeDescriptor<Integer>() {}));
+
+ DataflowAssert.that(output).containsInAnyOrder(6, 2, 4);
+ pipeline.run();
+ }
+
+ /**
+ * Basic test of {@link MapElements} with a method reference.
+ */
+ @Test
+ public void testMapMethodReference() throws Exception {
+ Pipeline pipeline = TestPipeline.create();
+ PCollection<Integer> output = pipeline
+ .apply(Create.of(1, 2, 3))
+ .apply(MapElements
+ // Note that the type annotation is required (for Java, not for Dataflow)
+ .via(new Doubler()::doubleIt)
+ .withOutputType(new TypeDescriptor<Integer>() {}));
+
+ DataflowAssert.that(output).containsInAnyOrder(6, 2, 4);
+ pipeline.run();
+ }
+
+ private static class Doubler implements Serializable {
+ public int doubleIt(int val) {
+ return val * 2;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/d4233aa0/sdks/java/java8tests/src/test/java/com/google/cloud/dataflow/sdk/transforms/PartitionJava8Test.java
----------------------------------------------------------------------
diff --git a/sdks/java/java8tests/src/test/java/com/google/cloud/dataflow/sdk/transforms/PartitionJava8Test.java b/sdks/java/java8tests/src/test/java/com/google/cloud/dataflow/sdk/transforms/PartitionJava8Test.java
new file mode 100644
index 0000000..c459ada
--- /dev/null
+++ b/sdks/java/java8tests/src/test/java/com/google/cloud/dataflow/sdk/transforms/PartitionJava8Test.java
@@ -0,0 +1,74 @@
+/*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package com.google.cloud.dataflow.sdk.transforms;
+
+import static org.junit.Assert.assertEquals;
+
+import com.google.cloud.dataflow.sdk.Pipeline;
+import com.google.cloud.dataflow.sdk.coders.CannotProvideCoderException;
+import com.google.cloud.dataflow.sdk.testing.DataflowAssert;
+import com.google.cloud.dataflow.sdk.testing.TestPipeline;
+import com.google.cloud.dataflow.sdk.values.PCollectionList;
+
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.ExpectedException;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+import java.io.Serializable;
+
+/**
+ * Java 8 Tests for {@link Filter}.
+ */
+@RunWith(JUnit4.class)
+@SuppressWarnings("serial")
+public class PartitionJava8Test implements Serializable {
+
+ @Rule
+ public transient ExpectedException thrown = ExpectedException.none();
+
+ @Test
+ public void testModPartition() {
+ Pipeline pipeline = TestPipeline.create();
+
+ PCollectionList<Integer> outputs = pipeline
+ .apply(Create.of(1, 2, 4, 5))
+ .apply(Partition.of(3, (element, numPartitions) -> element % numPartitions));
+ assertEquals(3, outputs.size());
+ DataflowAssert.that(outputs.get(0)).empty();
+ DataflowAssert.that(outputs.get(1)).containsInAnyOrder(1, 4);
+ DataflowAssert.that(outputs.get(2)).containsInAnyOrder(2, 5);
+ pipeline.run();
+ }
+
+ /**
+ * Confirms that in Java 8 style, where a lambda results in a rawtype, the output type token is
+ * not useful. If this test ever fails there may be simplifications available to us.
+ */
+ @Test
+ public void testPartitionFnOutputTypeDescriptorRaw() throws Exception {
+ Pipeline pipeline = TestPipeline.create();
+
+ PCollectionList<String> output = pipeline
+ .apply(Create.of("hello"))
+ .apply(Partition.of(1, (element, numPartitions) -> 0));
+
+ thrown.expect(CannotProvideCoderException.class);
+ pipeline.getCoderRegistry().getDefaultCoder(output.get(0).getTypeDescriptor());
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/d4233aa0/sdks/java/java8tests/src/test/java/com/google/cloud/dataflow/sdk/transforms/RemoveDuplicatesJava8Test.java
----------------------------------------------------------------------
diff --git a/sdks/java/java8tests/src/test/java/com/google/cloud/dataflow/sdk/transforms/RemoveDuplicatesJava8Test.java b/sdks/java/java8tests/src/test/java/com/google/cloud/dataflow/sdk/transforms/RemoveDuplicatesJava8Test.java
new file mode 100644
index 0000000..dfa1ca6
--- /dev/null
+++ b/sdks/java/java8tests/src/test/java/com/google/cloud/dataflow/sdk/transforms/RemoveDuplicatesJava8Test.java
@@ -0,0 +1,98 @@
+/*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+package com.google.cloud.dataflow.sdk.transforms;
+
+import static org.hamcrest.Matchers.contains;
+import static org.hamcrest.Matchers.hasItem;
+import static org.hamcrest.Matchers.not;
+import static org.junit.Assert.assertThat;
+
+import com.google.cloud.dataflow.sdk.testing.DataflowAssert;
+import com.google.cloud.dataflow.sdk.testing.TestPipeline;
+import com.google.cloud.dataflow.sdk.values.PCollection;
+import com.google.cloud.dataflow.sdk.values.TypeDescriptor;
+import com.google.common.collect.HashMultimap;
+import com.google.common.collect.Multimap;
+
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.ExpectedException;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+import java.util.HashSet;
+import java.util.Set;
+
+/**
+ * Java 8 tests for {@link RemoveDuplicates}.
+ */
+@RunWith(JUnit4.class)
+public class RemoveDuplicatesJava8Test {
+
+ @Rule
+ public ExpectedException thrown = ExpectedException.none();
+
+ @Test
+ public void withLambdaRepresentativeValuesFnAndTypeDescriptorShouldApplyFn() {
+ TestPipeline p = TestPipeline.create();
+
+ Multimap<Integer, String> predupedContents = HashMultimap.create();
+ predupedContents.put(3, "foo");
+ predupedContents.put(4, "foos");
+ predupedContents.put(6, "barbaz");
+ predupedContents.put(6, "bazbar");
+ PCollection<String> dupes =
+ p.apply(Create.of("foo", "foos", "barbaz", "barbaz", "bazbar", "foo"));
+ PCollection<String> deduped =
+ dupes.apply(RemoveDuplicates.withRepresentativeValueFn((String s) -> s.length())
+ .withRepresentativeType(TypeDescriptor.of(Integer.class)));
+
+ DataflowAssert.that(deduped).satisfies((Iterable<String> strs) -> {
+ Set<Integer> seenLengths = new HashSet<>();
+ for (String s : strs) {
+ assertThat(predupedContents.values(), hasItem(s));
+ assertThat(seenLengths, not(contains(s.length())));
+ seenLengths.add(s.length());
+ }
+ return null;
+ });
+
+ p.run();
+ }
+
+ @Test
+ public void withLambdaRepresentativeValuesFnNoTypeDescriptorShouldThrow() {
+ TestPipeline p = TestPipeline.create();
+
+ Multimap<Integer, String> predupedContents = HashMultimap.create();
+ predupedContents.put(3, "foo");
+ predupedContents.put(4, "foos");
+ predupedContents.put(6, "barbaz");
+ predupedContents.put(6, "bazbar");
+ PCollection<String> dupes =
+ p.apply(Create.of("foo", "foos", "barbaz", "barbaz", "bazbar", "foo"));
+
+ thrown.expect(IllegalStateException.class);
+ thrown.expectMessage("Unable to return a default Coder for RemoveRepresentativeDupes");
+ thrown.expectMessage("Cannot provide a coder for type variable K");
+ thrown.expectMessage("the actual type is unknown due to erasure.");
+
+ // Thrown when applying a transform to the internal WithKeys that withRepresentativeValueFn is
+ // implemented with
+ dupes.apply("RemoveRepresentativeDupes",
+ RemoveDuplicates.withRepresentativeValueFn((String s) -> s.length()));
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/d4233aa0/sdks/java/java8tests/src/test/java/com/google/cloud/dataflow/sdk/transforms/WithKeysJava8Test.java
----------------------------------------------------------------------
diff --git a/sdks/java/java8tests/src/test/java/com/google/cloud/dataflow/sdk/transforms/WithKeysJava8Test.java b/sdks/java/java8tests/src/test/java/com/google/cloud/dataflow/sdk/transforms/WithKeysJava8Test.java
new file mode 100644
index 0000000..3771f78
--- /dev/null
+++ b/sdks/java/java8tests/src/test/java/com/google/cloud/dataflow/sdk/transforms/WithKeysJava8Test.java
@@ -0,0 +1,73 @@
+/*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+package com.google.cloud.dataflow.sdk.transforms;
+
+import com.google.cloud.dataflow.sdk.Pipeline.PipelineExecutionException;
+import com.google.cloud.dataflow.sdk.testing.DataflowAssert;
+import com.google.cloud.dataflow.sdk.testing.RunnableOnService;
+import com.google.cloud.dataflow.sdk.testing.TestPipeline;
+import com.google.cloud.dataflow.sdk.values.KV;
+import com.google.cloud.dataflow.sdk.values.PCollection;
+import com.google.cloud.dataflow.sdk.values.TypeDescriptor;
+
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+import org.junit.rules.ExpectedException;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+/**
+ * Java 8 Tests for {@link WithKeys}.
+ */
+@RunWith(JUnit4.class)
+public class WithKeysJava8Test {
+
+ @Rule
+ public ExpectedException thrown = ExpectedException.none();
+
+ @Test
+ @Category(RunnableOnService.class)
+ public void withLambdaAndTypeDescriptorShouldSucceed() {
+ TestPipeline p = TestPipeline.create();
+
+ PCollection<String> values = p.apply(Create.of("1234", "3210", "0", "-12"));
+ PCollection<KV<Integer, String>> kvs = values.apply(
+ WithKeys.of((String s) -> Integer.valueOf(s))
+ .withKeyType(TypeDescriptor.of(Integer.class)));
+
+ DataflowAssert.that(kvs).containsInAnyOrder(
+ KV.of(1234, "1234"), KV.of(0, "0"), KV.of(-12, "-12"), KV.of(3210, "3210"));
+
+ p.run();
+ }
+
+ @Test
+ public void withLambdaAndNoTypeDescriptorShouldThrow() {
+ TestPipeline p = TestPipeline.create();
+
+ PCollection<String> values = p.apply(Create.of("1234", "3210", "0", "-12"));
+
+ values.apply("ApplyKeysWithWithKeys", WithKeys.of((String s) -> Integer.valueOf(s)));
+
+ thrown.expect(PipelineExecutionException.class);
+ thrown.expectMessage("Unable to return a default Coder for ApplyKeysWithWithKeys");
+ thrown.expectMessage("Cannot provide a coder for type variable K");
+ thrown.expectMessage("the actual type is unknown due to erasure.");
+
+ p.run();
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/d4233aa0/sdks/java/java8tests/src/test/java/com/google/cloud/dataflow/sdk/transforms/WithTimestampsJava8Test.java
----------------------------------------------------------------------
diff --git a/sdks/java/java8tests/src/test/java/com/google/cloud/dataflow/sdk/transforms/WithTimestampsJava8Test.java b/sdks/java/java8tests/src/test/java/com/google/cloud/dataflow/sdk/transforms/WithTimestampsJava8Test.java
new file mode 100644
index 0000000..b2b6dbc
--- /dev/null
+++ b/sdks/java/java8tests/src/test/java/com/google/cloud/dataflow/sdk/transforms/WithTimestampsJava8Test.java
@@ -0,0 +1,65 @@
+/*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+package com.google.cloud.dataflow.sdk.transforms;
+
+import com.google.cloud.dataflow.sdk.testing.DataflowAssert;
+import com.google.cloud.dataflow.sdk.testing.RunnableOnService;
+import com.google.cloud.dataflow.sdk.testing.TestPipeline;
+import com.google.cloud.dataflow.sdk.values.KV;
+import com.google.cloud.dataflow.sdk.values.PCollection;
+
+import org.joda.time.Instant;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+import java.io.Serializable;
+
+/**
+ * Java 8 tests for {@link WithTimestamps}.
+ */
+@RunWith(JUnit4.class)
+public class WithTimestampsJava8Test implements Serializable {
+ @Test
+ @Category(RunnableOnService.class)
+ public void withTimestampsLambdaShouldApplyTimestamps() {
+ TestPipeline p = TestPipeline.create();
+
+ String yearTwoThousand = "946684800000";
+ PCollection<String> timestamped =
+ p.apply(Create.of("1234", "0", Integer.toString(Integer.MAX_VALUE), yearTwoThousand))
+ .apply(WithTimestamps.of((String input) -> new Instant(Long.valueOf(yearTwoThousand))));
+
+ PCollection<KV<String, Instant>> timestampedVals =
+ timestamped.apply(ParDo.of(new DoFn<String, KV<String, Instant>>() {
+ @Override
+ public void processElement(DoFn<String, KV<String, Instant>>.ProcessContext c)
+ throws Exception {
+ c.output(KV.of(c.element(), c.timestamp()));
+ }
+ }));
+
+ DataflowAssert.that(timestamped)
+ .containsInAnyOrder(yearTwoThousand, "0", "1234", Integer.toString(Integer.MAX_VALUE));
+ DataflowAssert.that(timestampedVals)
+ .containsInAnyOrder(
+ KV.of("0", new Instant(0)),
+ KV.of("1234", new Instant("1234")),
+ KV.of(Integer.toString(Integer.MAX_VALUE), new Instant(Integer.MAX_VALUE)),
+ KV.of(yearTwoThousand, new Instant(Long.valueOf(yearTwoThousand))));
+ }
+}
[08/67] [partial] incubator-beam git commit: Directory reorganization
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/StringUtils.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/StringUtils.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/StringUtils.java
deleted file mode 100644
index 3a18336..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/StringUtils.java
+++ /dev/null
@@ -1,242 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-import com.google.common.base.Joiner;
-import com.google.common.base.Preconditions;
-
-import java.util.ArrayList;
-import java.util.List;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-/**
- * Utilities for working with JSON and other human-readable string formats.
- */
-public class StringUtils {
- /**
- * Converts the given array of bytes into a legal JSON string.
- *
- * <p>Uses a simple strategy of converting each byte to a single char,
- * except for non-printable chars, non-ASCII chars, and '%', '\',
- * and '"', which are encoded as three chars in '%xx' format, where
- * 'xx' is the hexadecimal encoding of the byte.
- */
- public static String byteArrayToJsonString(byte[] bytes) {
- StringBuilder sb = new StringBuilder(bytes.length * 2);
- for (byte b : bytes) {
- if (b >= 32 && b < 127) {
- // A printable ascii character.
- char c = (char) b;
- if (c != '%' && c != '\\' && c != '\"') {
- // Not an escape prefix or special character, either.
- // Send through unchanged.
- sb.append(c);
- continue;
- }
- }
- // Send through escaped. Use '%xx' format.
- sb.append(String.format("%%%02x", b));
- }
- return sb.toString();
- }
-
- /**
- * Converts the given string, encoded using {@link #byteArrayToJsonString},
- * into a byte array.
- *
- * @throws IllegalArgumentException if the argument string is not legal
- */
- public static byte[] jsonStringToByteArray(String string) {
- List<Byte> bytes = new ArrayList<>();
- for (int i = 0; i < string.length(); ) {
- char c = string.charAt(i);
- Byte b;
- if (c == '%') {
- // Escaped. Expect '%xx' format.
- try {
- b = (byte) Integer.parseInt(string.substring(i + 1, i + 3), 16);
- } catch (IndexOutOfBoundsException | NumberFormatException exn) {
- throw new IllegalArgumentException(
- "not in legal encoded format; " +
- "substring [" + i + ".." + (i + 2) + "] not in format \"%xx\"",
- exn);
- }
- i += 3;
- } else {
- // Send through unchanged.
- b = (byte) c;
- i++;
- }
- bytes.add(b);
- }
- byte[] byteArray = new byte[bytes.size()];
- int i = 0;
- for (Byte b : bytes) {
- byteArray[i++] = b;
- }
- return byteArray;
- }
-
- private static final String[] STANDARD_NAME_SUFFIXES =
- new String[]{"DoFn", "Fn"};
-
- /**
- * Pattern to match a non-anonymous inner class.
- * Eg, matches "Foo$Bar", or even "Foo$1$Bar", but not "Foo$1" or "Foo$1$2".
- */
- private static final Pattern NAMED_INNER_CLASS =
- Pattern.compile(".+\\$(?<INNER>[^0-9].*)");
-
- private static final String ANONYMOUS_CLASS_REGEX = "\\$[0-9]+\\$";
-
- /**
- * Returns a simple name for a class.
- *
- * <p>Note: this is non-invertible - the name may be simplified to an
- * extent that it cannot be mapped back to the original class.
- *
- * <p>This can be used to generate human-readable names. It
- * removes the package and outer classes from the name,
- * and removes common suffixes.
- *
- * <p>Examples:
- * <ul>
- * <li>{@code some.package.Word.SummaryDoFn} -> "Summary"
- * <li>{@code another.package.PairingFn} -> "Pairing"
- * </ul>
- *
- * @throws IllegalArgumentException if the class is anonymous
- */
- public static String approximateSimpleName(Class<?> clazz) {
- return approximateSimpleName(clazz, /* dropOuterClassNames */ true);
- }
-
- /**
- * Returns a name for a PTransform class.
- *
- * <p>This can be used to generate human-readable transform names. It
- * removes the package from the name, and removes common suffixes.
- *
- * <p>It is different than approximateSimpleName:
- * <ul>
- * <li>1. It keeps the outer classes names.
- * <li>2. It removes the common transform inner class: "Bound".
- * </ul>
- *
- * <p>Examples:
- * <ul>
- * <li>{@code some.package.Word.Summary} -> "Word.Summary"
- * <li>{@code another.package.Pairing.Bound} -> "Pairing"
- * </ul>
- */
- public static String approximatePTransformName(Class<?> clazz) {
- Preconditions.checkArgument(PTransform.class.isAssignableFrom(clazz));
- return approximateSimpleName(clazz, /* dropOuterClassNames */ false)
- .replaceFirst("\\.Bound$", "");
- }
-
- /**
- * Calculate the Levenshtein distance between two strings.
- *
- * <p>The Levenshtein distance between two words is the minimum number of single-character edits
- * (i.e. insertions, deletions or substitutions) required to change one string into the other.
- */
- public static int getLevenshteinDistance(final String s, final String t) {
- Preconditions.checkNotNull(s);
- Preconditions.checkNotNull(t);
-
- // base cases
- if (s.equals(t)) {
- return 0;
- }
- if (s.length() == 0) {
- return t.length();
- }
- if (t.length() == 0) {
- return s.length();
- }
-
- // create two work arrays to store integer distances
- final int[] v0 = new int[t.length() + 1];
- final int[] v1 = new int[t.length() + 1];
-
- // initialize v0 (the previous row of distances)
- // this row is A[0][i]: edit distance for an empty s
- // the distance is just the number of characters to delete from t
- for (int i = 0; i < v0.length; i++) {
- v0[i] = i;
- }
-
- for (int i = 0; i < s.length(); i++) {
- // calculate v1 (current row distances) from the previous row v0
-
- // first element of v1 is A[i+1][0]
- // edit distance is delete (i+1) chars from s to match empty t
- v1[0] = i + 1;
-
- // use formula to fill in the rest of the row
- for (int j = 0; j < t.length(); j++) {
- int cost = (s.charAt(i) == t.charAt(j)) ? 0 : 1;
- v1[j + 1] = Math.min(Math.min(v1[j] + 1, v0[j + 1] + 1), v0[j] + cost);
- }
-
- // copy v1 (current row) to v0 (previous row) for next iteration
- System.arraycopy(v1, 0, v0, 0, v0.length);
- }
-
- return v1[t.length()];
- }
-
- private static String approximateSimpleName(Class<?> clazz, boolean dropOuterClassNames) {
- Preconditions.checkArgument(!clazz.isAnonymousClass(),
- "Attempted to get simple name of anonymous class");
-
- String fullName = clazz.getName();
- String shortName = fullName.substring(fullName.lastIndexOf('.') + 1);
-
- // Drop common suffixes for each named component.
- String[] names = shortName.split("\\$");
- for (int i = 0; i < names.length; i++) {
- names[i] = simplifyNameComponent(names[i]);
- }
- shortName = Joiner.on('$').join(names);
-
- if (dropOuterClassNames) {
- // Simplify inner class name by dropping outer class prefixes.
- Matcher m = NAMED_INNER_CLASS.matcher(shortName);
- if (m.matches()) {
- shortName = m.group("INNER");
- }
- } else {
- // Dropping anonymous outer classes
- shortName = shortName.replaceAll(ANONYMOUS_CLASS_REGEX, ".");
- shortName = shortName.replaceAll("\\$", ".");
- }
- return shortName;
- }
-
- private static String simplifyNameComponent(String name) {
- for (String suffix : STANDARD_NAME_SUFFIXES) {
- if (name.endsWith(suffix) && name.length() > suffix.length()) {
- return name.substring(0, name.length() - suffix.length());
- }
- }
- return name;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/Structs.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/Structs.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/Structs.java
deleted file mode 100644
index c621c55..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/Structs.java
+++ /dev/null
@@ -1,384 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.api.client.util.Data;
-
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.List;
-import java.util.Map;
-
-import javax.annotation.Nullable;
-
-/**
- * A collection of static methods for manipulating datastructure representations
- * transferred via the Dataflow API.
- */
-public final class Structs {
- private Structs() {} // Non-instantiable
-
- public static String getString(Map<String, Object> map, String name) throws Exception {
- return getValue(map, name, String.class, "a string");
- }
-
- public static String getString(
- Map<String, Object> map, String name, @Nullable String defaultValue)
- throws Exception {
- return getValue(map, name, String.class, "a string", defaultValue);
- }
-
- public static byte[] getBytes(Map<String, Object> map, String name) throws Exception {
- @Nullable byte[] result = getBytes(map, name, null);
- if (result == null) {
- throw new ParameterNotFoundException(name, map);
- }
- return result;
- }
-
- @Nullable
- public static byte[] getBytes(Map<String, Object> map, String name, @Nullable byte[] defaultValue)
- throws Exception {
- @Nullable String jsonString = getString(map, name, null);
- if (jsonString == null) {
- return defaultValue;
- }
- // TODO: Need to agree on a format for encoding bytes in
- // a string that can be sent to the backend, over the cloud
- // map task work API. base64 encoding seems pretty common. Switch to it?
- return StringUtils.jsonStringToByteArray(jsonString);
- }
-
- public static Boolean getBoolean(Map<String, Object> map, String name) throws Exception {
- return getValue(map, name, Boolean.class, "a boolean");
- }
-
- @Nullable
- public static Boolean getBoolean(
- Map<String, Object> map, String name, @Nullable Boolean defaultValue)
- throws Exception {
- return getValue(map, name, Boolean.class, "a boolean", defaultValue);
- }
-
- public static Long getLong(Map<String, Object> map, String name) throws Exception {
- return getValue(map, name, Long.class, "a long");
- }
-
- @Nullable
- public static Long getLong(Map<String, Object> map, String name, @Nullable Long defaultValue)
- throws Exception {
- return getValue(map, name, Long.class, "a long", defaultValue);
- }
-
- public static Integer getInt(Map<String, Object> map, String name) throws Exception {
- return getValue(map, name, Integer.class, "an int");
- }
-
- @Nullable
- public static Integer getInt(Map<String, Object> map, String name, @Nullable Integer defaultValue)
- throws Exception {
- return getValue(map, name, Integer.class, "an int", defaultValue);
- }
-
- @Nullable
- public static List<String> getStrings(
- Map<String, Object> map, String name, @Nullable List<String> defaultValue)
- throws Exception {
- @Nullable Object value = map.get(name);
- if (value == null) {
- if (map.containsKey(name)) {
- throw new IncorrectTypeException(name, map, "a string or a list");
- }
- return defaultValue;
- }
- if (Data.isNull(value)) {
- // This is a JSON literal null. When represented as a list of strings,
- // this is an empty list.
- return Collections.<String>emptyList();
- }
- @Nullable String singletonString = decodeValue(value, String.class);
- if (singletonString != null) {
- return Collections.singletonList(singletonString);
- }
- if (!(value instanceof List)) {
- throw new IncorrectTypeException(name, map, "a string or a list");
- }
- @SuppressWarnings("unchecked")
- List<Object> elements = (List<Object>) value;
- List<String> result = new ArrayList<>(elements.size());
- for (Object o : elements) {
- @Nullable String s = decodeValue(o, String.class);
- if (s == null) {
- throw new IncorrectTypeException(name, map, "a list of strings");
- }
- result.add(s);
- }
- return result;
- }
-
- public static Map<String, Object> getObject(Map<String, Object> map, String name)
- throws Exception {
- @Nullable Map<String, Object> result = getObject(map, name, null);
- if (result == null) {
- throw new ParameterNotFoundException(name, map);
- }
- return result;
- }
-
- @Nullable
- public static Map<String, Object> getObject(
- Map<String, Object> map, String name, @Nullable Map<String, Object> defaultValue)
- throws Exception {
- @Nullable Object value = map.get(name);
- if (value == null) {
- if (map.containsKey(name)) {
- throw new IncorrectTypeException(name, map, "an object");
- }
- return defaultValue;
- }
- return checkObject(value, map, name);
- }
-
- private static Map<String, Object> checkObject(
- Object value, Map<String, Object> map, String name) throws Exception {
- if (Data.isNull(value)) {
- // This is a JSON literal null. When represented as an object, this is an
- // empty map.
- return Collections.<String, Object>emptyMap();
- }
- if (!(value instanceof Map)) {
- throw new IncorrectTypeException(name, map, "an object (not a map)");
- }
- @SuppressWarnings("unchecked")
- Map<String, Object> mapValue = (Map<String, Object>) value;
- if (!mapValue.containsKey(PropertyNames.OBJECT_TYPE_NAME)) {
- throw new IncorrectTypeException(name, map,
- "an object (no \"" + PropertyNames.OBJECT_TYPE_NAME + "\" field)");
- }
- return mapValue;
- }
-
- @Nullable
- public static List<Map<String, Object>> getListOfMaps(Map<String, Object> map, String name,
- @Nullable List<Map<String, Object>> defaultValue) throws Exception {
- @Nullable
- Object value = map.get(name);
- if (value == null) {
- if (map.containsKey(name)) {
- throw new IncorrectTypeException(name, map, "a list");
- }
- return defaultValue;
- }
- if (Data.isNull(value)) {
- // This is a JSON literal null. When represented as a list,
- // this is an empty list.
- return Collections.<Map<String, Object>>emptyList();
- }
-
- if (!(value instanceof List)) {
- throw new IncorrectTypeException(name, map, "a list");
- }
-
- List<?> elements = (List<?>) value;
- for (Object elem : elements) {
- if (!(elem instanceof Map)) {
- throw new IncorrectTypeException(name, map, "a list of Map objects");
- }
- }
-
- @SuppressWarnings("unchecked")
- List<Map<String, Object>> result = (List<Map<String, Object>>) elements;
- return result;
- }
-
- public static Map<String, Object> getDictionary(
- Map<String, Object> map, String name) throws Exception {
- @Nullable Object value = map.get(name);
- if (value == null) {
- throw new ParameterNotFoundException(name, map);
- }
- if (Data.isNull(value)) {
- // This is a JSON literal null. When represented as a dictionary, this is
- // an empty map.
- return Collections.<String, Object>emptyMap();
- }
- if (!(value instanceof Map)) {
- throw new IncorrectTypeException(name, map, "a dictionary");
- }
- @SuppressWarnings("unchecked")
- Map<String, Object> result = (Map<String, Object>) value;
- return result;
- }
-
- @Nullable
- public static Map<String, Object> getDictionary(
- Map<String, Object> map, String name, @Nullable Map<String, Object> defaultValue)
- throws Exception {
- @Nullable Object value = map.get(name);
- if (value == null) {
- if (map.containsKey(name)) {
- throw new IncorrectTypeException(name, map, "a dictionary");
- }
- return defaultValue;
- }
- if (Data.isNull(value)) {
- // This is a JSON literal null. When represented as a dictionary, this is
- // an empty map.
- return Collections.<String, Object>emptyMap();
- }
- if (!(value instanceof Map)) {
- throw new IncorrectTypeException(name, map, "a dictionary");
- }
- @SuppressWarnings("unchecked")
- Map<String, Object> result = (Map<String, Object>) value;
- return result;
- }
-
- // Builder operations.
-
- public static void addString(Map<String, Object> map, String name, String value) {
- addObject(map, name, CloudObject.forString(value));
- }
-
- public static void addBoolean(Map<String, Object> map, String name, boolean value) {
- addObject(map, name, CloudObject.forBoolean(value));
- }
-
- public static void addLong(Map<String, Object> map, String name, long value) {
- addObject(map, name, CloudObject.forInteger(value));
- }
-
- public static void addObject(
- Map<String, Object> map, String name, Map<String, Object> value) {
- map.put(name, value);
- }
-
- public static void addNull(Map<String, Object> map, String name) {
- map.put(name, Data.nullOf(Object.class));
- }
-
- public static void addLongs(Map<String, Object> map, String name, long... longs) {
- List<Map<String, Object>> elements = new ArrayList<>(longs.length);
- for (Long value : longs) {
- elements.add(CloudObject.forInteger(value));
- }
- map.put(name, elements);
- }
-
- public static void addList(
- Map<String, Object> map, String name, List<? extends Map<String, Object>> elements) {
- map.put(name, elements);
- }
-
- public static void addStringList(Map<String, Object> map, String name, List<String> elements) {
- ArrayList<CloudObject> objects = new ArrayList<>(elements.size());
- for (String element : elements) {
- objects.add(CloudObject.forString(element));
- }
- addList(map, name, objects);
- }
-
- public static <T extends Map<String, Object>> void addList(
- Map<String, Object> map, String name, T[] elements) {
- map.put(name, Arrays.asList(elements));
- }
-
- public static void addDictionary(
- Map<String, Object> map, String name, Map<String, Object> value) {
- map.put(name, value);
- }
-
- public static void addDouble(Map<String, Object> map, String name, Double value) {
- addObject(map, name, CloudObject.forFloat(value));
- }
-
- // Helper methods for a few of the accessor methods.
-
- private static <T> T getValue(Map<String, Object> map, String name, Class<T> clazz, String type)
- throws Exception {
- @Nullable T result = getValue(map, name, clazz, type, null);
- if (result == null) {
- throw new ParameterNotFoundException(name, map);
- }
- return result;
- }
-
- @Nullable
- private static <T> T getValue(
- Map<String, Object> map, String name, Class<T> clazz, String type, @Nullable T defaultValue)
- throws Exception {
- @Nullable Object value = map.get(name);
- if (value == null) {
- if (map.containsKey(name)) {
- throw new IncorrectTypeException(name, map, type);
- }
- return defaultValue;
- }
- T result = decodeValue(value, clazz);
- if (result == null) {
- // The value exists, but can't be decoded.
- throw new IncorrectTypeException(name, map, type);
- }
- return result;
- }
-
- @Nullable
- private static <T> T decodeValue(Object value, Class<T> clazz) {
- try {
- if (value.getClass() == clazz) {
- // decodeValue() is only called for final classes; if the class matches,
- // it's safe to just return the value, and if it doesn't match, decoding
- // is needed.
- return clazz.cast(value);
- }
- if (!(value instanceof Map)) {
- return null;
- }
- @SuppressWarnings("unchecked")
- Map<String, Object> map = (Map<String, Object>) value;
- @Nullable String typeName = (String) map.get(PropertyNames.OBJECT_TYPE_NAME);
- if (typeName == null) {
- return null;
- }
- @Nullable CloudKnownType knownType = CloudKnownType.forUri(typeName);
- if (knownType == null) {
- return null;
- }
- @Nullable Object scalar = map.get(PropertyNames.SCALAR_FIELD_NAME);
- if (scalar == null) {
- return null;
- }
- return knownType.parse(scalar, clazz);
- } catch (ClassCastException e) {
- // If any class cast fails during decoding, the value's not decodable.
- return null;
- }
- }
-
- private static final class ParameterNotFoundException extends Exception {
- public ParameterNotFoundException(String name, Map<String, Object> map) {
- super("didn't find required parameter " + name + " in " + map);
- }
- }
-
- private static final class IncorrectTypeException extends Exception {
- public IncorrectTypeException(String name, Map<String, Object> map, String type) {
- super("required parameter " + name + " in " + map + " not " + type);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/SystemDoFnInternal.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/SystemDoFnInternal.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/SystemDoFnInternal.java
deleted file mode 100644
index 3255ede..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/SystemDoFnInternal.java
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.cloud.dataflow.sdk.transforms.DoFn;
-
-import java.lang.annotation.Documented;
-import java.lang.annotation.ElementType;
-import java.lang.annotation.Retention;
-import java.lang.annotation.RetentionPolicy;
-import java.lang.annotation.Target;
-
-/**
- * Annotation to mark {@link DoFn DoFns} as an internal component of the Dataflow SDK.
- *
- * <p>Currently, the only effect of this is to mark any aggregators reported by an annotated
- * {@code DoFn} as a system counter (as opposed to a user counter).
- *
- * <p>This is internal to the Dataflow SDK.
- */
-@Documented
-@Retention(RetentionPolicy.RUNTIME)
-@Target(ElementType.TYPE)
-public @interface SystemDoFnInternal {}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/SystemReduceFn.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/SystemReduceFn.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/SystemReduceFn.java
deleted file mode 100644
index 1665792..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/SystemReduceFn.java
+++ /dev/null
@@ -1,133 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.util;
-
-
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.transforms.Combine.CombineFn;
-import com.google.cloud.dataflow.sdk.transforms.Combine.KeyedCombineFn;
-import com.google.cloud.dataflow.sdk.transforms.CombineWithContext.KeyedCombineFnWithContext;
-import com.google.cloud.dataflow.sdk.transforms.GroupByKey;
-import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
-import com.google.cloud.dataflow.sdk.util.state.AccumulatorCombiningState;
-import com.google.cloud.dataflow.sdk.util.state.BagState;
-import com.google.cloud.dataflow.sdk.util.state.CombiningState;
-import com.google.cloud.dataflow.sdk.util.state.MergingStateAccessor;
-import com.google.cloud.dataflow.sdk.util.state.ReadableState;
-import com.google.cloud.dataflow.sdk.util.state.StateAccessor;
-import com.google.cloud.dataflow.sdk.util.state.StateMerging;
-import com.google.cloud.dataflow.sdk.util.state.StateTag;
-import com.google.cloud.dataflow.sdk.util.state.StateTags;
-
-/**
- * {@link ReduceFn} implementing the default reduction behaviors of {@link GroupByKey}.
- *
- * @param <K> The type of key being processed.
- * @param <InputT> The type of values associated with the key.
- * @param <OutputT> The output type that will be produced for each key.
- * @param <W> The type of windows this operates on.
- */
-public abstract class SystemReduceFn<K, InputT, AccumT, OutputT, W extends BoundedWindow>
- extends ReduceFn<K, InputT, OutputT, W> {
- private static final String BUFFER_NAME = "buf";
-
- /**
- * Create a factory that produces {@link SystemReduceFn} instances that that buffer all of the
- * input values in persistent state and produces an {@code Iterable<T>}.
- */
- public static <K, T, W extends BoundedWindow> SystemReduceFn<K, T, Iterable<T>, Iterable<T>, W>
- buffering(final Coder<T> inputCoder) {
- final StateTag<Object, BagState<T>> bufferTag =
- StateTags.makeSystemTagInternal(StateTags.bag(BUFFER_NAME, inputCoder));
- return new SystemReduceFn<K, T, Iterable<T>, Iterable<T>, W>(bufferTag) {
- @Override
- public void prefetchOnMerge(MergingStateAccessor<K, W> state) throws Exception {
- StateMerging.prefetchBags(state, bufferTag);
- }
-
- @Override
- public void onMerge(OnMergeContext c) throws Exception {
- StateMerging.mergeBags(c.state(), bufferTag);
- }
- };
- }
-
- /**
- * Create a factory that produces {@link SystemReduceFn} instances that combine all of the input
- * values using a {@link CombineFn}.
- */
- public static <K, InputT, AccumT, OutputT, W extends BoundedWindow> SystemReduceFn<K, InputT,
- AccumT, OutputT, W>
- combining(
- final Coder<K> keyCoder, final AppliedCombineFn<K, InputT, AccumT, OutputT> combineFn) {
- final StateTag<K, AccumulatorCombiningState<InputT, AccumT, OutputT>> bufferTag;
- if (combineFn.getFn() instanceof KeyedCombineFnWithContext) {
- bufferTag = StateTags.makeSystemTagInternal(
- StateTags.<K, InputT, AccumT, OutputT>keyedCombiningValueWithContext(
- BUFFER_NAME, combineFn.getAccumulatorCoder(),
- (KeyedCombineFnWithContext<K, InputT, AccumT, OutputT>) combineFn.getFn()));
-
- } else {
- bufferTag = StateTags.makeSystemTagInternal(
- StateTags.<K, InputT, AccumT, OutputT>keyedCombiningValue(
- BUFFER_NAME, combineFn.getAccumulatorCoder(),
- (KeyedCombineFn<K, InputT, AccumT, OutputT>) combineFn.getFn()));
- }
- return new SystemReduceFn<K, InputT, AccumT, OutputT, W>(bufferTag) {
- @Override
- public void prefetchOnMerge(MergingStateAccessor<K, W> state) throws Exception {
- StateMerging.prefetchCombiningValues(state, bufferTag);
- }
-
- @Override
- public void onMerge(OnMergeContext c) throws Exception {
- StateMerging.mergeCombiningValues(c.state(), bufferTag);
- }
- };
- }
-
- private StateTag<? super K, ? extends CombiningState<InputT, OutputT>> bufferTag;
-
- public SystemReduceFn(
- StateTag<? super K, ? extends CombiningState<InputT, OutputT>> bufferTag) {
- this.bufferTag = bufferTag;
- }
-
- @Override
- public void processValue(ProcessValueContext c) throws Exception {
- c.state().access(bufferTag).add(c.value());
- }
-
- @Override
- public void prefetchOnTrigger(StateAccessor<K> state) {
- state.access(bufferTag).readLater();
- }
-
- @Override
- public void onTrigger(OnTriggerContext c) throws Exception {
- c.output(c.state().access(bufferTag).read());
- }
-
- @Override
- public void clearState(Context c) throws Exception {
- c.state().access(bufferTag).clear();
- }
-
- @Override
- public ReadableState<Boolean> isEmpty(StateAccessor<K> state) {
- return state.access(bufferTag).isEmpty();
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/TestCredential.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/TestCredential.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/TestCredential.java
deleted file mode 100644
index 359e157..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/TestCredential.java
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.api.client.auth.oauth2.BearerToken;
-import com.google.api.client.auth.oauth2.Credential;
-import com.google.api.client.auth.oauth2.TokenResponse;
-import com.google.api.client.testing.http.MockHttpTransport;
-
-import java.io.IOException;
-
-/**
- * Fake credential, for use in testing.
- */
-public class TestCredential extends Credential {
-
- private final String token;
-
- public TestCredential() {
- this("NULL");
- }
-
- public TestCredential(String token) {
- super(new Builder(
- BearerToken.authorizationHeaderAccessMethod())
- .setTransport(new MockHttpTransport()));
- this.token = token;
- }
-
- @Override
- protected TokenResponse executeRefreshToken() throws IOException {
- TokenResponse response = new TokenResponse();
- response.setExpiresInSeconds(5L * 60);
- response.setAccessToken(token);
- return response;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/TimeDomain.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/TimeDomain.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/TimeDomain.java
deleted file mode 100644
index 4ff36f7..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/TimeDomain.java
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.util;
-
-/**
- * {@code TimeDomain} specifies whether an operation is based on
- * timestamps of elements or current "real-world" time as reported while processing.
- */
-public enum TimeDomain {
- /**
- * The {@code EVENT_TIME} domain corresponds to the timestamps on the elements. Time advances
- * on the system watermark advances.
- */
- EVENT_TIME,
-
- /**
- * The {@code PROCESSING_TIME} domain corresponds to the current to the current (system) time.
- * This is advanced during execution of the Dataflow pipeline.
- */
- PROCESSING_TIME,
-
- /**
- * Same as the {@code PROCESSING_TIME} domain, except it won't fire a timer set for time
- * {@code T} until all timers from earlier stages set for a time earlier than {@code T} have
- * fired.
- */
- SYNCHRONIZED_PROCESSING_TIME;
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/TimeUtil.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/TimeUtil.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/TimeUtil.java
deleted file mode 100644
index 93195a7..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/TimeUtil.java
+++ /dev/null
@@ -1,164 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.util;
-
-import org.joda.time.DateTime;
-import org.joda.time.Duration;
-import org.joda.time.Instant;
-import org.joda.time.ReadableDuration;
-import org.joda.time.ReadableInstant;
-import org.joda.time.chrono.ISOChronology;
-
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import javax.annotation.Nullable;
-
-/**
- * A helper class for converting between Dataflow API and SDK time
- * representations.
- *
- * <p>Dataflow API times are strings of the form
- * {@code YYYY-MM-dd'T'HH:mm:ss[.nnnn]'Z'}: that is, RFC 3339
- * strings with optional fractional seconds and a 'Z' offset.
- *
- * <p>Dataflow API durations are strings of the form {@code ['-']sssss[.nnnn]'s'}:
- * that is, seconds with optional fractional seconds and a literal 's' at the end.
- *
- * <p>In both formats, fractional seconds are either three digits (millisecond
- * resolution), six digits (microsecond resolution), or nine digits (nanosecond
- * resolution).
- */
-public final class TimeUtil {
- private TimeUtil() {} // Non-instantiable.
-
- private static final Pattern DURATION_PATTERN = Pattern.compile("(\\d+)(?:\\.(\\d+))?s");
- private static final Pattern TIME_PATTERN =
- Pattern.compile("(\\d{4})-(\\d{2})-(\\d{2})T(\\d{2}):(\\d{2}):(\\d{2})(?:\\.(\\d+))?Z");
-
- /**
- * Converts a {@link ReadableInstant} into a Dateflow API time value.
- */
- public static String toCloudTime(ReadableInstant instant) {
- // Note that since Joda objects use millisecond resolution, we always
- // produce either no fractional seconds or fractional seconds with
- // millisecond resolution.
-
- // Translate the ReadableInstant to a DateTime with ISOChronology.
- DateTime time = new DateTime(instant);
-
- int millis = time.getMillisOfSecond();
- if (millis == 0) {
- return String.format("%04d-%02d-%02dT%02d:%02d:%02dZ",
- time.getYear(),
- time.getMonthOfYear(),
- time.getDayOfMonth(),
- time.getHourOfDay(),
- time.getMinuteOfHour(),
- time.getSecondOfMinute());
- } else {
- return String.format("%04d-%02d-%02dT%02d:%02d:%02d.%03dZ",
- time.getYear(),
- time.getMonthOfYear(),
- time.getDayOfMonth(),
- time.getHourOfDay(),
- time.getMinuteOfHour(),
- time.getSecondOfMinute(),
- millis);
- }
- }
-
- /**
- * Converts a time value received via the Dataflow API into the corresponding
- * {@link Instant}.
- * @return the parsed time, or null if a parse error occurs
- */
- @Nullable
- public static Instant fromCloudTime(String time) {
- Matcher matcher = TIME_PATTERN.matcher(time);
- if (!matcher.matches()) {
- return null;
- }
- int year = Integer.valueOf(matcher.group(1));
- int month = Integer.valueOf(matcher.group(2));
- int day = Integer.valueOf(matcher.group(3));
- int hour = Integer.valueOf(matcher.group(4));
- int minute = Integer.valueOf(matcher.group(5));
- int second = Integer.valueOf(matcher.group(6));
- int millis = 0;
-
- String frac = matcher.group(7);
- if (frac != null) {
- int fracs = Integer.valueOf(frac);
- if (frac.length() == 3) { // millisecond resolution
- millis = fracs;
- } else if (frac.length() == 6) { // microsecond resolution
- millis = fracs / 1000;
- } else if (frac.length() == 9) { // nanosecond resolution
- millis = fracs / 1000000;
- } else {
- return null;
- }
- }
-
- return new DateTime(year, month, day, hour, minute, second, millis,
- ISOChronology.getInstanceUTC()).toInstant();
- }
-
- /**
- * Converts a {@link ReadableDuration} into a Dataflow API duration string.
- */
- public static String toCloudDuration(ReadableDuration duration) {
- // Note that since Joda objects use millisecond resolution, we always
- // produce either no fractional seconds or fractional seconds with
- // millisecond resolution.
- long millis = duration.getMillis();
- long seconds = millis / 1000;
- millis = millis % 1000;
- if (millis == 0) {
- return String.format("%ds", seconds);
- } else {
- return String.format("%d.%03ds", seconds, millis);
- }
- }
-
- /**
- * Converts a Dataflow API duration string into a {@link Duration}.
- * @return the parsed duration, or null if a parse error occurs
- */
- @Nullable
- public static Duration fromCloudDuration(String duration) {
- Matcher matcher = DURATION_PATTERN.matcher(duration);
- if (!matcher.matches()) {
- return null;
- }
- long millis = Long.valueOf(matcher.group(1)) * 1000;
- String frac = matcher.group(2);
- if (frac != null) {
- long fracs = Long.valueOf(frac);
- if (frac.length() == 3) { // millisecond resolution
- millis += fracs;
- } else if (frac.length() == 6) { // microsecond resolution
- millis += fracs / 1000;
- } else if (frac.length() == 9) { // nanosecond resolution
- millis += fracs / 1000000;
- } else {
- return null;
- }
- }
- return Duration.millis(millis);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/TimerInternals.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/TimerInternals.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/TimerInternals.java
deleted file mode 100644
index c823ed3..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/TimerInternals.java
+++ /dev/null
@@ -1,269 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-import static com.google.common.base.Preconditions.checkNotNull;
-
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.coders.CoderException;
-import com.google.cloud.dataflow.sdk.coders.InstantCoder;
-import com.google.cloud.dataflow.sdk.coders.StandardCoder;
-import com.google.cloud.dataflow.sdk.coders.StringUtf8Coder;
-import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
-import com.google.cloud.dataflow.sdk.util.state.StateNamespace;
-import com.google.cloud.dataflow.sdk.util.state.StateNamespaces;
-import com.google.common.base.MoreObjects;
-import com.google.common.base.Preconditions;
-
-import com.fasterxml.jackson.annotation.JsonCreator;
-import com.fasterxml.jackson.annotation.JsonProperty;
-
-import org.joda.time.Instant;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.util.Arrays;
-import java.util.List;
-import java.util.Objects;
-
-import javax.annotation.Nullable;
-
-/**
- * Encapsulate interaction with time within the execution environment.
- *
- * <p>This class allows setting and deleting timers, and also retrieving an
- * estimate of the current time.
- */
-public interface TimerInternals {
-
- /**
- * Writes out a timer to be fired when the watermark reaches the given
- * timestamp.
- *
- * <p>The combination of {@code namespace}, {@code timestamp} and {@code domain} uniquely
- * identify a timer. Multiple timers set for the same parameters can be safely deduplicated.
- */
- void setTimer(TimerData timerKey);
-
- /**
- * Deletes the given timer.
- */
- void deleteTimer(TimerData timerKey);
-
- /**
- * Returns the current timestamp in the {@link TimeDomain#PROCESSING_TIME} time domain.
- */
- Instant currentProcessingTime();
-
- /**
- * Returns the current timestamp in the {@link TimeDomain#SYNCHRONIZED_PROCESSING_TIME} time
- * domain or {@code null} if unknown.
- */
- @Nullable
- Instant currentSynchronizedProcessingTime();
-
- /**
- * Return the current, local input watermark timestamp for this computation
- * in the {@link TimeDomain#EVENT_TIME} time domain. Return {@code null} if unknown.
- *
- * <p>This value:
- * <ol>
- * <li>Is monotonically increasing.
- * <li>May differ between workers due to network and other delays.
- * <li>Will never be ahead of the global input watermark for this computation. But it
- * may be arbitrarily behind the global input watermark.
- * <li>Any element with a timestamp before the local input watermark can be considered
- * 'locally late' and be subject to special processing or be dropped entirely.
- * </ol>
- *
- * <p>Note that because the local input watermark can be behind the global input watermark,
- * it is possible for an element to be considered locally on-time even though it is
- * globally late.
- */
- @Nullable
- Instant currentInputWatermarkTime();
-
- /**
- * Return the current, local output watermark timestamp for this computation
- * in the {@link TimeDomain#EVENT_TIME} time domain. Return {@code null} if unknown.
- *
- * <p>This value:
- * <ol>
- * <li>Is monotonically increasing.
- * <li>Will never be ahead of {@link #currentInputWatermarkTime} as returned above.
- * <li>May differ between workers due to network and other delays.
- * <li>However will never be behind the global input watermark for any following computation.
- * </ol>
- *
- * <p> In pictures:
- * <pre>
- * | | | | |
- * | | D | C | B | A
- * | | | | |
- * GIWM <= GOWM <= LOWM <= LIWM <= GIWM
- * (next stage)
- * -------------------------------------------------> event time
- * </pre>
- * where
- * <ul>
- * <li> LOWM = local output water mark.
- * <li> GOWM = global output water mark.
- * <li> GIWM = global input water mark.
- * <li> LIWM = local input water mark.
- * <li> A = A globally on-time element.
- * <li> B = A globally late, but locally on-time element.
- * <li> C = A locally late element which may still contribute to the timestamp of a pane.
- * <li> D = A locally late element which cannot contribute to the timestamp of a pane.
- * </ul>
- *
- * <p>Note that if a computation emits an element which is not before the current output watermark
- * then that element will always appear locally on-time in all following computations. However,
- * it is possible for an element emitted before the current output watermark to appear locally
- * on-time in a following computation. Thus we must be careful to never assume locally late data
- * viewed on the output of a computation remains locally late on the input of a following
- * computation.
- */
- @Nullable
- Instant currentOutputWatermarkTime();
-
- /**
- * Data about a timer as represented within {@link TimerInternals}.
- */
- public static class TimerData implements Comparable<TimerData> {
- private final StateNamespace namespace;
- private final Instant timestamp;
- private final TimeDomain domain;
-
- private TimerData(StateNamespace namespace, Instant timestamp, TimeDomain domain) {
- this.namespace = checkNotNull(namespace);
- this.timestamp = checkNotNull(timestamp);
- this.domain = checkNotNull(domain);
- }
-
- public StateNamespace getNamespace() {
- return namespace;
- }
-
- public Instant getTimestamp() {
- return timestamp;
- }
-
- public TimeDomain getDomain() {
- return domain;
- }
-
- /**
- * Construct the {@code TimerKey} for the given parameters.
- */
- public static TimerData of(StateNamespace namespace, Instant timestamp, TimeDomain domain) {
- return new TimerData(namespace, timestamp, domain);
- }
-
- @Override
- public boolean equals(Object obj) {
- if (this == obj) {
- return true;
- }
-
- if (!(obj instanceof TimerData)) {
- return false;
- }
-
- TimerData that = (TimerData) obj;
- return Objects.equals(this.domain, that.domain)
- && this.timestamp.isEqual(that.timestamp)
- && Objects.equals(this.namespace, that.namespace);
- }
-
- @Override
- public int hashCode() {
- return Objects.hash(domain, timestamp, namespace);
- }
-
- @Override
- public String toString() {
- return MoreObjects.toStringHelper(getClass())
- .add("namespace", namespace)
- .add("timestamp", timestamp)
- .add("domain", domain)
- .toString();
- }
-
- @Override
- public int compareTo(TimerData o) {
- return Long.compare(timestamp.getMillis(), o.getTimestamp().getMillis());
- }
- }
-
- /**
- * A {@link Coder} for {@link TimerData}.
- */
- public class TimerDataCoder extends StandardCoder<TimerData> {
- private static final StringUtf8Coder STRING_CODER = StringUtf8Coder.of();
- private static final InstantCoder INSTANT_CODER = InstantCoder.of();
- private final Coder<? extends BoundedWindow> windowCoder;
-
- public static TimerDataCoder of(Coder<? extends BoundedWindow> windowCoder) {
- return new TimerDataCoder(windowCoder);
- }
-
- @SuppressWarnings("unchecked")
- @JsonCreator
- public static TimerDataCoder of(
- @JsonProperty(PropertyNames.COMPONENT_ENCODINGS)
- List<Coder<?>> components) {
- Preconditions.checkArgument(components.size() == 1,
- "Expecting 1 components, got " + components.size());
- return of((Coder<? extends BoundedWindow>) components.get(0));
- }
-
- private TimerDataCoder(Coder<? extends BoundedWindow> windowCoder) {
- this.windowCoder = windowCoder;
- }
-
- @Override
- public void encode(TimerData timer, OutputStream outStream, Context context)
- throws CoderException, IOException {
- Context nestedContext = context.nested();
- STRING_CODER.encode(timer.namespace.stringKey(), outStream, nestedContext);
- INSTANT_CODER.encode(timer.timestamp, outStream, nestedContext);
- STRING_CODER.encode(timer.domain.name(), outStream, nestedContext);
- }
-
- @Override
- public TimerData decode(InputStream inStream, Context context)
- throws CoderException, IOException {
- Context nestedContext = context.nested();
- StateNamespace namespace =
- StateNamespaces.fromString(STRING_CODER.decode(inStream, nestedContext), windowCoder);
- Instant timestamp = INSTANT_CODER.decode(inStream, nestedContext);
- TimeDomain domain = TimeDomain.valueOf(STRING_CODER.decode(inStream, nestedContext));
- return TimerData.of(namespace, timestamp, domain);
- }
-
- @Override
- public List<? extends Coder<?>> getCoderArguments() {
- return Arrays.asList(windowCoder);
- }
-
- @Override
- public void verifyDeterministic() throws NonDeterministicException {
- verifyDeterministic("window coder must be deterministic", windowCoder);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/Timers.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/Timers.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/Timers.java
deleted file mode 100644
index 7d4b4f2..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/Timers.java
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.cloud.dataflow.sdk.annotations.Experimental;
-
-import org.joda.time.Instant;
-
-import javax.annotation.Nullable;
-
-/**
- * Interface for interacting with time.
- */
-@Experimental(Experimental.Kind.TIMERS)
-public interface Timers {
- /**
- * Sets a timer to fire when the event time watermark, the current processing time, or
- * the synchronized processing time watermark surpasses a given timestamp.
- *
- * <p>See {@link TimeDomain} for details on the time domains available.
- *
- * <p>Timers are not guaranteed to fire immediately, but will be delivered at some time
- * afterwards.
- *
- * <p>An implementation of {@link Timers} implicitly scopes timers that are set - they may
- * be scoped to a key and window, or a key, window, and trigger, etc.
- *
- * @param timestamp the time at which the timer should be delivered
- * @param timeDomain the domain that the {@code timestamp} applies to
- */
- public abstract void setTimer(Instant timestamp, TimeDomain timeDomain);
-
- /** Removes the timer set in this context for the {@code timestmap} and {@code timeDomain}. */
- public abstract void deleteTimer(Instant timestamp, TimeDomain timeDomain);
-
- /** Returns the current processing time. */
- public abstract Instant currentProcessingTime();
-
- /** Returns the current synchronized processing time or {@code null} if unknown. */
- @Nullable
- public abstract Instant currentSynchronizedProcessingTime();
-
- /** Returns the current event time or {@code null} if unknown. */
- @Nullable
- public abstract Instant currentEventTime();
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/Transport.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/Transport.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/Transport.java
deleted file mode 100644
index 15fe286..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/Transport.java
+++ /dev/null
@@ -1,205 +0,0 @@
-/*******************************************************************************
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- ******************************************************************************/
-
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.api.client.auth.oauth2.Credential;
-import com.google.api.client.googleapis.javanet.GoogleNetHttpTransport;
-import com.google.api.client.http.HttpRequestInitializer;
-import com.google.api.client.http.HttpTransport;
-import com.google.api.client.json.JsonFactory;
-import com.google.api.client.json.jackson2.JacksonFactory;
-import com.google.api.services.bigquery.Bigquery;
-import com.google.api.services.clouddebugger.v2.Clouddebugger;
-import com.google.api.services.dataflow.Dataflow;
-import com.google.api.services.pubsub.Pubsub;
-import com.google.api.services.storage.Storage;
-import com.google.cloud.dataflow.sdk.options.BigQueryOptions;
-import com.google.cloud.dataflow.sdk.options.DataflowPipelineDebugOptions;
-import com.google.cloud.dataflow.sdk.options.DataflowPipelineOptions;
-import com.google.cloud.dataflow.sdk.options.GcsOptions;
-import com.google.cloud.hadoop.util.ChainingHttpRequestInitializer;
-import com.google.common.collect.ImmutableList;
-
-import java.io.IOException;
-import java.net.MalformedURLException;
-import java.net.URL;
-import java.security.GeneralSecurityException;
-
-/**
- * Helpers for cloud communication.
- */
-public class Transport {
-
- private static class SingletonHelper {
- /** Global instance of the JSON factory. */
- private static final JsonFactory JSON_FACTORY;
-
- /** Global instance of the HTTP transport. */
- private static final HttpTransport HTTP_TRANSPORT;
-
- static {
- try {
- JSON_FACTORY = JacksonFactory.getDefaultInstance();
- HTTP_TRANSPORT = GoogleNetHttpTransport.newTrustedTransport();
- } catch (GeneralSecurityException | IOException e) {
- throw new RuntimeException(e);
- }
- }
- }
-
- public static HttpTransport getTransport() {
- return SingletonHelper.HTTP_TRANSPORT;
- }
-
- public static JsonFactory getJsonFactory() {
- return SingletonHelper.JSON_FACTORY;
- }
-
- private static class ApiComponents {
- public String rootUrl;
- public String servicePath;
-
- public ApiComponents(String root, String path) {
- this.rootUrl = root;
- this.servicePath = path;
- }
- }
-
- private static ApiComponents apiComponentsFromUrl(String urlString) {
- try {
- URL url = new URL(urlString);
- String rootUrl = url.getProtocol() + "://" + url.getHost() +
- (url.getPort() > 0 ? ":" + url.getPort() : "");
- return new ApiComponents(rootUrl, url.getPath());
- } catch (MalformedURLException e) {
- throw new RuntimeException("Invalid URL: " + urlString);
- }
- }
-
- /**
- * Returns a BigQuery client builder.
- *
- * <p>Note: this client's endpoint is <b>not</b> modified by the
- * {@link DataflowPipelineDebugOptions#getApiRootUrl()} option.
- */
- public static Bigquery.Builder
- newBigQueryClient(BigQueryOptions options) {
- return new Bigquery.Builder(getTransport(), getJsonFactory(),
- chainHttpRequestInitializer(
- options.getGcpCredential(),
- // Do not log 404. It clutters the output and is possibly even required by the caller.
- new RetryHttpRequestInitializer(ImmutableList.of(404))))
- .setApplicationName(options.getAppName())
- .setGoogleClientRequestInitializer(options.getGoogleApiTrace());
- }
-
- /**
- * Returns a Pubsub client builder.
- *
- * <p>Note: this client's endpoint is <b>not</b> modified by the
- * {@link DataflowPipelineDebugOptions#getApiRootUrl()} option.
- */
- public static Pubsub.Builder
- newPubsubClient(DataflowPipelineOptions options) {
- return new Pubsub.Builder(getTransport(), getJsonFactory(),
- chainHttpRequestInitializer(
- options.getGcpCredential(),
- // Do not log 404. It clutters the output and is possibly even required by the caller.
- new RetryHttpRequestInitializer(ImmutableList.of(404))))
- .setRootUrl(options.getPubsubRootUrl())
- .setApplicationName(options.getAppName())
- .setGoogleClientRequestInitializer(options.getGoogleApiTrace());
- }
-
- /**
- * Returns a Google Cloud Dataflow client builder.
- */
- public static Dataflow.Builder newDataflowClient(DataflowPipelineOptions options) {
- String servicePath = options.getDataflowEndpoint();
- ApiComponents components;
- if (servicePath.contains("://")) {
- components = apiComponentsFromUrl(servicePath);
- } else {
- components = new ApiComponents(options.getApiRootUrl(), servicePath);
- }
-
- return new Dataflow.Builder(getTransport(),
- getJsonFactory(),
- chainHttpRequestInitializer(
- options.getGcpCredential(),
- // Do not log 404. It clutters the output and is possibly even required by the caller.
- new RetryHttpRequestInitializer(ImmutableList.of(404))))
- .setApplicationName(options.getAppName())
- .setRootUrl(components.rootUrl)
- .setServicePath(components.servicePath)
- .setGoogleClientRequestInitializer(options.getGoogleApiTrace());
- }
-
- public static Clouddebugger.Builder newClouddebuggerClient(DataflowPipelineOptions options) {
- return new Clouddebugger.Builder(getTransport(),
- getJsonFactory(),
- chainHttpRequestInitializer(options.getGcpCredential(), new RetryHttpRequestInitializer()))
- .setApplicationName(options.getAppName())
- .setGoogleClientRequestInitializer(options.getGoogleApiTrace());
- }
-
- /**
- * Returns a Dataflow client that does not automatically retry failed
- * requests.
- */
- public static Dataflow.Builder
- newRawDataflowClient(DataflowPipelineOptions options) {
- return newDataflowClient(options)
- .setHttpRequestInitializer(options.getGcpCredential())
- .setGoogleClientRequestInitializer(options.getGoogleApiTrace());
- }
-
- /**
- * Returns a Cloud Storage client builder.
- *
- * <p>Note: this client's endpoint is <b>not</b> modified by the
- * {@link DataflowPipelineDebugOptions#getApiRootUrl()} option.
- */
- public static Storage.Builder
- newStorageClient(GcsOptions options) {
- String servicePath = options.getGcsEndpoint();
- Storage.Builder storageBuilder = new Storage.Builder(getTransport(), getJsonFactory(),
- chainHttpRequestInitializer(
- options.getGcpCredential(),
- // Do not log the code 404. Code up the stack will deal with 404's if needed, and
- // logging it by default clutters the output during file staging.
- new RetryHttpRequestInitializer(
- ImmutableList.of(404), new UploadIdResponseInterceptor())))
- .setApplicationName(options.getAppName())
- .setGoogleClientRequestInitializer(options.getGoogleApiTrace());
- if (servicePath != null) {
- ApiComponents components = apiComponentsFromUrl(servicePath);
- storageBuilder.setRootUrl(components.rootUrl);
- storageBuilder.setServicePath(components.servicePath);
- }
- return storageBuilder;
- }
-
- private static HttpRequestInitializer chainHttpRequestInitializer(
- Credential credential, HttpRequestInitializer httpRequestInitializer) {
- if (credential == null) {
- return httpRequestInitializer;
- } else {
- return new ChainingHttpRequestInitializer(credential, httpRequestInitializer);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/TriggerContextFactory.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/TriggerContextFactory.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/TriggerContextFactory.java
deleted file mode 100644
index 64ff402..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/TriggerContextFactory.java
+++ /dev/null
@@ -1,522 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
-import com.google.cloud.dataflow.sdk.transforms.windowing.Trigger;
-import com.google.cloud.dataflow.sdk.transforms.windowing.Trigger.MergingTriggerInfo;
-import com.google.cloud.dataflow.sdk.transforms.windowing.Trigger.TriggerInfo;
-import com.google.cloud.dataflow.sdk.util.state.MergingStateAccessor;
-import com.google.cloud.dataflow.sdk.util.state.State;
-import com.google.cloud.dataflow.sdk.util.state.StateAccessor;
-import com.google.cloud.dataflow.sdk.util.state.StateInternals;
-import com.google.cloud.dataflow.sdk.util.state.StateNamespace;
-import com.google.cloud.dataflow.sdk.util.state.StateNamespaces;
-import com.google.cloud.dataflow.sdk.util.state.StateTag;
-import com.google.common.base.Predicate;
-import com.google.common.collect.FluentIterable;
-import com.google.common.collect.ImmutableMap;
-import com.google.common.collect.Iterables;
-import com.google.common.collect.Maps;
-
-import org.joda.time.Instant;
-
-import java.util.Collection;
-import java.util.Map;
-
-import javax.annotation.Nullable;
-
-/**
- * Factory for creating instances of the various {@link Trigger} contexts.
- *
- * <p>These contexts are highly interdependent and share many fields; it is inadvisable
- * to create them via any means other than this factory class.
- */
-public class TriggerContextFactory<W extends BoundedWindow> {
-
- private final WindowingStrategy<?, W> windowingStrategy;
- private StateInternals<?> stateInternals;
- // Future triggers may be able to exploit the active window to state address window mapping.
- @SuppressWarnings("unused")
- private ActiveWindowSet<W> activeWindows;
- private final Coder<W> windowCoder;
-
- public TriggerContextFactory(WindowingStrategy<?, W> windowingStrategy,
- StateInternals<?> stateInternals, ActiveWindowSet<W> activeWindows) {
- this.windowingStrategy = windowingStrategy;
- this.stateInternals = stateInternals;
- this.activeWindows = activeWindows;
- this.windowCoder = windowingStrategy.getWindowFn().windowCoder();
- }
-
- public Trigger<W>.TriggerContext base(W window, Timers timers,
- ExecutableTrigger<W> rootTrigger, FinishedTriggers finishedSet) {
- return new TriggerContextImpl(window, timers, rootTrigger, finishedSet);
- }
-
- public Trigger<W>.OnElementContext createOnElementContext(
- W window, Timers timers, Instant elementTimestamp,
- ExecutableTrigger<W> rootTrigger, FinishedTriggers finishedSet) {
- return new OnElementContextImpl(window, timers, rootTrigger, finishedSet, elementTimestamp);
- }
-
- public Trigger<W>.OnMergeContext createOnMergeContext(W window, Timers timers,
- ExecutableTrigger<W> rootTrigger, FinishedTriggers finishedSet,
- Map<W, FinishedTriggers> finishedSets) {
- return new OnMergeContextImpl(window, timers, rootTrigger, finishedSet, finishedSets);
- }
-
- public StateAccessor<?> createStateAccessor(W window, ExecutableTrigger<W> trigger) {
- return new StateAccessorImpl(window, trigger);
- }
-
- public MergingStateAccessor<?, W> createMergingStateAccessor(
- W mergeResult, Collection<W> mergingWindows, ExecutableTrigger<W> trigger) {
- return new MergingStateAccessorImpl(trigger, mergingWindows, mergeResult);
- }
-
- private class TriggerInfoImpl implements Trigger.TriggerInfo<W> {
-
- protected final ExecutableTrigger<W> trigger;
- protected final FinishedTriggers finishedSet;
- private final Trigger<W>.TriggerContext context;
-
- public TriggerInfoImpl(ExecutableTrigger<W> trigger, FinishedTriggers finishedSet,
- Trigger<W>.TriggerContext context) {
- this.trigger = trigger;
- this.finishedSet = finishedSet;
- this.context = context;
- }
-
- @Override
- public boolean isMerging() {
- return !windowingStrategy.getWindowFn().isNonMerging();
- }
-
- @Override
- public Iterable<ExecutableTrigger<W>> subTriggers() {
- return trigger.subTriggers();
- }
-
- @Override
- public ExecutableTrigger<W> subTrigger(int subtriggerIndex) {
- return trigger.subTriggers().get(subtriggerIndex);
- }
-
- @Override
- public boolean isFinished() {
- return finishedSet.isFinished(trigger);
- }
-
- @Override
- public boolean isFinished(int subtriggerIndex) {
- return finishedSet.isFinished(subTrigger(subtriggerIndex));
- }
-
- @Override
- public boolean areAllSubtriggersFinished() {
- return Iterables.isEmpty(unfinishedSubTriggers());
- }
-
- @Override
- public Iterable<ExecutableTrigger<W>> unfinishedSubTriggers() {
- return FluentIterable
- .from(trigger.subTriggers())
- .filter(new Predicate<ExecutableTrigger<W>>() {
- @Override
- public boolean apply(ExecutableTrigger<W> trigger) {
- return !finishedSet.isFinished(trigger);
- }
- });
- }
-
- @Override
- public ExecutableTrigger<W> firstUnfinishedSubTrigger() {
- for (ExecutableTrigger<W> subTrigger : trigger.subTriggers()) {
- if (!finishedSet.isFinished(subTrigger)) {
- return subTrigger;
- }
- }
- return null;
- }
-
- @Override
- public void resetTree() throws Exception {
- finishedSet.clearRecursively(trigger);
- trigger.invokeClear(context);
- }
-
- @Override
- public void setFinished(boolean finished) {
- finishedSet.setFinished(trigger, finished);
- }
-
- @Override
- public void setFinished(boolean finished, int subTriggerIndex) {
- finishedSet.setFinished(subTrigger(subTriggerIndex), finished);
- }
- }
-
- private class TriggerTimers implements Timers {
-
- private final Timers timers;
- private final W window;
-
- public TriggerTimers(W window, Timers timers) {
- this.timers = timers;
- this.window = window;
- }
-
- @Override
- public void setTimer(Instant timestamp, TimeDomain timeDomain) {
- timers.setTimer(timestamp, timeDomain);
- }
-
- @Override
- public void deleteTimer(Instant timestamp, TimeDomain timeDomain) {
- if (timeDomain == TimeDomain.EVENT_TIME
- && timestamp.equals(window.maxTimestamp())) {
- // Don't allow triggers to unset the at-max-timestamp timer. This is necessary for on-time
- // state transitions.
- return;
- }
- timers.deleteTimer(timestamp, timeDomain);
- }
-
- @Override
- public Instant currentProcessingTime() {
- return timers.currentProcessingTime();
- }
-
- @Override
- @Nullable
- public Instant currentSynchronizedProcessingTime() {
- return timers.currentSynchronizedProcessingTime();
- }
-
- @Override
- @Nullable
- public Instant currentEventTime() {
- return timers.currentEventTime();
- }
- }
-
- private class MergingTriggerInfoImpl
- extends TriggerInfoImpl implements Trigger.MergingTriggerInfo<W> {
-
- private final Map<W, FinishedTriggers> finishedSets;
-
- public MergingTriggerInfoImpl(
- ExecutableTrigger<W> trigger,
- FinishedTriggers finishedSet,
- Trigger<W>.TriggerContext context,
- Map<W, FinishedTriggers> finishedSets) {
- super(trigger, finishedSet, context);
- this.finishedSets = finishedSets;
- }
-
- @Override
- public boolean finishedInAnyMergingWindow() {
- for (FinishedTriggers finishedSet : finishedSets.values()) {
- if (finishedSet.isFinished(trigger)) {
- return true;
- }
- }
- return false;
- }
-
- @Override
- public boolean finishedInAllMergingWindows() {
- for (FinishedTriggers finishedSet : finishedSets.values()) {
- if (!finishedSet.isFinished(trigger)) {
- return false;
- }
- }
- return true;
- }
-
- @Override
- public Iterable<W> getFinishedMergingWindows() {
- return Maps.filterValues(finishedSets, new Predicate<FinishedTriggers>() {
- @Override
- public boolean apply(FinishedTriggers finishedSet) {
- return finishedSet.isFinished(trigger);
- }
- }).keySet();
- }
- }
-
- private class StateAccessorImpl implements StateAccessor<Object> {
- protected final int triggerIndex;
- protected final StateNamespace windowNamespace;
-
- public StateAccessorImpl(
- W window,
- ExecutableTrigger<W> trigger) {
- this.triggerIndex = trigger.getTriggerIndex();
- this.windowNamespace = namespaceFor(window);
- }
-
- protected StateNamespace namespaceFor(W window) {
- return StateNamespaces.windowAndTrigger(windowCoder, window, triggerIndex);
- }
-
- @Override
- public <StateT extends State> StateT access(StateTag<? super Object, StateT> address) {
- return stateInternals.state(windowNamespace, address);
- }
- }
-
- private class MergingStateAccessorImpl extends StateAccessorImpl
- implements MergingStateAccessor<Object, W> {
- private final Collection<W> activeToBeMerged;
-
- public MergingStateAccessorImpl(ExecutableTrigger<W> trigger, Collection<W> activeToBeMerged,
- W mergeResult) {
- super(mergeResult, trigger);
- this.activeToBeMerged = activeToBeMerged;
- }
-
- @Override
- public <StateT extends State> StateT access(
- StateTag<? super Object, StateT> address) {
- return stateInternals.state(windowNamespace, address);
- }
-
- @Override
- public <StateT extends State> Map<W, StateT> accessInEachMergingWindow(
- StateTag<? super Object, StateT> address) {
- ImmutableMap.Builder<W, StateT> builder = ImmutableMap.builder();
- for (W mergingWindow : activeToBeMerged) {
- StateT stateForWindow = stateInternals.state(namespaceFor(mergingWindow), address);
- builder.put(mergingWindow, stateForWindow);
- }
- return builder.build();
- }
- }
-
- private class TriggerContextImpl extends Trigger<W>.TriggerContext {
-
- private final W window;
- private final StateAccessorImpl state;
- private final Timers timers;
- private final TriggerInfoImpl triggerInfo;
-
- private TriggerContextImpl(
- W window,
- Timers timers,
- ExecutableTrigger<W> trigger,
- FinishedTriggers finishedSet) {
- trigger.getSpec().super();
- this.window = window;
- this.state = new StateAccessorImpl(window, trigger);
- this.timers = new TriggerTimers(window, timers);
- this.triggerInfo = new TriggerInfoImpl(trigger, finishedSet, this);
- }
-
- @Override
- public Trigger<W>.TriggerContext forTrigger(ExecutableTrigger<W> trigger) {
- return new TriggerContextImpl(window, timers, trigger, triggerInfo.finishedSet);
- }
-
- @Override
- public TriggerInfo<W> trigger() {
- return triggerInfo;
- }
-
- @Override
- public StateAccessor state() {
- return state;
- }
-
- @Override
- public W window() {
- return window;
- }
-
- @Override
- public void deleteTimer(Instant timestamp, TimeDomain domain) {
- timers.deleteTimer(timestamp, domain);
- }
-
- @Override
- public Instant currentProcessingTime() {
- return timers.currentProcessingTime();
- }
-
- @Override
- @Nullable
- public Instant currentSynchronizedProcessingTime() {
- return timers.currentSynchronizedProcessingTime();
- }
-
- @Override
- @Nullable
- public Instant currentEventTime() {
- return timers.currentEventTime();
- }
- }
-
- private class OnElementContextImpl extends Trigger<W>.OnElementContext {
-
- private final W window;
- private final StateAccessorImpl state;
- private final Timers timers;
- private final TriggerInfoImpl triggerInfo;
- private final Instant eventTimestamp;
-
- private OnElementContextImpl(
- W window,
- Timers timers,
- ExecutableTrigger<W> trigger,
- FinishedTriggers finishedSet,
- Instant eventTimestamp) {
- trigger.getSpec().super();
- this.window = window;
- this.state = new StateAccessorImpl(window, trigger);
- this.timers = new TriggerTimers(window, timers);
- this.triggerInfo = new TriggerInfoImpl(trigger, finishedSet, this);
- this.eventTimestamp = eventTimestamp;
- }
-
-
- @Override
- public Instant eventTimestamp() {
- return eventTimestamp;
- }
-
- @Override
- public Trigger<W>.OnElementContext forTrigger(ExecutableTrigger<W> trigger) {
- return new OnElementContextImpl(
- window, timers, trigger, triggerInfo.finishedSet, eventTimestamp);
- }
-
- @Override
- public TriggerInfo<W> trigger() {
- return triggerInfo;
- }
-
- @Override
- public StateAccessor state() {
- return state;
- }
-
- @Override
- public W window() {
- return window;
- }
-
- @Override
- public void setTimer(Instant timestamp, TimeDomain domain) {
- timers.setTimer(timestamp, domain);
- }
-
-
- @Override
- public void deleteTimer(Instant timestamp, TimeDomain domain) {
- timers.deleteTimer(timestamp, domain);
- }
-
- @Override
- public Instant currentProcessingTime() {
- return timers.currentProcessingTime();
- }
-
- @Override
- @Nullable
- public Instant currentSynchronizedProcessingTime() {
- return timers.currentSynchronizedProcessingTime();
- }
-
- @Override
- @Nullable
- public Instant currentEventTime() {
- return timers.currentEventTime();
- }
- }
-
- private class OnMergeContextImpl extends Trigger<W>.OnMergeContext {
- private final MergingStateAccessor<?, W> state;
- private final W window;
- private final Collection<W> mergingWindows;
- private final Timers timers;
- private final MergingTriggerInfoImpl triggerInfo;
-
- private OnMergeContextImpl(
- W window,
- Timers timers,
- ExecutableTrigger<W> trigger,
- FinishedTriggers finishedSet,
- Map<W, FinishedTriggers> finishedSets) {
- trigger.getSpec().super();
- this.mergingWindows = finishedSets.keySet();
- this.window = window;
- this.state = new MergingStateAccessorImpl(trigger, mergingWindows, window);
- this.timers = new TriggerTimers(window, timers);
- this.triggerInfo = new MergingTriggerInfoImpl(trigger, finishedSet, this, finishedSets);
- }
-
- @Override
- public Trigger<W>.OnMergeContext forTrigger(ExecutableTrigger<W> trigger) {
- return new OnMergeContextImpl(
- window, timers, trigger, triggerInfo.finishedSet, triggerInfo.finishedSets);
- }
-
- @Override
- public MergingStateAccessor<?, W> state() {
- return state;
- }
-
- @Override
- public MergingTriggerInfo<W> trigger() {
- return triggerInfo;
- }
-
- @Override
- public W window() {
- return window;
- }
-
- @Override
- public void setTimer(Instant timestamp, TimeDomain domain) {
- timers.setTimer(timestamp, domain);
- }
-
- @Override
- public void deleteTimer(Instant timestamp, TimeDomain domain) {
- timers.setTimer(timestamp, domain);
-
- }
-
- @Override
- public Instant currentProcessingTime() {
- return timers.currentProcessingTime();
- }
-
- @Override
- @Nullable
- public Instant currentSynchronizedProcessingTime() {
- return timers.currentSynchronizedProcessingTime();
- }
-
- @Override
- @Nullable
- public Instant currentEventTime() {
- return timers.currentEventTime();
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/TriggerRunner.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/TriggerRunner.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/TriggerRunner.java
deleted file mode 100644
index dcfd035..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/TriggerRunner.java
+++ /dev/null
@@ -1,223 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
-import com.google.cloud.dataflow.sdk.transforms.windowing.DefaultTrigger;
-import com.google.cloud.dataflow.sdk.transforms.windowing.Trigger;
-import com.google.cloud.dataflow.sdk.util.state.MergingStateAccessor;
-import com.google.cloud.dataflow.sdk.util.state.StateAccessor;
-import com.google.cloud.dataflow.sdk.util.state.StateTag;
-import com.google.cloud.dataflow.sdk.util.state.StateTags;
-import com.google.cloud.dataflow.sdk.util.state.ValueState;
-import com.google.common.annotations.VisibleForTesting;
-import com.google.common.base.Preconditions;
-import com.google.common.collect.ImmutableMap;
-
-import org.joda.time.Instant;
-
-import java.util.BitSet;
-import java.util.Collection;
-import java.util.Map;
-
-/**
- * Executes a trigger while managing persistence of information about which subtriggers are
- * finished. Subtriggers include all recursive trigger expressions as well as the entire trigger.
- *
- * <p>Specifically, the responsibilities are:
- *
- * <ul>
- * <li>Invoking the trigger's methods via its {@link ExecutableTrigger} wrapper by
- * constructing the appropriate trigger contexts.</li>
- * <li>Committing a record of which subtriggers are finished to persistent state.</li>
- * <li>Restoring the record of which subtriggers are finished from persistent state.</li>
- * <li>Clearing out the persisted finished set when a caller indicates
- * (via {#link #clearFinished}) that it is no longer needed.</li>
- * </ul>
- *
- * <p>These responsibilities are intertwined: trigger contexts include mutable information about
- * which subtriggers are finished. This class provides the information when building the contexts
- * and commits the information when the method of the {@link ExecutableTrigger} returns.
- *
- * @param <W> The kind of windows being processed.
- */
-public class TriggerRunner<W extends BoundedWindow> {
- @VisibleForTesting
- static final StateTag<Object, ValueState<BitSet>> FINISHED_BITS_TAG =
- StateTags.makeSystemTagInternal(StateTags.value("closed", BitSetCoder.of()));
-
- private final ExecutableTrigger<W> rootTrigger;
- private final TriggerContextFactory<W> contextFactory;
-
- public TriggerRunner(ExecutableTrigger<W> rootTrigger, TriggerContextFactory<W> contextFactory) {
- Preconditions.checkState(rootTrigger.getTriggerIndex() == 0);
- this.rootTrigger = rootTrigger;
- this.contextFactory = contextFactory;
- }
-
- private FinishedTriggersBitSet readFinishedBits(ValueState<BitSet> state) {
- if (!isFinishedSetNeeded()) {
- // If no trigger in the tree will ever have finished bits, then we don't need to read them.
- // So that the code can be agnostic to that fact, we create a BitSet that is all 0 (not
- // finished) for each trigger in the tree.
- return FinishedTriggersBitSet.emptyWithCapacity(rootTrigger.getFirstIndexAfterSubtree());
- }
-
- BitSet bitSet = state.read();
- return bitSet == null
- ? FinishedTriggersBitSet.emptyWithCapacity(rootTrigger.getFirstIndexAfterSubtree())
- : FinishedTriggersBitSet.fromBitSet(bitSet);
- }
-
- /** Return true if the trigger is closed in the window corresponding to the specified state. */
- public boolean isClosed(StateAccessor<?> state) {
- return readFinishedBits(state.access(FINISHED_BITS_TAG)).isFinished(rootTrigger);
- }
-
- public void prefetchForValue(W window, StateAccessor<?> state) {
- if (isFinishedSetNeeded()) {
- state.access(FINISHED_BITS_TAG).readLater();
- }
- rootTrigger.getSpec().prefetchOnElement(
- contextFactory.createStateAccessor(window, rootTrigger));
- }
-
- public void prefetchOnFire(W window, StateAccessor<?> state) {
- if (isFinishedSetNeeded()) {
- state.access(FINISHED_BITS_TAG).readLater();
- }
- rootTrigger.getSpec().prefetchOnFire(contextFactory.createStateAccessor(window, rootTrigger));
- }
-
- public void prefetchShouldFire(W window, StateAccessor<?> state) {
- if (isFinishedSetNeeded()) {
- state.access(FINISHED_BITS_TAG).readLater();
- }
- rootTrigger.getSpec().prefetchShouldFire(
- contextFactory.createStateAccessor(window, rootTrigger));
- }
-
- /**
- * Run the trigger logic to deal with a new value.
- */
- public void processValue(W window, Instant timestamp, Timers timers, StateAccessor<?> state)
- throws Exception {
- // Clone so that we can detect changes and so that changes here don't pollute merging.
- FinishedTriggersBitSet finishedSet =
- readFinishedBits(state.access(FINISHED_BITS_TAG)).copy();
- Trigger<W>.OnElementContext triggerContext = contextFactory.createOnElementContext(
- window, timers, timestamp, rootTrigger, finishedSet);
- rootTrigger.invokeOnElement(triggerContext);
- persistFinishedSet(state, finishedSet);
- }
-
- public void prefetchForMerge(
- W window, Collection<W> mergingWindows, MergingStateAccessor<?, W> state) {
- if (isFinishedSetNeeded()) {
- for (ValueState<?> value : state.accessInEachMergingWindow(FINISHED_BITS_TAG).values()) {
- value.readLater();
- }
- }
- rootTrigger.getSpec().prefetchOnMerge(contextFactory.createMergingStateAccessor(
- window, mergingWindows, rootTrigger));
- }
-
- /**
- * Run the trigger merging logic as part of executing the specified merge.
- */
- public void onMerge(W window, Timers timers, MergingStateAccessor<?, W> state) throws Exception {
- // Clone so that we can detect changes and so that changes here don't pollute merging.
- FinishedTriggersBitSet finishedSet =
- readFinishedBits(state.access(FINISHED_BITS_TAG)).copy();
-
- // And read the finished bits in each merging window.
- ImmutableMap.Builder<W, FinishedTriggers> builder = ImmutableMap.builder();
- for (Map.Entry<W, ValueState<BitSet>> entry :
- state.accessInEachMergingWindow(FINISHED_BITS_TAG).entrySet()) {
- // Don't need to clone these, since the trigger context doesn't allow modification
- builder.put(entry.getKey(), readFinishedBits(entry.getValue()));
- }
- ImmutableMap<W, FinishedTriggers> mergingFinishedSets = builder.build();
-
- Trigger<W>.OnMergeContext mergeContext = contextFactory.createOnMergeContext(
- window, timers, rootTrigger, finishedSet, mergingFinishedSets);
-
- // Run the merge from the trigger
- rootTrigger.invokeOnMerge(mergeContext);
-
- persistFinishedSet(state, finishedSet);
-
- // Clear the finished bits.
- clearFinished(state);
- }
-
- public boolean shouldFire(W window, Timers timers, StateAccessor<?> state) throws Exception {
- FinishedTriggers finishedSet = readFinishedBits(state.access(FINISHED_BITS_TAG)).copy();
- Trigger<W>.TriggerContext context = contextFactory.base(window, timers,
- rootTrigger, finishedSet);
- return rootTrigger.invokeShouldFire(context);
- }
-
- public void onFire(W window, Timers timers, StateAccessor<?> state) throws Exception {
- FinishedTriggersBitSet finishedSet =
- readFinishedBits(state.access(FINISHED_BITS_TAG)).copy();
- Trigger<W>.TriggerContext context = contextFactory.base(window, timers,
- rootTrigger, finishedSet);
- rootTrigger.invokeOnFire(context);
- persistFinishedSet(state, finishedSet);
- }
-
- private void persistFinishedSet(
- StateAccessor<?> state, FinishedTriggersBitSet modifiedFinishedSet) {
- if (!isFinishedSetNeeded()) {
- return;
- }
-
- ValueState<BitSet> finishedSetState = state.access(FINISHED_BITS_TAG);
- if (!readFinishedBits(finishedSetState).equals(modifiedFinishedSet)) {
- if (modifiedFinishedSet.getBitSet().isEmpty()) {
- finishedSetState.clear();
- } else {
- finishedSetState.write(modifiedFinishedSet.getBitSet());
- }
- }
- }
-
- /**
- * Clear finished bits.
- */
- public void clearFinished(StateAccessor<?> state) {
- if (isFinishedSetNeeded()) {
- state.access(FINISHED_BITS_TAG).clear();
- }
- }
-
- /**
- * Clear the state used for executing triggers, but leave the finished set to indicate
- * the window is closed.
- */
- public void clearState(W window, Timers timers, StateAccessor<?> state) throws Exception {
- // Don't need to clone, because we'll be clearing the finished bits anyways.
- FinishedTriggers finishedSet = readFinishedBits(state.access(FINISHED_BITS_TAG));
- rootTrigger.invokeClear(contextFactory.base(window, timers, rootTrigger, finishedSet));
- }
-
- private boolean isFinishedSetNeeded() {
- // TODO: If we know that no trigger in the tree will ever finish, we don't need to do the
- // lookup. Right now, we special case this for the DefaultTrigger.
- return !(rootTrigger.getSpec() instanceof DefaultTrigger);
- }
-}
[43/67] [partial] incubator-beam git commit: Directory reorganization
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/DatastoreIO.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/DatastoreIO.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/DatastoreIO.java
deleted file mode 100644
index f618bc9..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/DatastoreIO.java
+++ /dev/null
@@ -1,957 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.io;
-
-import static com.google.api.services.datastore.DatastoreV1.PropertyFilter.Operator.EQUAL;
-import static com.google.api.services.datastore.DatastoreV1.PropertyOrder.Direction.DESCENDING;
-import static com.google.api.services.datastore.DatastoreV1.QueryResultBatch.MoreResultsType.NOT_FINISHED;
-import static com.google.api.services.datastore.client.DatastoreHelper.getPropertyMap;
-import static com.google.api.services.datastore.client.DatastoreHelper.makeFilter;
-import static com.google.api.services.datastore.client.DatastoreHelper.makeOrder;
-import static com.google.api.services.datastore.client.DatastoreHelper.makeValue;
-import static com.google.common.base.Preconditions.checkArgument;
-import static com.google.common.base.Preconditions.checkNotNull;
-import static com.google.common.base.Verify.verify;
-
-import com.google.api.client.auth.oauth2.Credential;
-import com.google.api.client.util.BackOff;
-import com.google.api.client.util.BackOffUtils;
-import com.google.api.client.util.Sleeper;
-import com.google.api.services.datastore.DatastoreV1.CommitRequest;
-import com.google.api.services.datastore.DatastoreV1.Entity;
-import com.google.api.services.datastore.DatastoreV1.EntityResult;
-import com.google.api.services.datastore.DatastoreV1.Key;
-import com.google.api.services.datastore.DatastoreV1.Key.PathElement;
-import com.google.api.services.datastore.DatastoreV1.PartitionId;
-import com.google.api.services.datastore.DatastoreV1.Query;
-import com.google.api.services.datastore.DatastoreV1.QueryResultBatch;
-import com.google.api.services.datastore.DatastoreV1.RunQueryRequest;
-import com.google.api.services.datastore.DatastoreV1.RunQueryResponse;
-import com.google.api.services.datastore.client.Datastore;
-import com.google.api.services.datastore.client.DatastoreException;
-import com.google.api.services.datastore.client.DatastoreFactory;
-import com.google.api.services.datastore.client.DatastoreHelper;
-import com.google.api.services.datastore.client.DatastoreOptions;
-import com.google.api.services.datastore.client.QuerySplitter;
-import com.google.cloud.dataflow.sdk.annotations.Experimental;
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.coders.EntityCoder;
-import com.google.cloud.dataflow.sdk.coders.SerializableCoder;
-import com.google.cloud.dataflow.sdk.io.Sink.WriteOperation;
-import com.google.cloud.dataflow.sdk.io.Sink.Writer;
-import com.google.cloud.dataflow.sdk.options.DataflowPipelineWorkerPoolOptions;
-import com.google.cloud.dataflow.sdk.options.GcpOptions;
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-import com.google.cloud.dataflow.sdk.util.AttemptBoundedExponentialBackOff;
-import com.google.cloud.dataflow.sdk.util.RetryHttpRequestInitializer;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.common.base.MoreObjects;
-import com.google.common.base.Preconditions;
-import com.google.common.collect.ImmutableList;
-import com.google.common.primitives.Ints;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.IOException;
-import java.io.Serializable;
-import java.util.ArrayList;
-import java.util.Iterator;
-import java.util.List;
-import java.util.NoSuchElementException;
-
-import javax.annotation.Nullable;
-
-/**
- * <p>{@link DatastoreIO} provides an API to Read and Write {@link PCollection PCollections} of
- * <a href="https://developers.google.com/datastore/">Google Cloud Datastore</a>
- * {@link Entity} objects.
- *
- * <p>Google Cloud Datastore is a fully managed NoSQL data storage service.
- * An {@code Entity} is an object in Datastore, analogous to a row in traditional
- * database table.
- *
- * <p>This API currently requires an authentication workaround. To use {@link DatastoreIO}, users
- * must use the {@code gcloud} command line tool to get credentials for Datastore:
- * <pre>
- * $ gcloud auth login
- * </pre>
- *
- * <p>To read a {@link PCollection} from a query to Datastore, use {@link DatastoreIO#source} and
- * its methods {@link DatastoreIO.Source#withDataset} and {@link DatastoreIO.Source#withQuery} to
- * specify the dataset to query and the query to read from. You can optionally provide a namespace
- * to query within using {@link DatastoreIO.Source#withNamespace} or a Datastore host using
- * {@link DatastoreIO.Source#withHost}.
- *
- * <p>For example:
- *
- * <pre> {@code
- * // Read a query from Datastore
- * PipelineOptions options = PipelineOptionsFactory.fromArgs(args).create();
- * Query query = ...;
- * String dataset = "...";
- *
- * Pipeline p = Pipeline.create(options);
- * PCollection<Entity> entities = p.apply(
- * Read.from(DatastoreIO.source()
- * .withDataset(datasetId)
- * .withQuery(query)
- * .withHost(host)));
- * } </pre>
- *
- * <p>or:
- *
- * <pre> {@code
- * // Read a query from Datastore using the default namespace and host
- * PipelineOptions options = PipelineOptionsFactory.fromArgs(args).create();
- * Query query = ...;
- * String dataset = "...";
- *
- * Pipeline p = Pipeline.create(options);
- * PCollection<Entity> entities = p.apply(DatastoreIO.readFrom(datasetId, query));
- * p.run();
- * } </pre>
- *
- * <p><b>Note:</b> Normally, a Cloud Dataflow job will read from Cloud Datastore in parallel across
- * many workers. However, when the {@link Query} is configured with a limit using
- * {@link com.google.api.services.datastore.DatastoreV1.Query.Builder#setLimit(int)}, then
- * all returned results will be read by a single Dataflow worker in order to ensure correct data.
- *
- * <p>To write a {@link PCollection} to a Datastore, use {@link DatastoreIO#writeTo},
- * specifying the datastore to write to:
- *
- * <pre> {@code
- * PCollection<Entity> entities = ...;
- * entities.apply(DatastoreIO.writeTo(dataset));
- * p.run();
- * } </pre>
- *
- * <p>To optionally change the host that is used to write to the Datastore, use {@link
- * DatastoreIO#sink} to build a {@link DatastoreIO.Sink} and write to it using the {@link Write}
- * transform:
- *
- * <pre> {@code
- * PCollection<Entity> entities = ...;
- * entities.apply(Write.to(DatastoreIO.sink().withDataset(dataset).withHost(host)));
- * } </pre>
- *
- * <p>{@link Entity Entities} in the {@code PCollection} to be written must have complete
- * {@link Key Keys}. Complete {@code Keys} specify the {@code name} and {@code id} of the
- * {@code Entity}, where incomplete {@code Keys} do not. A {@code namespace} other than the
- * project default may be written to by specifying it in the {@code Entity} {@code Keys}.
- *
- * <pre>{@code
- * Key.Builder keyBuilder = DatastoreHelper.makeKey(...);
- * keyBuilder.getPartitionIdBuilder().setNamespace(namespace);
- * }</pre>
- *
- * <p>{@code Entities} will be committed as upsert (update or insert) mutations. Please read
- * <a href="https://cloud.google.com/datastore/docs/concepts/entities">Entities, Properties, and
- * Keys</a> for more information about {@code Entity} keys.
- *
- * <p><h3>Permissions</h3>
- * Permission requirements depend on the {@code PipelineRunner} that is used to execute the
- * Dataflow job. Please refer to the documentation of corresponding {@code PipelineRunner}s for
- * more details.
- *
- * <p>Please see <a href="https://cloud.google.com/datastore/docs/activate">Cloud Datastore Sign Up
- * </a>for security and permission related information specific to Datastore.
- *
- * @see com.google.cloud.dataflow.sdk.runners.PipelineRunner
- */
-@Experimental(Experimental.Kind.SOURCE_SINK)
-public class DatastoreIO {
- public static final String DEFAULT_HOST = "https://www.googleapis.com";
-
- /**
- * Datastore has a limit of 500 mutations per batch operation, so we flush
- * changes to Datastore every 500 entities.
- */
- public static final int DATASTORE_BATCH_UPDATE_LIMIT = 500;
-
- /**
- * Returns an empty {@link DatastoreIO.Source} builder with the default {@code host}.
- * Configure the {@code dataset}, {@code query}, and {@code namespace} using
- * {@link DatastoreIO.Source#withDataset}, {@link DatastoreIO.Source#withQuery},
- * and {@link DatastoreIO.Source#withNamespace}.
- *
- * @deprecated the name and return type do not match. Use {@link #source()}.
- */
- @Deprecated
- public static Source read() {
- return source();
- }
-
- /**
- * Returns an empty {@link DatastoreIO.Source} builder with the default {@code host}.
- * Configure the {@code dataset}, {@code query}, and {@code namespace} using
- * {@link DatastoreIO.Source#withDataset}, {@link DatastoreIO.Source#withQuery},
- * and {@link DatastoreIO.Source#withNamespace}.
- *
- * <p>The resulting {@link Source} object can be passed to {@link Read} to create a
- * {@code PTransform} that will read from Datastore.
- */
- public static Source source() {
- return new Source(DEFAULT_HOST, null, null, null);
- }
-
- /**
- * Returns a {@code PTransform} that reads Datastore entities from the query
- * against the given dataset.
- */
- public static Read.Bounded<Entity> readFrom(String datasetId, Query query) {
- return Read.from(new Source(DEFAULT_HOST, datasetId, query, null));
- }
-
- /**
- * Returns a {@code PTransform} that reads Datastore entities from the query
- * against the given dataset and host.
- *
- * @deprecated prefer {@link #source()} with {@link Source#withHost}, {@link Source#withDataset},
- * {@link Source#withQuery}s.
- */
- @Deprecated
- public static Read.Bounded<Entity> readFrom(String host, String datasetId, Query query) {
- return Read.from(new Source(host, datasetId, query, null));
- }
-
- /**
- * A {@link Source} that reads the result rows of a Datastore query as {@code Entity} objects.
- */
- public static class Source extends BoundedSource<Entity> {
- public String getHost() {
- return host;
- }
-
- public String getDataset() {
- return datasetId;
- }
-
- public Query getQuery() {
- return query;
- }
-
- @Nullable
- public String getNamespace() {
- return namespace;
- }
-
- public Source withDataset(String datasetId) {
- checkNotNull(datasetId, "datasetId");
- return new Source(host, datasetId, query, namespace);
- }
-
- /**
- * Returns a new {@link Source} that reads the results of the specified query.
- *
- * <p>Does not modify this object.
- *
- * <p><b>Note:</b> Normally, a Cloud Dataflow job will read from Cloud Datastore in parallel
- * across many workers. However, when the {@link Query} is configured with a limit using
- * {@link com.google.api.services.datastore.DatastoreV1.Query.Builder#setLimit(int)}, then all
- * returned results will be read by a single Dataflow worker in order to ensure correct data.
- */
- public Source withQuery(Query query) {
- checkNotNull(query, "query");
- checkArgument(!query.hasLimit() || query.getLimit() > 0,
- "Invalid query limit %s: must be positive", query.getLimit());
- return new Source(host, datasetId, query, namespace);
- }
-
- public Source withHost(String host) {
- checkNotNull(host, "host");
- return new Source(host, datasetId, query, namespace);
- }
-
- public Source withNamespace(@Nullable String namespace) {
- return new Source(host, datasetId, query, namespace);
- }
-
- @Override
- public Coder<Entity> getDefaultOutputCoder() {
- return EntityCoder.of();
- }
-
- @Override
- public boolean producesSortedKeys(PipelineOptions options) {
- // TODO: Perhaps this can be implemented by inspecting the query.
- return false;
- }
-
- @Override
- public List<Source> splitIntoBundles(long desiredBundleSizeBytes, PipelineOptions options)
- throws Exception {
- // Users may request a limit on the number of results. We can currently support this by
- // simply disabling parallel reads and using only a single split.
- if (query.hasLimit()) {
- return ImmutableList.of(this);
- }
-
- long numSplits;
- try {
- numSplits = Math.round(((double) getEstimatedSizeBytes(options)) / desiredBundleSizeBytes);
- } catch (Exception e) {
- // Fallback in case estimated size is unavailable. TODO: fix this, it's horrible.
-
- // 1. Try Dataflow's numWorkers, which will be 0 for other workers.
- DataflowPipelineWorkerPoolOptions poolOptions =
- options.as(DataflowPipelineWorkerPoolOptions.class);
- if (poolOptions.getNumWorkers() > 0) {
- LOG.warn("Estimated size of unavailable, using the number of workers {}",
- poolOptions.getNumWorkers(), e);
- numSplits = poolOptions.getNumWorkers();
- } else {
- // 2. Default to 12 in the unknown case.
- numSplits = 12;
- }
- }
-
- // If the desiredBundleSize or number of workers results in 1 split, simply return
- // a source that reads from the original query.
- if (numSplits <= 1) {
- return ImmutableList.of(this);
- }
-
- List<Query> datastoreSplits;
- try {
- datastoreSplits = getSplitQueries(Ints.checkedCast(numSplits), options);
- } catch (IllegalArgumentException | DatastoreException e) {
- LOG.warn("Unable to parallelize the given query: {}", query, e);
- return ImmutableList.of(this);
- }
-
- ImmutableList.Builder<Source> splits = ImmutableList.builder();
- for (Query splitQuery : datastoreSplits) {
- splits.add(new Source(host, datasetId, splitQuery, namespace));
- }
- return splits.build();
- }
-
- @Override
- public BoundedReader<Entity> createReader(PipelineOptions pipelineOptions) throws IOException {
- return new DatastoreReader(this, getDatastore(pipelineOptions));
- }
-
- @Override
- public void validate() {
- Preconditions.checkNotNull(host, "host");
- Preconditions.checkNotNull(query, "query");
- Preconditions.checkNotNull(datasetId, "datasetId");
- }
-
- @Override
- public long getEstimatedSizeBytes(PipelineOptions options) throws Exception {
- // Datastore provides no way to get a good estimate of how large the result of a query
- // will be. As a rough approximation, we attempt to fetch the statistics of the whole
- // entity kind being queried, using the __Stat_Kind__ system table, assuming exactly 1 kind
- // is specified in the query.
- //
- // See https://cloud.google.com/datastore/docs/concepts/stats
- if (mockEstimateSizeBytes != null) {
- return mockEstimateSizeBytes;
- }
-
- Datastore datastore = getDatastore(options);
- if (query.getKindCount() != 1) {
- throw new UnsupportedOperationException(
- "Can only estimate size for queries specifying exactly 1 kind.");
- }
- String ourKind = query.getKind(0).getName();
- long latestTimestamp = queryLatestStatisticsTimestamp(datastore);
- Query.Builder query = Query.newBuilder();
- if (namespace == null) {
- query.addKindBuilder().setName("__Stat_Kind__");
- } else {
- query.addKindBuilder().setName("__Ns_Stat_Kind__");
- }
- query.setFilter(makeFilter(
- makeFilter("kind_name", EQUAL, makeValue(ourKind)).build(),
- makeFilter("timestamp", EQUAL, makeValue(latestTimestamp)).build()));
- RunQueryRequest request = makeRequest(query.build());
-
- long now = System.currentTimeMillis();
- RunQueryResponse response = datastore.runQuery(request);
- LOG.info("Query for per-kind statistics took {}ms", System.currentTimeMillis() - now);
-
- QueryResultBatch batch = response.getBatch();
- if (batch.getEntityResultCount() == 0) {
- throw new NoSuchElementException(
- "Datastore statistics for kind " + ourKind + " unavailable");
- }
- Entity entity = batch.getEntityResult(0).getEntity();
- return getPropertyMap(entity).get("entity_bytes").getIntegerValue();
- }
-
- @Override
- public String toString() {
- return MoreObjects.toStringHelper(getClass())
- .add("host", host)
- .add("dataset", datasetId)
- .add("query", query)
- .add("namespace", namespace)
- .toString();
- }
-
- ///////////////////////////////////////////////////////////////////////////////////////////
-
- private static final Logger LOG = LoggerFactory.getLogger(Source.class);
- private final String host;
- /** Not really nullable, but it may be {@code null} for in-progress {@code Source}s. */
- @Nullable
- private final String datasetId;
- /** Not really nullable, but it may be {@code null} for in-progress {@code Source}s. */
- @Nullable
- private final Query query;
- @Nullable
- private final String namespace;
-
- /** For testing only. TODO: This could be much cleaner with dependency injection. */
- @Nullable
- private QuerySplitter mockSplitter;
- @Nullable
- private Long mockEstimateSizeBytes;
-
- /**
- * Note that only {@code namespace} is really {@code @Nullable}. The other parameters may be
- * {@code null} as a matter of build order, but if they are {@code null} at instantiation time,
- * an error will be thrown.
- */
- private Source(
- String host, @Nullable String datasetId, @Nullable Query query,
- @Nullable String namespace) {
- this.host = checkNotNull(host, "host");
- this.datasetId = datasetId;
- this.query = query;
- this.namespace = namespace;
- }
-
- /**
- * A helper function to get the split queries, taking into account the optional
- * {@code namespace} and whether there is a mock splitter.
- */
- private List<Query> getSplitQueries(int numSplits, PipelineOptions options)
- throws DatastoreException {
- // If namespace is set, include it in the split request so splits are calculated accordingly.
- PartitionId.Builder partitionBuilder = PartitionId.newBuilder();
- if (namespace != null) {
- partitionBuilder.setNamespace(namespace);
- }
-
- if (mockSplitter != null) {
- // For testing.
- return mockSplitter.getSplits(query, partitionBuilder.build(), numSplits, null);
- }
-
- return DatastoreHelper.getQuerySplitter().getSplits(
- query, partitionBuilder.build(), numSplits, getDatastore(options));
- }
-
- /**
- * Builds a {@link RunQueryRequest} from the {@code query}, using the properties set on this
- * {@code Source}. For example, sets the {@code namespace} for the request.
- */
- private RunQueryRequest makeRequest(Query query) {
- RunQueryRequest.Builder requestBuilder = RunQueryRequest.newBuilder().setQuery(query);
- if (namespace != null) {
- requestBuilder.getPartitionIdBuilder().setNamespace(namespace);
- }
- return requestBuilder.build();
- }
-
- /**
- * Datastore system tables with statistics are periodically updated. This method fetches
- * the latest timestamp of statistics update using the {@code __Stat_Total__} table.
- */
- private long queryLatestStatisticsTimestamp(Datastore datastore) throws DatastoreException {
- Query.Builder query = Query.newBuilder();
- query.addKindBuilder().setName("__Stat_Total__");
- query.addOrder(makeOrder("timestamp", DESCENDING));
- query.setLimit(1);
- RunQueryRequest request = makeRequest(query.build());
-
- long now = System.currentTimeMillis();
- RunQueryResponse response = datastore.runQuery(request);
- LOG.info("Query for latest stats timestamp of dataset {} took {}ms", datasetId,
- System.currentTimeMillis() - now);
- QueryResultBatch batch = response.getBatch();
- if (batch.getEntityResultCount() == 0) {
- throw new NoSuchElementException(
- "Datastore total statistics for dataset " + datasetId + " unavailable");
- }
- Entity entity = batch.getEntityResult(0).getEntity();
- return getPropertyMap(entity).get("timestamp").getTimestampMicrosecondsValue();
- }
-
- private Datastore getDatastore(PipelineOptions pipelineOptions) {
- DatastoreOptions.Builder builder =
- new DatastoreOptions.Builder().host(host).dataset(datasetId).initializer(
- new RetryHttpRequestInitializer());
-
- Credential credential = pipelineOptions.as(GcpOptions.class).getGcpCredential();
- if (credential != null) {
- builder.credential(credential);
- }
- return DatastoreFactory.get().create(builder.build());
- }
-
- /** For testing only. */
- Source withMockSplitter(QuerySplitter splitter) {
- Source res = new Source(host, datasetId, query, namespace);
- res.mockSplitter = splitter;
- res.mockEstimateSizeBytes = mockEstimateSizeBytes;
- return res;
- }
-
- /** For testing only. */
- Source withMockEstimateSizeBytes(Long estimateSizeBytes) {
- Source res = new Source(host, datasetId, query, namespace);
- res.mockSplitter = mockSplitter;
- res.mockEstimateSizeBytes = estimateSizeBytes;
- return res;
- }
- }
-
- ///////////////////// Write Class /////////////////////////////////
-
- /**
- * Returns a new {@link DatastoreIO.Sink} builder using the default host.
- * You need to further configure it using {@link DatastoreIO.Sink#withDataset}, and optionally
- * {@link DatastoreIO.Sink#withHost} before using it in a {@link Write} transform.
- *
- * <p>For example: {@code p.apply(Write.to(DatastoreIO.sink().withDataset(dataset)));}
- */
- public static Sink sink() {
- return new Sink(DEFAULT_HOST, null);
- }
-
- /**
- * Returns a new {@link Write} transform that will write to a {@link Sink}.
- *
- * <p>For example: {@code p.apply(DatastoreIO.writeTo(dataset));}
- */
- public static Write.Bound<Entity> writeTo(String datasetId) {
- return Write.to(sink().withDataset(datasetId));
- }
-
- /**
- * A {@link Sink} that writes a {@link PCollection} containing
- * {@link Entity Entities} to a Datastore kind.
- *
- */
- public static class Sink extends com.google.cloud.dataflow.sdk.io.Sink<Entity> {
- final String host;
- final String datasetId;
-
- /**
- * Returns a {@link Sink} that is like this one, but will write to the specified dataset.
- */
- public Sink withDataset(String datasetId) {
- checkNotNull(datasetId, "datasetId");
- return new Sink(host, datasetId);
- }
-
- /**
- * Returns a {@link Sink} that is like this one, but will use the given host. If not specified,
- * the {@link DatastoreIO#DEFAULT_HOST default host} will be used.
- */
- public Sink withHost(String host) {
- checkNotNull(host, "host");
- return new Sink(host, datasetId);
- }
-
- /**
- * Constructs a Sink with given host and dataset.
- */
- protected Sink(String host, String datasetId) {
- this.host = checkNotNull(host, "host");
- this.datasetId = datasetId;
- }
-
- /**
- * Ensures the host and dataset are set.
- */
- @Override
- public void validate(PipelineOptions options) {
- Preconditions.checkNotNull(
- host, "Host is a required parameter. Please use withHost to set the host.");
- Preconditions.checkNotNull(
- datasetId,
- "Dataset ID is a required parameter. Please use withDataset to to set the datasetId.");
- }
-
- @Override
- public DatastoreWriteOperation createWriteOperation(PipelineOptions options) {
- return new DatastoreWriteOperation(this);
- }
- }
-
- /**
- * A {@link WriteOperation} that will manage a parallel write to a Datastore sink.
- */
- private static class DatastoreWriteOperation
- extends WriteOperation<Entity, DatastoreWriteResult> {
- private static final Logger LOG = LoggerFactory.getLogger(DatastoreWriteOperation.class);
-
- private final DatastoreIO.Sink sink;
-
- public DatastoreWriteOperation(DatastoreIO.Sink sink) {
- this.sink = sink;
- }
-
- @Override
- public Coder<DatastoreWriteResult> getWriterResultCoder() {
- return SerializableCoder.of(DatastoreWriteResult.class);
- }
-
- @Override
- public void initialize(PipelineOptions options) throws Exception {}
-
- /**
- * Finalizes the write. Logs the number of entities written to the Datastore.
- */
- @Override
- public void finalize(Iterable<DatastoreWriteResult> writerResults, PipelineOptions options)
- throws Exception {
- long totalEntities = 0;
- for (DatastoreWriteResult result : writerResults) {
- totalEntities += result.entitiesWritten;
- }
- LOG.info("Wrote {} elements.", totalEntities);
- }
-
- @Override
- public DatastoreWriter createWriter(PipelineOptions options) throws Exception {
- DatastoreOptions.Builder builder =
- new DatastoreOptions.Builder()
- .host(sink.host)
- .dataset(sink.datasetId)
- .initializer(new RetryHttpRequestInitializer());
- Credential credential = options.as(GcpOptions.class).getGcpCredential();
- if (credential != null) {
- builder.credential(credential);
- }
- Datastore datastore = DatastoreFactory.get().create(builder.build());
-
- return new DatastoreWriter(this, datastore);
- }
-
- @Override
- public DatastoreIO.Sink getSink() {
- return sink;
- }
- }
-
- /**
- * {@link Writer} that writes entities to a Datastore Sink. Entities are written in batches,
- * where the maximum batch size is {@link DatastoreIO#DATASTORE_BATCH_UPDATE_LIMIT}. Entities
- * are committed as upsert mutations (either update if the key already exists, or insert if it is
- * a new key). If an entity does not have a complete key (i.e., it has no name or id), the bundle
- * will fail.
- *
- * <p>See <a
- * href="https://cloud.google.com/datastore/docs/concepts/entities#Datastore_Creating_an_entity">
- * Datastore: Entities, Properties, and Keys</a> for information about entity keys and upsert
- * mutations.
- *
- * <p>Commits are non-transactional. If a commit fails because of a conflict over an entity
- * group, the commit will be retried (up to {@link DatastoreIO#DATASTORE_BATCH_UPDATE_LIMIT}
- * times).
- *
- * <p>Visible for testing purposes.
- */
- static class DatastoreWriter extends Writer<Entity, DatastoreWriteResult> {
- private static final Logger LOG = LoggerFactory.getLogger(DatastoreWriter.class);
- private final DatastoreWriteOperation writeOp;
- private final Datastore datastore;
- private long totalWritten = 0;
-
- // Visible for testing.
- final List<Entity> entities = new ArrayList<>();
-
- /**
- * Since a bundle is written in batches, we should retry the commit of a batch in order to
- * prevent transient errors from causing the bundle to fail.
- */
- private static final int MAX_RETRIES = 5;
-
- /**
- * Initial backoff time for exponential backoff for retry attempts.
- */
- private static final int INITIAL_BACKOFF_MILLIS = 5000;
-
- /**
- * Returns true if a Datastore key is complete. A key is complete if its last element
- * has either an id or a name.
- */
- static boolean isValidKey(Key key) {
- List<PathElement> elementList = key.getPathElementList();
- if (elementList.isEmpty()) {
- return false;
- }
- PathElement lastElement = elementList.get(elementList.size() - 1);
- return (lastElement.hasId() || lastElement.hasName());
- }
-
- // Visible for testing
- DatastoreWriter(DatastoreWriteOperation writeOp, Datastore datastore) {
- this.writeOp = writeOp;
- this.datastore = datastore;
- }
-
- @Override
- public void open(String uId) throws Exception {}
-
- /**
- * Writes an entity to the Datastore. Writes are batched, up to {@link
- * DatastoreIO#DATASTORE_BATCH_UPDATE_LIMIT}. If an entity does not have a complete key, an
- * {@link IllegalArgumentException} will be thrown.
- */
- @Override
- public void write(Entity value) throws Exception {
- // Verify that the entity to write has a complete key.
- if (!isValidKey(value.getKey())) {
- throw new IllegalArgumentException(
- "Entities to be written to the Datastore must have complete keys");
- }
-
- entities.add(value);
-
- if (entities.size() >= DatastoreIO.DATASTORE_BATCH_UPDATE_LIMIT) {
- flushBatch();
- }
- }
-
- /**
- * Flushes any pending batch writes and returns a DatastoreWriteResult.
- */
- @Override
- public DatastoreWriteResult close() throws Exception {
- if (entities.size() > 0) {
- flushBatch();
- }
- return new DatastoreWriteResult(totalWritten);
- }
-
- @Override
- public DatastoreWriteOperation getWriteOperation() {
- return writeOp;
- }
-
- /**
- * Writes a batch of entities to the Datastore.
- *
- * <p>If a commit fails, it will be retried (up to {@link DatastoreWriter#MAX_RETRIES}
- * times). All entities in the batch will be committed again, even if the commit was partially
- * successful. If the retry limit is exceeded, the last exception from the Datastore will be
- * thrown.
- *
- * @throws DatastoreException if the commit fails or IOException or InterruptedException if
- * backing off between retries fails.
- */
- private void flushBatch() throws DatastoreException, IOException, InterruptedException {
- LOG.debug("Writing batch of {} entities", entities.size());
- Sleeper sleeper = Sleeper.DEFAULT;
- BackOff backoff = new AttemptBoundedExponentialBackOff(MAX_RETRIES, INITIAL_BACKOFF_MILLIS);
-
- while (true) {
- // Batch upsert entities.
- try {
- CommitRequest.Builder commitRequest = CommitRequest.newBuilder();
- commitRequest.getMutationBuilder().addAllUpsert(entities);
- commitRequest.setMode(CommitRequest.Mode.NON_TRANSACTIONAL);
- datastore.commit(commitRequest.build());
-
- // Break if the commit threw no exception.
- break;
-
- } catch (DatastoreException exception) {
- // Only log the code and message for potentially-transient errors. The entire exception
- // will be propagated upon the last retry.
- LOG.error("Error writing to the Datastore ({}): {}", exception.getCode(),
- exception.getMessage());
- if (!BackOffUtils.next(sleeper, backoff)) {
- LOG.error("Aborting after {} retries.", MAX_RETRIES);
- throw exception;
- }
- }
- }
- totalWritten += entities.size();
- LOG.debug("Successfully wrote {} entities", entities.size());
- entities.clear();
- }
- }
-
- private static class DatastoreWriteResult implements Serializable {
- final long entitiesWritten;
-
- public DatastoreWriteResult(long recordsWritten) {
- this.entitiesWritten = recordsWritten;
- }
- }
-
- /**
- * A {@link Source.Reader} over the records from a query of the datastore.
- *
- * <p>Timestamped records are currently not supported.
- * All records implicitly have the timestamp of {@code BoundedWindow.TIMESTAMP_MIN_VALUE}.
- */
- public static class DatastoreReader extends BoundedSource.BoundedReader<Entity> {
- private final Source source;
-
- /**
- * Datastore to read from.
- */
- private final Datastore datastore;
-
- /**
- * True if more results may be available.
- */
- private boolean moreResults;
-
- /**
- * Iterator over records.
- */
- private java.util.Iterator<EntityResult> entities;
-
- /**
- * Current batch of query results.
- */
- private QueryResultBatch currentBatch;
-
- /**
- * Maximum number of results to request per query.
- *
- * <p>Must be set, or it may result in an I/O error when querying
- * Cloud Datastore.
- */
- private static final int QUERY_BATCH_LIMIT = 500;
-
- /**
- * Remaining user-requested limit on the number of sources to return. If the user did not set a
- * limit, then this variable will always have the value {@link Integer#MAX_VALUE} and will never
- * be decremented.
- */
- private int userLimit;
-
- private Entity currentEntity;
-
- /**
- * Returns a DatastoreReader with Source and Datastore object set.
- *
- * @param datastore a datastore connection to use.
- */
- public DatastoreReader(Source source, Datastore datastore) {
- this.source = source;
- this.datastore = datastore;
- // If the user set a limit on the query, remember it. Otherwise pin to MAX_VALUE.
- userLimit = source.query.hasLimit() ? source.query.getLimit() : Integer.MAX_VALUE;
- }
-
- @Override
- public Entity getCurrent() {
- return currentEntity;
- }
-
- @Override
- public boolean start() throws IOException {
- return advance();
- }
-
- @Override
- public boolean advance() throws IOException {
- if (entities == null || (!entities.hasNext() && moreResults)) {
- try {
- entities = getIteratorAndMoveCursor();
- } catch (DatastoreException e) {
- throw new IOException(e);
- }
- }
-
- if (entities == null || !entities.hasNext()) {
- currentEntity = null;
- return false;
- }
-
- currentEntity = entities.next().getEntity();
- return true;
- }
-
- @Override
- public void close() throws IOException {
- // Nothing
- }
-
- @Override
- public DatastoreIO.Source getCurrentSource() {
- return source;
- }
-
- @Override
- public DatastoreIO.Source splitAtFraction(double fraction) {
- // Not supported.
- return null;
- }
-
- @Override
- public Double getFractionConsumed() {
- // Not supported.
- return null;
- }
-
- /**
- * Returns an iterator over the next batch of records for the query
- * and updates the cursor to get the next batch as needed.
- * Query has specified limit and offset from InputSplit.
- */
- private Iterator<EntityResult> getIteratorAndMoveCursor() throws DatastoreException {
- Query.Builder query = source.query.toBuilder().clone();
- query.setLimit(Math.min(userLimit, QUERY_BATCH_LIMIT));
- if (currentBatch != null && currentBatch.hasEndCursor()) {
- query.setStartCursor(currentBatch.getEndCursor());
- }
-
- RunQueryRequest request = source.makeRequest(query.build());
- RunQueryResponse response = datastore.runQuery(request);
-
- currentBatch = response.getBatch();
-
- // MORE_RESULTS_AFTER_LIMIT is not implemented yet:
- // https://groups.google.com/forum/#!topic/gcd-discuss/iNs6M1jA2Vw, so
- // use result count to determine if more results might exist.
- int numFetch = currentBatch.getEntityResultCount();
- if (source.query.hasLimit()) {
- verify(userLimit >= numFetch,
- "Expected userLimit %s >= numFetch %s, because query limit %s should be <= userLimit",
- userLimit, numFetch, query.getLimit());
- userLimit -= numFetch;
- }
- moreResults =
- // User-limit does not exist (so userLimit == MAX_VALUE) and/or has not been satisfied.
- (userLimit > 0)
- // All indications from the API are that there are/may be more results.
- && ((numFetch == QUERY_BATCH_LIMIT) || (currentBatch.getMoreResults() == NOT_FINISHED));
-
- // May receive a batch of 0 results if the number of records is a multiple
- // of the request limit.
- if (numFetch == 0) {
- return null;
- }
-
- return currentBatch.getEntityResultList().iterator();
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/FileBasedSink.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/FileBasedSink.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/FileBasedSink.java
deleted file mode 100644
index dda500c..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/FileBasedSink.java
+++ /dev/null
@@ -1,864 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.io;
-
-import com.google.api.client.googleapis.batch.BatchRequest;
-import com.google.api.client.googleapis.batch.json.JsonBatchCallback;
-import com.google.api.client.googleapis.json.GoogleJsonError;
-import com.google.api.client.http.HttpHeaders;
-import com.google.api.client.http.HttpRequestInitializer;
-import com.google.api.services.storage.Storage;
-import com.google.api.services.storage.StorageRequest;
-import com.google.api.services.storage.model.StorageObject;
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.coders.SerializableCoder;
-import com.google.cloud.dataflow.sdk.options.GcsOptions;
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-import com.google.cloud.dataflow.sdk.transforms.DoFn;
-import com.google.cloud.dataflow.sdk.transforms.GroupByKey;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-import com.google.cloud.dataflow.sdk.transforms.ParDo;
-import com.google.cloud.dataflow.sdk.transforms.windowing.DefaultTrigger;
-import com.google.cloud.dataflow.sdk.transforms.windowing.GlobalWindows;
-import com.google.cloud.dataflow.sdk.transforms.windowing.Window;
-import com.google.cloud.dataflow.sdk.util.FileIOChannelFactory;
-import com.google.cloud.dataflow.sdk.util.GcsIOChannelFactory;
-import com.google.cloud.dataflow.sdk.util.IOChannelFactory;
-import com.google.cloud.dataflow.sdk.util.IOChannelUtils;
-import com.google.cloud.dataflow.sdk.util.MimeTypes;
-import com.google.cloud.dataflow.sdk.util.Transport;
-import com.google.cloud.dataflow.sdk.util.gcsfs.GcsPath;
-import com.google.cloud.dataflow.sdk.values.KV;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.cloud.hadoop.util.ApiErrorExtractor;
-import com.google.common.base.Preconditions;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.IOException;
-import java.io.Serializable;
-import java.nio.channels.WritableByteChannel;
-import java.nio.file.Files;
-import java.nio.file.NoSuchFileException;
-import java.nio.file.Paths;
-import java.nio.file.StandardCopyOption;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.LinkedList;
-import java.util.List;
-
-import javax.annotation.concurrent.NotThreadSafe;
-
-/**
- * Abstract {@link Sink} for file-based output. An implementation of FileBasedSink writes file-based
- * output and defines the format of output files (how values are written, headers/footers, MIME
- * type, etc.).
- *
- * <p>At pipeline construction time, the methods of FileBasedSink are called to validate the sink
- * and to create a {@link Sink.WriteOperation} that manages the process of writing to the sink.
- *
- * <p>The process of writing to file-based sink is as follows:
- * <ol>
- * <li>An optional subclass-defined initialization,
- * <li>a parallel write of bundles to temporary files, and finally,
- * <li>these temporary files are renamed with final output filenames.
- * </ol>
- *
- * <p>Supported file systems are those registered with {@link IOChannelUtils}.
- *
- * @param <T> the type of values written to the sink.
- */
-public abstract class FileBasedSink<T> extends Sink<T> {
- /**
- * Base filename for final output files.
- */
- protected final String baseOutputFilename;
-
- /**
- * The extension to be used for the final output files.
- */
- protected final String extension;
-
- /**
- * Naming template for output files. See {@link ShardNameTemplate} for a description of
- * possible naming templates. Default is {@link ShardNameTemplate#INDEX_OF_MAX}.
- */
- protected final String fileNamingTemplate;
-
- /**
- * Construct a FileBasedSink with the given base output filename and extension.
- */
- public FileBasedSink(String baseOutputFilename, String extension) {
- this(baseOutputFilename, extension, ShardNameTemplate.INDEX_OF_MAX);
- }
-
- /**
- * Construct a FileBasedSink with the given base output filename, extension, and file naming
- * template.
- *
- * <p>See {@link ShardNameTemplate} for a description of file naming templates.
- */
- public FileBasedSink(String baseOutputFilename, String extension, String fileNamingTemplate) {
- this.baseOutputFilename = baseOutputFilename;
- this.extension = extension;
- this.fileNamingTemplate = fileNamingTemplate;
- }
-
- /**
- * Returns the base output filename for this file based sink.
- */
- public String getBaseOutputFilename() {
- return baseOutputFilename;
- }
-
- /**
- * Perform pipeline-construction-time validation. The default implementation is a no-op.
- * Subclasses should override to ensure the sink is valid and can be written to. It is recommended
- * to use {@link Preconditions} in the implementation of this method.
- */
- @Override
- public void validate(PipelineOptions options) {}
-
- /**
- * Return a subclass of {@link FileBasedSink.FileBasedWriteOperation} that will manage the write
- * to the sink.
- */
- @Override
- public abstract FileBasedWriteOperation<T> createWriteOperation(PipelineOptions options);
-
- /**
- * Abstract {@link Sink.WriteOperation} that manages the process of writing to a
- * {@link FileBasedSink}.
- *
- * <p>The primary responsibilities of the FileBasedWriteOperation is the management of output
- * files. During a write, {@link FileBasedSink.FileBasedWriter}s write bundles to temporary file
- * locations. After the bundles have been written,
- * <ol>
- * <li>{@link FileBasedSink.FileBasedWriteOperation#finalize} is given a list of the temporary
- * files containing the output bundles.
- * <li>During finalize, these temporary files are copied to final output locations and named
- * according to a file naming template.
- * <li>Finally, any temporary files that were created during the write are removed.
- * </ol>
- *
- * <p>Subclass implementations of FileBasedWriteOperation must implement
- * {@link FileBasedSink.FileBasedWriteOperation#createWriter} to return a concrete
- * FileBasedSinkWriter.
- *
- * <h2>Temporary and Output File Naming:</h2> During the write, bundles are written to temporary
- * files using the baseTemporaryFilename that can be provided via the constructor of
- * FileBasedWriteOperation. These temporary files will be named
- * {@code {baseTemporaryFilename}-temp-{bundleId}}, where bundleId is the unique id of the bundle.
- * For example, if baseTemporaryFilename is "gs://my-bucket/my_temp_output", the output for a
- * bundle with bundle id 15723 will be "gs://my-bucket/my_temp_output-temp-15723".
- *
- * <p>Final output files are written to baseOutputFilename with the format
- * {@code {baseOutputFilename}-0000i-of-0000n.{extension}} where n is the total number of bundles
- * written and extension is the file extension. Both baseOutputFilename and extension are required
- * constructor arguments.
- *
- * <p>Subclass implementations can change the file naming template by supplying a value for
- * {@link FileBasedSink#fileNamingTemplate}.
- *
- * <h2>Temporary Bundle File Handling:</h2>
- * <p>{@link FileBasedSink.FileBasedWriteOperation#temporaryFileRetention} controls the behavior
- * for managing temporary files. By default, temporary files will be removed. Subclasses can
- * provide a different value to the constructor.
- *
- * <p>Note that in the case of permanent failure of a bundle's write, no clean up of temporary
- * files will occur.
- *
- * <p>If there are no elements in the PCollection being written, no output will be generated.
- *
- * @param <T> the type of values written to the sink.
- */
- public abstract static class FileBasedWriteOperation<T> extends WriteOperation<T, FileResult> {
- private static final Logger LOG = LoggerFactory.getLogger(FileBasedWriteOperation.class);
-
- /**
- * Options for handling of temporary output files.
- */
- public enum TemporaryFileRetention {
- KEEP,
- REMOVE;
- }
-
- /**
- * The Sink that this WriteOperation will write to.
- */
- protected final FileBasedSink<T> sink;
-
- /**
- * Option to keep or remove temporary output files.
- */
- protected final TemporaryFileRetention temporaryFileRetention;
-
- /**
- * Base filename used for temporary output files. Default is the baseOutputFilename.
- */
- protected final String baseTemporaryFilename;
-
- /**
- * Name separator for temporary files. Temporary files will be named
- * {@code {baseTemporaryFilename}-temp-{bundleId}}.
- */
- protected static final String TEMPORARY_FILENAME_SEPARATOR = "-temp-";
-
- /**
- * Build a temporary filename using the temporary filename separator with the given prefix and
- * suffix.
- */
- protected static final String buildTemporaryFilename(String prefix, String suffix) {
- return prefix + FileBasedWriteOperation.TEMPORARY_FILENAME_SEPARATOR + suffix;
- }
-
- /**
- * Construct a FileBasedWriteOperation using the same base filename for both temporary and
- * output files.
- *
- * @param sink the FileBasedSink that will be used to configure this write operation.
- */
- public FileBasedWriteOperation(FileBasedSink<T> sink) {
- this(sink, sink.baseOutputFilename);
- }
-
- /**
- * Construct a FileBasedWriteOperation.
- *
- * @param sink the FileBasedSink that will be used to configure this write operation.
- * @param baseTemporaryFilename the base filename to be used for temporary output files.
- */
- public FileBasedWriteOperation(FileBasedSink<T> sink, String baseTemporaryFilename) {
- this(sink, baseTemporaryFilename, TemporaryFileRetention.REMOVE);
- }
-
- /**
- * Create a new FileBasedWriteOperation.
- *
- * @param sink the FileBasedSink that will be used to configure this write operation.
- * @param baseTemporaryFilename the base filename to be used for temporary output files.
- * @param temporaryFileRetention defines how temporary files are handled.
- */
- public FileBasedWriteOperation(FileBasedSink<T> sink, String baseTemporaryFilename,
- TemporaryFileRetention temporaryFileRetention) {
- this.sink = sink;
- this.baseTemporaryFilename = baseTemporaryFilename;
- this.temporaryFileRetention = temporaryFileRetention;
- }
-
- /**
- * Clients must implement to return a subclass of {@link FileBasedSink.FileBasedWriter}. This
- * method must satisfy the restrictions placed on implementations of
- * {@link Sink.WriteOperation#createWriter}. Namely, it must not mutate the state of the object.
- */
- @Override
- public abstract FileBasedWriter<T> createWriter(PipelineOptions options) throws Exception;
-
- /**
- * Initialization of the sink. Default implementation is a no-op. May be overridden by subclass
- * implementations to perform initialization of the sink at pipeline runtime. This method must
- * be idempotent and is subject to the same implementation restrictions as
- * {@link Sink.WriteOperation#initialize}.
- */
- @Override
- public void initialize(PipelineOptions options) throws Exception {}
-
- /**
- * Finalizes writing by copying temporary output files to their final location and optionally
- * removing temporary files.
- *
- * <p>Finalization may be overridden by subclass implementations to perform customized
- * finalization (e.g., initiating some operation on output bundles, merging them, etc.).
- * {@code writerResults} contains the filenames of written bundles.
- *
- * <p>If subclasses override this method, they must guarantee that its implementation is
- * idempotent, as it may be executed multiple times in the case of failure or for redundancy. It
- * is a best practice to attempt to try to make this method atomic.
- *
- * @param writerResults the results of writes (FileResult).
- */
- @Override
- public void finalize(Iterable<FileResult> writerResults, PipelineOptions options)
- throws Exception {
- // Collect names of temporary files and rename them.
- List<String> files = new ArrayList<>();
- for (FileResult result : writerResults) {
- LOG.debug("Temporary bundle output file {} will be copied.", result.getFilename());
- files.add(result.getFilename());
- }
- copyToOutputFiles(files, options);
-
- // Optionally remove temporary files.
- if (temporaryFileRetention == TemporaryFileRetention.REMOVE) {
- removeTemporaryFiles(options);
- }
- }
-
- /**
- * Copy temporary files to final output filenames using the file naming template.
- *
- * <p>Can be called from subclasses that override {@link FileBasedWriteOperation#finalize}.
- *
- * <p>Files will be named according to the file naming template. The order of the output files
- * will be the same as the sorted order of the input filenames. In other words, if the input
- * filenames are ["C", "A", "B"], baseOutputFilename is "file", the extension is ".txt", and
- * the fileNamingTemplate is "-SSS-of-NNN", the contents of A will be copied to
- * file-000-of-003.txt, the contents of B will be copied to file-001-of-003.txt, etc.
- *
- * @param filenames the filenames of temporary files.
- * @return a list containing the names of final output files.
- */
- protected final List<String> copyToOutputFiles(List<String> filenames, PipelineOptions options)
- throws IOException {
- int numFiles = filenames.size();
- List<String> srcFilenames = new ArrayList<>();
- List<String> destFilenames = generateDestinationFilenames(numFiles);
-
- // Sort files for copying.
- srcFilenames.addAll(filenames);
- Collections.sort(srcFilenames);
-
- if (numFiles > 0) {
- LOG.debug("Copying {} files.", numFiles);
- FileOperations fileOperations =
- FileOperationsFactory.getFileOperations(destFilenames.get(0), options);
- fileOperations.copy(srcFilenames, destFilenames);
- } else {
- LOG.info("No output files to write.");
- }
-
- return destFilenames;
- }
-
- /**
- * Generate output bundle filenames.
- */
- protected final List<String> generateDestinationFilenames(int numFiles) {
- List<String> destFilenames = new ArrayList<>();
- String extension = getSink().extension;
- String baseOutputFilename = getSink().baseOutputFilename;
- String fileNamingTemplate = getSink().fileNamingTemplate;
-
- String suffix = getFileExtension(extension);
- for (int i = 0; i < numFiles; i++) {
- destFilenames.add(IOChannelUtils.constructName(
- baseOutputFilename, fileNamingTemplate, suffix, i, numFiles));
- }
- return destFilenames;
- }
-
- /**
- * Returns the file extension to be used. If the user did not request a file
- * extension then this method returns the empty string. Otherwise this method
- * adds a {@code "."} to the beginning of the users extension if one is not present.
- */
- private String getFileExtension(String usersExtension) {
- if (usersExtension == null || usersExtension.isEmpty()) {
- return "";
- }
- if (usersExtension.startsWith(".")) {
- return usersExtension;
- }
- return "." + usersExtension;
- }
-
- /**
- * Removes temporary output files. Uses the temporary filename to find files to remove.
- *
- * <p>Can be called from subclasses that override {@link FileBasedWriteOperation#finalize}.
- * <b>Note:</b>If finalize is overridden and does <b>not</b> rename or otherwise finalize
- * temporary files, this method will remove them.
- */
- protected final void removeTemporaryFiles(PipelineOptions options) throws IOException {
- String pattern = buildTemporaryFilename(baseTemporaryFilename, "*");
- LOG.debug("Finding temporary bundle output files matching {}.", pattern);
- FileOperations fileOperations = FileOperationsFactory.getFileOperations(pattern, options);
- IOChannelFactory factory = IOChannelUtils.getFactory(pattern);
- Collection<String> matches = factory.match(pattern);
- LOG.debug("{} temporary files matched {}", matches.size(), pattern);
- LOG.debug("Removing {} files.", matches.size());
- fileOperations.remove(matches);
- }
-
- /**
- * Provides a coder for {@link FileBasedSink.FileResult}.
- */
- @Override
- public Coder<FileResult> getWriterResultCoder() {
- return SerializableCoder.of(FileResult.class);
- }
-
- /**
- * Returns the FileBasedSink for this write operation.
- */
- @Override
- public FileBasedSink<T> getSink() {
- return sink;
- }
- }
-
- /**
- * Abstract {@link Sink.Writer} that writes a bundle to a {@link FileBasedSink}. Subclass
- * implementations provide a method that can write a single value to a {@link WritableByteChannel}
- * ({@link Sink.Writer#write}).
- *
- * <p>Subclass implementations may also override methods that write headers and footers before and
- * after the values in a bundle, respectively, as well as provide a MIME type for the output
- * channel.
- *
- * <p>Multiple FileBasedWriter instances may be created on the same worker, and therefore any
- * access to static members or methods should be thread safe.
- *
- * @param <T> the type of values to write.
- */
- public abstract static class FileBasedWriter<T> extends Writer<T, FileResult> {
- private static final Logger LOG = LoggerFactory.getLogger(FileBasedWriter.class);
-
- final FileBasedWriteOperation<T> writeOperation;
-
- /**
- * Unique id for this output bundle.
- */
- private String id;
-
- /**
- * The filename of the output bundle. Equal to the
- * {@link FileBasedSink.FileBasedWriteOperation#TEMPORARY_FILENAME_SEPARATOR} and id appended to
- * the baseName.
- */
- private String filename;
-
- /**
- * The channel to write to.
- */
- private WritableByteChannel channel;
-
- /**
- * The MIME type used in the creation of the output channel (if the file system supports it).
- *
- * <p>GCS, for example, supports writing files with Content-Type metadata.
- *
- * <p>May be overridden. Default is {@link MimeTypes#TEXT}. See {@link MimeTypes} for other
- * options.
- */
- protected String mimeType = MimeTypes.TEXT;
-
- /**
- * Construct a new FileBasedWriter with a base filename.
- */
- public FileBasedWriter(FileBasedWriteOperation<T> writeOperation) {
- Preconditions.checkNotNull(writeOperation);
- this.writeOperation = writeOperation;
- }
-
- /**
- * Called with the channel that a subclass will write its header, footer, and values to.
- * Subclasses should either keep a reference to the channel provided or create and keep a
- * reference to an appropriate object that they will use to write to it.
- *
- * <p>Called before any subsequent calls to writeHeader, writeFooter, and write.
- */
- protected abstract void prepareWrite(WritableByteChannel channel) throws Exception;
-
- /**
- * Writes header at the beginning of output files. Nothing by default; subclasses may override.
- */
- protected void writeHeader() throws Exception {}
-
- /**
- * Writes footer at the end of output files. Nothing by default; subclasses may override.
- */
- protected void writeFooter() throws Exception {}
-
- /**
- * Opens the channel.
- */
- @Override
- public final void open(String uId) throws Exception {
- this.id = uId;
- filename = FileBasedWriteOperation.buildTemporaryFilename(
- getWriteOperation().baseTemporaryFilename, uId);
- LOG.debug("Opening {}.", filename);
- channel = IOChannelUtils.create(filename, mimeType);
- try {
- prepareWrite(channel);
- LOG.debug("Writing header to {}.", filename);
- writeHeader();
- } catch (Exception e) {
- // The caller shouldn't have to close() this Writer if it fails to open(), so close the
- // channel if prepareWrite() or writeHeader() fails.
- try {
- LOG.error("Writing header to {} failed, closing channel.", filename);
- channel.close();
- } catch (IOException closeException) {
- // Log exception and mask it.
- LOG.error("Closing channel for {} failed: {}", filename, closeException.getMessage());
- }
- // Throw the exception that caused the write to fail.
- throw e;
- }
- LOG.debug("Starting write of bundle {} to {}.", this.id, filename);
- }
-
- /**
- * Closes the channel and return the bundle result.
- */
- @Override
- public final FileResult close() throws Exception {
- try (WritableByteChannel theChannel = channel) {
- LOG.debug("Writing footer to {}.", filename);
- writeFooter();
- }
- FileResult result = new FileResult(filename);
- LOG.debug("Result for bundle {}: {}", this.id, filename);
- return result;
- }
-
- /**
- * Return the FileBasedWriteOperation that this Writer belongs to.
- */
- @Override
- public FileBasedWriteOperation<T> getWriteOperation() {
- return writeOperation;
- }
- }
-
- /**
- * Result of a single bundle write. Contains the filename of the bundle.
- */
- public static final class FileResult implements Serializable {
- private final String filename;
-
- public FileResult(String filename) {
- this.filename = filename;
- }
-
- public String getFilename() {
- return filename;
- }
- }
-
- // File system operations
- // Warning: These class are purposefully private and will be replaced by more robust file I/O
- // utilities. Not for use outside FileBasedSink.
-
- /**
- * Factory for FileOperations.
- */
- private static class FileOperationsFactory {
- /**
- * Return a FileOperations implementation based on which IOChannel would be used to write to a
- * location specification (not necessarily a filename, as it may contain wildcards).
- *
- * <p>Only supports File and GCS locations (currently, the only factories registered with
- * IOChannelUtils). For other locations, an exception is thrown.
- */
- public static FileOperations getFileOperations(String spec, PipelineOptions options)
- throws IOException {
- IOChannelFactory factory = IOChannelUtils.getFactory(spec);
- if (factory instanceof GcsIOChannelFactory) {
- return new GcsOperations(options);
- } else if (factory instanceof FileIOChannelFactory) {
- return new LocalFileOperations();
- } else {
- throw new IOException("Unrecognized file system.");
- }
- }
- }
-
- /**
- * Copy and Remove operations for files. Operations behave like remove-if-existing and
- * copy-if-existing and do not throw exceptions on file not found to enable retries of these
- * operations in the case of transient error.
- */
- private static interface FileOperations {
- /**
- * Copy a collection of files from one location to another.
- *
- * <p>The number of source filenames must equal the number of destination filenames.
- *
- * @param srcFilenames the source filenames.
- * @param destFilenames the destination filenames.
- */
- public void copy(List<String> srcFilenames, List<String> destFilenames) throws IOException;
-
- /**
- * Remove a collection of files.
- */
- public void remove(Collection<String> filenames) throws IOException;
- }
-
- /**
- * GCS file system operations.
- */
- private static class GcsOperations implements FileOperations {
- private static final Logger LOG = LoggerFactory.getLogger(GcsOperations.class);
-
- /**
- * Maximum number of requests permitted in a GCS batch request.
- */
- private static final int MAX_REQUESTS_PER_BATCH = 1000;
-
- private ApiErrorExtractor errorExtractor = new ApiErrorExtractor();
- private GcsOptions gcsOptions;
- private Storage gcs;
- private BatchHelper batchHelper;
-
- public GcsOperations(PipelineOptions options) {
- gcsOptions = options.as(GcsOptions.class);
- gcs = Transport.newStorageClient(gcsOptions).build();
- batchHelper =
- new BatchHelper(gcs.getRequestFactory().getInitializer(), gcs, MAX_REQUESTS_PER_BATCH);
- }
-
- @Override
- public void copy(List<String> srcFilenames, List<String> destFilenames) throws IOException {
- Preconditions.checkArgument(
- srcFilenames.size() == destFilenames.size(),
- String.format("Number of source files {} must equal number of destination files {}",
- srcFilenames.size(), destFilenames.size()));
- for (int i = 0; i < srcFilenames.size(); i++) {
- final GcsPath sourcePath = GcsPath.fromUri(srcFilenames.get(i));
- final GcsPath destPath = GcsPath.fromUri(destFilenames.get(i));
- LOG.debug("Copying {} to {}", sourcePath, destPath);
- Storage.Objects.Copy copyObject = gcs.objects().copy(sourcePath.getBucket(),
- sourcePath.getObject(), destPath.getBucket(), destPath.getObject(), null);
- batchHelper.queue(copyObject, new JsonBatchCallback<StorageObject>() {
- @Override
- public void onSuccess(StorageObject obj, HttpHeaders responseHeaders) {
- LOG.debug("Successfully copied {} to {}", sourcePath, destPath);
- }
-
- @Override
- public void onFailure(GoogleJsonError e, HttpHeaders responseHeaders) throws IOException {
- // Do nothing on item not found.
- if (!errorExtractor.itemNotFound(e)) {
- throw new IOException(e.toString());
- }
- LOG.debug("{} does not exist.", sourcePath);
- }
- });
- }
- batchHelper.flush();
- }
-
- @Override
- public void remove(Collection<String> filenames) throws IOException {
- for (String filename : filenames) {
- final GcsPath path = GcsPath.fromUri(filename);
- LOG.debug("Removing: " + path);
- Storage.Objects.Delete deleteObject =
- gcs.objects().delete(path.getBucket(), path.getObject());
- batchHelper.queue(deleteObject, new JsonBatchCallback<Void>() {
- @Override
- public void onSuccess(Void obj, HttpHeaders responseHeaders) throws IOException {
- LOG.debug("Successfully removed {}", path);
- }
-
- @Override
- public void onFailure(GoogleJsonError e, HttpHeaders responseHeaders) throws IOException {
- // Do nothing on item not found.
- if (!errorExtractor.itemNotFound(e)) {
- throw new IOException(e.toString());
- }
- LOG.debug("{} does not exist.", path);
- }
- });
- }
- batchHelper.flush();
- }
- }
-
- /**
- * File systems supported by {@link Files}.
- */
- private static class LocalFileOperations implements FileOperations {
- private static final Logger LOG = LoggerFactory.getLogger(LocalFileOperations.class);
-
- @Override
- public void copy(List<String> srcFilenames, List<String> destFilenames) throws IOException {
- Preconditions.checkArgument(
- srcFilenames.size() == destFilenames.size(),
- String.format("Number of source files {} must equal number of destination files {}",
- srcFilenames.size(), destFilenames.size()));
- int numFiles = srcFilenames.size();
- for (int i = 0; i < numFiles; i++) {
- String src = srcFilenames.get(i);
- String dst = destFilenames.get(i);
- LOG.debug("Copying {} to {}", src, dst);
- copyOne(src, dst);
- }
- }
-
- private void copyOne(String source, String destination) throws IOException {
- try {
- // Copy the source file, replacing the existing destination.
- Files.copy(Paths.get(source), Paths.get(destination), StandardCopyOption.REPLACE_EXISTING);
- } catch (NoSuchFileException e) {
- LOG.debug("{} does not exist.", source);
- // Suppress exception if file does not exist.
- }
- }
-
- @Override
- public void remove(Collection<String> filenames) throws IOException {
- for (String filename : filenames) {
- LOG.debug("Removing file {}", filename);
- removeOne(filename);
- }
- }
-
- private void removeOne(String filename) throws IOException {
- // Delete the file if it exists.
- boolean exists = Files.deleteIfExists(Paths.get(filename));
- if (!exists) {
- LOG.debug("{} does not exist.", filename);
- }
- }
- }
-
- /**
- * BatchHelper abstracts out the logic for the maximum requests per batch for GCS.
- *
- * <p>Copy of
- * https://github.com/GoogleCloudPlatform/bigdata-interop/blob/master/gcs/src/main/java/com/google/cloud/hadoop/gcsio/BatchHelper.java
- *
- * <p>Copied to prevent Dataflow from depending on the Hadoop-related dependencies that are not
- * used in Dataflow. Hadoop-related dependencies will be removed from the Google Cloud Storage
- * Connector (https://cloud.google.com/hadoop/google-cloud-storage-connector) so that this project
- * and others may use the connector without introducing unnecessary dependencies.
- *
- * <p>This class is not thread-safe; create a new BatchHelper instance per single-threaded logical
- * grouping of requests.
- */
- @NotThreadSafe
- private static class BatchHelper {
- /**
- * Callback that causes a single StorageRequest to be added to the BatchRequest.
- */
- protected static interface QueueRequestCallback {
- void enqueue() throws IOException;
- }
-
- private final List<QueueRequestCallback> pendingBatchEntries;
- private final BatchRequest batch;
-
- // Number of requests that can be queued into a single actual HTTP request
- // before a sub-batch is sent.
- private final long maxRequestsPerBatch;
-
- // Flag that indicates whether there is an in-progress flush.
- private boolean flushing = false;
-
- /**
- * Primary constructor, generally accessed only via the inner Factory class.
- */
- public BatchHelper(
- HttpRequestInitializer requestInitializer, Storage gcs, long maxRequestsPerBatch) {
- this.pendingBatchEntries = new LinkedList<>();
- this.batch = gcs.batch(requestInitializer);
- this.maxRequestsPerBatch = maxRequestsPerBatch;
- }
-
- /**
- * Adds an additional request to the batch, and possibly flushes the current contents of the
- * batch if {@code maxRequestsPerBatch} has been reached.
- */
- public <T> void queue(final StorageRequest<T> req, final JsonBatchCallback<T> callback)
- throws IOException {
- QueueRequestCallback queueCallback = new QueueRequestCallback() {
- @Override
- public void enqueue() throws IOException {
- req.queue(batch, callback);
- }
- };
- pendingBatchEntries.add(queueCallback);
-
- flushIfPossibleAndRequired();
- }
-
- // Flush our buffer if we have more pending entries than maxRequestsPerBatch
- private void flushIfPossibleAndRequired() throws IOException {
- if (pendingBatchEntries.size() > maxRequestsPerBatch) {
- flushIfPossible();
- }
- }
-
- // Flush our buffer if we are not already in a flush operation and we have data to flush.
- private void flushIfPossible() throws IOException {
- if (!flushing && pendingBatchEntries.size() > 0) {
- flushing = true;
- try {
- while (batch.size() < maxRequestsPerBatch && pendingBatchEntries.size() > 0) {
- QueueRequestCallback head = pendingBatchEntries.remove(0);
- head.enqueue();
- }
-
- batch.execute();
- } finally {
- flushing = false;
- }
- }
- }
-
-
- /**
- * Sends any currently remaining requests in the batch; should be called at the end of any
- * series of batched requests to ensure everything has been sent.
- */
- public void flush() throws IOException {
- flushIfPossible();
- }
- }
-
- static class ReshardForWrite<T> extends PTransform<PCollection<T>, PCollection<T>> {
- @Override
- public PCollection<T> apply(PCollection<T> input) {
- return input
- // TODO: This would need to be adapted to write per-window shards.
- .apply(Window.<T>into(new GlobalWindows())
- .triggering(DefaultTrigger.of())
- .discardingFiredPanes())
- .apply("RandomKey", ParDo.of(
- new DoFn<T, KV<Long, T>>() {
- transient long counter, step;
- @Override
- public void startBundle(Context c) {
- counter = (long) (Math.random() * Long.MAX_VALUE);
- step = 1 + 2 * (long) (Math.random() * Long.MAX_VALUE);
- }
- @Override
- public void processElement(ProcessContext c) {
- counter += step;
- c.output(KV.of(counter, c.element()));
- }
- }))
- .apply(GroupByKey.<Long, T>create())
- .apply("Ungroup", ParDo.of(
- new DoFn<KV<Long, Iterable<T>>, T>() {
- @Override
- public void processElement(ProcessContext c) {
- for (T item : c.element().getValue()) {
- c.output(item);
- }
- }
- }));
- }
- }
-}
[59/67] incubator-beam git commit: Directory reorganization
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/java/src/main/java/com/google/cloud/dataflow/examples/common/DataflowExampleUtils.java
----------------------------------------------------------------------
diff --git a/examples/java/src/main/java/com/google/cloud/dataflow/examples/common/DataflowExampleUtils.java b/examples/java/src/main/java/com/google/cloud/dataflow/examples/common/DataflowExampleUtils.java
new file mode 100644
index 0000000..4dfdd85
--- /dev/null
+++ b/examples/java/src/main/java/com/google/cloud/dataflow/examples/common/DataflowExampleUtils.java
@@ -0,0 +1,485 @@
+/*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package com.google.cloud.dataflow.examples.common;
+
+import com.google.api.client.googleapis.json.GoogleJsonResponseException;
+import com.google.api.client.googleapis.services.AbstractGoogleClientRequest;
+import com.google.api.client.util.BackOff;
+import com.google.api.client.util.BackOffUtils;
+import com.google.api.client.util.Sleeper;
+import com.google.api.services.bigquery.Bigquery;
+import com.google.api.services.bigquery.Bigquery.Datasets;
+import com.google.api.services.bigquery.Bigquery.Tables;
+import com.google.api.services.bigquery.model.Dataset;
+import com.google.api.services.bigquery.model.DatasetReference;
+import com.google.api.services.bigquery.model.Table;
+import com.google.api.services.bigquery.model.TableReference;
+import com.google.api.services.bigquery.model.TableSchema;
+import com.google.api.services.dataflow.Dataflow;
+import com.google.api.services.pubsub.Pubsub;
+import com.google.api.services.pubsub.model.Subscription;
+import com.google.api.services.pubsub.model.Topic;
+import com.google.cloud.dataflow.sdk.Pipeline;
+import com.google.cloud.dataflow.sdk.PipelineResult;
+import com.google.cloud.dataflow.sdk.io.TextIO;
+import com.google.cloud.dataflow.sdk.options.BigQueryOptions;
+import com.google.cloud.dataflow.sdk.options.DataflowPipelineOptions;
+import com.google.cloud.dataflow.sdk.runners.DataflowPipelineJob;
+import com.google.cloud.dataflow.sdk.runners.DataflowPipelineRunner;
+import com.google.cloud.dataflow.sdk.runners.DirectPipelineRunner;
+import com.google.cloud.dataflow.sdk.transforms.IntraBundleParallelization;
+import com.google.cloud.dataflow.sdk.transforms.PTransform;
+import com.google.cloud.dataflow.sdk.util.AttemptBoundedExponentialBackOff;
+import com.google.cloud.dataflow.sdk.util.MonitoringUtil;
+import com.google.cloud.dataflow.sdk.util.Transport;
+import com.google.cloud.dataflow.sdk.values.PBegin;
+import com.google.cloud.dataflow.sdk.values.PCollection;
+import com.google.common.base.Strings;
+import com.google.common.base.Throwables;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Sets;
+
+import java.io.IOException;
+import java.util.Collection;
+import java.util.List;
+import java.util.Set;
+import java.util.concurrent.TimeUnit;
+
+import javax.servlet.http.HttpServletResponse;
+
+/**
+ * The utility class that sets up and tears down external resources, starts the Google Cloud Pub/Sub
+ * injector, and cancels the streaming and the injector pipelines once the program terminates.
+ *
+ * <p>It is used to run Dataflow examples, such as TrafficMaxLaneFlow and TrafficRoutes.
+ */
+public class DataflowExampleUtils {
+
+ private final DataflowPipelineOptions options;
+ private Bigquery bigQueryClient = null;
+ private Pubsub pubsubClient = null;
+ private Dataflow dataflowClient = null;
+ private Set<DataflowPipelineJob> jobsToCancel = Sets.newHashSet();
+ private List<String> pendingMessages = Lists.newArrayList();
+
+ public DataflowExampleUtils(DataflowPipelineOptions options) {
+ this.options = options;
+ }
+
+ /**
+ * Do resources and runner options setup.
+ */
+ public DataflowExampleUtils(DataflowPipelineOptions options, boolean isUnbounded)
+ throws IOException {
+ this.options = options;
+ setupResourcesAndRunner(isUnbounded);
+ }
+
+ /**
+ * Sets up external resources that are required by the example,
+ * such as Pub/Sub topics and BigQuery tables.
+ *
+ * @throws IOException if there is a problem setting up the resources
+ */
+ public void setup() throws IOException {
+ Sleeper sleeper = Sleeper.DEFAULT;
+ BackOff backOff = new AttemptBoundedExponentialBackOff(3, 200);
+ Throwable lastException = null;
+ try {
+ do {
+ try {
+ setupPubsub();
+ setupBigQueryTable();
+ return;
+ } catch (GoogleJsonResponseException e) {
+ lastException = e;
+ }
+ } while (BackOffUtils.next(sleeper, backOff));
+ } catch (InterruptedException e) {
+ // Ignore InterruptedException
+ }
+ Throwables.propagate(lastException);
+ }
+
+ /**
+ * Set up external resources, and configure the runner appropriately.
+ */
+ public void setupResourcesAndRunner(boolean isUnbounded) throws IOException {
+ if (isUnbounded) {
+ options.setStreaming(true);
+ }
+ setup();
+ setupRunner();
+ }
+
+ /**
+ * Sets up the Google Cloud Pub/Sub topic.
+ *
+ * <p>If the topic doesn't exist, a new topic with the given name will be created.
+ *
+ * @throws IOException if there is a problem setting up the Pub/Sub topic
+ */
+ public void setupPubsub() throws IOException {
+ ExamplePubsubTopicAndSubscriptionOptions pubsubOptions =
+ options.as(ExamplePubsubTopicAndSubscriptionOptions.class);
+ if (!pubsubOptions.getPubsubTopic().isEmpty()) {
+ pendingMessages.add("**********************Set Up Pubsub************************");
+ setupPubsubTopic(pubsubOptions.getPubsubTopic());
+ pendingMessages.add("The Pub/Sub topic has been set up for this example: "
+ + pubsubOptions.getPubsubTopic());
+
+ if (!pubsubOptions.getPubsubSubscription().isEmpty()) {
+ setupPubsubSubscription(
+ pubsubOptions.getPubsubTopic(), pubsubOptions.getPubsubSubscription());
+ pendingMessages.add("The Pub/Sub subscription has been set up for this example: "
+ + pubsubOptions.getPubsubSubscription());
+ }
+ }
+ }
+
+ /**
+ * Sets up the BigQuery table with the given schema.
+ *
+ * <p>If the table already exists, the schema has to match the given one. Otherwise, the example
+ * will throw a RuntimeException. If the table doesn't exist, a new table with the given schema
+ * will be created.
+ *
+ * @throws IOException if there is a problem setting up the BigQuery table
+ */
+ public void setupBigQueryTable() throws IOException {
+ ExampleBigQueryTableOptions bigQueryTableOptions =
+ options.as(ExampleBigQueryTableOptions.class);
+ if (bigQueryTableOptions.getBigQueryDataset() != null
+ && bigQueryTableOptions.getBigQueryTable() != null
+ && bigQueryTableOptions.getBigQuerySchema() != null) {
+ pendingMessages.add("******************Set Up Big Query Table*******************");
+ setupBigQueryTable(bigQueryTableOptions.getProject(),
+ bigQueryTableOptions.getBigQueryDataset(),
+ bigQueryTableOptions.getBigQueryTable(),
+ bigQueryTableOptions.getBigQuerySchema());
+ pendingMessages.add("The BigQuery table has been set up for this example: "
+ + bigQueryTableOptions.getProject()
+ + ":" + bigQueryTableOptions.getBigQueryDataset()
+ + "." + bigQueryTableOptions.getBigQueryTable());
+ }
+ }
+
+ /**
+ * Tears down external resources that can be deleted upon the example's completion.
+ */
+ private void tearDown() {
+ pendingMessages.add("*************************Tear Down*************************");
+ ExamplePubsubTopicAndSubscriptionOptions pubsubOptions =
+ options.as(ExamplePubsubTopicAndSubscriptionOptions.class);
+ if (!pubsubOptions.getPubsubTopic().isEmpty()) {
+ try {
+ deletePubsubTopic(pubsubOptions.getPubsubTopic());
+ pendingMessages.add("The Pub/Sub topic has been deleted: "
+ + pubsubOptions.getPubsubTopic());
+ } catch (IOException e) {
+ pendingMessages.add("Failed to delete the Pub/Sub topic : "
+ + pubsubOptions.getPubsubTopic());
+ }
+ if (!pubsubOptions.getPubsubSubscription().isEmpty()) {
+ try {
+ deletePubsubSubscription(pubsubOptions.getPubsubSubscription());
+ pendingMessages.add("The Pub/Sub subscription has been deleted: "
+ + pubsubOptions.getPubsubSubscription());
+ } catch (IOException e) {
+ pendingMessages.add("Failed to delete the Pub/Sub subscription : "
+ + pubsubOptions.getPubsubSubscription());
+ }
+ }
+ }
+
+ ExampleBigQueryTableOptions bigQueryTableOptions =
+ options.as(ExampleBigQueryTableOptions.class);
+ if (bigQueryTableOptions.getBigQueryDataset() != null
+ && bigQueryTableOptions.getBigQueryTable() != null
+ && bigQueryTableOptions.getBigQuerySchema() != null) {
+ pendingMessages.add("The BigQuery table might contain the example's output, "
+ + "and it is not deleted automatically: "
+ + bigQueryTableOptions.getProject()
+ + ":" + bigQueryTableOptions.getBigQueryDataset()
+ + "." + bigQueryTableOptions.getBigQueryTable());
+ pendingMessages.add("Please go to the Developers Console to delete it manually."
+ + " Otherwise, you may be charged for its usage.");
+ }
+ }
+
+ private void setupBigQueryTable(String projectId, String datasetId, String tableId,
+ TableSchema schema) throws IOException {
+ if (bigQueryClient == null) {
+ bigQueryClient = Transport.newBigQueryClient(options.as(BigQueryOptions.class)).build();
+ }
+
+ Datasets datasetService = bigQueryClient.datasets();
+ if (executeNullIfNotFound(datasetService.get(projectId, datasetId)) == null) {
+ Dataset newDataset = new Dataset().setDatasetReference(
+ new DatasetReference().setProjectId(projectId).setDatasetId(datasetId));
+ datasetService.insert(projectId, newDataset).execute();
+ }
+
+ Tables tableService = bigQueryClient.tables();
+ Table table = executeNullIfNotFound(tableService.get(projectId, datasetId, tableId));
+ if (table == null) {
+ Table newTable = new Table().setSchema(schema).setTableReference(
+ new TableReference().setProjectId(projectId).setDatasetId(datasetId).setTableId(tableId));
+ tableService.insert(projectId, datasetId, newTable).execute();
+ } else if (!table.getSchema().equals(schema)) {
+ throw new RuntimeException(
+ "Table exists and schemas do not match, expecting: " + schema.toPrettyString()
+ + ", actual: " + table.getSchema().toPrettyString());
+ }
+ }
+
+ private void setupPubsubTopic(String topic) throws IOException {
+ if (pubsubClient == null) {
+ pubsubClient = Transport.newPubsubClient(options).build();
+ }
+ if (executeNullIfNotFound(pubsubClient.projects().topics().get(topic)) == null) {
+ pubsubClient.projects().topics().create(topic, new Topic().setName(topic)).execute();
+ }
+ }
+
+ private void setupPubsubSubscription(String topic, String subscription) throws IOException {
+ if (pubsubClient == null) {
+ pubsubClient = Transport.newPubsubClient(options).build();
+ }
+ if (executeNullIfNotFound(pubsubClient.projects().subscriptions().get(subscription)) == null) {
+ Subscription subInfo = new Subscription()
+ .setAckDeadlineSeconds(60)
+ .setTopic(topic);
+ pubsubClient.projects().subscriptions().create(subscription, subInfo).execute();
+ }
+ }
+
+ /**
+ * Deletes the Google Cloud Pub/Sub topic.
+ *
+ * @throws IOException if there is a problem deleting the Pub/Sub topic
+ */
+ private void deletePubsubTopic(String topic) throws IOException {
+ if (pubsubClient == null) {
+ pubsubClient = Transport.newPubsubClient(options).build();
+ }
+ if (executeNullIfNotFound(pubsubClient.projects().topics().get(topic)) != null) {
+ pubsubClient.projects().topics().delete(topic).execute();
+ }
+ }
+
+ /**
+ * Deletes the Google Cloud Pub/Sub subscription.
+ *
+ * @throws IOException if there is a problem deleting the Pub/Sub subscription
+ */
+ private void deletePubsubSubscription(String subscription) throws IOException {
+ if (pubsubClient == null) {
+ pubsubClient = Transport.newPubsubClient(options).build();
+ }
+ if (executeNullIfNotFound(pubsubClient.projects().subscriptions().get(subscription)) != null) {
+ pubsubClient.projects().subscriptions().delete(subscription).execute();
+ }
+ }
+
+ /**
+ * If this is an unbounded (streaming) pipeline, and both inputFile and pubsub topic are defined,
+ * start an 'injector' pipeline that publishes the contents of the file to the given topic, first
+ * creating the topic if necessary.
+ */
+ public void startInjectorIfNeeded(String inputFile) {
+ ExamplePubsubTopicOptions pubsubTopicOptions = options.as(ExamplePubsubTopicOptions.class);
+ if (pubsubTopicOptions.isStreaming()
+ && !Strings.isNullOrEmpty(inputFile)
+ && !Strings.isNullOrEmpty(pubsubTopicOptions.getPubsubTopic())) {
+ runInjectorPipeline(inputFile, pubsubTopicOptions.getPubsubTopic());
+ }
+ }
+
+ /**
+ * Do some runner setup: check that the DirectPipelineRunner is not used in conjunction with
+ * streaming, and if streaming is specified, use the DataflowPipelineRunner. Return the streaming
+ * flag value.
+ */
+ public void setupRunner() {
+ if (options.isStreaming() && options.getRunner() != DirectPipelineRunner.class) {
+ // In order to cancel the pipelines automatically,
+ // {@literal DataflowPipelineRunner} is forced to be used.
+ options.setRunner(DataflowPipelineRunner.class);
+ }
+ }
+
+ /**
+ * Runs a batch pipeline to inject data into the PubSubIO input topic.
+ *
+ * <p>The injector pipeline will read from the given text file, and inject data
+ * into the Google Cloud Pub/Sub topic.
+ */
+ public void runInjectorPipeline(String inputFile, String topic) {
+ runInjectorPipeline(TextIO.Read.from(inputFile), topic, null);
+ }
+
+ /**
+ * Runs a batch pipeline to inject data into the PubSubIO input topic.
+ *
+ * <p>The injector pipeline will read from the given source, and inject data
+ * into the Google Cloud Pub/Sub topic.
+ */
+ public void runInjectorPipeline(PTransform<? super PBegin, PCollection<String>> readSource,
+ String topic,
+ String pubsubTimestampTabelKey) {
+ PubsubFileInjector.Bound injector;
+ if (Strings.isNullOrEmpty(pubsubTimestampTabelKey)) {
+ injector = PubsubFileInjector.publish(topic);
+ } else {
+ injector = PubsubFileInjector.withTimestampLabelKey(pubsubTimestampTabelKey).publish(topic);
+ }
+ DataflowPipelineOptions copiedOptions = options.cloneAs(DataflowPipelineOptions.class);
+ if (options.getServiceAccountName() != null) {
+ copiedOptions.setServiceAccountName(options.getServiceAccountName());
+ }
+ if (options.getServiceAccountKeyfile() != null) {
+ copiedOptions.setServiceAccountKeyfile(options.getServiceAccountKeyfile());
+ }
+ copiedOptions.setStreaming(false);
+ copiedOptions.setNumWorkers(options.as(DataflowExampleOptions.class).getInjectorNumWorkers());
+ copiedOptions.setJobName(options.getJobName() + "-injector");
+ Pipeline injectorPipeline = Pipeline.create(copiedOptions);
+ injectorPipeline.apply(readSource)
+ .apply(IntraBundleParallelization
+ .of(injector)
+ .withMaxParallelism(20));
+ PipelineResult result = injectorPipeline.run();
+ if (result instanceof DataflowPipelineJob) {
+ jobsToCancel.add(((DataflowPipelineJob) result));
+ }
+ }
+
+ /**
+ * Runs the provided pipeline to inject data into the PubSubIO input topic.
+ */
+ public void runInjectorPipeline(Pipeline injectorPipeline) {
+ PipelineResult result = injectorPipeline.run();
+ if (result instanceof DataflowPipelineJob) {
+ jobsToCancel.add(((DataflowPipelineJob) result));
+ }
+ }
+
+ /**
+ * Start the auxiliary injector pipeline, then wait for this pipeline to finish.
+ */
+ public void mockUnboundedSource(String inputFile, PipelineResult result) {
+ startInjectorIfNeeded(inputFile);
+ waitToFinish(result);
+ }
+
+ /**
+ * If {@literal DataflowPipelineRunner} or {@literal BlockingDataflowPipelineRunner} is used,
+ * waits for the pipeline to finish and cancels it (and the injector) before the program exists.
+ */
+ public void waitToFinish(PipelineResult result) {
+ if (result instanceof DataflowPipelineJob) {
+ final DataflowPipelineJob job = (DataflowPipelineJob) result;
+ jobsToCancel.add(job);
+ if (!options.as(DataflowExampleOptions.class).getKeepJobsRunning()) {
+ addShutdownHook(jobsToCancel);
+ }
+ try {
+ job.waitToFinish(-1, TimeUnit.SECONDS, new MonitoringUtil.PrintHandler(System.out));
+ } catch (Exception e) {
+ throw new RuntimeException("Failed to wait for job to finish: " + job.getJobId());
+ }
+ } else {
+ // Do nothing if the given PipelineResult doesn't support waitToFinish(),
+ // such as EvaluationResults returned by DirectPipelineRunner.
+ tearDown();
+ printPendingMessages();
+ }
+ }
+
+ private void addShutdownHook(final Collection<DataflowPipelineJob> jobs) {
+ if (dataflowClient == null) {
+ dataflowClient = options.getDataflowClient();
+ }
+
+ Runtime.getRuntime().addShutdownHook(new Thread() {
+ @Override
+ public void run() {
+ tearDown();
+ printPendingMessages();
+ for (DataflowPipelineJob job : jobs) {
+ System.out.println("Canceling example pipeline: " + job.getJobId());
+ try {
+ job.cancel();
+ } catch (IOException e) {
+ System.out.println("Failed to cancel the job,"
+ + " please go to the Developers Console to cancel it manually");
+ System.out.println(
+ MonitoringUtil.getJobMonitoringPageURL(job.getProjectId(), job.getJobId()));
+ }
+ }
+
+ for (DataflowPipelineJob job : jobs) {
+ boolean cancellationVerified = false;
+ for (int retryAttempts = 6; retryAttempts > 0; retryAttempts--) {
+ if (job.getState().isTerminal()) {
+ cancellationVerified = true;
+ System.out.println("Canceled example pipeline: " + job.getJobId());
+ break;
+ } else {
+ System.out.println(
+ "The example pipeline is still running. Verifying the cancellation.");
+ }
+ try {
+ Thread.sleep(10000);
+ } catch (InterruptedException e) {
+ // Ignore
+ }
+ }
+ if (!cancellationVerified) {
+ System.out.println("Failed to verify the cancellation for job: " + job.getJobId());
+ System.out.println("Please go to the Developers Console to verify manually:");
+ System.out.println(
+ MonitoringUtil.getJobMonitoringPageURL(job.getProjectId(), job.getJobId()));
+ }
+ }
+ }
+ });
+ }
+
+ private void printPendingMessages() {
+ System.out.println();
+ System.out.println("***********************************************************");
+ System.out.println("***********************************************************");
+ for (String message : pendingMessages) {
+ System.out.println(message);
+ }
+ System.out.println("***********************************************************");
+ System.out.println("***********************************************************");
+ }
+
+ private static <T> T executeNullIfNotFound(
+ AbstractGoogleClientRequest<T> request) throws IOException {
+ try {
+ return request.execute();
+ } catch (GoogleJsonResponseException e) {
+ if (e.getStatusCode() == HttpServletResponse.SC_NOT_FOUND) {
+ return null;
+ } else {
+ throw e;
+ }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/java/src/main/java/com/google/cloud/dataflow/examples/common/ExampleBigQueryTableOptions.java
----------------------------------------------------------------------
diff --git a/examples/java/src/main/java/com/google/cloud/dataflow/examples/common/ExampleBigQueryTableOptions.java b/examples/java/src/main/java/com/google/cloud/dataflow/examples/common/ExampleBigQueryTableOptions.java
new file mode 100644
index 0000000..7c213b5
--- /dev/null
+++ b/examples/java/src/main/java/com/google/cloud/dataflow/examples/common/ExampleBigQueryTableOptions.java
@@ -0,0 +1,53 @@
+/*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package com.google.cloud.dataflow.examples.common;
+
+import com.google.api.services.bigquery.model.TableSchema;
+import com.google.cloud.dataflow.sdk.options.DataflowPipelineOptions;
+import com.google.cloud.dataflow.sdk.options.Default;
+import com.google.cloud.dataflow.sdk.options.DefaultValueFactory;
+import com.google.cloud.dataflow.sdk.options.Description;
+import com.google.cloud.dataflow.sdk.options.PipelineOptions;
+
+/**
+ * Options that can be used to configure BigQuery tables in Dataflow examples.
+ * The project defaults to the project being used to run the example.
+ */
+public interface ExampleBigQueryTableOptions extends DataflowPipelineOptions {
+ @Description("BigQuery dataset name")
+ @Default.String("dataflow_examples")
+ String getBigQueryDataset();
+ void setBigQueryDataset(String dataset);
+
+ @Description("BigQuery table name")
+ @Default.InstanceFactory(BigQueryTableFactory.class)
+ String getBigQueryTable();
+ void setBigQueryTable(String table);
+
+ @Description("BigQuery table schema")
+ TableSchema getBigQuerySchema();
+ void setBigQuerySchema(TableSchema schema);
+
+ /**
+ * Returns the job name as the default BigQuery table name.
+ */
+ static class BigQueryTableFactory implements DefaultValueFactory<String> {
+ @Override
+ public String create(PipelineOptions options) {
+ return options.as(DataflowPipelineOptions.class).getJobName()
+ .replace('-', '_');
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/java/src/main/java/com/google/cloud/dataflow/examples/common/ExamplePubsubTopicAndSubscriptionOptions.java
----------------------------------------------------------------------
diff --git a/examples/java/src/main/java/com/google/cloud/dataflow/examples/common/ExamplePubsubTopicAndSubscriptionOptions.java b/examples/java/src/main/java/com/google/cloud/dataflow/examples/common/ExamplePubsubTopicAndSubscriptionOptions.java
new file mode 100644
index 0000000..d7bd4b8
--- /dev/null
+++ b/examples/java/src/main/java/com/google/cloud/dataflow/examples/common/ExamplePubsubTopicAndSubscriptionOptions.java
@@ -0,0 +1,44 @@
+/*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package com.google.cloud.dataflow.examples.common;
+
+import com.google.cloud.dataflow.sdk.options.DataflowPipelineOptions;
+import com.google.cloud.dataflow.sdk.options.Default;
+import com.google.cloud.dataflow.sdk.options.DefaultValueFactory;
+import com.google.cloud.dataflow.sdk.options.Description;
+import com.google.cloud.dataflow.sdk.options.PipelineOptions;
+
+/**
+ * Options that can be used to configure Pub/Sub topic/subscription in Dataflow examples.
+ */
+public interface ExamplePubsubTopicAndSubscriptionOptions extends ExamplePubsubTopicOptions {
+ @Description("Pub/Sub subscription")
+ @Default.InstanceFactory(PubsubSubscriptionFactory.class)
+ String getPubsubSubscription();
+ void setPubsubSubscription(String subscription);
+
+ /**
+ * Returns a default Pub/Sub subscription based on the project and the job names.
+ */
+ static class PubsubSubscriptionFactory implements DefaultValueFactory<String> {
+ @Override
+ public String create(PipelineOptions options) {
+ DataflowPipelineOptions dataflowPipelineOptions =
+ options.as(DataflowPipelineOptions.class);
+ return "projects/" + dataflowPipelineOptions.getProject()
+ + "/subscriptions/" + dataflowPipelineOptions.getJobName();
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/java/src/main/java/com/google/cloud/dataflow/examples/common/ExamplePubsubTopicOptions.java
----------------------------------------------------------------------
diff --git a/examples/java/src/main/java/com/google/cloud/dataflow/examples/common/ExamplePubsubTopicOptions.java b/examples/java/src/main/java/com/google/cloud/dataflow/examples/common/ExamplePubsubTopicOptions.java
new file mode 100644
index 0000000..4bedf31
--- /dev/null
+++ b/examples/java/src/main/java/com/google/cloud/dataflow/examples/common/ExamplePubsubTopicOptions.java
@@ -0,0 +1,44 @@
+/*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package com.google.cloud.dataflow.examples.common;
+
+import com.google.cloud.dataflow.sdk.options.DataflowPipelineOptions;
+import com.google.cloud.dataflow.sdk.options.Default;
+import com.google.cloud.dataflow.sdk.options.DefaultValueFactory;
+import com.google.cloud.dataflow.sdk.options.Description;
+import com.google.cloud.dataflow.sdk.options.PipelineOptions;
+
+/**
+ * Options that can be used to configure Pub/Sub topic in Dataflow examples.
+ */
+public interface ExamplePubsubTopicOptions extends DataflowPipelineOptions {
+ @Description("Pub/Sub topic")
+ @Default.InstanceFactory(PubsubTopicFactory.class)
+ String getPubsubTopic();
+ void setPubsubTopic(String topic);
+
+ /**
+ * Returns a default Pub/Sub topic based on the project and the job names.
+ */
+ static class PubsubTopicFactory implements DefaultValueFactory<String> {
+ @Override
+ public String create(PipelineOptions options) {
+ DataflowPipelineOptions dataflowPipelineOptions =
+ options.as(DataflowPipelineOptions.class);
+ return "projects/" + dataflowPipelineOptions.getProject()
+ + "/topics/" + dataflowPipelineOptions.getJobName();
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/java/src/main/java/com/google/cloud/dataflow/examples/common/PubsubFileInjector.java
----------------------------------------------------------------------
diff --git a/examples/java/src/main/java/com/google/cloud/dataflow/examples/common/PubsubFileInjector.java b/examples/java/src/main/java/com/google/cloud/dataflow/examples/common/PubsubFileInjector.java
new file mode 100644
index 0000000..4a82ae6
--- /dev/null
+++ b/examples/java/src/main/java/com/google/cloud/dataflow/examples/common/PubsubFileInjector.java
@@ -0,0 +1,153 @@
+/*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package com.google.cloud.dataflow.examples.common;
+
+import com.google.api.services.pubsub.Pubsub;
+import com.google.api.services.pubsub.model.PublishRequest;
+import com.google.api.services.pubsub.model.PubsubMessage;
+import com.google.cloud.dataflow.sdk.Pipeline;
+import com.google.cloud.dataflow.sdk.io.TextIO;
+import com.google.cloud.dataflow.sdk.options.DataflowPipelineOptions;
+import com.google.cloud.dataflow.sdk.options.Description;
+import com.google.cloud.dataflow.sdk.options.PipelineOptions;
+import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
+import com.google.cloud.dataflow.sdk.options.Validation;
+import com.google.cloud.dataflow.sdk.transforms.DoFn;
+import com.google.cloud.dataflow.sdk.transforms.IntraBundleParallelization;
+import com.google.cloud.dataflow.sdk.util.Transport;
+import com.google.common.collect.ImmutableMap;
+
+import java.io.IOException;
+import java.util.Arrays;
+
+/**
+ * A batch Dataflow pipeline for injecting a set of GCS files into
+ * a PubSub topic line by line. Empty lines are skipped.
+ *
+ * <p>This is useful for testing streaming
+ * pipelines. Note that since batch pipelines might retry chunks, this
+ * does _not_ guarantee exactly-once injection of file data. Some lines may
+ * be published multiple times.
+ * </p>
+ */
+public class PubsubFileInjector {
+
+ /**
+ * An incomplete {@code PubsubFileInjector} transform with unbound output topic.
+ */
+ public static class Unbound {
+ private final String timestampLabelKey;
+
+ Unbound() {
+ this.timestampLabelKey = null;
+ }
+
+ Unbound(String timestampLabelKey) {
+ this.timestampLabelKey = timestampLabelKey;
+ }
+
+ Unbound withTimestampLabelKey(String timestampLabelKey) {
+ return new Unbound(timestampLabelKey);
+ }
+
+ public Bound publish(String outputTopic) {
+ return new Bound(outputTopic, timestampLabelKey);
+ }
+ }
+
+ /** A DoFn that publishes non-empty lines to Google Cloud PubSub. */
+ public static class Bound extends DoFn<String, Void> {
+ private final String outputTopic;
+ private final String timestampLabelKey;
+ public transient Pubsub pubsub;
+
+ public Bound(String outputTopic, String timestampLabelKey) {
+ this.outputTopic = outputTopic;
+ this.timestampLabelKey = timestampLabelKey;
+ }
+
+ @Override
+ public void startBundle(Context context) {
+ this.pubsub =
+ Transport.newPubsubClient(context.getPipelineOptions().as(DataflowPipelineOptions.class))
+ .build();
+ }
+
+ @Override
+ public void processElement(ProcessContext c) throws IOException {
+ if (c.element().isEmpty()) {
+ return;
+ }
+ PubsubMessage pubsubMessage = new PubsubMessage();
+ pubsubMessage.encodeData(c.element().getBytes());
+ if (timestampLabelKey != null) {
+ pubsubMessage.setAttributes(
+ ImmutableMap.of(timestampLabelKey, Long.toString(c.timestamp().getMillis())));
+ }
+ PublishRequest publishRequest = new PublishRequest();
+ publishRequest.setMessages(Arrays.asList(pubsubMessage));
+ this.pubsub.projects().topics().publish(outputTopic, publishRequest).execute();
+ }
+ }
+
+ /**
+ * Creates a {@code PubsubFileInjector} transform with the given timestamp label key.
+ */
+ public static Unbound withTimestampLabelKey(String timestampLabelKey) {
+ return new Unbound(timestampLabelKey);
+ }
+
+ /**
+ * Creates a {@code PubsubFileInjector} transform that publishes to the given output topic.
+ */
+ public static Bound publish(String outputTopic) {
+ return new Unbound().publish(outputTopic);
+ }
+
+ /**
+ * Command line parameter options.
+ */
+ private interface PubsubFileInjectorOptions extends PipelineOptions {
+ @Description("GCS location of files.")
+ @Validation.Required
+ String getInput();
+ void setInput(String value);
+
+ @Description("Topic to publish on.")
+ @Validation.Required
+ String getOutputTopic();
+ void setOutputTopic(String value);
+ }
+
+ /**
+ * Sets up and starts streaming pipeline.
+ */
+ public static void main(String[] args) {
+ PubsubFileInjectorOptions options = PipelineOptionsFactory.fromArgs(args)
+ .withValidation()
+ .as(PubsubFileInjectorOptions.class);
+
+ Pipeline pipeline = Pipeline.create(options);
+
+ pipeline
+ .apply(TextIO.Read.from(options.getInput()))
+ .apply(IntraBundleParallelization.of(PubsubFileInjector.publish(options.getOutputTopic()))
+ .withMaxParallelism(20));
+
+ pipeline.run();
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/java/src/main/java/com/google/cloud/dataflow/examples/complete/AutoComplete.java
----------------------------------------------------------------------
diff --git a/examples/java/src/main/java/com/google/cloud/dataflow/examples/complete/AutoComplete.java b/examples/java/src/main/java/com/google/cloud/dataflow/examples/complete/AutoComplete.java
new file mode 100644
index 0000000..f897338
--- /dev/null
+++ b/examples/java/src/main/java/com/google/cloud/dataflow/examples/complete/AutoComplete.java
@@ -0,0 +1,516 @@
+/*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package com.google.cloud.dataflow.examples.complete;
+
+import com.google.api.services.bigquery.model.TableFieldSchema;
+import com.google.api.services.bigquery.model.TableReference;
+import com.google.api.services.bigquery.model.TableRow;
+import com.google.api.services.bigquery.model.TableSchema;
+import com.google.api.services.datastore.DatastoreV1.Entity;
+import com.google.api.services.datastore.DatastoreV1.Key;
+import com.google.api.services.datastore.DatastoreV1.Value;
+import com.google.api.services.datastore.client.DatastoreHelper;
+import com.google.cloud.dataflow.examples.common.DataflowExampleUtils;
+import com.google.cloud.dataflow.examples.common.ExampleBigQueryTableOptions;
+import com.google.cloud.dataflow.examples.common.ExamplePubsubTopicOptions;
+import com.google.cloud.dataflow.sdk.Pipeline;
+import com.google.cloud.dataflow.sdk.PipelineResult;
+import com.google.cloud.dataflow.sdk.coders.AvroCoder;
+import com.google.cloud.dataflow.sdk.coders.DefaultCoder;
+import com.google.cloud.dataflow.sdk.io.BigQueryIO;
+import com.google.cloud.dataflow.sdk.io.DatastoreIO;
+import com.google.cloud.dataflow.sdk.io.PubsubIO;
+import com.google.cloud.dataflow.sdk.io.TextIO;
+import com.google.cloud.dataflow.sdk.options.Default;
+import com.google.cloud.dataflow.sdk.options.Description;
+import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
+import com.google.cloud.dataflow.sdk.runners.DataflowPipelineRunner;
+import com.google.cloud.dataflow.sdk.transforms.Count;
+import com.google.cloud.dataflow.sdk.transforms.DoFn;
+import com.google.cloud.dataflow.sdk.transforms.Filter;
+import com.google.cloud.dataflow.sdk.transforms.Flatten;
+import com.google.cloud.dataflow.sdk.transforms.PTransform;
+import com.google.cloud.dataflow.sdk.transforms.ParDo;
+import com.google.cloud.dataflow.sdk.transforms.Partition;
+import com.google.cloud.dataflow.sdk.transforms.Partition.PartitionFn;
+import com.google.cloud.dataflow.sdk.transforms.SerializableFunction;
+import com.google.cloud.dataflow.sdk.transforms.Top;
+import com.google.cloud.dataflow.sdk.transforms.windowing.GlobalWindows;
+import com.google.cloud.dataflow.sdk.transforms.windowing.SlidingWindows;
+import com.google.cloud.dataflow.sdk.transforms.windowing.Window;
+import com.google.cloud.dataflow.sdk.transforms.windowing.WindowFn;
+import com.google.cloud.dataflow.sdk.values.KV;
+import com.google.cloud.dataflow.sdk.values.PBegin;
+import com.google.cloud.dataflow.sdk.values.PCollection;
+import com.google.cloud.dataflow.sdk.values.PCollectionList;
+import com.google.common.base.MoreObjects;
+import com.google.common.base.Preconditions;
+
+import org.joda.time.Duration;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * An example that computes the most popular hash tags
+ * for every prefix, which can be used for auto-completion.
+ *
+ * <p>Concepts: Using the same pipeline in both streaming and batch, combiners,
+ * composite transforms.
+ *
+ * <p>To execute this pipeline using the Dataflow service in batch mode,
+ * specify pipeline configuration:
+ * <pre>{@code
+ * --project=YOUR_PROJECT_ID
+ * --stagingLocation=gs://YOUR_STAGING_DIRECTORY
+ * --runner=DataflowPipelineRunner
+ * --inputFile=gs://path/to/input*.txt
+ * }</pre>
+ *
+ * <p>To execute this pipeline using the Dataflow service in streaming mode,
+ * specify pipeline configuration:
+ * <pre>{@code
+ * --project=YOUR_PROJECT_ID
+ * --stagingLocation=gs://YOUR_STAGING_DIRECTORY
+ * --runner=DataflowPipelineRunner
+ * --inputFile=gs://YOUR_INPUT_DIRECTORY/*.txt
+ * --streaming
+ * }</pre>
+ *
+ * <p>This will update the datastore every 10 seconds based on the last
+ * 30 minutes of data received.
+ */
+public class AutoComplete {
+
+ /**
+ * A PTransform that takes as input a list of tokens and returns
+ * the most common tokens per prefix.
+ */
+ public static class ComputeTopCompletions
+ extends PTransform<PCollection<String>, PCollection<KV<String, List<CompletionCandidate>>>> {
+ private final int candidatesPerPrefix;
+ private final boolean recursive;
+
+ protected ComputeTopCompletions(int candidatesPerPrefix, boolean recursive) {
+ this.candidatesPerPrefix = candidatesPerPrefix;
+ this.recursive = recursive;
+ }
+
+ public static ComputeTopCompletions top(int candidatesPerPrefix, boolean recursive) {
+ return new ComputeTopCompletions(candidatesPerPrefix, recursive);
+ }
+
+ @Override
+ public PCollection<KV<String, List<CompletionCandidate>>> apply(PCollection<String> input) {
+ PCollection<CompletionCandidate> candidates = input
+ // First count how often each token appears.
+ .apply(new Count.PerElement<String>())
+
+ // Map the KV outputs of Count into our own CompletionCandiate class.
+ .apply(ParDo.named("CreateCompletionCandidates").of(
+ new DoFn<KV<String, Long>, CompletionCandidate>() {
+ @Override
+ public void processElement(ProcessContext c) {
+ c.output(new CompletionCandidate(c.element().getKey(), c.element().getValue()));
+ }
+ }));
+
+ // Compute the top via either a flat or recursive algorithm.
+ if (recursive) {
+ return candidates
+ .apply(new ComputeTopRecursive(candidatesPerPrefix, 1))
+ .apply(Flatten.<KV<String, List<CompletionCandidate>>>pCollections());
+ } else {
+ return candidates
+ .apply(new ComputeTopFlat(candidatesPerPrefix, 1));
+ }
+ }
+ }
+
+ /**
+ * Lower latency, but more expensive.
+ */
+ private static class ComputeTopFlat
+ extends PTransform<PCollection<CompletionCandidate>,
+ PCollection<KV<String, List<CompletionCandidate>>>> {
+ private final int candidatesPerPrefix;
+ private final int minPrefix;
+
+ public ComputeTopFlat(int candidatesPerPrefix, int minPrefix) {
+ this.candidatesPerPrefix = candidatesPerPrefix;
+ this.minPrefix = minPrefix;
+ }
+
+ @Override
+ public PCollection<KV<String, List<CompletionCandidate>>> apply(
+ PCollection<CompletionCandidate> input) {
+ return input
+ // For each completion candidate, map it to all prefixes.
+ .apply(ParDo.of(new AllPrefixes(minPrefix)))
+
+ // Find and return the top candiates for each prefix.
+ .apply(Top.<String, CompletionCandidate>largestPerKey(candidatesPerPrefix)
+ .withHotKeyFanout(new HotKeyFanout()));
+ }
+
+ private static class HotKeyFanout implements SerializableFunction<String, Integer> {
+ @Override
+ public Integer apply(String input) {
+ return (int) Math.pow(4, 5 - input.length());
+ }
+ }
+ }
+
+ /**
+ * Cheaper but higher latency.
+ *
+ * <p>Returns two PCollections, the first is top prefixes of size greater
+ * than minPrefix, and the second is top prefixes of size exactly
+ * minPrefix.
+ */
+ private static class ComputeTopRecursive
+ extends PTransform<PCollection<CompletionCandidate>,
+ PCollectionList<KV<String, List<CompletionCandidate>>>> {
+ private final int candidatesPerPrefix;
+ private final int minPrefix;
+
+ public ComputeTopRecursive(int candidatesPerPrefix, int minPrefix) {
+ this.candidatesPerPrefix = candidatesPerPrefix;
+ this.minPrefix = minPrefix;
+ }
+
+ private class KeySizePartitionFn implements PartitionFn<KV<String, List<CompletionCandidate>>> {
+ @Override
+ public int partitionFor(KV<String, List<CompletionCandidate>> elem, int numPartitions) {
+ return elem.getKey().length() > minPrefix ? 0 : 1;
+ }
+ }
+
+ private static class FlattenTops
+ extends DoFn<KV<String, List<CompletionCandidate>>, CompletionCandidate> {
+ @Override
+ public void processElement(ProcessContext c) {
+ for (CompletionCandidate cc : c.element().getValue()) {
+ c.output(cc);
+ }
+ }
+ }
+
+ @Override
+ public PCollectionList<KV<String, List<CompletionCandidate>>> apply(
+ PCollection<CompletionCandidate> input) {
+ if (minPrefix > 10) {
+ // Base case, partitioning to return the output in the expected format.
+ return input
+ .apply(new ComputeTopFlat(candidatesPerPrefix, minPrefix))
+ .apply(Partition.of(2, new KeySizePartitionFn()));
+ } else {
+ // If a candidate is in the top N for prefix a...b, it must also be in the top
+ // N for a...bX for every X, which is typlically a much smaller set to consider.
+ // First, compute the top candidate for prefixes of size at least minPrefix + 1.
+ PCollectionList<KV<String, List<CompletionCandidate>>> larger = input
+ .apply(new ComputeTopRecursive(candidatesPerPrefix, minPrefix + 1));
+ // Consider the top candidates for each prefix of length minPrefix + 1...
+ PCollection<KV<String, List<CompletionCandidate>>> small =
+ PCollectionList
+ .of(larger.get(1).apply(ParDo.of(new FlattenTops())))
+ // ...together with those (previously excluded) candidates of length
+ // exactly minPrefix...
+ .and(input.apply(Filter.byPredicate(
+ new SerializableFunction<CompletionCandidate, Boolean>() {
+ @Override
+ public Boolean apply(CompletionCandidate c) {
+ return c.getValue().length() == minPrefix;
+ }
+ })))
+ .apply("FlattenSmall", Flatten.<CompletionCandidate>pCollections())
+ // ...set the key to be the minPrefix-length prefix...
+ .apply(ParDo.of(new AllPrefixes(minPrefix, minPrefix)))
+ // ...and (re)apply the Top operator to all of them together.
+ .apply(Top.<String, CompletionCandidate>largestPerKey(candidatesPerPrefix));
+
+ PCollection<KV<String, List<CompletionCandidate>>> flattenLarger = larger
+ .apply("FlattenLarge", Flatten.<KV<String, List<CompletionCandidate>>>pCollections());
+
+ return PCollectionList.of(flattenLarger).and(small);
+ }
+ }
+ }
+
+ /**
+ * A DoFn that keys each candidate by all its prefixes.
+ */
+ private static class AllPrefixes
+ extends DoFn<CompletionCandidate, KV<String, CompletionCandidate>> {
+ private final int minPrefix;
+ private final int maxPrefix;
+ public AllPrefixes(int minPrefix) {
+ this(minPrefix, Integer.MAX_VALUE);
+ }
+ public AllPrefixes(int minPrefix, int maxPrefix) {
+ this.minPrefix = minPrefix;
+ this.maxPrefix = maxPrefix;
+ }
+ @Override
+ public void processElement(ProcessContext c) {
+ String word = c.element().value;
+ for (int i = minPrefix; i <= Math.min(word.length(), maxPrefix); i++) {
+ c.output(KV.of(word.substring(0, i), c.element()));
+ }
+ }
+ }
+
+ /**
+ * Class used to store tag-count pairs.
+ */
+ @DefaultCoder(AvroCoder.class)
+ static class CompletionCandidate implements Comparable<CompletionCandidate> {
+ private long count;
+ private String value;
+
+ public CompletionCandidate(String value, long count) {
+ this.value = value;
+ this.count = count;
+ }
+
+ public long getCount() {
+ return count;
+ }
+
+ public String getValue() {
+ return value;
+ }
+
+ // Empty constructor required for Avro decoding.
+ public CompletionCandidate() {}
+
+ @Override
+ public int compareTo(CompletionCandidate o) {
+ if (this.count < o.count) {
+ return -1;
+ } else if (this.count == o.count) {
+ return this.value.compareTo(o.value);
+ } else {
+ return 1;
+ }
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ if (other instanceof CompletionCandidate) {
+ CompletionCandidate that = (CompletionCandidate) other;
+ return this.count == that.count && this.value.equals(that.value);
+ } else {
+ return false;
+ }
+ }
+
+ @Override
+ public int hashCode() {
+ return Long.valueOf(count).hashCode() ^ value.hashCode();
+ }
+
+ @Override
+ public String toString() {
+ return "CompletionCandidate[" + value + ", " + count + "]";
+ }
+ }
+
+ /**
+ * Takes as input a set of strings, and emits each #hashtag found therein.
+ */
+ static class ExtractHashtags extends DoFn<String, String> {
+ @Override
+ public void processElement(ProcessContext c) {
+ Matcher m = Pattern.compile("#\\S+").matcher(c.element());
+ while (m.find()) {
+ c.output(m.group().substring(1));
+ }
+ }
+ }
+
+ static class FormatForBigquery extends DoFn<KV<String, List<CompletionCandidate>>, TableRow> {
+ @Override
+ public void processElement(ProcessContext c) {
+ List<TableRow> completions = new ArrayList<>();
+ for (CompletionCandidate cc : c.element().getValue()) {
+ completions.add(new TableRow()
+ .set("count", cc.getCount())
+ .set("tag", cc.getValue()));
+ }
+ TableRow row = new TableRow()
+ .set("prefix", c.element().getKey())
+ .set("tags", completions);
+ c.output(row);
+ }
+
+ /**
+ * Defines the BigQuery schema used for the output.
+ */
+ static TableSchema getSchema() {
+ List<TableFieldSchema> tagFields = new ArrayList<>();
+ tagFields.add(new TableFieldSchema().setName("count").setType("INTEGER"));
+ tagFields.add(new TableFieldSchema().setName("tag").setType("STRING"));
+ List<TableFieldSchema> fields = new ArrayList<>();
+ fields.add(new TableFieldSchema().setName("prefix").setType("STRING"));
+ fields.add(new TableFieldSchema()
+ .setName("tags").setType("RECORD").setMode("REPEATED").setFields(tagFields));
+ return new TableSchema().setFields(fields);
+ }
+ }
+
+ /**
+ * Takes as input a the top candidates per prefix, and emits an entity
+ * suitable for writing to Datastore.
+ */
+ static class FormatForDatastore extends DoFn<KV<String, List<CompletionCandidate>>, Entity> {
+ private String kind;
+
+ public FormatForDatastore(String kind) {
+ this.kind = kind;
+ }
+
+ @Override
+ public void processElement(ProcessContext c) {
+ Entity.Builder entityBuilder = Entity.newBuilder();
+ Key key = DatastoreHelper.makeKey(kind, c.element().getKey()).build();
+
+ entityBuilder.setKey(key);
+ List<Value> candidates = new ArrayList<>();
+ for (CompletionCandidate tag : c.element().getValue()) {
+ Entity.Builder tagEntity = Entity.newBuilder();
+ tagEntity.addProperty(
+ DatastoreHelper.makeProperty("tag", DatastoreHelper.makeValue(tag.value)));
+ tagEntity.addProperty(
+ DatastoreHelper.makeProperty("count", DatastoreHelper.makeValue(tag.count)));
+ candidates.add(DatastoreHelper.makeValue(tagEntity).setIndexed(false).build());
+ }
+ entityBuilder.addProperty(
+ DatastoreHelper.makeProperty("candidates", DatastoreHelper.makeValue(candidates)));
+ c.output(entityBuilder.build());
+ }
+ }
+
+ /**
+ * Options supported by this class.
+ *
+ * <p>Inherits standard Dataflow configuration options.
+ */
+ private static interface Options extends ExamplePubsubTopicOptions, ExampleBigQueryTableOptions {
+ @Description("Input text file")
+ String getInputFile();
+ void setInputFile(String value);
+
+ @Description("Whether to use the recursive algorithm")
+ @Default.Boolean(true)
+ Boolean getRecursive();
+ void setRecursive(Boolean value);
+
+ @Description("Dataset entity kind")
+ @Default.String("autocomplete-demo")
+ String getKind();
+ void setKind(String value);
+
+ @Description("Whether output to BigQuery")
+ @Default.Boolean(true)
+ Boolean getOutputToBigQuery();
+ void setOutputToBigQuery(Boolean value);
+
+ @Description("Whether output to Datastore")
+ @Default.Boolean(false)
+ Boolean getOutputToDatastore();
+ void setOutputToDatastore(Boolean value);
+
+ @Description("Datastore output dataset ID, defaults to project ID")
+ String getOutputDataset();
+ void setOutputDataset(String value);
+ }
+
+ public static void main(String[] args) throws IOException {
+ Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
+
+ if (options.isStreaming()) {
+ // In order to cancel the pipelines automatically,
+ // {@literal DataflowPipelineRunner} is forced to be used.
+ options.setRunner(DataflowPipelineRunner.class);
+ }
+
+ options.setBigQuerySchema(FormatForBigquery.getSchema());
+ DataflowExampleUtils dataflowUtils = new DataflowExampleUtils(options);
+
+ // We support running the same pipeline in either
+ // batch or windowed streaming mode.
+ PTransform<? super PBegin, PCollection<String>> readSource;
+ WindowFn<Object, ?> windowFn;
+ if (options.isStreaming()) {
+ Preconditions.checkArgument(
+ !options.getOutputToDatastore(), "DatastoreIO is not supported in streaming.");
+ dataflowUtils.setupPubsub();
+
+ readSource = PubsubIO.Read.topic(options.getPubsubTopic());
+ windowFn = SlidingWindows.of(Duration.standardMinutes(30)).every(Duration.standardSeconds(5));
+ } else {
+ readSource = TextIO.Read.from(options.getInputFile());
+ windowFn = new GlobalWindows();
+ }
+
+ // Create the pipeline.
+ Pipeline p = Pipeline.create(options);
+ PCollection<KV<String, List<CompletionCandidate>>> toWrite = p
+ .apply(readSource)
+ .apply(ParDo.of(new ExtractHashtags()))
+ .apply(Window.<String>into(windowFn))
+ .apply(ComputeTopCompletions.top(10, options.getRecursive()));
+
+ if (options.getOutputToDatastore()) {
+ toWrite
+ .apply(ParDo.named("FormatForDatastore").of(new FormatForDatastore(options.getKind())))
+ .apply(DatastoreIO.writeTo(MoreObjects.firstNonNull(
+ options.getOutputDataset(), options.getProject())));
+ }
+ if (options.getOutputToBigQuery()) {
+ dataflowUtils.setupBigQueryTable();
+
+ TableReference tableRef = new TableReference();
+ tableRef.setProjectId(options.getProject());
+ tableRef.setDatasetId(options.getBigQueryDataset());
+ tableRef.setTableId(options.getBigQueryTable());
+
+ toWrite
+ .apply(ParDo.of(new FormatForBigquery()))
+ .apply(BigQueryIO.Write
+ .to(tableRef)
+ .withSchema(FormatForBigquery.getSchema())
+ .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED)
+ .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_TRUNCATE));
+ }
+
+ // Run the pipeline.
+ PipelineResult result = p.run();
+
+ if (options.isStreaming() && !options.getInputFile().isEmpty()) {
+ // Inject the data into the Pub/Sub topic with a Dataflow batch pipeline.
+ dataflowUtils.runInjectorPipeline(options.getInputFile(), options.getPubsubTopic());
+ }
+
+ // dataflowUtils will try to cancel the pipeline and the injector before the program exists.
+ dataflowUtils.waitToFinish(result);
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/java/src/main/java/com/google/cloud/dataflow/examples/complete/README.md
----------------------------------------------------------------------
diff --git a/examples/java/src/main/java/com/google/cloud/dataflow/examples/complete/README.md b/examples/java/src/main/java/com/google/cloud/dataflow/examples/complete/README.md
new file mode 100644
index 0000000..5fba154
--- /dev/null
+++ b/examples/java/src/main/java/com/google/cloud/dataflow/examples/complete/README.md
@@ -0,0 +1,44 @@
+
+# "Complete" Examples
+
+This directory contains end-to-end example pipelines that perform complex data processing tasks. They include:
+
+<ul>
+ <li><a href="https://github.com/GoogleCloudPlatform/DataflowJavaSDK/blob/master/examples/src/main/java/com/google/cloud/dataflow/examples/complete/AutoComplete.java">AutoComplete</a>
+ — An example that computes the most popular hash tags for every
+ prefix, which can be used for auto-completion. Demonstrates how to use the
+ same pipeline in both streaming and batch, combiners, and composite
+ transforms.</li>
+ <li><a href="https://github.com/GoogleCloudPlatform/DataflowJavaSDK/blob/master/examples/src/main/java/com/google/cloud/dataflow/examples/complete/StreamingWordExtract.java">StreamingWordExtract</a>
+ — A streaming pipeline example that inputs lines of text from a Cloud
+ Pub/Sub topic, splits each line into individual words, capitalizes those
+ words, and writes the output to a BigQuery table.
+ </li>
+ <li><a href="https://github.com/GoogleCloudPlatform/DataflowJavaSDK/blob/master/examples/src/main/java/com/google/cloud/dataflow/examples/complete/TfIdf.java">TfIdf</a>
+ — An example that computes a basic TF-IDF search table for a directory or
+ Cloud Storage prefix. Demonstrates joining data, side inputs, and logging.
+ </li>
+ <li><a href="https://github.com/GoogleCloudPlatform/DataflowJavaSDK/blob/master/examples/src/main/java/com/google/cloud/dataflow/examples/complete/TopWikipediaSessions.java">TopWikipediaSessions</a>
+ — An example that reads Wikipedia edit data from Cloud Storage and
+ computes the user with the longest string of edits separated by no more than
+ an hour within each month. Demonstrates using Cloud Dataflow
+ <code>Windowing</code> to perform time-based aggregations of data.
+ </li>
+ <li><a href="https://github.com/GoogleCloudPlatform/DataflowJavaSDK/blob/master/examples/src/main/java/com/google/cloud/dataflow/examples/complete/TrafficMaxLaneFlow.java">TrafficMaxLaneFlow</a>
+ — A streaming Cloud Dataflow example using BigQuery output in the
+ <code>traffic sensor</code> domain. Demonstrates the Cloud Dataflow streaming
+ runner, sliding windows, Cloud Pub/Sub topic ingestion, the use of the
+ <code>AvroCoder</code> to encode a custom class, and custom
+ <code>Combine</code> transforms.
+ </li>
+ <li><a href="https://github.com/GoogleCloudPlatform/DataflowJavaSDK/blob/master/examples/src/main/java/com/google/cloud/dataflow/examples/complete/TrafficRoutes.java">TrafficRoutes</a>
+ — A streaming Cloud Dataflow example using BigQuery output in the
+ <code>traffic sensor</code> domain. Demonstrates the Cloud Dataflow streaming
+ runner, <code>GroupByKey</code>, keyed state, sliding windows, and Cloud
+ Pub/Sub topic ingestion.
+ </li>
+ </ul>
+
+See the [documentation](https://cloud.google.com/dataflow/getting-started) and the [Examples
+README](../../../../../../../../../README.md) for
+information about how to run these examples.
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/java/src/main/java/com/google/cloud/dataflow/examples/complete/StreamingWordExtract.java
----------------------------------------------------------------------
diff --git a/examples/java/src/main/java/com/google/cloud/dataflow/examples/complete/StreamingWordExtract.java b/examples/java/src/main/java/com/google/cloud/dataflow/examples/complete/StreamingWordExtract.java
new file mode 100644
index 0000000..99c5249
--- /dev/null
+++ b/examples/java/src/main/java/com/google/cloud/dataflow/examples/complete/StreamingWordExtract.java
@@ -0,0 +1,163 @@
+/*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package com.google.cloud.dataflow.examples.complete;
+
+import com.google.api.services.bigquery.model.TableFieldSchema;
+import com.google.api.services.bigquery.model.TableRow;
+import com.google.api.services.bigquery.model.TableSchema;
+import com.google.cloud.dataflow.examples.common.DataflowExampleUtils;
+import com.google.cloud.dataflow.examples.common.ExampleBigQueryTableOptions;
+import com.google.cloud.dataflow.examples.common.ExamplePubsubTopicOptions;
+import com.google.cloud.dataflow.sdk.Pipeline;
+import com.google.cloud.dataflow.sdk.PipelineResult;
+import com.google.cloud.dataflow.sdk.io.BigQueryIO;
+import com.google.cloud.dataflow.sdk.io.PubsubIO;
+import com.google.cloud.dataflow.sdk.options.Default;
+import com.google.cloud.dataflow.sdk.options.Description;
+import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
+import com.google.cloud.dataflow.sdk.runners.DataflowPipelineRunner;
+import com.google.cloud.dataflow.sdk.transforms.DoFn;
+import com.google.cloud.dataflow.sdk.transforms.ParDo;
+
+import java.io.IOException;
+import java.util.ArrayList;
+
+/**
+ * A streaming Dataflow Example using BigQuery output.
+ *
+ * <p>This pipeline example reads lines of text from a PubSub topic, splits each line
+ * into individual words, capitalizes those words, and writes the output to
+ * a BigQuery table.
+ *
+ * <p>By default, the example will run a separate pipeline to inject the data from the default
+ * {@literal --inputFile} to the Pub/Sub {@literal --pubsubTopic}. It will make it available for
+ * the streaming pipeline to process. You may override the default {@literal --inputFile} with the
+ * file of your choosing. You may also set {@literal --inputFile} to an empty string, which will
+ * disable the automatic Pub/Sub injection, and allow you to use separate tool to control the input
+ * to this example.
+ *
+ * <p>The example is configured to use the default Pub/Sub topic and the default BigQuery table
+ * from the example common package (there are no defaults for a general Dataflow pipeline).
+ * You can override them by using the {@literal --pubsubTopic}, {@literal --bigQueryDataset}, and
+ * {@literal --bigQueryTable} options. If the Pub/Sub topic or the BigQuery table do not exist,
+ * the example will try to create them.
+ *
+ * <p>The example will try to cancel the pipelines on the signal to terminate the process (CTRL-C)
+ * and then exits.
+ */
+public class StreamingWordExtract {
+
+ /** A DoFn that tokenizes lines of text into individual words. */
+ static class ExtractWords extends DoFn<String, String> {
+ @Override
+ public void processElement(ProcessContext c) {
+ String[] words = c.element().split("[^a-zA-Z']+");
+ for (String word : words) {
+ if (!word.isEmpty()) {
+ c.output(word);
+ }
+ }
+ }
+ }
+
+ /** A DoFn that uppercases a word. */
+ static class Uppercase extends DoFn<String, String> {
+ @Override
+ public void processElement(ProcessContext c) {
+ c.output(c.element().toUpperCase());
+ }
+ }
+
+ /**
+ * Converts strings into BigQuery rows.
+ */
+ static class StringToRowConverter extends DoFn<String, TableRow> {
+ /**
+ * In this example, put the whole string into single BigQuery field.
+ */
+ @Override
+ public void processElement(ProcessContext c) {
+ c.output(new TableRow().set("string_field", c.element()));
+ }
+
+ static TableSchema getSchema() {
+ return new TableSchema().setFields(new ArrayList<TableFieldSchema>() {
+ // Compose the list of TableFieldSchema from tableSchema.
+ {
+ add(new TableFieldSchema().setName("string_field").setType("STRING"));
+ }
+ });
+ }
+ }
+
+ /**
+ * Options supported by {@link StreamingWordExtract}.
+ *
+ * <p>Inherits standard configuration options.
+ */
+ private interface StreamingWordExtractOptions
+ extends ExamplePubsubTopicOptions, ExampleBigQueryTableOptions {
+ @Description("Input file to inject to Pub/Sub topic")
+ @Default.String("gs://dataflow-samples/shakespeare/kinglear.txt")
+ String getInputFile();
+ void setInputFile(String value);
+ }
+
+ /**
+ * Sets up and starts streaming pipeline.
+ *
+ * @throws IOException if there is a problem setting up resources
+ */
+ public static void main(String[] args) throws IOException {
+ StreamingWordExtractOptions options = PipelineOptionsFactory.fromArgs(args)
+ .withValidation()
+ .as(StreamingWordExtractOptions.class);
+ options.setStreaming(true);
+ // In order to cancel the pipelines automatically,
+ // {@literal DataflowPipelineRunner} is forced to be used.
+ options.setRunner(DataflowPipelineRunner.class);
+
+ options.setBigQuerySchema(StringToRowConverter.getSchema());
+ DataflowExampleUtils dataflowUtils = new DataflowExampleUtils(options);
+ dataflowUtils.setup();
+
+ Pipeline pipeline = Pipeline.create(options);
+
+ String tableSpec = new StringBuilder()
+ .append(options.getProject()).append(":")
+ .append(options.getBigQueryDataset()).append(".")
+ .append(options.getBigQueryTable())
+ .toString();
+ pipeline
+ .apply(PubsubIO.Read.topic(options.getPubsubTopic()))
+ .apply(ParDo.of(new ExtractWords()))
+ .apply(ParDo.of(new Uppercase()))
+ .apply(ParDo.of(new StringToRowConverter()))
+ .apply(BigQueryIO.Write.to(tableSpec)
+ .withSchema(StringToRowConverter.getSchema()));
+
+ PipelineResult result = pipeline.run();
+
+ if (!options.getInputFile().isEmpty()) {
+ // Inject the data into the Pub/Sub topic with a Dataflow batch pipeline.
+ dataflowUtils.runInjectorPipeline(options.getInputFile(), options.getPubsubTopic());
+ }
+
+ // dataflowUtils will try to cancel the pipeline and the injector before the program exists.
+ dataflowUtils.waitToFinish(result);
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/java/src/main/java/com/google/cloud/dataflow/examples/complete/TfIdf.java
----------------------------------------------------------------------
diff --git a/examples/java/src/main/java/com/google/cloud/dataflow/examples/complete/TfIdf.java b/examples/java/src/main/java/com/google/cloud/dataflow/examples/complete/TfIdf.java
new file mode 100644
index 0000000..65ac753
--- /dev/null
+++ b/examples/java/src/main/java/com/google/cloud/dataflow/examples/complete/TfIdf.java
@@ -0,0 +1,431 @@
+/*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package com.google.cloud.dataflow.examples.complete;
+
+import com.google.cloud.dataflow.sdk.Pipeline;
+import com.google.cloud.dataflow.sdk.coders.Coder;
+import com.google.cloud.dataflow.sdk.coders.KvCoder;
+import com.google.cloud.dataflow.sdk.coders.StringDelegateCoder;
+import com.google.cloud.dataflow.sdk.coders.StringUtf8Coder;
+import com.google.cloud.dataflow.sdk.io.TextIO;
+import com.google.cloud.dataflow.sdk.options.Default;
+import com.google.cloud.dataflow.sdk.options.Description;
+import com.google.cloud.dataflow.sdk.options.GcsOptions;
+import com.google.cloud.dataflow.sdk.options.PipelineOptions;
+import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
+import com.google.cloud.dataflow.sdk.options.Validation;
+import com.google.cloud.dataflow.sdk.transforms.Count;
+import com.google.cloud.dataflow.sdk.transforms.DoFn;
+import com.google.cloud.dataflow.sdk.transforms.Flatten;
+import com.google.cloud.dataflow.sdk.transforms.Keys;
+import com.google.cloud.dataflow.sdk.transforms.PTransform;
+import com.google.cloud.dataflow.sdk.transforms.ParDo;
+import com.google.cloud.dataflow.sdk.transforms.RemoveDuplicates;
+import com.google.cloud.dataflow.sdk.transforms.Values;
+import com.google.cloud.dataflow.sdk.transforms.View;
+import com.google.cloud.dataflow.sdk.transforms.WithKeys;
+import com.google.cloud.dataflow.sdk.transforms.join.CoGbkResult;
+import com.google.cloud.dataflow.sdk.transforms.join.CoGroupByKey;
+import com.google.cloud.dataflow.sdk.transforms.join.KeyedPCollectionTuple;
+import com.google.cloud.dataflow.sdk.util.GcsUtil;
+import com.google.cloud.dataflow.sdk.util.gcsfs.GcsPath;
+import com.google.cloud.dataflow.sdk.values.KV;
+import com.google.cloud.dataflow.sdk.values.PCollection;
+import com.google.cloud.dataflow.sdk.values.PCollectionList;
+import com.google.cloud.dataflow.sdk.values.PCollectionView;
+import com.google.cloud.dataflow.sdk.values.PDone;
+import com.google.cloud.dataflow.sdk.values.PInput;
+import com.google.cloud.dataflow.sdk.values.TupleTag;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.File;
+import java.io.IOException;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.util.HashSet;
+import java.util.Set;
+
+/**
+ * An example that computes a basic TF-IDF search table for a directory or GCS prefix.
+ *
+ * <p>Concepts: joining data; side inputs; logging
+ *
+ * <p>To execute this pipeline locally, specify general pipeline configuration:
+ * <pre>{@code
+ * --project=YOUR_PROJECT_ID
+ * }</pre>
+ * and a local output file or output prefix on GCS:
+ * <pre>{@code
+ * --output=[YOUR_LOCAL_FILE | gs://YOUR_OUTPUT_PREFIX]
+ * }</pre>
+ *
+ * <p>To execute this pipeline using the Dataflow service, specify pipeline configuration:
+ * <pre>{@code
+ * --project=YOUR_PROJECT_ID
+ * --stagingLocation=gs://YOUR_STAGING_DIRECTORY
+ * --runner=BlockingDataflowPipelineRunner
+ * and an output prefix on GCS:
+ * --output=gs://YOUR_OUTPUT_PREFIX
+ * }</pre>
+ *
+ * <p>The default input is {@code gs://dataflow-samples/shakespeare/} and can be overridden with
+ * {@code --input}.
+ */
+public class TfIdf {
+ /**
+ * Options supported by {@link TfIdf}.
+ *
+ * <p>Inherits standard configuration options.
+ */
+ private static interface Options extends PipelineOptions {
+ @Description("Path to the directory or GCS prefix containing files to read from")
+ @Default.String("gs://dataflow-samples/shakespeare/")
+ String getInput();
+ void setInput(String value);
+
+ @Description("Prefix of output URI to write to")
+ @Validation.Required
+ String getOutput();
+ void setOutput(String value);
+ }
+
+ /**
+ * Lists documents contained beneath the {@code options.input} prefix/directory.
+ */
+ public static Set<URI> listInputDocuments(Options options)
+ throws URISyntaxException, IOException {
+ URI baseUri = new URI(options.getInput());
+
+ // List all documents in the directory or GCS prefix.
+ URI absoluteUri;
+ if (baseUri.getScheme() != null) {
+ absoluteUri = baseUri;
+ } else {
+ absoluteUri = new URI(
+ "file",
+ baseUri.getAuthority(),
+ baseUri.getPath(),
+ baseUri.getQuery(),
+ baseUri.getFragment());
+ }
+
+ Set<URI> uris = new HashSet<>();
+ if (absoluteUri.getScheme().equals("file")) {
+ File directory = new File(absoluteUri);
+ for (String entry : directory.list()) {
+ File path = new File(directory, entry);
+ uris.add(path.toURI());
+ }
+ } else if (absoluteUri.getScheme().equals("gs")) {
+ GcsUtil gcsUtil = options.as(GcsOptions.class).getGcsUtil();
+ URI gcsUriGlob = new URI(
+ absoluteUri.getScheme(),
+ absoluteUri.getAuthority(),
+ absoluteUri.getPath() + "*",
+ absoluteUri.getQuery(),
+ absoluteUri.getFragment());
+ for (GcsPath entry : gcsUtil.expand(GcsPath.fromUri(gcsUriGlob))) {
+ uris.add(entry.toUri());
+ }
+ }
+
+ return uris;
+ }
+
+ /**
+ * Reads the documents at the provided uris and returns all lines
+ * from the documents tagged with which document they are from.
+ */
+ public static class ReadDocuments
+ extends PTransform<PInput, PCollection<KV<URI, String>>> {
+ private Iterable<URI> uris;
+
+ public ReadDocuments(Iterable<URI> uris) {
+ this.uris = uris;
+ }
+
+ @Override
+ public Coder<?> getDefaultOutputCoder() {
+ return KvCoder.of(StringDelegateCoder.of(URI.class), StringUtf8Coder.of());
+ }
+
+ @Override
+ public PCollection<KV<URI, String>> apply(PInput input) {
+ Pipeline pipeline = input.getPipeline();
+
+ // Create one TextIO.Read transform for each document
+ // and add its output to a PCollectionList
+ PCollectionList<KV<URI, String>> urisToLines =
+ PCollectionList.empty(pipeline);
+
+ // TextIO.Read supports:
+ // - file: URIs and paths locally
+ // - gs: URIs on the service
+ for (final URI uri : uris) {
+ String uriString;
+ if (uri.getScheme().equals("file")) {
+ uriString = new File(uri).getPath();
+ } else {
+ uriString = uri.toString();
+ }
+
+ PCollection<KV<URI, String>> oneUriToLines = pipeline
+ .apply(TextIO.Read.from(uriString)
+ .named("TextIO.Read(" + uriString + ")"))
+ .apply("WithKeys(" + uriString + ")", WithKeys.<URI, String>of(uri));
+
+ urisToLines = urisToLines.and(oneUriToLines);
+ }
+
+ return urisToLines.apply(Flatten.<KV<URI, String>>pCollections());
+ }
+ }
+
+ /**
+ * A transform containing a basic TF-IDF pipeline. The input consists of KV objects
+ * where the key is the document's URI and the value is a piece
+ * of the document's content. The output is mapping from terms to
+ * scores for each document URI.
+ */
+ public static class ComputeTfIdf
+ extends PTransform<PCollection<KV<URI, String>>, PCollection<KV<String, KV<URI, Double>>>> {
+ public ComputeTfIdf() { }
+
+ @Override
+ public PCollection<KV<String, KV<URI, Double>>> apply(
+ PCollection<KV<URI, String>> uriToContent) {
+
+ // Compute the total number of documents, and
+ // prepare this singleton PCollectionView for
+ // use as a side input.
+ final PCollectionView<Long> totalDocuments =
+ uriToContent
+ .apply("GetURIs", Keys.<URI>create())
+ .apply("RemoveDuplicateDocs", RemoveDuplicates.<URI>create())
+ .apply(Count.<URI>globally())
+ .apply(View.<Long>asSingleton());
+
+ // Create a collection of pairs mapping a URI to each
+ // of the words in the document associated with that that URI.
+ PCollection<KV<URI, String>> uriToWords = uriToContent
+ .apply(ParDo.named("SplitWords").of(
+ new DoFn<KV<URI, String>, KV<URI, String>>() {
+ @Override
+ public void processElement(ProcessContext c) {
+ URI uri = c.element().getKey();
+ String line = c.element().getValue();
+ for (String word : line.split("\\W+")) {
+ // Log INFO messages when the word “love” is found.
+ if (word.toLowerCase().equals("love")) {
+ LOG.info("Found {}", word.toLowerCase());
+ }
+
+ if (!word.isEmpty()) {
+ c.output(KV.of(uri, word.toLowerCase()));
+ }
+ }
+ }
+ }));
+
+ // Compute a mapping from each word to the total
+ // number of documents in which it appears.
+ PCollection<KV<String, Long>> wordToDocCount = uriToWords
+ .apply("RemoveDuplicateWords", RemoveDuplicates.<KV<URI, String>>create())
+ .apply(Values.<String>create())
+ .apply("CountDocs", Count.<String>perElement());
+
+ // Compute a mapping from each URI to the total
+ // number of words in the document associated with that URI.
+ PCollection<KV<URI, Long>> uriToWordTotal = uriToWords
+ .apply("GetURIs2", Keys.<URI>create())
+ .apply("CountWords", Count.<URI>perElement());
+
+ // Count, for each (URI, word) pair, the number of
+ // occurrences of that word in the document associated
+ // with the URI.
+ PCollection<KV<KV<URI, String>, Long>> uriAndWordToCount = uriToWords
+ .apply("CountWordDocPairs", Count.<KV<URI, String>>perElement());
+
+ // Adjust the above collection to a mapping from
+ // (URI, word) pairs to counts into an isomorphic mapping
+ // from URI to (word, count) pairs, to prepare for a join
+ // by the URI key.
+ PCollection<KV<URI, KV<String, Long>>> uriToWordAndCount = uriAndWordToCount
+ .apply(ParDo.named("ShiftKeys").of(
+ new DoFn<KV<KV<URI, String>, Long>, KV<URI, KV<String, Long>>>() {
+ @Override
+ public void processElement(ProcessContext c) {
+ URI uri = c.element().getKey().getKey();
+ String word = c.element().getKey().getValue();
+ Long occurrences = c.element().getValue();
+ c.output(KV.of(uri, KV.of(word, occurrences)));
+ }
+ }));
+
+ // Prepare to join the mapping of URI to (word, count) pairs with
+ // the mapping of URI to total word counts, by associating
+ // each of the input PCollection<KV<URI, ...>> with
+ // a tuple tag. Each input must have the same key type, URI
+ // in this case. The type parameter of the tuple tag matches
+ // the types of the values for each collection.
+ final TupleTag<Long> wordTotalsTag = new TupleTag<Long>();
+ final TupleTag<KV<String, Long>> wordCountsTag = new TupleTag<KV<String, Long>>();
+ KeyedPCollectionTuple<URI> coGbkInput = KeyedPCollectionTuple
+ .of(wordTotalsTag, uriToWordTotal)
+ .and(wordCountsTag, uriToWordAndCount);
+
+ // Perform a CoGroupByKey (a sort of pre-join) on the prepared
+ // inputs. This yields a mapping from URI to a CoGbkResult
+ // (CoGroupByKey Result). The CoGbkResult is a mapping
+ // from the above tuple tags to the values in each input
+ // associated with a particular URI. In this case, each
+ // KV<URI, CoGbkResult> group a URI with the total number of
+ // words in that document as well as all the (word, count)
+ // pairs for particular words.
+ PCollection<KV<URI, CoGbkResult>> uriToWordAndCountAndTotal = coGbkInput
+ .apply("CoGroupByUri", CoGroupByKey.<URI>create());
+
+ // Compute a mapping from each word to a (URI, term frequency)
+ // pair for each URI. A word's term frequency for a document
+ // is simply the number of times that word occurs in the document
+ // divided by the total number of words in the document.
+ PCollection<KV<String, KV<URI, Double>>> wordToUriAndTf = uriToWordAndCountAndTotal
+ .apply(ParDo.named("ComputeTermFrequencies").of(
+ new DoFn<KV<URI, CoGbkResult>, KV<String, KV<URI, Double>>>() {
+ @Override
+ public void processElement(ProcessContext c) {
+ URI uri = c.element().getKey();
+ Long wordTotal = c.element().getValue().getOnly(wordTotalsTag);
+
+ for (KV<String, Long> wordAndCount
+ : c.element().getValue().getAll(wordCountsTag)) {
+ String word = wordAndCount.getKey();
+ Long wordCount = wordAndCount.getValue();
+ Double termFrequency = wordCount.doubleValue() / wordTotal.doubleValue();
+ c.output(KV.of(word, KV.of(uri, termFrequency)));
+ }
+ }
+ }));
+
+ // Compute a mapping from each word to its document frequency.
+ // A word's document frequency in a corpus is the number of
+ // documents in which the word appears divided by the total
+ // number of documents in the corpus. Note how the total number of
+ // documents is passed as a side input; the same value is
+ // presented to each invocation of the DoFn.
+ PCollection<KV<String, Double>> wordToDf = wordToDocCount
+ .apply(ParDo
+ .named("ComputeDocFrequencies")
+ .withSideInputs(totalDocuments)
+ .of(new DoFn<KV<String, Long>, KV<String, Double>>() {
+ @Override
+ public void processElement(ProcessContext c) {
+ String word = c.element().getKey();
+ Long documentCount = c.element().getValue();
+ Long documentTotal = c.sideInput(totalDocuments);
+ Double documentFrequency = documentCount.doubleValue()
+ / documentTotal.doubleValue();
+
+ c.output(KV.of(word, documentFrequency));
+ }
+ }));
+
+ // Join the term frequency and document frequency
+ // collections, each keyed on the word.
+ final TupleTag<KV<URI, Double>> tfTag = new TupleTag<KV<URI, Double>>();
+ final TupleTag<Double> dfTag = new TupleTag<Double>();
+ PCollection<KV<String, CoGbkResult>> wordToUriAndTfAndDf = KeyedPCollectionTuple
+ .of(tfTag, wordToUriAndTf)
+ .and(dfTag, wordToDf)
+ .apply(CoGroupByKey.<String>create());
+
+ // Compute a mapping from each word to a (URI, TF-IDF) score
+ // for each URI. There are a variety of definitions of TF-IDF
+ // ("term frequency - inverse document frequency") score;
+ // here we use a basic version that is the term frequency
+ // divided by the log of the document frequency.
+ PCollection<KV<String, KV<URI, Double>>> wordToUriAndTfIdf = wordToUriAndTfAndDf
+ .apply(ParDo.named("ComputeTfIdf").of(
+ new DoFn<KV<String, CoGbkResult>, KV<String, KV<URI, Double>>>() {
+ @Override
+ public void processElement(ProcessContext c) {
+ String word = c.element().getKey();
+ Double df = c.element().getValue().getOnly(dfTag);
+
+ for (KV<URI, Double> uriAndTf : c.element().getValue().getAll(tfTag)) {
+ URI uri = uriAndTf.getKey();
+ Double tf = uriAndTf.getValue();
+ Double tfIdf = tf * Math.log(1 / df);
+ c.output(KV.of(word, KV.of(uri, tfIdf)));
+ }
+ }
+ }));
+
+ return wordToUriAndTfIdf;
+ }
+
+ // Instantiate Logger.
+ // It is suggested that the user specify the class name of the containing class
+ // (in this case ComputeTfIdf).
+ private static final Logger LOG = LoggerFactory.getLogger(ComputeTfIdf.class);
+ }
+
+ /**
+ * A {@link PTransform} to write, in CSV format, a mapping from term and URI
+ * to score.
+ */
+ public static class WriteTfIdf
+ extends PTransform<PCollection<KV<String, KV<URI, Double>>>, PDone> {
+ private String output;
+
+ public WriteTfIdf(String output) {
+ this.output = output;
+ }
+
+ @Override
+ public PDone apply(PCollection<KV<String, KV<URI, Double>>> wordToUriAndTfIdf) {
+ return wordToUriAndTfIdf
+ .apply(ParDo.named("Format").of(new DoFn<KV<String, KV<URI, Double>>, String>() {
+ @Override
+ public void processElement(ProcessContext c) {
+ c.output(String.format("%s,\t%s,\t%f",
+ c.element().getKey(),
+ c.element().getValue().getKey(),
+ c.element().getValue().getValue()));
+ }
+ }))
+ .apply(TextIO.Write
+ .to(output)
+ .withSuffix(".csv"));
+ }
+ }
+
+ public static void main(String[] args) throws Exception {
+ Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
+ Pipeline pipeline = Pipeline.create(options);
+ pipeline.getCoderRegistry().registerCoder(URI.class, StringDelegateCoder.of(URI.class));
+
+ pipeline
+ .apply(new ReadDocuments(listInputDocuments(options)))
+ .apply(new ComputeTfIdf())
+ .apply(new WriteTfIdf(options.getOutput()));
+
+ pipeline.run();
+ }
+}
[62/67] incubator-beam git commit: Directory reorganization
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/11bb9e0e/examples/java8/src/main/java/com/google/cloud/dataflow/examples/complete/game/injector/InjectorUtils.java
----------------------------------------------------------------------
diff --git a/examples/java8/src/main/java/com/google/cloud/dataflow/examples/complete/game/injector/InjectorUtils.java b/examples/java8/src/main/java/com/google/cloud/dataflow/examples/complete/game/injector/InjectorUtils.java
new file mode 100644
index 0000000..55982df
--- /dev/null
+++ b/examples/java8/src/main/java/com/google/cloud/dataflow/examples/complete/game/injector/InjectorUtils.java
@@ -0,0 +1,101 @@
+/*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package com.google.cloud.dataflow.examples.complete.game.injector;
+
+
+import com.google.api.client.googleapis.auth.oauth2.GoogleCredential;
+import com.google.api.client.googleapis.json.GoogleJsonResponseException;
+import com.google.api.client.googleapis.util.Utils;
+import com.google.api.client.http.HttpRequestInitializer;
+import com.google.api.client.http.HttpStatusCodes;
+import com.google.api.client.http.HttpTransport;
+import com.google.api.client.json.JsonFactory;
+import com.google.api.services.pubsub.Pubsub;
+import com.google.api.services.pubsub.PubsubScopes;
+import com.google.api.services.pubsub.model.Topic;
+
+import com.google.common.base.Preconditions;
+
+import java.io.IOException;
+
+class InjectorUtils {
+
+ private static final String APP_NAME = "injector";
+
+ /**
+ * Builds a new Pubsub client and returns it.
+ */
+ public static Pubsub getClient(final HttpTransport httpTransport,
+ final JsonFactory jsonFactory)
+ throws IOException {
+ Preconditions.checkNotNull(httpTransport);
+ Preconditions.checkNotNull(jsonFactory);
+ GoogleCredential credential =
+ GoogleCredential.getApplicationDefault(httpTransport, jsonFactory);
+ if (credential.createScopedRequired()) {
+ credential = credential.createScoped(PubsubScopes.all());
+ }
+ if (credential.getClientAuthentication() != null) {
+ System.out.println("\n***Warning! You are not using service account credentials to "
+ + "authenticate.\nYou need to use service account credentials for this example,"
+ + "\nsince user-level credentials do not have enough pubsub quota,\nand so you will run "
+ + "out of PubSub quota very quickly.\nSee "
+ + "https://developers.google.com/identity/protocols/application-default-credentials.");
+ System.exit(1);
+ }
+ HttpRequestInitializer initializer =
+ new RetryHttpInitializerWrapper(credential);
+ return new Pubsub.Builder(httpTransport, jsonFactory, initializer)
+ .setApplicationName(APP_NAME)
+ .build();
+ }
+
+ /**
+ * Builds a new Pubsub client with default HttpTransport and
+ * JsonFactory and returns it.
+ */
+ public static Pubsub getClient() throws IOException {
+ return getClient(Utils.getDefaultTransport(),
+ Utils.getDefaultJsonFactory());
+ }
+
+
+ /**
+ * Returns the fully qualified topic name for Pub/Sub.
+ */
+ public static String getFullyQualifiedTopicName(
+ final String project, final String topic) {
+ return String.format("projects/%s/topics/%s", project, topic);
+ }
+
+ /**
+ * Create a topic if it doesn't exist.
+ */
+ public static void createTopic(Pubsub client, String fullTopicName)
+ throws IOException {
+ try {
+ client.projects().topics().get(fullTopicName).execute();
+ } catch (GoogleJsonResponseException e) {
+ if (e.getStatusCode() == HttpStatusCodes.STATUS_CODE_NOT_FOUND) {
+ Topic topic = client.projects().topics()
+ .create(fullTopicName, new Topic())
+ .execute();
+ System.out.printf("Topic %s was created.\n", topic.getName());
+ }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/11bb9e0e/examples/java8/src/main/java/com/google/cloud/dataflow/examples/complete/game/injector/RetryHttpInitializerWrapper.java
----------------------------------------------------------------------
diff --git a/examples/java8/src/main/java/com/google/cloud/dataflow/examples/complete/game/injector/RetryHttpInitializerWrapper.java b/examples/java8/src/main/java/com/google/cloud/dataflow/examples/complete/game/injector/RetryHttpInitializerWrapper.java
new file mode 100644
index 0000000..1437534
--- /dev/null
+++ b/examples/java8/src/main/java/com/google/cloud/dataflow/examples/complete/game/injector/RetryHttpInitializerWrapper.java
@@ -0,0 +1,126 @@
+/*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package com.google.cloud.dataflow.examples.complete.game.injector;
+
+import com.google.api.client.auth.oauth2.Credential;
+import com.google.api.client.http.HttpBackOffIOExceptionHandler;
+import com.google.api.client.http.HttpBackOffUnsuccessfulResponseHandler;
+import com.google.api.client.http.HttpRequest;
+import com.google.api.client.http.HttpRequestInitializer;
+import com.google.api.client.http.HttpResponse;
+import com.google.api.client.http.HttpUnsuccessfulResponseHandler;
+import com.google.api.client.util.ExponentialBackOff;
+import com.google.api.client.util.Sleeper;
+import com.google.common.base.Preconditions;
+
+import java.io.IOException;
+import java.util.logging.Logger;
+
+/**
+ * RetryHttpInitializerWrapper will automatically retry upon RPC
+ * failures, preserving the auto-refresh behavior of the Google
+ * Credentials.
+ */
+public class RetryHttpInitializerWrapper implements HttpRequestInitializer {
+
+ /**
+ * A private logger.
+ */
+ private static final Logger LOG =
+ Logger.getLogger(RetryHttpInitializerWrapper.class.getName());
+
+ /**
+ * One minutes in miliseconds.
+ */
+ private static final int ONEMINITUES = 60000;
+
+ /**
+ * Intercepts the request for filling in the "Authorization"
+ * header field, as well as recovering from certain unsuccessful
+ * error codes wherein the Credential must refresh its token for a
+ * retry.
+ */
+ private final Credential wrappedCredential;
+
+ /**
+ * A sleeper; you can replace it with a mock in your test.
+ */
+ private final Sleeper sleeper;
+
+ /**
+ * A constructor.
+ *
+ * @param wrappedCredential Credential which will be wrapped and
+ * used for providing auth header.
+ */
+ public RetryHttpInitializerWrapper(final Credential wrappedCredential) {
+ this(wrappedCredential, Sleeper.DEFAULT);
+ }
+
+ /**
+ * A protected constructor only for testing.
+ *
+ * @param wrappedCredential Credential which will be wrapped and
+ * used for providing auth header.
+ * @param sleeper Sleeper for easy testing.
+ */
+ RetryHttpInitializerWrapper(
+ final Credential wrappedCredential, final Sleeper sleeper) {
+ this.wrappedCredential = Preconditions.checkNotNull(wrappedCredential);
+ this.sleeper = sleeper;
+ }
+
+ /**
+ * Initializes the given request.
+ */
+ @Override
+ public final void initialize(final HttpRequest request) {
+ request.setReadTimeout(2 * ONEMINITUES); // 2 minutes read timeout
+ final HttpUnsuccessfulResponseHandler backoffHandler =
+ new HttpBackOffUnsuccessfulResponseHandler(
+ new ExponentialBackOff())
+ .setSleeper(sleeper);
+ request.setInterceptor(wrappedCredential);
+ request.setUnsuccessfulResponseHandler(
+ new HttpUnsuccessfulResponseHandler() {
+ @Override
+ public boolean handleResponse(
+ final HttpRequest request,
+ final HttpResponse response,
+ final boolean supportsRetry) throws IOException {
+ if (wrappedCredential.handleResponse(
+ request, response, supportsRetry)) {
+ // If credential decides it can handle it,
+ // the return code or message indicated
+ // something specific to authentication,
+ // and no backoff is desired.
+ return true;
+ } else if (backoffHandler.handleResponse(
+ request, response, supportsRetry)) {
+ // Otherwise, we defer to the judgement of
+ // our internal backoff handler.
+ LOG.info("Retrying "
+ + request.getUrl().toString());
+ return true;
+ } else {
+ return false;
+ }
+ }
+ });
+ request.setIOExceptionHandler(
+ new HttpBackOffIOExceptionHandler(new ExponentialBackOff())
+ .setSleeper(sleeper));
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/11bb9e0e/examples/java8/src/main/java/com/google/cloud/dataflow/examples/complete/game/utils/WriteToBigQuery.java
----------------------------------------------------------------------
diff --git a/examples/java8/src/main/java/com/google/cloud/dataflow/examples/complete/game/utils/WriteToBigQuery.java b/examples/java8/src/main/java/com/google/cloud/dataflow/examples/complete/game/utils/WriteToBigQuery.java
new file mode 100644
index 0000000..2cf719a
--- /dev/null
+++ b/examples/java8/src/main/java/com/google/cloud/dataflow/examples/complete/game/utils/WriteToBigQuery.java
@@ -0,0 +1,134 @@
+ /*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package com.google.cloud.dataflow.examples.complete.game.utils;
+
+import com.google.api.services.bigquery.model.TableFieldSchema;
+import com.google.api.services.bigquery.model.TableReference;
+import com.google.api.services.bigquery.model.TableRow;
+import com.google.api.services.bigquery.model.TableSchema;
+import com.google.cloud.dataflow.examples.complete.game.UserScore;
+import com.google.cloud.dataflow.sdk.Pipeline;
+import com.google.cloud.dataflow.sdk.io.BigQueryIO;
+import com.google.cloud.dataflow.sdk.io.BigQueryIO.Write.CreateDisposition;
+import com.google.cloud.dataflow.sdk.io.BigQueryIO.Write.WriteDisposition;
+import com.google.cloud.dataflow.sdk.options.GcpOptions;
+import com.google.cloud.dataflow.sdk.options.PipelineOptions;
+import com.google.cloud.dataflow.sdk.transforms.DoFn;
+import com.google.cloud.dataflow.sdk.transforms.PTransform;
+import com.google.cloud.dataflow.sdk.transforms.ParDo;
+import com.google.cloud.dataflow.sdk.transforms.SerializableFunction;
+import com.google.cloud.dataflow.sdk.values.PCollection;
+import com.google.cloud.dataflow.sdk.values.PDone;
+
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * Generate, format, and write BigQuery table row information. Use provided information about
+ * the field names and types, as well as lambda functions that describe how to generate their
+ * values.
+ */
+public class WriteToBigQuery<T>
+ extends PTransform<PCollection<T>, PDone> {
+
+ protected String tableName;
+ protected Map<String, FieldInfo<T>> fieldInfo;
+
+ public WriteToBigQuery() {
+ }
+
+ public WriteToBigQuery(String tableName,
+ Map<String, FieldInfo<T>> fieldInfo) {
+ this.tableName = tableName;
+ this.fieldInfo = fieldInfo;
+ }
+
+ /** Define a class to hold information about output table field definitions. */
+ public static class FieldInfo<T> implements Serializable {
+ // The BigQuery 'type' of the field
+ private String fieldType;
+ // A lambda function to generate the field value
+ private SerializableFunction<DoFn<T, TableRow>.ProcessContext, Object> fieldFn;
+
+ public FieldInfo(String fieldType,
+ SerializableFunction<DoFn<T, TableRow>.ProcessContext, Object> fieldFn) {
+ this.fieldType = fieldType;
+ this.fieldFn = fieldFn;
+ }
+
+ String getFieldType() {
+ return this.fieldType;
+ }
+
+ SerializableFunction<DoFn<T, TableRow>.ProcessContext, Object> getFieldFn() {
+ return this.fieldFn;
+ }
+ }
+ /** Convert each key/score pair into a BigQuery TableRow as specified by fieldFn. */
+ protected class BuildRowFn extends DoFn<T, TableRow> {
+
+ @Override
+ public void processElement(ProcessContext c) {
+
+ TableRow row = new TableRow();
+ for (Map.Entry<String, FieldInfo<T>> entry : fieldInfo.entrySet()) {
+ String key = entry.getKey();
+ FieldInfo<T> fcnInfo = entry.getValue();
+ SerializableFunction<DoFn<T, TableRow>.ProcessContext, Object> fcn =
+ fcnInfo.getFieldFn();
+ row.set(key, fcn.apply(c));
+ }
+ c.output(row);
+ }
+ }
+
+ /** Build the output table schema. */
+ protected TableSchema getSchema() {
+ List<TableFieldSchema> fields = new ArrayList<>();
+ for (Map.Entry<String, FieldInfo<T>> entry : fieldInfo.entrySet()) {
+ String key = entry.getKey();
+ FieldInfo<T> fcnInfo = entry.getValue();
+ String bqType = fcnInfo.getFieldType();
+ fields.add(new TableFieldSchema().setName(key).setType(bqType));
+ }
+ return new TableSchema().setFields(fields);
+ }
+
+ @Override
+ public PDone apply(PCollection<T> teamAndScore) {
+ return teamAndScore
+ .apply(ParDo.named("ConvertToRow").of(new BuildRowFn()))
+ .apply(BigQueryIO.Write
+ .to(getTable(teamAndScore.getPipeline(),
+ tableName))
+ .withSchema(getSchema())
+ .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
+ .withWriteDisposition(WriteDisposition.WRITE_APPEND));
+ }
+
+ /** Utility to construct an output table reference. */
+ static TableReference getTable(Pipeline pipeline, String tableName) {
+ PipelineOptions options = pipeline.getOptions();
+ TableReference table = new TableReference();
+ table.setDatasetId(options.as(UserScore.Options.class).getDataset());
+ table.setProjectId(options.as(GcpOptions.class).getProject());
+ table.setTableId(tableName);
+ return table;
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/11bb9e0e/examples/java8/src/main/java/com/google/cloud/dataflow/examples/complete/game/utils/WriteWindowedToBigQuery.java
----------------------------------------------------------------------
diff --git a/examples/java8/src/main/java/com/google/cloud/dataflow/examples/complete/game/utils/WriteWindowedToBigQuery.java b/examples/java8/src/main/java/com/google/cloud/dataflow/examples/complete/game/utils/WriteWindowedToBigQuery.java
new file mode 100644
index 0000000..8433021
--- /dev/null
+++ b/examples/java8/src/main/java/com/google/cloud/dataflow/examples/complete/game/utils/WriteWindowedToBigQuery.java
@@ -0,0 +1,76 @@
+ /*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package com.google.cloud.dataflow.examples.complete.game.utils;
+
+import com.google.api.services.bigquery.model.TableRow;
+import com.google.cloud.dataflow.sdk.io.BigQueryIO;
+import com.google.cloud.dataflow.sdk.io.BigQueryIO.Write.CreateDisposition;
+import com.google.cloud.dataflow.sdk.io.BigQueryIO.Write.WriteDisposition;
+import com.google.cloud.dataflow.sdk.transforms.DoFn;
+import com.google.cloud.dataflow.sdk.transforms.DoFn.RequiresWindowAccess;
+import com.google.cloud.dataflow.sdk.transforms.ParDo;
+import com.google.cloud.dataflow.sdk.transforms.SerializableFunction;
+import com.google.cloud.dataflow.sdk.values.PCollection;
+import com.google.cloud.dataflow.sdk.values.PDone;
+
+import java.util.Map;
+
+/**
+ * Generate, format, and write BigQuery table row information. Subclasses {@link WriteToBigQuery}
+ * to require windowing; so this subclass may be used for writes that require access to the
+ * context's window information.
+ */
+public class WriteWindowedToBigQuery<T>
+ extends WriteToBigQuery<T> {
+
+ public WriteWindowedToBigQuery(String tableName,
+ Map<String, FieldInfo<T>> fieldInfo) {
+ super(tableName, fieldInfo);
+ }
+
+ /** Convert each key/score pair into a BigQuery TableRow. */
+ protected class BuildRowFn extends DoFn<T, TableRow>
+ implements RequiresWindowAccess {
+
+ @Override
+ public void processElement(ProcessContext c) {
+
+ TableRow row = new TableRow();
+ for (Map.Entry<String, FieldInfo<T>> entry : fieldInfo.entrySet()) {
+ String key = entry.getKey();
+ FieldInfo<T> fcnInfo = entry.getValue();
+ SerializableFunction<DoFn<T, TableRow>.ProcessContext, Object> fcn =
+ fcnInfo.getFieldFn();
+ row.set(key, fcn.apply(c));
+ }
+ c.output(row);
+ }
+ }
+
+ @Override
+ public PDone apply(PCollection<T> teamAndScore) {
+ return teamAndScore
+ .apply(ParDo.named("ConvertToRow").of(new BuildRowFn()))
+ .apply(BigQueryIO.Write
+ .to(getTable(teamAndScore.getPipeline(),
+ tableName))
+ .withSchema(getSchema())
+ .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
+ .withWriteDisposition(WriteDisposition.WRITE_APPEND));
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/11bb9e0e/examples/java8/src/test/java/com/google/cloud/dataflow/examples/MinimalWordCountJava8Test.java
----------------------------------------------------------------------
diff --git a/examples/java8/src/test/java/com/google/cloud/dataflow/examples/MinimalWordCountJava8Test.java b/examples/java8/src/test/java/com/google/cloud/dataflow/examples/MinimalWordCountJava8Test.java
new file mode 100644
index 0000000..fcae41c
--- /dev/null
+++ b/examples/java8/src/test/java/com/google/cloud/dataflow/examples/MinimalWordCountJava8Test.java
@@ -0,0 +1,103 @@
+/*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package com.google.cloud.dataflow.examples;
+
+import com.google.cloud.dataflow.sdk.Pipeline;
+import com.google.cloud.dataflow.sdk.io.TextIO;
+import com.google.cloud.dataflow.sdk.options.GcsOptions;
+import com.google.cloud.dataflow.sdk.testing.TestPipeline;
+import com.google.cloud.dataflow.sdk.transforms.Count;
+import com.google.cloud.dataflow.sdk.transforms.Filter;
+import com.google.cloud.dataflow.sdk.transforms.FlatMapElements;
+import com.google.cloud.dataflow.sdk.transforms.MapElements;
+import com.google.cloud.dataflow.sdk.util.GcsUtil;
+import com.google.cloud.dataflow.sdk.util.gcsfs.GcsPath;
+import com.google.cloud.dataflow.sdk.values.KV;
+import com.google.cloud.dataflow.sdk.values.TypeDescriptor;
+import com.google.common.collect.ImmutableList;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+import org.mockito.Mockito;
+import org.mockito.invocation.InvocationOnMock;
+import org.mockito.stubbing.Answer;
+
+import java.io.IOException;
+import java.io.Serializable;
+import java.nio.channels.FileChannel;
+import java.nio.channels.SeekableByteChannel;
+import java.nio.file.Files;
+import java.nio.file.StandardOpenOption;
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * To keep {@link MinimalWordCountJava8} simple, it is not factored or testable. This test
+ * file should be maintained with a copy of its code for a basic smoke test.
+ */
+@RunWith(JUnit4.class)
+public class MinimalWordCountJava8Test implements Serializable {
+
+ /**
+ * A basic smoke test that ensures there is no crash at pipeline construction time.
+ */
+ @Test
+ public void testMinimalWordCountJava8() throws Exception {
+ Pipeline p = TestPipeline.create();
+ p.getOptions().as(GcsOptions.class).setGcsUtil(buildMockGcsUtil());
+
+ p.apply(TextIO.Read.from("gs://dataflow-samples/shakespeare/*"))
+ .apply(FlatMapElements.via((String word) -> Arrays.asList(word.split("[^a-zA-Z']+")))
+ .withOutputType(new TypeDescriptor<String>() {}))
+ .apply(Filter.byPredicate((String word) -> !word.isEmpty()))
+ .apply(Count.<String>perElement())
+ .apply(MapElements
+ .via((KV<String, Long> wordCount) -> wordCount.getKey() + ": " + wordCount.getValue())
+ .withOutputType(new TypeDescriptor<String>() {}))
+ .apply(TextIO.Write.to("gs://YOUR_OUTPUT_BUCKET/AND_OUTPUT_PREFIX"));
+ }
+
+ private GcsUtil buildMockGcsUtil() throws IOException {
+ GcsUtil mockGcsUtil = Mockito.mock(GcsUtil.class);
+
+ // Any request to open gets a new bogus channel
+ Mockito
+ .when(mockGcsUtil.open(Mockito.any(GcsPath.class)))
+ .then(new Answer<SeekableByteChannel>() {
+ @Override
+ public SeekableByteChannel answer(InvocationOnMock invocation) throws Throwable {
+ return FileChannel.open(
+ Files.createTempFile("channel-", ".tmp"),
+ StandardOpenOption.CREATE, StandardOpenOption.DELETE_ON_CLOSE);
+ }
+ });
+
+ // Any request for expansion returns a list containing the original GcsPath
+ // This is required to pass validation that occurs in TextIO during apply()
+ Mockito
+ .when(mockGcsUtil.expand(Mockito.any(GcsPath.class)))
+ .then(new Answer<List<GcsPath>>() {
+ @Override
+ public List<GcsPath> answer(InvocationOnMock invocation) throws Throwable {
+ return ImmutableList.of((GcsPath) invocation.getArguments()[0]);
+ }
+ });
+
+ return mockGcsUtil;
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/11bb9e0e/examples/java8/src/test/java/com/google/cloud/dataflow/examples/complete/game/GameStatsTest.java
----------------------------------------------------------------------
diff --git a/examples/java8/src/test/java/com/google/cloud/dataflow/examples/complete/game/GameStatsTest.java b/examples/java8/src/test/java/com/google/cloud/dataflow/examples/complete/game/GameStatsTest.java
new file mode 100644
index 0000000..f77d146
--- /dev/null
+++ b/examples/java8/src/test/java/com/google/cloud/dataflow/examples/complete/game/GameStatsTest.java
@@ -0,0 +1,76 @@
+/*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package com.google.cloud.dataflow.examples.complete.game;
+
+import com.google.cloud.dataflow.examples.complete.game.GameStats.CalculateSpammyUsers;
+import com.google.cloud.dataflow.sdk.Pipeline;
+import com.google.cloud.dataflow.sdk.testing.DataflowAssert;
+import com.google.cloud.dataflow.sdk.testing.RunnableOnService;
+import com.google.cloud.dataflow.sdk.testing.TestPipeline;
+import com.google.cloud.dataflow.sdk.transforms.Create;
+import com.google.cloud.dataflow.sdk.values.KV;
+import com.google.cloud.dataflow.sdk.values.PCollection;
+
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+import java.io.Serializable;
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * Tests of GameStats.
+ * Because the pipeline was designed for easy readability and explanations, it lacks good
+ * modularity for testing. See our testing documentation for better ideas:
+ * https://cloud.google.com/dataflow/pipelines/testing-your-pipeline.
+ */
+@RunWith(JUnit4.class)
+public class GameStatsTest implements Serializable {
+
+ // User scores
+ static final List<KV<String, Integer>> USER_SCORES = Arrays.asList(
+ KV.of("Robot-2", 66), KV.of("Robot-1", 116), KV.of("user7_AndroidGreenKookaburra", 23),
+ KV.of("user7_AndroidGreenKookaburra", 1),
+ KV.of("user19_BisqueBilby", 14), KV.of("user13_ApricotQuokka", 15),
+ KV.of("user18_BananaEmu", 25), KV.of("user6_AmberEchidna", 8),
+ KV.of("user2_AmberQuokka", 6), KV.of("user0_MagentaKangaroo", 4),
+ KV.of("user0_MagentaKangaroo", 3), KV.of("user2_AmberCockatoo", 13),
+ KV.of("user7_AlmondWallaby", 15), KV.of("user6_AmberNumbat", 11),
+ KV.of("user6_AmberQuokka", 4));
+
+ // The expected list of 'spammers'.
+ static final List<KV<String, Integer>> SPAMMERS = Arrays.asList(
+ KV.of("Robot-2", 66), KV.of("Robot-1", 116));
+
+ /** Test the calculation of 'spammy users'. */
+ @Test
+ @Category(RunnableOnService.class)
+ public void testCalculateSpammyUsers() throws Exception {
+ Pipeline p = TestPipeline.create();
+
+ PCollection<KV<String, Integer>> input = p.apply(Create.of(USER_SCORES));
+ PCollection<KV<String, Integer>> output = input.apply(new CalculateSpammyUsers());
+
+ // Check the set of spammers.
+ DataflowAssert.that(output).containsInAnyOrder(SPAMMERS);
+
+ p.run();
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/11bb9e0e/examples/java8/src/test/java/com/google/cloud/dataflow/examples/complete/game/HourlyTeamScoreTest.java
----------------------------------------------------------------------
diff --git a/examples/java8/src/test/java/com/google/cloud/dataflow/examples/complete/game/HourlyTeamScoreTest.java b/examples/java8/src/test/java/com/google/cloud/dataflow/examples/complete/game/HourlyTeamScoreTest.java
new file mode 100644
index 0000000..f77a5d4
--- /dev/null
+++ b/examples/java8/src/test/java/com/google/cloud/dataflow/examples/complete/game/HourlyTeamScoreTest.java
@@ -0,0 +1,111 @@
+/*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package com.google.cloud.dataflow.examples.complete.game;
+
+import com.google.cloud.dataflow.examples.complete.game.UserScore.GameActionInfo;
+import com.google.cloud.dataflow.examples.complete.game.UserScore.ParseEventFn;
+import com.google.cloud.dataflow.sdk.Pipeline;
+import com.google.cloud.dataflow.sdk.coders.StringUtf8Coder;
+import com.google.cloud.dataflow.sdk.testing.DataflowAssert;
+import com.google.cloud.dataflow.sdk.testing.RunnableOnService;
+import com.google.cloud.dataflow.sdk.testing.TestPipeline;
+import com.google.cloud.dataflow.sdk.transforms.Create;
+import com.google.cloud.dataflow.sdk.transforms.Filter;
+import com.google.cloud.dataflow.sdk.transforms.MapElements;
+import com.google.cloud.dataflow.sdk.transforms.ParDo;
+import com.google.cloud.dataflow.sdk.values.KV;
+import com.google.cloud.dataflow.sdk.values.PCollection;
+import com.google.cloud.dataflow.sdk.values.TypeDescriptor;
+
+import org.joda.time.Instant;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+import java.io.Serializable;
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * Tests of HourlyTeamScore.
+ * Because the pipeline was designed for easy readability and explanations, it lacks good
+ * modularity for testing. See our testing documentation for better ideas:
+ * https://cloud.google.com/dataflow/pipelines/testing-your-pipeline.
+ */
+@RunWith(JUnit4.class)
+public class HourlyTeamScoreTest implements Serializable {
+
+ static final String[] GAME_EVENTS_ARRAY = new String[] {
+ "user0_MagentaKangaroo,MagentaKangaroo,3,1447955630000,2015-11-19 09:53:53.444",
+ "user13_ApricotQuokka,ApricotQuokka,15,1447955630000,2015-11-19 09:53:53.444",
+ "user6_AmberNumbat,AmberNumbat,11,1447955630000,2015-11-19 09:53:53.444",
+ "user7_AlmondWallaby,AlmondWallaby,15,1447955630000,2015-11-19 09:53:53.444",
+ "user7_AndroidGreenKookaburra,AndroidGreenKookaburra,12,1447955630000,2015-11-19 09:53:53.444",
+ "user7_AndroidGreenKookaburra,AndroidGreenKookaburra,11,1447955630000,2015-11-19 09:53:53.444",
+ "user19_BisqueBilby,BisqueBilby,6,1447955630000,2015-11-19 09:53:53.444",
+ "user19_BisqueBilby,BisqueBilby,8,1447955630000,2015-11-19 09:53:53.444",
+ // time gap...
+ "user0_AndroidGreenEchidna,AndroidGreenEchidna,0,1447965690000,2015-11-19 12:41:31.053",
+ "user0_MagentaKangaroo,MagentaKangaroo,4,1447965690000,2015-11-19 12:41:31.053",
+ "user2_AmberCockatoo,AmberCockatoo,13,1447965690000,2015-11-19 12:41:31.053",
+ "user18_BananaEmu,BananaEmu,7,1447965690000,2015-11-19 12:41:31.053",
+ "user3_BananaEmu,BananaEmu,17,1447965690000,2015-11-19 12:41:31.053",
+ "user18_BananaEmu,BananaEmu,1,1447965690000,2015-11-19 12:41:31.053",
+ "user18_ApricotCaneToad,ApricotCaneToad,14,1447965690000,2015-11-19 12:41:31.053"
+ };
+
+
+ static final List<String> GAME_EVENTS = Arrays.asList(GAME_EVENTS_ARRAY);
+
+
+ // Used to check the filtering.
+ static final KV[] FILTERED_EVENTS = new KV[] {
+ KV.of("user0_AndroidGreenEchidna", 0), KV.of("user0_MagentaKangaroo", 4),
+ KV.of("user2_AmberCockatoo", 13),
+ KV.of("user18_BananaEmu", 7), KV.of("user3_BananaEmu", 17),
+ KV.of("user18_BananaEmu", 1), KV.of("user18_ApricotCaneToad", 14)
+ };
+
+
+ /** Test the filtering. */
+ @Test
+ @Category(RunnableOnService.class)
+ public void testUserScoresFilter() throws Exception {
+ Pipeline p = TestPipeline.create();
+
+ final Instant startMinTimestamp = new Instant(1447965680000L);
+
+ PCollection<String> input = p.apply(Create.of(GAME_EVENTS).withCoder(StringUtf8Coder.of()));
+
+ PCollection<KV<String, Integer>> output = input
+ .apply(ParDo.named("ParseGameEvent").of(new ParseEventFn()))
+
+ .apply("FilterStartTime", Filter.byPredicate(
+ (GameActionInfo gInfo)
+ -> gInfo.getTimestamp() > startMinTimestamp.getMillis()))
+ // run a map to access the fields in the result.
+ .apply(MapElements
+ .via((GameActionInfo gInfo) -> KV.of(gInfo.getUser(), gInfo.getScore()))
+ .withOutputType(new TypeDescriptor<KV<String, Integer>>() {}));
+
+ DataflowAssert.that(output).containsInAnyOrder(FILTERED_EVENTS);
+
+ p.run();
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/11bb9e0e/examples/java8/src/test/java/com/google/cloud/dataflow/examples/complete/game/UserScoreTest.java
----------------------------------------------------------------------
diff --git a/examples/java8/src/test/java/com/google/cloud/dataflow/examples/complete/game/UserScoreTest.java b/examples/java8/src/test/java/com/google/cloud/dataflow/examples/complete/game/UserScoreTest.java
new file mode 100644
index 0000000..641e2c3
--- /dev/null
+++ b/examples/java8/src/test/java/com/google/cloud/dataflow/examples/complete/game/UserScoreTest.java
@@ -0,0 +1,154 @@
+/*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package com.google.cloud.dataflow.examples.complete.game;
+
+import com.google.cloud.dataflow.examples.complete.game.UserScore.ExtractAndSumScore;
+import com.google.cloud.dataflow.examples.complete.game.UserScore.GameActionInfo;
+import com.google.cloud.dataflow.examples.complete.game.UserScore.ParseEventFn;
+import com.google.cloud.dataflow.sdk.Pipeline;
+import com.google.cloud.dataflow.sdk.coders.StringUtf8Coder;
+import com.google.cloud.dataflow.sdk.testing.DataflowAssert;
+import com.google.cloud.dataflow.sdk.testing.RunnableOnService;
+import com.google.cloud.dataflow.sdk.testing.TestPipeline;
+import com.google.cloud.dataflow.sdk.transforms.Create;
+import com.google.cloud.dataflow.sdk.transforms.DoFnTester;
+import com.google.cloud.dataflow.sdk.transforms.MapElements;
+import com.google.cloud.dataflow.sdk.transforms.ParDo;
+import com.google.cloud.dataflow.sdk.values.KV;
+import com.google.cloud.dataflow.sdk.values.PCollection;
+import com.google.cloud.dataflow.sdk.values.TypeDescriptor;
+
+import org.junit.Assert;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+import java.io.Serializable;
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * Tests of UserScore.
+ */
+@RunWith(JUnit4.class)
+public class UserScoreTest implements Serializable {
+
+ static final String[] GAME_EVENTS_ARRAY = new String[] {
+ "user0_MagentaKangaroo,MagentaKangaroo,3,1447955630000,2015-11-19 09:53:53.444",
+ "user13_ApricotQuokka,ApricotQuokka,15,1447955630000,2015-11-19 09:53:53.444",
+ "user6_AmberNumbat,AmberNumbat,11,1447955630000,2015-11-19 09:53:53.444",
+ "user7_AlmondWallaby,AlmondWallaby,15,1447955630000,2015-11-19 09:53:53.444",
+ "user7_AndroidGreenKookaburra,AndroidGreenKookaburra,12,1447955630000,2015-11-19 09:53:53.444",
+ "user6_AliceBlueDingo,AliceBlueDingo,4,xxxxxxx,2015-11-19 09:53:53.444",
+ "user7_AndroidGreenKookaburra,AndroidGreenKookaburra,11,1447955630000,2015-11-19 09:53:53.444",
+ "THIS IS A PARSE ERROR,2015-11-19 09:53:53.444",
+ "user19_BisqueBilby,BisqueBilby,6,1447955630000,2015-11-19 09:53:53.444",
+ "user19_BisqueBilby,BisqueBilby,8,1447955630000,2015-11-19 09:53:53.444"
+ };
+
+ static final String[] GAME_EVENTS_ARRAY2 = new String[] {
+ "user6_AliceBlueDingo,AliceBlueDingo,4,xxxxxxx,2015-11-19 09:53:53.444",
+ "THIS IS A PARSE ERROR,2015-11-19 09:53:53.444",
+ "user13_BisqueBilby,BisqueBilby,xxx,1447955630000,2015-11-19 09:53:53.444"
+ };
+
+ static final List<String> GAME_EVENTS = Arrays.asList(GAME_EVENTS_ARRAY);
+ static final List<String> GAME_EVENTS2 = Arrays.asList(GAME_EVENTS_ARRAY2);
+
+ static final List<KV<String, Integer>> USER_SUMS = Arrays.asList(
+ KV.of("user0_MagentaKangaroo", 3), KV.of("user13_ApricotQuokka", 15),
+ KV.of("user6_AmberNumbat", 11), KV.of("user7_AlmondWallaby", 15),
+ KV.of("user7_AndroidGreenKookaburra", 23),
+ KV.of("user19_BisqueBilby", 14));
+
+ static final List<KV<String, Integer>> TEAM_SUMS = Arrays.asList(
+ KV.of("MagentaKangaroo", 3), KV.of("ApricotQuokka", 15),
+ KV.of("AmberNumbat", 11), KV.of("AlmondWallaby", 15),
+ KV.of("AndroidGreenKookaburra", 23),
+ KV.of("BisqueBilby", 14));
+
+ /** Test the ParseEventFn DoFn. */
+ @Test
+ public void testParseEventFn() {
+ DoFnTester<String, GameActionInfo> parseEventFn =
+ DoFnTester.of(new ParseEventFn());
+
+ List<GameActionInfo> results = parseEventFn.processBatch(GAME_EVENTS_ARRAY);
+ Assert.assertEquals(results.size(), 8);
+ Assert.assertEquals(results.get(0).getUser(), "user0_MagentaKangaroo");
+ Assert.assertEquals(results.get(0).getTeam(), "MagentaKangaroo");
+ Assert.assertEquals(results.get(0).getScore(), new Integer(3));
+ }
+
+ /** Tests ExtractAndSumScore("user"). */
+ @Test
+ @Category(RunnableOnService.class)
+ public void testUserScoreSums() throws Exception {
+ Pipeline p = TestPipeline.create();
+
+ PCollection<String> input = p.apply(Create.of(GAME_EVENTS).withCoder(StringUtf8Coder.of()));
+
+ PCollection<KV<String, Integer>> output = input
+ .apply(ParDo.of(new ParseEventFn()))
+ // Extract and sum username/score pairs from the event data.
+ .apply("ExtractUserScore", new ExtractAndSumScore("user"));
+
+ // Check the user score sums.
+ DataflowAssert.that(output).containsInAnyOrder(USER_SUMS);
+
+ p.run();
+ }
+
+ /** Tests ExtractAndSumScore("team"). */
+ @Test
+ @Category(RunnableOnService.class)
+ public void testTeamScoreSums() throws Exception {
+ Pipeline p = TestPipeline.create();
+
+ PCollection<String> input = p.apply(Create.of(GAME_EVENTS).withCoder(StringUtf8Coder.of()));
+
+ PCollection<KV<String, Integer>> output = input
+ .apply(ParDo.of(new ParseEventFn()))
+ // Extract and sum teamname/score pairs from the event data.
+ .apply("ExtractTeamScore", new ExtractAndSumScore("team"));
+
+ // Check the team score sums.
+ DataflowAssert.that(output).containsInAnyOrder(TEAM_SUMS);
+
+ p.run();
+ }
+
+ /** Test that bad input data is dropped appropriately. */
+ @Test
+ @Category(RunnableOnService.class)
+ public void testUserScoresBadInput() throws Exception {
+ Pipeline p = TestPipeline.create();
+
+ PCollection<String> input = p.apply(Create.of(GAME_EVENTS2).withCoder(StringUtf8Coder.of()));
+
+ PCollection<KV<String, Integer>> extract = input
+ .apply(ParDo.of(new ParseEventFn()))
+ .apply(
+ MapElements.via((GameActionInfo gInfo) -> KV.of(gInfo.getUser(), gInfo.getScore()))
+ .withOutputType(new TypeDescriptor<KV<String, Integer>>() {}));
+
+ DataflowAssert.that(extract).empty();
+
+ p.run();
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/11bb9e0e/java8examples/pom.xml
----------------------------------------------------------------------
diff --git a/java8examples/pom.xml b/java8examples/pom.xml
deleted file mode 100644
index eb3ef01..0000000
--- a/java8examples/pom.xml
+++ /dev/null
@@ -1,279 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project xmlns="http://maven.apache.org/POM/4.0.0"
- xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
- xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
-
- <modelVersion>4.0.0</modelVersion>
-
- <parent>
- <groupId>org.apache.beam</groupId>
- <artifactId>parent</artifactId>
- <version>0.1.0-incubating-SNAPSHOT</version>
- <relativePath>../pom.xml</relativePath>
- </parent>
-
- <artifactId>java8examples-all</artifactId>
- <name>Apache Beam :: Examples :: Java 8 All</name>
- <description>Apache Beam Java SDK provides a simple, Java-based
- interface for processing virtually any size data.
- This artifact includes examples of the SDK from a Java 8
- user.</description>
-
- <packaging>jar</packaging>
-
- <profiles>
- <profile>
- <id>DataflowPipelineTests</id>
- <properties>
- <runIntegrationTestOnService>true</runIntegrationTestOnService>
- <testGroups>com.google.cloud.dataflow.sdk.testing.RunnableOnService</testGroups>
- <testParallelValue>both</testParallelValue>
- </properties>
- </profile>
- </profiles>
-
- <build>
- <plugins>
- <plugin>
- <artifactId>maven-compiler-plugin</artifactId>
- <configuration>
- <source>1.8</source>
- <target>1.8</target>
- <testSource>1.8</testSource>
- <testTarget>1.8</testTarget>
- </configuration>
- </plugin>
-
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-dependency-plugin</artifactId>
- <executions>
- <execution>
- <goals><goal>analyze-only</goal></goals>
- <configuration>
- <failOnWarning>true</failOnWarning>
- </configuration>
- </execution>
- </executions>
- </plugin>
-
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-checkstyle-plugin</artifactId>
- <version>2.12</version>
- <dependencies>
- <dependency>
- <groupId>com.puppycrawl.tools</groupId>
- <artifactId>checkstyle</artifactId>
- <version>6.6</version>
- </dependency>
- </dependencies>
- <configuration>
- <configLocation>../checkstyle.xml</configLocation>
- <consoleOutput>true</consoleOutput>
- <failOnViolation>true</failOnViolation>
- <includeTestSourceDirectory>true</includeTestSourceDirectory>
- <includeResources>false</includeResources>
- </configuration>
- <executions>
- <execution>
- <goals>
- <goal>check</goal>
- </goals>
- </execution>
- </executions>
- </plugin>
-
- <!-- Source plugin for generating source and test-source JARs. -->
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-source-plugin</artifactId>
- <version>2.4</version>
- <executions>
- <execution>
- <id>attach-sources</id>
- <phase>compile</phase>
- <goals>
- <goal>jar</goal>
- </goals>
- </execution>
- <execution>
- <id>attach-test-sources</id>
- <phase>test-compile</phase>
- <goals>
- <goal>test-jar</goal>
- </goals>
- </execution>
- </executions>
- </plugin>
-
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-jar-plugin</artifactId>
- <executions>
- <execution>
- <id>default-jar</id>
- <goals>
- <goal>jar</goal>
- </goals>
- </execution>
- <execution>
- <id>default-test-jar</id>
- <goals>
- <goal>test-jar</goal>
- </goals>
- </execution>
- </executions>
- </plugin>
-
- <!-- Coverage analysis for unit tests. -->
- <plugin>
- <groupId>org.jacoco</groupId>
- <artifactId>jacoco-maven-plugin</artifactId>
- </plugin>
- </plugins>
- </build>
-
- <dependencies>
- <dependency>
- <groupId>org.apache.beam</groupId>
- <artifactId>java-sdk-all</artifactId>
- <version>${project.version}</version>
- </dependency>
-
- <dependency>
- <groupId>org.apache.beam</groupId>
- <artifactId>java-examples-all</artifactId>
- <version>${project.version}</version>
- </dependency>
-
- <dependency>
- <groupId>com.google.guava</groupId>
- <artifactId>guava</artifactId>
- <version>${guava.version}</version>
- </dependency>
-
- <dependency>
- <groupId>org.slf4j</groupId>
- <artifactId>slf4j-api</artifactId>
- <version>${slf4j.version}</version>
- </dependency>
-
- <dependency>
- <groupId>org.apache.avro</groupId>
- <artifactId>avro</artifactId>
- <version>${avro.version}</version>
- </dependency>
-
- <dependency>
- <groupId>joda-time</groupId>
- <artifactId>joda-time</artifactId>
- <version>${joda.version}</version>
- </dependency>
-
- <dependency>
- <groupId>org.hamcrest</groupId>
- <artifactId>hamcrest-all</artifactId>
- <version>${hamcrest.version}</version>
- <scope>test</scope>
- </dependency>
-
- <dependency>
- <groupId>org.mockito</groupId>
- <artifactId>mockito-all</artifactId>
- <version>1.10.19</version>
- <scope>test</scope>
- </dependency>
-
- <dependency>
- <groupId>junit</groupId>
- <artifactId>junit</artifactId>
- <version>${junit.version}</version>
- <scope>test</scope>
- </dependency>
-
- <dependency>
- <groupId>com.google.apis</groupId>
- <artifactId>google-api-services-bigquery</artifactId>
- <version>${bigquery.version}</version>
- <exclusions>
- <!-- Exclude an old version of guava that is being pulled
- in by a transitive dependency of google-api-client -->
- <exclusion>
- <groupId>com.google.guava</groupId>
- <artifactId>guava-jdk5</artifactId>
- </exclusion>
- </exclusions>
- </dependency>
-
- <dependency>
- <groupId>com.google.http-client</groupId>
- <artifactId>google-http-client</artifactId>
- <version>${google-clients.version}</version>
- <exclusions>
- <!-- Exclude an old version of guava that is being pulled
- in by a transitive dependency of google-api-client -->
- <exclusion>
- <groupId>com.google.guava</groupId>
- <artifactId>guava-jdk5</artifactId>
- </exclusion>
- </exclusions>
- </dependency>
-
- <dependency>
- <groupId>com.google.oauth-client</groupId>
- <artifactId>google-oauth-client</artifactId>
- <version>${google-clients.version}</version>
- <exclusions>
- <!-- Exclude an old version of guava that is being pulled
- in by a transitive dependency of google-api-client -->
- <exclusion>
- <groupId>com.google.guava</groupId>
- <artifactId>guava-jdk5</artifactId>
- </exclusion>
- </exclusions>
- </dependency>
-
- <dependency>
- <groupId>com.google.apis</groupId>
- <artifactId>google-api-services-pubsub</artifactId>
- <version>${pubsub.version}</version>
- <exclusions>
- <!-- Exclude an old version of guava that is being pulled
- in by a transitive dependency of google-api-client -->
- <exclusion>
- <groupId>com.google.guava</groupId>
- <artifactId>guava-jdk5</artifactId>
- </exclusion>
- </exclusions>
- </dependency>
-
- <dependency>
- <groupId>com.google.api-client</groupId>
- <artifactId>google-api-client</artifactId>
- <version>${google-clients.version}</version>
- <exclusions>
- <exclusion>
- <groupId>com.google.guava</groupId>
- <artifactId>guava-jdk5</artifactId>
- </exclusion>
- </exclusions>
- </dependency>
- </dependencies>
-</project>
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/11bb9e0e/java8examples/src/main/java/com/google/cloud/dataflow/examples/MinimalWordCountJava8.java
----------------------------------------------------------------------
diff --git a/java8examples/src/main/java/com/google/cloud/dataflow/examples/MinimalWordCountJava8.java b/java8examples/src/main/java/com/google/cloud/dataflow/examples/MinimalWordCountJava8.java
deleted file mode 100644
index c115ea0..0000000
--- a/java8examples/src/main/java/com/google/cloud/dataflow/examples/MinimalWordCountJava8.java
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.examples;
-
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.io.TextIO;
-import com.google.cloud.dataflow.sdk.options.DataflowPipelineOptions;
-import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
-import com.google.cloud.dataflow.sdk.runners.BlockingDataflowPipelineRunner;
-import com.google.cloud.dataflow.sdk.transforms.Count;
-import com.google.cloud.dataflow.sdk.transforms.Filter;
-import com.google.cloud.dataflow.sdk.transforms.FlatMapElements;
-import com.google.cloud.dataflow.sdk.transforms.MapElements;
-import com.google.cloud.dataflow.sdk.values.KV;
-import com.google.cloud.dataflow.sdk.values.TypeDescriptor;
-
-import java.util.Arrays;
-
-/**
- * An example that counts words in Shakespeare, using Java 8 language features.
- *
- * <p>See {@link MinimalWordCount} for a comprehensive explanation.
- */
-public class MinimalWordCountJava8 {
-
- public static void main(String[] args) {
- DataflowPipelineOptions options = PipelineOptionsFactory.create()
- .as(DataflowPipelineOptions.class);
-
- options.setRunner(BlockingDataflowPipelineRunner.class);
-
- // CHANGE 1 of 3: Your project ID is required in order to run your pipeline on the Google Cloud.
- options.setProject("SET_YOUR_PROJECT_ID_HERE");
-
- // CHANGE 2 of 3: Your Google Cloud Storage path is required for staging local files.
- options.setStagingLocation("gs://SET_YOUR_BUCKET_NAME_HERE/AND_STAGING_DIRECTORY");
-
- Pipeline p = Pipeline.create(options);
-
- p.apply(TextIO.Read.from("gs://dataflow-samples/shakespeare/*"))
- .apply(FlatMapElements.via((String word) -> Arrays.asList(word.split("[^a-zA-Z']+")))
- .withOutputType(new TypeDescriptor<String>() {}))
- .apply(Filter.byPredicate((String word) -> !word.isEmpty()))
- .apply(Count.<String>perElement())
- .apply(MapElements
- .via((KV<String, Long> wordCount) -> wordCount.getKey() + ": " + wordCount.getValue())
- .withOutputType(new TypeDescriptor<String>() {}))
-
- // CHANGE 3 of 3: The Google Cloud Storage path is required for outputting the results to.
- .apply(TextIO.Write.to("gs://YOUR_OUTPUT_BUCKET/AND_OUTPUT_PREFIX"));
-
- p.run();
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/11bb9e0e/java8examples/src/main/java/com/google/cloud/dataflow/examples/complete/game/GameStats.java
----------------------------------------------------------------------
diff --git a/java8examples/src/main/java/com/google/cloud/dataflow/examples/complete/game/GameStats.java b/java8examples/src/main/java/com/google/cloud/dataflow/examples/complete/game/GameStats.java
deleted file mode 100644
index 7c67d10..0000000
--- a/java8examples/src/main/java/com/google/cloud/dataflow/examples/complete/game/GameStats.java
+++ /dev/null
@@ -1,339 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.examples.complete.game;
-
-import com.google.cloud.dataflow.examples.common.DataflowExampleUtils;
-import com.google.cloud.dataflow.examples.complete.game.utils.WriteWindowedToBigQuery;
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.PipelineResult;
-import com.google.cloud.dataflow.sdk.io.PubsubIO;
-import com.google.cloud.dataflow.sdk.options.Default;
-import com.google.cloud.dataflow.sdk.options.Description;
-import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
-import com.google.cloud.dataflow.sdk.runners.DataflowPipelineRunner;
-import com.google.cloud.dataflow.sdk.transforms.Aggregator;
-import com.google.cloud.dataflow.sdk.transforms.Combine;
-import com.google.cloud.dataflow.sdk.transforms.DoFn;
-import com.google.cloud.dataflow.sdk.transforms.DoFn.RequiresWindowAccess;
-import com.google.cloud.dataflow.sdk.transforms.MapElements;
-import com.google.cloud.dataflow.sdk.transforms.Mean;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-import com.google.cloud.dataflow.sdk.transforms.ParDo;
-import com.google.cloud.dataflow.sdk.transforms.Sum;
-import com.google.cloud.dataflow.sdk.transforms.Values;
-import com.google.cloud.dataflow.sdk.transforms.View;
-import com.google.cloud.dataflow.sdk.transforms.windowing.FixedWindows;
-import com.google.cloud.dataflow.sdk.transforms.windowing.IntervalWindow;
-import com.google.cloud.dataflow.sdk.transforms.windowing.OutputTimeFns;
-import com.google.cloud.dataflow.sdk.transforms.windowing.Sessions;
-import com.google.cloud.dataflow.sdk.transforms.windowing.Window;
-import com.google.cloud.dataflow.sdk.values.KV;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.cloud.dataflow.sdk.values.PCollectionView;
-import com.google.cloud.dataflow.sdk.values.TypeDescriptor;
-
-import org.joda.time.DateTimeZone;
-import org.joda.time.Duration;
-import org.joda.time.Instant;
-import org.joda.time.format.DateTimeFormat;
-import org.joda.time.format.DateTimeFormatter;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.util.HashMap;
-import java.util.Map;
-import java.util.TimeZone;
-
-/**
- * This class is the fourth in a series of four pipelines that tell a story in a 'gaming'
- * domain, following {@link UserScore}, {@link HourlyTeamScore}, and {@link LeaderBoard}.
- * New concepts: session windows and finding session duration; use of both
- * singleton and non-singleton side inputs.
- *
- * <p> This pipeline builds on the {@link LeaderBoard} functionality, and adds some "business
- * intelligence" analysis: abuse detection and usage patterns. The pipeline derives the Mean user
- * score sum for a window, and uses that information to identify likely spammers/robots. (The robots
- * have a higher click rate than the human users). The 'robot' users are then filtered out when
- * calculating the team scores.
- *
- * <p> Additionally, user sessions are tracked: that is, we find bursts of user activity using
- * session windows. Then, the mean session duration information is recorded in the context of
- * subsequent fixed windowing. (This could be used to tell us what games are giving us greater
- * user retention).
- *
- * <p> Run {@code com.google.cloud.dataflow.examples.complete.game.injector.Injector} to generate
- * pubsub data for this pipeline. The {@code Injector} documentation provides more detail.
- *
- * <p> To execute this pipeline using the Dataflow service, specify the pipeline configuration
- * like this:
- * <pre>{@code
- * --project=YOUR_PROJECT_ID
- * --stagingLocation=gs://YOUR_STAGING_DIRECTORY
- * --runner=BlockingDataflowPipelineRunner
- * --dataset=YOUR-DATASET
- * --topic=projects/YOUR-PROJECT/topics/YOUR-TOPIC
- * }
- * </pre>
- * where the BigQuery dataset you specify must already exist. The PubSub topic you specify should
- * be the same topic to which the Injector is publishing.
- */
-public class GameStats extends LeaderBoard {
-
- private static final String TIMESTAMP_ATTRIBUTE = "timestamp_ms";
-
- private static DateTimeFormatter fmt =
- DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss.SSS")
- .withZone(DateTimeZone.forTimeZone(TimeZone.getTimeZone("PST")));
-
- /**
- * Filter out all but those users with a high clickrate, which we will consider as 'spammy' uesrs.
- * We do this by finding the mean total score per user, then using that information as a side
- * input to filter out all but those user scores that are > (mean * SCORE_WEIGHT)
- */
- // [START DocInclude_AbuseDetect]
- public static class CalculateSpammyUsers
- extends PTransform<PCollection<KV<String, Integer>>, PCollection<KV<String, Integer>>> {
- private static final Logger LOG = LoggerFactory.getLogger(CalculateSpammyUsers.class);
- private static final double SCORE_WEIGHT = 2.5;
-
- @Override
- public PCollection<KV<String, Integer>> apply(PCollection<KV<String, Integer>> userScores) {
-
- // Get the sum of scores for each user.
- PCollection<KV<String, Integer>> sumScores = userScores
- .apply("UserSum", Sum.<String>integersPerKey());
-
- // Extract the score from each element, and use it to find the global mean.
- final PCollectionView<Double> globalMeanScore = sumScores.apply(Values.<Integer>create())
- .apply(Mean.<Integer>globally().asSingletonView());
-
- // Filter the user sums using the global mean.
- PCollection<KV<String, Integer>> filtered = sumScores
- .apply(ParDo
- .named("ProcessAndFilter")
- // use the derived mean total score as a side input
- .withSideInputs(globalMeanScore)
- .of(new DoFn<KV<String, Integer>, KV<String, Integer>>() {
- private final Aggregator<Long, Long> numSpammerUsers =
- createAggregator("SpammerUsers", new Sum.SumLongFn());
- @Override
- public void processElement(ProcessContext c) {
- Integer score = c.element().getValue();
- Double gmc = c.sideInput(globalMeanScore);
- if (score > (gmc * SCORE_WEIGHT)) {
- LOG.info("user " + c.element().getKey() + " spammer score " + score
- + " with mean " + gmc);
- numSpammerUsers.addValue(1L);
- c.output(c.element());
- }
- }
- }));
- return filtered;
- }
- }
- // [END DocInclude_AbuseDetect]
-
- /**
- * Calculate and output an element's session duration.
- */
- private static class UserSessionInfoFn extends DoFn<KV<String, Integer>, Integer>
- implements RequiresWindowAccess {
-
- @Override
- public void processElement(ProcessContext c) {
- IntervalWindow w = (IntervalWindow) c.window();
- int duration = new Duration(
- w.start(), w.end()).toPeriod().toStandardMinutes().getMinutes();
- c.output(duration);
- }
- }
-
-
- /**
- * Options supported by {@link GameStats}.
- */
- static interface Options extends LeaderBoard.Options {
- @Description("Numeric value of fixed window duration for user analysis, in minutes")
- @Default.Integer(60)
- Integer getFixedWindowDuration();
- void setFixedWindowDuration(Integer value);
-
- @Description("Numeric value of gap between user sessions, in minutes")
- @Default.Integer(5)
- Integer getSessionGap();
- void setSessionGap(Integer value);
-
- @Description("Numeric value of fixed window for finding mean of user session duration, "
- + "in minutes")
- @Default.Integer(30)
- Integer getUserActivityWindowDuration();
- void setUserActivityWindowDuration(Integer value);
-
- @Description("Prefix used for the BigQuery table names")
- @Default.String("game_stats")
- String getTablePrefix();
- void setTablePrefix(String value);
- }
-
-
- /**
- * Create a map of information that describes how to write pipeline output to BigQuery. This map
- * is used to write information about team score sums.
- */
- protected static Map<String, WriteWindowedToBigQuery.FieldInfo<KV<String, Integer>>>
- configureWindowedWrite() {
- Map<String, WriteWindowedToBigQuery.FieldInfo<KV<String, Integer>>> tableConfigure =
- new HashMap<String, WriteWindowedToBigQuery.FieldInfo<KV<String, Integer>>>();
- tableConfigure.put("team",
- new WriteWindowedToBigQuery.FieldInfo<KV<String, Integer>>("STRING",
- c -> c.element().getKey()));
- tableConfigure.put("total_score",
- new WriteWindowedToBigQuery.FieldInfo<KV<String, Integer>>("INTEGER",
- c -> c.element().getValue()));
- tableConfigure.put("window_start",
- new WriteWindowedToBigQuery.FieldInfo<KV<String, Integer>>("STRING",
- c -> { IntervalWindow w = (IntervalWindow) c.window();
- return fmt.print(w.start()); }));
- tableConfigure.put("processing_time",
- new WriteWindowedToBigQuery.FieldInfo<KV<String, Integer>>(
- "STRING", c -> fmt.print(Instant.now())));
- return tableConfigure;
- }
-
- /**
- * Create a map of information that describes how to write pipeline output to BigQuery. This map
- * is used to write information about mean user session time.
- */
- protected static Map<String, WriteWindowedToBigQuery.FieldInfo<Double>>
- configureSessionWindowWrite() {
-
- Map<String, WriteWindowedToBigQuery.FieldInfo<Double>> tableConfigure =
- new HashMap<String, WriteWindowedToBigQuery.FieldInfo<Double>>();
- tableConfigure.put("window_start",
- new WriteWindowedToBigQuery.FieldInfo<Double>("STRING",
- c -> { IntervalWindow w = (IntervalWindow) c.window();
- return fmt.print(w.start()); }));
- tableConfigure.put("mean_duration",
- new WriteWindowedToBigQuery.FieldInfo<Double>("FLOAT", c -> c.element()));
- return tableConfigure;
- }
-
-
-
- public static void main(String[] args) throws Exception {
-
- Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
- // Enforce that this pipeline is always run in streaming mode.
- options.setStreaming(true);
- // Allow the pipeline to be cancelled automatically.
- options.setRunner(DataflowPipelineRunner.class);
- DataflowExampleUtils dataflowUtils = new DataflowExampleUtils(options);
- Pipeline pipeline = Pipeline.create(options);
-
- // Read Events from Pub/Sub using custom timestamps
- PCollection<GameActionInfo> rawEvents = pipeline
- .apply(PubsubIO.Read.timestampLabel(TIMESTAMP_ATTRIBUTE).topic(options.getTopic()))
- .apply(ParDo.named("ParseGameEvent").of(new ParseEventFn()));
-
- // Extract username/score pairs from the event stream
- PCollection<KV<String, Integer>> userEvents =
- rawEvents.apply("ExtractUserScore",
- MapElements.via((GameActionInfo gInfo) -> KV.of(gInfo.getUser(), gInfo.getScore()))
- .withOutputType(new TypeDescriptor<KV<String, Integer>>() {}));
-
- // Calculate the total score per user over fixed windows, and
- // cumulative updates for late data.
- final PCollectionView<Map<String, Integer>> spammersView = userEvents
- .apply(Window.named("FixedWindowsUser")
- .<KV<String, Integer>>into(FixedWindows.of(
- Duration.standardMinutes(options.getFixedWindowDuration())))
- )
-
- // Filter out everyone but those with (SCORE_WEIGHT * avg) clickrate.
- // These might be robots/spammers.
- .apply("CalculateSpammyUsers", new CalculateSpammyUsers())
- // Derive a view from the collection of spammer users. It will be used as a side input
- // in calculating the team score sums, below.
- .apply("CreateSpammersView", View.<String, Integer>asMap());
-
- // [START DocInclude_FilterAndCalc]
- // Calculate the total score per team over fixed windows,
- // and emit cumulative updates for late data. Uses the side input derived above-- the set of
- // suspected robots-- to filter out scores from those users from the sum.
- // Write the results to BigQuery.
- rawEvents
- .apply(Window.named("WindowIntoFixedWindows")
- .<GameActionInfo>into(FixedWindows.of(
- Duration.standardMinutes(options.getFixedWindowDuration())))
- )
- // Filter out the detected spammer users, using the side input derived above.
- .apply(ParDo.named("FilterOutSpammers")
- .withSideInputs(spammersView)
- .of(new DoFn<GameActionInfo, GameActionInfo>() {
- @Override
- public void processElement(ProcessContext c) {
- // If the user is not in the spammers Map, output the data element.
- if (c.sideInput(spammersView).get(c.element().getUser().trim()) == null) {
- c.output(c.element());
- }
- }
- }))
- // Extract and sum teamname/score pairs from the event data.
- .apply("ExtractTeamScore", new ExtractAndSumScore("team"))
- // [END DocInclude_FilterAndCalc]
- // Write the result to BigQuery
- .apply("WriteTeamSums",
- new WriteWindowedToBigQuery<KV<String, Integer>>(
- options.getTablePrefix() + "_team", configureWindowedWrite()));
-
-
- // [START DocInclude_SessionCalc]
- // Detect user sessions-- that is, a burst of activity separated by a gap from further
- // activity. Find and record the mean session lengths.
- // This information could help the game designers track the changing user engagement
- // as their set of games changes.
- userEvents
- .apply(Window.named("WindowIntoSessions")
- .<KV<String, Integer>>into(
- Sessions.withGapDuration(Duration.standardMinutes(options.getSessionGap())))
- .withOutputTimeFn(OutputTimeFns.outputAtEndOfWindow()))
- // For this use, we care only about the existence of the session, not any particular
- // information aggregated over it, so the following is an efficient way to do that.
- .apply(Combine.perKey(x -> 0))
- // Get the duration per session.
- .apply("UserSessionActivity", ParDo.of(new UserSessionInfoFn()))
- // [END DocInclude_SessionCalc]
- // [START DocInclude_Rewindow]
- // Re-window to process groups of session sums according to when the sessions complete.
- .apply(Window.named("WindowToExtractSessionMean")
- .<Integer>into(
- FixedWindows.of(Duration.standardMinutes(options.getUserActivityWindowDuration()))))
- // Find the mean session duration in each window.
- .apply(Mean.<Integer>globally().withoutDefaults())
- // Write this info to a BigQuery table.
- .apply("WriteAvgSessionLength",
- new WriteWindowedToBigQuery<Double>(
- options.getTablePrefix() + "_sessions", configureSessionWindowWrite()));
- // [END DocInclude_Rewindow]
-
-
- // Run the pipeline and wait for the pipeline to finish; capture cancellation requests from the
- // command line.
- PipelineResult result = pipeline.run();
- dataflowUtils.waitToFinish(result);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/11bb9e0e/java8examples/src/main/java/com/google/cloud/dataflow/examples/complete/game/HourlyTeamScore.java
----------------------------------------------------------------------
diff --git a/java8examples/src/main/java/com/google/cloud/dataflow/examples/complete/game/HourlyTeamScore.java b/java8examples/src/main/java/com/google/cloud/dataflow/examples/complete/game/HourlyTeamScore.java
deleted file mode 100644
index 481b9df..0000000
--- a/java8examples/src/main/java/com/google/cloud/dataflow/examples/complete/game/HourlyTeamScore.java
+++ /dev/null
@@ -1,193 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.examples.complete.game;
-
-import com.google.cloud.dataflow.examples.complete.game.utils.WriteWindowedToBigQuery;
-
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.io.TextIO;
-import com.google.cloud.dataflow.sdk.options.Default;
-import com.google.cloud.dataflow.sdk.options.Description;
-import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
-import com.google.cloud.dataflow.sdk.transforms.Filter;
-import com.google.cloud.dataflow.sdk.transforms.ParDo;
-import com.google.cloud.dataflow.sdk.transforms.WithTimestamps;
-import com.google.cloud.dataflow.sdk.transforms.windowing.FixedWindows;
-import com.google.cloud.dataflow.sdk.transforms.windowing.IntervalWindow;
-import com.google.cloud.dataflow.sdk.transforms.windowing.Window;
-import com.google.cloud.dataflow.sdk.values.KV;
-
-import org.joda.time.DateTimeZone;
-import org.joda.time.Duration;
-import org.joda.time.Instant;
-import org.joda.time.format.DateTimeFormat;
-import org.joda.time.format.DateTimeFormatter;
-
-import java.util.HashMap;
-import java.util.Map;
-import java.util.TimeZone;
-
-/**
- * This class is the second in a series of four pipelines that tell a story in a 'gaming'
- * domain, following {@link UserScore}. In addition to the concepts introduced in {@link UserScore},
- * new concepts include: windowing and element timestamps; use of {@code Filter.byPredicate()}.
- *
- * <p> This pipeline processes data collected from gaming events in batch, building on {@link
- * UserScore} but using fixed windows. It calculates the sum of scores per team, for each window,
- * optionally allowing specification of two timestamps before and after which data is filtered out.
- * This allows a model where late data collected after the intended analysis window can be included,
- * and any late-arriving data prior to the beginning of the analysis window can be removed as well.
- * By using windowing and adding element timestamps, we can do finer-grained analysis than with the
- * {@link UserScore} pipeline. However, our batch processing is high-latency, in that we don't get
- * results from plays at the beginning of the batch's time period until the batch is processed.
- *
- * <p> To execute this pipeline using the Dataflow service, specify the pipeline configuration
- * like this:
- * <pre>{@code
- * --project=YOUR_PROJECT_ID
- * --stagingLocation=gs://YOUR_STAGING_DIRECTORY
- * --runner=BlockingDataflowPipelineRunner
- * --dataset=YOUR-DATASET
- * }
- * </pre>
- * where the BigQuery dataset you specify must already exist.
- *
- * <p> Optionally include {@code --input} to specify the batch input file path.
- * To indicate a time after which the data should be filtered out, include the
- * {@code --stopMin} arg. E.g., {@code --stopMin=2015-10-18-23-59} indicates that any data
- * timestamped after 23:59 PST on 2015-10-18 should not be included in the analysis.
- * To indicate a time before which data should be filtered out, include the {@code --startMin} arg.
- * If you're using the default input specified in {@link UserScore},
- * "gs://dataflow-samples/game/gaming_data*.csv", then
- * {@code --startMin=2015-11-16-16-10 --stopMin=2015-11-17-16-10} are good values.
- */
-public class HourlyTeamScore extends UserScore {
-
- private static DateTimeFormatter fmt =
- DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss.SSS")
- .withZone(DateTimeZone.forTimeZone(TimeZone.getTimeZone("PST")));
- private static DateTimeFormatter minFmt =
- DateTimeFormat.forPattern("yyyy-MM-dd-HH-mm")
- .withZone(DateTimeZone.forTimeZone(TimeZone.getTimeZone("PST")));
-
-
- /**
- * Options supported by {@link HourlyTeamScore}.
- */
- static interface Options extends UserScore.Options {
-
- @Description("Numeric value of fixed window duration, in minutes")
- @Default.Integer(60)
- Integer getWindowDuration();
- void setWindowDuration(Integer value);
-
- @Description("String representation of the first minute after which to generate results,"
- + "in the format: yyyy-MM-dd-HH-mm . This time should be in PST."
- + "Any input data timestamped prior to that minute won't be included in the sums.")
- @Default.String("1970-01-01-00-00")
- String getStartMin();
- void setStartMin(String value);
-
- @Description("String representation of the first minute for which to not generate results,"
- + "in the format: yyyy-MM-dd-HH-mm . This time should be in PST."
- + "Any input data timestamped after that minute won't be included in the sums.")
- @Default.String("2100-01-01-00-00")
- String getStopMin();
- void setStopMin(String value);
-
- @Description("The BigQuery table name. Should not already exist.")
- @Default.String("hourly_team_score")
- String getTableName();
- void setTableName(String value);
- }
-
- /**
- * Create a map of information that describes how to write pipeline output to BigQuery. This map
- * is passed to the {@link WriteWindowedToBigQuery} constructor to write team score sums and
- * includes information about window start time.
- */
- protected static Map<String, WriteWindowedToBigQuery.FieldInfo<KV<String, Integer>>>
- configureWindowedTableWrite() {
- Map<String, WriteWindowedToBigQuery.FieldInfo<KV<String, Integer>>> tableConfig =
- new HashMap<String, WriteWindowedToBigQuery.FieldInfo<KV<String, Integer>>>();
- tableConfig.put("team",
- new WriteWindowedToBigQuery.FieldInfo<KV<String, Integer>>("STRING",
- c -> c.element().getKey()));
- tableConfig.put("total_score",
- new WriteWindowedToBigQuery.FieldInfo<KV<String, Integer>>("INTEGER",
- c -> c.element().getValue()));
- tableConfig.put("window_start",
- new WriteWindowedToBigQuery.FieldInfo<KV<String, Integer>>("STRING",
- c -> { IntervalWindow w = (IntervalWindow) c.window();
- return fmt.print(w.start()); }));
- return tableConfig;
- }
-
-
- /**
- * Run a batch pipeline to do windowed analysis of the data.
- */
- // [START DocInclude_HTSMain]
- public static void main(String[] args) throws Exception {
- // Begin constructing a pipeline configured by commandline flags.
- Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
- Pipeline pipeline = Pipeline.create(options);
-
- final Instant stopMinTimestamp = new Instant(minFmt.parseMillis(options.getStopMin()));
- final Instant startMinTimestamp = new Instant(minFmt.parseMillis(options.getStartMin()));
-
- // Read 'gaming' events from a text file.
- pipeline.apply(TextIO.Read.from(options.getInput()))
- // Parse the incoming data.
- .apply(ParDo.named("ParseGameEvent").of(new ParseEventFn()))
-
- // Filter out data before and after the given times so that it is not included
- // in the calculations. As we collect data in batches (say, by day), the batch for the day
- // that we want to analyze could potentially include some late-arriving data from the previous
- // day. If so, we want to weed it out. Similarly, if we include data from the following day
- // (to scoop up late-arriving events from the day we're analyzing), we need to weed out events
- // that fall after the time period we want to analyze.
- // [START DocInclude_HTSFilters]
- .apply("FilterStartTime", Filter.byPredicate(
- (GameActionInfo gInfo)
- -> gInfo.getTimestamp() > startMinTimestamp.getMillis()))
- .apply("FilterEndTime", Filter.byPredicate(
- (GameActionInfo gInfo)
- -> gInfo.getTimestamp() < stopMinTimestamp.getMillis()))
- // [END DocInclude_HTSFilters]
-
- // [START DocInclude_HTSAddTsAndWindow]
- // Add an element timestamp based on the event log, and apply fixed windowing.
- .apply("AddEventTimestamps",
- WithTimestamps.of((GameActionInfo i) -> new Instant(i.getTimestamp())))
- .apply(Window.named("FixedWindowsTeam")
- .<GameActionInfo>into(FixedWindows.of(
- Duration.standardMinutes(options.getWindowDuration()))))
- // [END DocInclude_HTSAddTsAndWindow]
-
- // Extract and sum teamname/score pairs from the event data.
- .apply("ExtractTeamScore", new ExtractAndSumScore("team"))
- .apply("WriteTeamScoreSums",
- new WriteWindowedToBigQuery<KV<String, Integer>>(options.getTableName(),
- configureWindowedTableWrite()));
-
-
- pipeline.run();
- }
- // [END DocInclude_HTSMain]
-
-}
[49/67] [partial] incubator-beam git commit: Directory reorganization
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/BigEndianIntegerCoder.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/BigEndianIntegerCoder.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/BigEndianIntegerCoder.java
deleted file mode 100644
index 24f6a45..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/BigEndianIntegerCoder.java
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.coders;
-
-import com.fasterxml.jackson.annotation.JsonCreator;
-
-import java.io.DataInputStream;
-import java.io.DataOutputStream;
-import java.io.EOFException;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.io.UTFDataFormatException;
-
-/**
- * A {@link BigEndianIntegerCoder} encodes {@link Integer Integers} in 4 bytes, big-endian.
- */
-public class BigEndianIntegerCoder extends AtomicCoder<Integer> {
-
- @JsonCreator
- public static BigEndianIntegerCoder of() {
- return INSTANCE;
- }
-
- /////////////////////////////////////////////////////////////////////////////
-
- private static final BigEndianIntegerCoder INSTANCE = new BigEndianIntegerCoder();
-
- private BigEndianIntegerCoder() {}
-
- @Override
- public void encode(Integer value, OutputStream outStream, Context context)
- throws IOException, CoderException {
- if (value == null) {
- throw new CoderException("cannot encode a null Integer");
- }
- new DataOutputStream(outStream).writeInt(value);
- }
-
- @Override
- public Integer decode(InputStream inStream, Context context)
- throws IOException, CoderException {
- try {
- return new DataInputStream(inStream).readInt();
- } catch (EOFException | UTFDataFormatException exn) {
- // These exceptions correspond to decoding problems, so change
- // what kind of exception they're branded as.
- throw new CoderException(exn);
- }
- }
-
- /**
- * {@inheritDoc}
- *
- * @return {@code true}. This coder is injective.
- */
- @Override
- public boolean consistentWithEquals() {
- return true;
- }
-
- /**
- * {@inheritDoc}
- *
- * @return {@code true}, because {@link #getEncodedElementByteSize} runs in constant time.
- */
- @Override
- public boolean isRegisterByteSizeObserverCheap(Integer value, Context context) {
- return true;
- }
-
- /**
- * {@inheritDoc}
- *
- * @return {@code 4}, the size in bytes of an integer's big endian encoding.
- */
- @Override
- protected long getEncodedElementByteSize(Integer value, Context context)
- throws Exception {
- if (value == null) {
- throw new CoderException("cannot encode a null Integer");
- }
- return 4;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/BigEndianLongCoder.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/BigEndianLongCoder.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/BigEndianLongCoder.java
deleted file mode 100644
index 4196608..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/BigEndianLongCoder.java
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.coders;
-
-import com.fasterxml.jackson.annotation.JsonCreator;
-
-import java.io.DataInputStream;
-import java.io.DataOutputStream;
-import java.io.EOFException;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.io.UTFDataFormatException;
-
-/**
- * A {@link BigEndianLongCoder} encodes {@link Long}s in 8 bytes, big-endian.
- */
-public class BigEndianLongCoder extends AtomicCoder<Long> {
-
- @JsonCreator
- public static BigEndianLongCoder of() {
- return INSTANCE;
- }
-
- /////////////////////////////////////////////////////////////////////////////
-
- private static final BigEndianLongCoder INSTANCE = new BigEndianLongCoder();
-
- private BigEndianLongCoder() {}
-
- @Override
- public void encode(Long value, OutputStream outStream, Context context)
- throws IOException, CoderException {
- if (value == null) {
- throw new CoderException("cannot encode a null Long");
- }
- new DataOutputStream(outStream).writeLong(value);
- }
-
- @Override
- public Long decode(InputStream inStream, Context context)
- throws IOException, CoderException {
- try {
- return new DataInputStream(inStream).readLong();
- } catch (EOFException | UTFDataFormatException exn) {
- // These exceptions correspond to decoding problems, so change
- // what kind of exception they're branded as.
- throw new CoderException(exn);
- }
- }
-
- /**
- * {@inheritDoc}
- *
- * @return {@code true}. This coder is injective.
- */
- @Override
- public boolean consistentWithEquals() {
- return true;
- }
-
- /**
- * {@inheritDoc}
- *
- * @return {@code true}, since {@link #getEncodedElementByteSize} returns a constant.
- */
- @Override
- public boolean isRegisterByteSizeObserverCheap(Long value, Context context) {
- return true;
- }
-
- /**
- * {@inheritDoc}
- *
- * @return {@code 8}, the byte size of a big-endian encoded {@code Long}.
- */
- @Override
- protected long getEncodedElementByteSize(Long value, Context context)
- throws Exception {
- if (value == null) {
- throw new CoderException("cannot encode a null Long");
- }
- return 8;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/ByteArrayCoder.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/ByteArrayCoder.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/ByteArrayCoder.java
deleted file mode 100644
index 1e555c6..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/ByteArrayCoder.java
+++ /dev/null
@@ -1,138 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.coders;
-
-import com.google.cloud.dataflow.sdk.util.ExposedByteArrayOutputStream;
-import com.google.cloud.dataflow.sdk.util.StreamUtils;
-import com.google.cloud.dataflow.sdk.util.VarInt;
-import com.google.common.io.ByteStreams;
-
-import com.fasterxml.jackson.annotation.JsonCreator;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-
-/**
- * A {@link Coder} for {@code byte[]}.
- *
- * <p>The encoding format is as follows:
- * <ul>
- * <li>If in a non-nested context (the {@code byte[]} is the only value in the stream), the
- * bytes are read/written directly.</li>
- * <li>If in a nested context, the bytes are prefixed with the length of the array,
- * encoded via a {@link VarIntCoder}.</li>
- * </ul>
- */
-public class ByteArrayCoder extends AtomicCoder<byte[]> {
-
- @JsonCreator
- public static ByteArrayCoder of() {
- return INSTANCE;
- }
-
-
- /////////////////////////////////////////////////////////////////////////////
-
- private static final ByteArrayCoder INSTANCE = new ByteArrayCoder();
-
- private ByteArrayCoder() {}
-
- @Override
- public void encode(byte[] value, OutputStream outStream, Context context)
- throws IOException, CoderException {
- if (value == null) {
- throw new CoderException("cannot encode a null byte[]");
- }
- if (!context.isWholeStream) {
- VarInt.encode(value.length, outStream);
- outStream.write(value);
- } else {
- outStream.write(value);
- }
- }
-
- /**
- * Encodes the provided {@code value} with the identical encoding to {@link #encode}, but with
- * optimizations that take ownership of the value.
- *
- * <p>Once passed to this method, {@code value} should never be observed or mutated again.
- */
- public void encodeAndOwn(byte[] value, OutputStream outStream, Context context)
- throws IOException, CoderException {
- if (!context.isWholeStream) {
- VarInt.encode(value.length, outStream);
- outStream.write(value);
- } else {
- if (outStream instanceof ExposedByteArrayOutputStream) {
- ((ExposedByteArrayOutputStream) outStream).writeAndOwn(value);
- } else {
- outStream.write(value);
- }
- }
- }
-
- @Override
- public byte[] decode(InputStream inStream, Context context)
- throws IOException, CoderException {
- if (context.isWholeStream) {
- return StreamUtils.getBytes(inStream);
- } else {
- int length = VarInt.decodeInt(inStream);
- if (length < 0) {
- throw new IOException("invalid length " + length);
- }
- byte[] value = new byte[length];
- ByteStreams.readFully(inStream, value);
- return value;
- }
- }
-
- /**
- * {@inheritDoc}
- *
- * @return objects that are equal if the two arrays contain the same bytes.
- */
- @Override
- public Object structuralValue(byte[] value) {
- return new StructuralByteArray(value);
- }
-
- /**
- * {@inheritDoc}
- *
- * @return {@code true} since {@link #getEncodedElementByteSize} runs in
- * constant time using the {@code length} of the provided array.
- */
- @Override
- public boolean isRegisterByteSizeObserverCheap(byte[] value, Context context) {
- return true;
- }
-
- @Override
- protected long getEncodedElementByteSize(byte[] value, Context context)
- throws Exception {
- if (value == null) {
- throw new CoderException("cannot encode a null byte[]");
- }
- long size = 0;
- if (!context.isWholeStream) {
- size += VarInt.getLength(value.length);
- }
- return size + value.length;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/ByteCoder.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/ByteCoder.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/ByteCoder.java
deleted file mode 100644
index 9f17497..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/ByteCoder.java
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.coders;
-
-import com.fasterxml.jackson.annotation.JsonCreator;
-
-import java.io.EOFException;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.io.UTFDataFormatException;
-
-/**
- * A {@link ByteCoder} encodes {@link Byte} values in 1 byte using Java serialization.
- */
-public class ByteCoder extends AtomicCoder<Byte> {
-
- @JsonCreator
- public static ByteCoder of() {
- return INSTANCE;
- }
-
- /////////////////////////////////////////////////////////////////////////////
-
- private static final ByteCoder INSTANCE = new ByteCoder();
-
- private ByteCoder() {}
-
- @Override
- public void encode(Byte value, OutputStream outStream, Context context)
- throws IOException, CoderException {
- if (value == null) {
- throw new CoderException("cannot encode a null Byte");
- }
- outStream.write(value.byteValue());
- }
-
- @Override
- public Byte decode(InputStream inStream, Context context)
- throws IOException, CoderException {
- try {
- // value will be between 0-255, -1 for EOF
- int value = inStream.read();
- if (value == -1) {
- throw new EOFException("EOF encountered decoding 1 byte from input stream");
- }
- return (byte) value;
- } catch (EOFException | UTFDataFormatException exn) {
- // These exceptions correspond to decoding problems, so change
- // what kind of exception they're branded as.
- throw new CoderException(exn);
- }
- }
-
- /**
- * {@inheritDoc}
- *
- * {@link ByteCoder} will never throw a {@link Coder.NonDeterministicException}; bytes can always
- * be encoded deterministically.
- */
- @Override
- public void verifyDeterministic() {}
-
- /**
- * {@inheritDoc}
- *
- * @return {@code true}. This coder is injective.
- */
- @Override
- public boolean consistentWithEquals() {
- return true;
- }
-
- /**
- * {@inheritDoc}
- *
- * @return {@code true}. {@link ByteCoder#getEncodedElementByteSize} returns a constant.
- */
- @Override
- public boolean isRegisterByteSizeObserverCheap(Byte value, Context context) {
- return true;
- }
-
- /**
- * {@inheritDoc}
- *
- * @return {@code 1}, the byte size of a {@link Byte} encoded using Java serialization.
- */
- @Override
- protected long getEncodedElementByteSize(Byte value, Context context)
- throws Exception {
- if (value == null) {
- throw new CoderException("cannot estimate size for unsupported null value");
- }
- return 1;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/ByteStringCoder.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/ByteStringCoder.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/ByteStringCoder.java
deleted file mode 100644
index b7c1a3c..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/ByteStringCoder.java
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.coders;
-
-import com.google.cloud.dataflow.sdk.util.VarInt;
-import com.google.common.io.ByteStreams;
-import com.google.protobuf.ByteString;
-
-import com.fasterxml.jackson.annotation.JsonCreator;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-
-/**
- * A {@link Coder} for {@link ByteString} objects based on their encoded Protocol Buffer form.
- *
- * <p>When this code is used in a nested {@link Coder.Context}, the serialized {@link ByteString}
- * objects are first delimited by their size.
- */
-public class ByteStringCoder extends AtomicCoder<ByteString> {
-
- @JsonCreator
- public static ByteStringCoder of() {
- return INSTANCE;
- }
-
- /***************************/
-
- private static final ByteStringCoder INSTANCE = new ByteStringCoder();
-
- private ByteStringCoder() {}
-
- @Override
- public void encode(ByteString value, OutputStream outStream, Context context)
- throws IOException, CoderException {
- if (value == null) {
- throw new CoderException("cannot encode a null ByteString");
- }
-
- if (!context.isWholeStream) {
- // ByteString is not delimited, so write its size before its contents.
- VarInt.encode(value.size(), outStream);
- }
- value.writeTo(outStream);
- }
-
- @Override
- public ByteString decode(InputStream inStream, Context context) throws IOException {
- if (context.isWholeStream) {
- return ByteString.readFrom(inStream);
- }
-
- int size = VarInt.decodeInt(inStream);
- // ByteString reads to the end of the input stream, so give it a limited stream of exactly
- // the right length. Also set its chunk size so that the ByteString will contain exactly
- // one chunk.
- return ByteString.readFrom(ByteStreams.limit(inStream, size), size);
- }
-
- @Override
- protected long getEncodedElementByteSize(ByteString value, Context context) throws Exception {
- int size = value.size();
-
- if (context.isWholeStream) {
- return size;
- }
- return VarInt.getLength(size) + size;
- }
-
- /**
- * {@inheritDoc}
- *
- * <p>Returns true; the encoded output of two invocations of {@link ByteStringCoder} in the same
- * {@link Coder.Context} will be identical if and only if the original {@link ByteString} objects
- * are equal according to {@link Object#equals}.
- */
- @Override
- public boolean consistentWithEquals() {
- return true;
- }
-
- /**
- * {@inheritDoc}
- *
- * <p>Returns true. {@link ByteString#size} returns the size of an array and a {@link VarInt}.
- */
- @Override
- public boolean isRegisterByteSizeObserverCheap(ByteString value, Context context) {
- return true;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/CannotProvideCoderException.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/CannotProvideCoderException.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/CannotProvideCoderException.java
deleted file mode 100644
index 97b5e23..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/CannotProvideCoderException.java
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.coders;
-
-/**
- * The exception thrown when a {@link CoderProvider} cannot
- * provide a {@link Coder} that has been requested.
- */
-public class CannotProvideCoderException extends Exception {
- private final ReasonCode reason;
-
- public CannotProvideCoderException(String message) {
- this(message, ReasonCode.UNKNOWN);
- }
-
- public CannotProvideCoderException(String message, ReasonCode reason) {
- super(message);
- this.reason = reason;
- }
-
- public CannotProvideCoderException(String message, Throwable cause) {
- this(message, cause, ReasonCode.UNKNOWN);
- }
-
- public CannotProvideCoderException(String message, Throwable cause, ReasonCode reason) {
- super(message, cause);
- this.reason = reason;
- }
-
- public CannotProvideCoderException(Throwable cause) {
- this(cause, ReasonCode.UNKNOWN);
- }
-
- public CannotProvideCoderException(Throwable cause, ReasonCode reason) {
- super(cause);
- this.reason = reason;
- }
-
- /**
- * @return the reason that Coder inference failed.
- */
- public ReasonCode getReason() {
- return reason;
- }
-
- /**
- * Returns the inner-most {@link CannotProvideCoderException} when they are deeply nested.
- *
- * <p>For example, if a coder for {@code List<KV<Integer, Whatsit>>} cannot be provided because
- * there is no known coder for {@code Whatsit}, the root cause of the exception should be a
- * CannotProvideCoderException with details pertinent to {@code Whatsit}, suppressing the
- * intermediate layers.
- */
- public Throwable getRootCause() {
- Throwable cause = getCause();
- if (cause == null) {
- return this;
- } else if (!(cause instanceof CannotProvideCoderException)) {
- return cause;
- } else {
- return ((CannotProvideCoderException) cause).getRootCause();
- }
- }
-
- /**
- * Indicates the reason that {@link Coder} inference failed.
- */
- public static enum ReasonCode {
- /**
- * The reason a coder could not be provided is unknown or does have an established
- * {@link ReasonCode}.
- */
- UNKNOWN,
-
- /**
- * The reason a coder could not be provided is type erasure, for example when requesting
- * coder inference for a {@code List<T>} where {@code T} is unknown.
- */
- TYPE_ERASURE
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/Coder.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/Coder.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/Coder.java
deleted file mode 100644
index f3a8bec..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/Coder.java
+++ /dev/null
@@ -1,298 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.coders;
-
-import com.google.cloud.dataflow.sdk.annotations.Experimental;
-import com.google.cloud.dataflow.sdk.annotations.Experimental.Kind;
-import com.google.cloud.dataflow.sdk.util.CloudObject;
-import com.google.cloud.dataflow.sdk.util.common.ElementByteSizeObserver;
-import com.google.common.base.Joiner;
-import com.google.common.base.MoreObjects;
-import com.google.common.base.Objects;
-import com.google.common.base.Preconditions;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.io.Serializable;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.List;
-
-import javax.annotation.Nullable;
-
-/**
- * A {@link Coder Coder<T>} defines how to encode and decode values of type {@code T} into
- * byte streams.
- *
- * <p>{@link Coder} instances are serialized during job creation and deserialized
- * before use, via JSON serialization. See {@link SerializableCoder} for an example of a
- * {@link Coder} that adds a custom field to
- * the {@link Coder} serialization. It provides a constructor annotated with
- * {@link com.fasterxml.jackson.annotation.JsonCreator}, which is a factory method used when
- * deserializing a {@link Coder} instance.
- *
- * <p>{@link Coder} classes for compound types are often composed from coder classes for types
- * contains therein. The composition of {@link Coder} instances into a coder for the compound
- * class is the subject of the {@link CoderFactory} type, which enables automatic generic
- * composition of {@link Coder} classes within the {@link CoderRegistry}. With particular
- * static methods on a compound {@link Coder} class, a {@link CoderFactory} can be automatically
- * inferred. See {@link KvCoder} for an example of a simple compound {@link Coder} that supports
- * automatic composition in the {@link CoderRegistry}.
- *
- * <p>The binary format of a {@link Coder} is identified by {@link #getEncodingId()}; be sure to
- * understand the requirements for evolving coder formats.
- *
- * <p>All methods of a {@link Coder} are required to be thread safe.
- *
- * @param <T> the type of the values being transcoded
- */
-public interface Coder<T> extends Serializable {
- /** The context in which encoding or decoding is being done. */
- public static class Context {
- /**
- * The outer context: the value being encoded or decoded takes
- * up the remainder of the record/stream contents.
- */
- public static final Context OUTER = new Context(true);
-
- /**
- * The nested context: the value being encoded or decoded is
- * (potentially) a part of a larger record/stream contents, and
- * may have other parts encoded or decoded after it.
- */
- public static final Context NESTED = new Context(false);
-
- /**
- * Whether the encoded or decoded value fills the remainder of the
- * output or input (resp.) record/stream contents. If so, then
- * the size of the decoded value can be determined from the
- * remaining size of the record/stream contents, and so explicit
- * lengths aren't required.
- */
- public final boolean isWholeStream;
-
- public Context(boolean isWholeStream) {
- this.isWholeStream = isWholeStream;
- }
-
- public Context nested() {
- return NESTED;
- }
-
- @Override
- public boolean equals(Object obj) {
- if (!(obj instanceof Context)) {
- return false;
- }
- return Objects.equal(isWholeStream, ((Context) obj).isWholeStream);
- }
-
- @Override
- public int hashCode() {
- return Objects.hashCode(isWholeStream);
- }
-
- @Override
- public String toString() {
- return MoreObjects.toStringHelper(Context.class)
- .addValue(isWholeStream ? "OUTER" : "NESTED").toString();
- }
- }
-
- /**
- * Encodes the given value of type {@code T} onto the given output stream
- * in the given context.
- *
- * @throws IOException if writing to the {@code OutputStream} fails
- * for some reason
- * @throws CoderException if the value could not be encoded for some reason
- */
- public void encode(T value, OutputStream outStream, Context context)
- throws CoderException, IOException;
-
- /**
- * Decodes a value of type {@code T} from the given input stream in
- * the given context. Returns the decoded value.
- *
- * @throws IOException if reading from the {@code InputStream} fails
- * for some reason
- * @throws CoderException if the value could not be decoded for some reason
- */
- public T decode(InputStream inStream, Context context)
- throws CoderException, IOException;
-
- /**
- * If this is a {@code Coder} for a parameterized type, returns the
- * list of {@code Coder}s being used for each of the parameters, or
- * returns {@code null} if this cannot be done or this is not a
- * parameterized type.
- */
- public List<? extends Coder<?>> getCoderArguments();
-
- /**
- * Returns the {@link CloudObject} that represents this {@code Coder}.
- */
- public CloudObject asCloudObject();
-
- /**
- * Throw {@link NonDeterministicException} if the coding is not deterministic.
- *
- * <p>In order for a {@code Coder} to be considered deterministic,
- * the following must be true:
- * <ul>
- * <li>two values that compare as equal (via {@code Object.equals()}
- * or {@code Comparable.compareTo()}, if supported) have the same
- * encoding.
- * <li>the {@code Coder} always produces a canonical encoding, which is the
- * same for an instance of an object even if produced on different
- * computers at different times.
- * </ul>
- *
- * @throws Coder.NonDeterministicException if this coder is not deterministic.
- */
- public void verifyDeterministic() throws Coder.NonDeterministicException;
-
- /**
- * Returns {@code true} if this {@link Coder} is injective with respect to {@link Objects#equals}.
- *
- * <p>Whenever the encoded bytes of two values are equal, then the original values are equal
- * according to {@code Objects.equals()}. Note that this is well-defined for {@code null}.
- *
- * <p>This condition is most notably false for arrays. More generally, this condition is false
- * whenever {@code equals()} compares object identity, rather than performing a
- * semantic/structural comparison.
- */
- public boolean consistentWithEquals();
-
- /**
- * Returns an object with an {@code Object.equals()} method that represents structural equality
- * on the argument.
- *
- * <p>For any two values {@code x} and {@code y} of type {@code T}, if their encoded bytes are the
- * same, then it must be the case that {@code structuralValue(x).equals(@code structuralValue(y)}.
- *
- * <p>Most notably:
- * <ul>
- * <li>The structural value for an array coder should perform a structural comparison of the
- * contents of the arrays, rather than the default behavior of comparing according to object
- * identity.
- * <li>The structural value for a coder accepting {@code null} should be a proper object with
- * an {@code equals()} method, even if the input value is {@code null}.
- * </ul>
- *
- * <p>See also {@link #consistentWithEquals()}.
- */
- public Object structuralValue(T value) throws Exception;
-
- /**
- * Returns whether {@link #registerByteSizeObserver} cheap enough to
- * call for every element, that is, if this {@code Coder} can
- * calculate the byte size of the element to be coded in roughly
- * constant time (or lazily).
- *
- * <p>Not intended to be called by user code, but instead by
- * {@link com.google.cloud.dataflow.sdk.runners.PipelineRunner}
- * implementations.
- */
- public boolean isRegisterByteSizeObserverCheap(T value, Context context);
-
- /**
- * Notifies the {@code ElementByteSizeObserver} about the byte size
- * of the encoded value using this {@code Coder}.
- *
- * <p>Not intended to be called by user code, but instead by
- * {@link com.google.cloud.dataflow.sdk.runners.PipelineRunner}
- * implementations.
- */
- public void registerByteSizeObserver(
- T value, ElementByteSizeObserver observer, Context context)
- throws Exception;
-
- /**
- * An identifier for the binary format written by {@link #encode}.
- *
- * <p>This value, along with the fully qualified class name, forms an identifier for the
- * binary format of this coder. Whenever this value changes, the new encoding is considered
- * incompatible with the prior format: It is presumed that the prior version of the coder will
- * be unable to correctly read the new format and the new version of the coder will be unable to
- * correctly read the old format.
- *
- * <p>If the format is changed in a backwards-compatible way (the Coder can still accept data from
- * the prior format), such as by adding optional fields to a Protocol Buffer or Avro definition,
- * and you want Dataflow to understand that the new coder is compatible with the prior coder,
- * this value must remain unchanged. It is then the responsibility of {@link #decode} to correctly
- * read data from the prior format.
- */
- @Experimental(Kind.CODER_ENCODING_ID)
- public String getEncodingId();
-
- /**
- * A collection of encodings supported by {@link #decode} in addition to the encoding
- * from {@link #getEncodingId()} (which is assumed supported).
- *
- * <p><i>This information is not currently used for any purpose</i>. It is descriptive only,
- * and this method is subject to change.
- *
- * @see #getEncodingId()
- */
- @Experimental(Kind.CODER_ENCODING_ID)
- public Collection<String> getAllowedEncodings();
-
- /**
- * Exception thrown by {@link Coder#verifyDeterministic()} if the encoding is
- * not deterministic, including details of why the encoding is not deterministic.
- */
- public static class NonDeterministicException extends Throwable {
- private Coder<?> coder;
- private List<String> reasons;
-
- public NonDeterministicException(
- Coder<?> coder, String reason, @Nullable NonDeterministicException e) {
- this(coder, Arrays.asList(reason), e);
- }
-
- public NonDeterministicException(Coder<?> coder, String reason) {
- this(coder, Arrays.asList(reason), null);
- }
-
- public NonDeterministicException(Coder<?> coder, List<String> reasons) {
- this(coder, reasons, null);
- }
-
- public NonDeterministicException(
- Coder<?> coder,
- List<String> reasons,
- @Nullable NonDeterministicException cause) {
- super(cause);
- Preconditions.checkArgument(reasons.size() > 0,
- "Reasons must not be empty.");
- this.reasons = reasons;
- this.coder = coder;
- }
-
- public Iterable<String> getReasons() {
- return reasons;
- }
-
- @Override
- public String getMessage() {
- return String.format("%s is not deterministic because:\n %s",
- coder, Joiner.on("\n ").join(reasons));
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/CoderException.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/CoderException.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/CoderException.java
deleted file mode 100644
index 8ff8571..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/CoderException.java
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.coders;
-
-import java.io.IOException;
-
-/**
- * An {@link Exception} thrown if there is a problem encoding or decoding a value.
- */
-public class CoderException extends IOException {
- public CoderException(String message) {
- super(message);
- }
-
- public CoderException(String message, Throwable cause) {
- super(message, cause);
- }
-
- public CoderException(Throwable cause) {
- super(cause);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/CoderFactories.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/CoderFactories.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/CoderFactories.java
deleted file mode 100644
index 82b40a4..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/CoderFactories.java
+++ /dev/null
@@ -1,274 +0,0 @@
-/*
- * Copyright (C) 2014 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.coders;
-
-import com.google.cloud.dataflow.sdk.values.TypeDescriptor;
-
-import java.lang.reflect.InvocationTargetException;
-import java.lang.reflect.Method;
-import java.lang.reflect.Modifier;
-import java.lang.reflect.ParameterizedType;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.List;
-
-/**
- * Static utility methods for creating and working with {@link Coder}s.
- */
-public final class CoderFactories {
- private CoderFactories() { } // Static utility class
-
- /**
- * Creates a {@link CoderFactory} built from particular static methods of a class that
- * implements {@link Coder}.
- *
- * <p>The class must have the following static methods:
- *
- * <ul>
- * <li> {@code
- * public static Coder<T> of(Coder<X> argCoder1, Coder<Y> argCoder2, ...)
- * }
- * <li> {@code
- * public static List<Object> getInstanceComponents(T exampleValue);
- * }
- * </ul>
- *
- * <p>The {@code of(...)} method will be used to construct a
- * {@code Coder<T>} from component {@link Coder}s.
- * It must accept one {@link Coder} argument for each
- * generic type parameter of {@code T}. If {@code T} takes no generic
- * type parameters, then the {@code of()} factory method should take
- * no arguments.
- *
- * <p>The {@code getInstanceComponents} method will be used to
- * decompose a value during the {@link Coder} inference process,
- * to automatically choose coders for the components.
- *
- * <p>Note that the class {@code T} to be coded may be a
- * not-yet-specialized generic class.
- * For a generic class {@code MyClass<X>} and an actual type parameter
- * {@code Foo}, the {@link CoderFactoryFromStaticMethods} will
- * accept any {@code Coder<Foo>} and produce a {@code Coder<MyClass<Foo>>}.
- *
- * <p>For example, the {@link CoderFactory} returned by
- * {@code fromStaticMethods(ListCoder.class)}
- * will produce a {@code Coder<List<X>>} for any {@code Coder Coder<X>}.
- */
- public static <T> CoderFactory fromStaticMethods(Class<T> clazz) {
- return new CoderFactoryFromStaticMethods(clazz);
- }
-
- /**
- * Creates a {@link CoderFactory} that always returns the
- * given coder.
- *
- * <p>The {@code getInstanceComponents} method of this
- * {@link CoderFactory} always returns an empty list.
- */
- public static <T> CoderFactory forCoder(Coder<T> coder) {
- return new CoderFactoryForCoder<>(coder);
- }
-
- /**
- * See {@link #fromStaticMethods} for a detailed description
- * of the characteristics of this {@link CoderFactory}.
- */
- private static class CoderFactoryFromStaticMethods implements CoderFactory {
-
- @Override
- @SuppressWarnings("rawtypes")
- public Coder<?> create(List<? extends Coder<?>> componentCoders) {
- try {
- return (Coder) factoryMethod.invoke(
- null /* static */, componentCoders.toArray());
- } catch (IllegalAccessException |
- IllegalArgumentException |
- InvocationTargetException |
- NullPointerException |
- ExceptionInInitializerError exn) {
- throw new IllegalStateException(
- "error when invoking Coder factory method " + factoryMethod,
- exn);
- }
- }
-
- @Override
- public List<Object> getInstanceComponents(Object value) {
- try {
- @SuppressWarnings("unchecked")
- List<Object> components = (List<Object>) getComponentsMethod.invoke(
- null /* static */, value);
- return components;
- } catch (IllegalAccessException
- | IllegalArgumentException
- | InvocationTargetException
- | NullPointerException
- | ExceptionInInitializerError exn) {
- throw new IllegalStateException(
- "error when invoking Coder getComponents method " + getComponentsMethod,
- exn);
- }
- }
-
- ////////////////////////////////////////////////////////////////////////////////
-
- // Method to create a coder given component coders
- // For a Coder class of kind * -> * -> ... n times ... -> *
- // this has type Coder<?> -> Coder<?> -> ... n times ... -> Coder<T>
- private Method factoryMethod;
-
- // Method to decompose a value of type T into its parts.
- // For a Coder class of kind * -> * -> ... n times ... -> *
- // this has type T -> List<Object>
- // where the list has n elements.
- private Method getComponentsMethod;
-
- /**
- * Returns a CoderFactory that invokes the given static factory method
- * to create the Coder.
- */
- private CoderFactoryFromStaticMethods(Class<?> coderClazz) {
- this.factoryMethod = getFactoryMethod(coderClazz);
- this.getComponentsMethod = getInstanceComponentsMethod(coderClazz);
- }
-
- /**
- * Returns the static {@code of} constructor method on {@code coderClazz}
- * if it exists. It is assumed to have one {@link Coder} parameter for
- * each type parameter of {@code coderClazz}.
- */
- private Method getFactoryMethod(Class<?> coderClazz) {
- Method factoryMethodCandidate;
-
- // Find the static factory method of coderClazz named 'of' with
- // the appropriate number of type parameters.
- int numTypeParameters = coderClazz.getTypeParameters().length;
- Class<?>[] factoryMethodArgTypes = new Class<?>[numTypeParameters];
- Arrays.fill(factoryMethodArgTypes, Coder.class);
- try {
- factoryMethodCandidate =
- coderClazz.getDeclaredMethod("of", factoryMethodArgTypes);
- } catch (NoSuchMethodException | SecurityException exn) {
- throw new IllegalArgumentException(
- "cannot register Coder " + coderClazz + ": "
- + "does not have an accessible method named 'of' with "
- + numTypeParameters + " arguments of Coder type",
- exn);
- }
- if (!Modifier.isStatic(factoryMethodCandidate.getModifiers())) {
- throw new IllegalArgumentException(
- "cannot register Coder " + coderClazz + ": "
- + "method named 'of' with " + numTypeParameters
- + " arguments of Coder type is not static");
- }
- if (!coderClazz.isAssignableFrom(factoryMethodCandidate.getReturnType())) {
- throw new IllegalArgumentException(
- "cannot register Coder " + coderClazz + ": "
- + "method named 'of' with " + numTypeParameters
- + " arguments of Coder type does not return a " + coderClazz);
- }
- try {
- if (!factoryMethodCandidate.isAccessible()) {
- factoryMethodCandidate.setAccessible(true);
- }
- } catch (SecurityException exn) {
- throw new IllegalArgumentException(
- "cannot register Coder " + coderClazz + ": "
- + "method named 'of' with " + numTypeParameters
- + " arguments of Coder type is not accessible",
- exn);
- }
-
- return factoryMethodCandidate;
- }
-
- /**
- * Finds the static method on {@code coderType} to use
- * to decompose a value of type {@code T} into components,
- * each corresponding to an argument of the {@code of}
- * method.
- */
- private <T> Method getInstanceComponentsMethod(Class<?> coderClazz) {
- TypeDescriptor<?> coderType = TypeDescriptor.of(coderClazz);
- TypeDescriptor<T> argumentType = getCodedType(coderType);
-
- // getInstanceComponents may be implemented in a superclass,
- // so we search them all for an applicable method. We do not
- // try to be clever about finding the best overload. It may
- // be in a generic superclass, erased to accept an Object.
- // However, subtypes are listed before supertypes (it is a
- // topological ordering) so probably the best one will be chosen
- // if there are more than one (which should be rare)
- for (TypeDescriptor<?> supertype : coderType.getClasses()) {
- for (Method method : supertype.getRawType().getDeclaredMethods()) {
- if (method.getName().equals("getInstanceComponents")) {
- TypeDescriptor<?> formalArgumentType = supertype.getArgumentTypes(method).get(0);
- if (formalArgumentType.getRawType().isAssignableFrom(argumentType.getRawType())) {
- return method;
- }
- }
- }
- }
-
- throw new IllegalArgumentException(
- "cannot create a CoderFactory from " + coderType + ": "
- + "does not have an accessible method "
- + "'getInstanceComponents'");
- }
-
- /**
- * If {@code coderType} is a subclass of {@link Coder} for a specific
- * type {@code T}, returns {@code T.class}. Otherwise, raises IllegalArgumentException.
- */
- private <T> TypeDescriptor<T> getCodedType(TypeDescriptor<?> coderType) {
- for (TypeDescriptor<?> ifaceType : coderType.getInterfaces()) {
- if (ifaceType.getRawType().equals(Coder.class)) {
- ParameterizedType coderIface = (ParameterizedType) ifaceType.getType();
- @SuppressWarnings("unchecked")
- TypeDescriptor<T> token =
- (TypeDescriptor<T>) TypeDescriptor.of(coderIface.getActualTypeArguments()[0]);
- return token;
- }
- }
- throw new IllegalArgumentException(
- "cannot build CoderFactory from class " + coderType
- + ": does not implement Coder<T> for any T.");
- }
- }
-
- /**
- * See {@link #forCoder} for a detailed description of this
- * {@link CoderFactory}.
- */
- private static class CoderFactoryForCoder<T> implements CoderFactory {
- private Coder<T> coder;
-
- public CoderFactoryForCoder(Coder<T> coder) {
- this.coder = coder;
- }
-
- @Override
- public Coder<?> create(List<? extends Coder<?>> componentCoders) {
- return this.coder;
- }
-
- @Override
- public List<Object> getInstanceComponents(Object value) {
- return Collections.emptyList();
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/CoderFactory.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/CoderFactory.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/CoderFactory.java
deleted file mode 100644
index 541256c..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/CoderFactory.java
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright (C) 2014 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.coders;
-
-import java.util.List;
-
-/**
- * A {@link CoderFactory} creates coders and decomposes values.
- * It may operate on a parameterized type, such as {@link List},
- * in which case the {@link #create} method accepts a list of
- * coders to use for the type parameters.
- */
-public interface CoderFactory {
-
- /**
- * Returns a {@code Coder<?>}, given argument coder to use for
- * values of a particular type, given the Coders for each of
- * the type's generic parameter types.
- */
- public Coder<?> create(List<? extends Coder<?>> componentCoders);
-
- /**
- * Returns a list of objects contained in {@code value}, one per
- * type argument, or {@code null} if none can be determined.
- * The list of returned objects should be the same size as the
- * list of coders required by {@link #create}.
- */
- public List<Object> getInstanceComponents(Object value);
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/CoderProvider.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/CoderProvider.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/CoderProvider.java
deleted file mode 100644
index a3e6ec4..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/CoderProvider.java
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (C) 2014 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.coders;
-
-import com.google.cloud.dataflow.sdk.values.TypeDescriptor;
-
-/**
- * A {@link CoderProvider} may create a {@link Coder} for
- * any concrete class.
- */
-public interface CoderProvider {
-
- /**
- * Provides a coder for a given class, if possible.
- *
- * @throws CannotProvideCoderException if no coder can be provided
- */
- public <T> Coder<T> getCoder(TypeDescriptor<T> type) throws CannotProvideCoderException;
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/CoderProviders.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/CoderProviders.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/CoderProviders.java
deleted file mode 100644
index 8b0aedd..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/CoderProviders.java
+++ /dev/null
@@ -1,164 +0,0 @@
-/*
- * Copyright (C) 2014 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.coders;
-
-import static com.google.common.base.Preconditions.checkArgument;
-
-import com.google.cloud.dataflow.sdk.util.InstanceBuilder;
-import com.google.cloud.dataflow.sdk.values.TypeDescriptor;
-import com.google.common.base.Joiner;
-import com.google.common.collect.ImmutableList;
-import com.google.common.collect.Lists;
-
-import java.lang.reflect.InvocationTargetException;
-import java.util.List;
-
-/**
- * Static utility methods for working with {@link CoderProvider CoderProviders}.
- */
-public final class CoderProviders {
-
- // Static utility class
- private CoderProviders() { }
-
- /**
- * Creates a {@link CoderProvider} built from particular static methods of a class that
- * implements {@link Coder}. The requirements for this method are precisely the requirements
- * for a {@link Coder} class to be usable with {@link DefaultCoder} annotations.
- *
- * <p>The class must have the following static method:
- *
- * <pre>{@code
- * public static Coder<T> of(TypeDescriptor<T> type)
- * }
- * </pre>
- */
- public static <T> CoderProvider fromStaticMethods(Class<T> clazz) {
- return new CoderProviderFromStaticMethods(clazz);
- }
-
-
- /**
- * Returns a {@link CoderProvider} that consults each of the provider {@code coderProviders}
- * and returns the first {@link Coder} provided.
- *
- * <p>Note that the order in which the providers are listed matters: While the set of types
- * handled will be the union of those handled by all of the providers in the list, the actual
- * {@link Coder} provided by the first successful provider may differ, and may have inferior
- * properties. For example, not all {@link Coder Coders} are deterministic, handle {@code null}
- * values, or have comparable performance.
- */
- public static CoderProvider firstOf(CoderProvider... coderProviders) {
- return new FirstOf(ImmutableList.copyOf(coderProviders));
- }
-
- ///////////////////////////////////////////////////////////////////////////////////////////////
-
- /**
- * @see #firstOf
- */
- private static class FirstOf implements CoderProvider {
-
- private Iterable<CoderProvider> providers;
-
- public FirstOf(Iterable<CoderProvider> providers) {
- this.providers = providers;
- }
-
- @Override
- public <T> Coder<T> getCoder(TypeDescriptor<T> type) throws CannotProvideCoderException {
- List<String> messages = Lists.newArrayList();
- for (CoderProvider provider : providers) {
- try {
- return provider.getCoder(type);
- } catch (CannotProvideCoderException exc) {
- messages.add(String.format("%s could not provide a Coder for type %s: %s",
- provider, type, exc.getMessage()));
- }
- }
- throw new CannotProvideCoderException(
- String.format("Cannot provide coder for type %s: %s.",
- type, Joiner.on("; ").join(messages)));
- }
- }
-
- private static class CoderProviderFromStaticMethods implements CoderProvider {
-
- /** If true, then clazz has {@code of(TypeDescriptor)}. If false, {@code of(Class)}. */
- private final boolean takesTypeDescriptor;
- private final Class<?> clazz;
-
- public CoderProviderFromStaticMethods(Class<?> clazz) {
- // Note that the second condition supports older classes, which only needed to provide
- // of(Class), not of(TypeDescriptor). Our own classes have updated to accept a
- // TypeDescriptor. Hence the error message points only to the current specification,
- // not both acceptable conditions.
- checkArgument(classTakesTypeDescriptor(clazz) || classTakesClass(clazz),
- "Class " + clazz.getCanonicalName()
- + " is missing required static method of(TypeDescriptor).");
-
- this.takesTypeDescriptor = classTakesTypeDescriptor(clazz);
- this.clazz = clazz;
- }
-
- @Override
- public <T> Coder<T> getCoder(TypeDescriptor<T> type) throws CannotProvideCoderException {
- try {
- if (takesTypeDescriptor) {
- @SuppressWarnings("unchecked")
- Coder<T> result = InstanceBuilder.ofType(Coder.class)
- .fromClass(clazz)
- .fromFactoryMethod("of")
- .withArg(TypeDescriptor.class, type)
- .build();
- return result;
- } else {
- @SuppressWarnings("unchecked")
- Coder<T> result = InstanceBuilder.ofType(Coder.class)
- .fromClass(clazz)
- .fromFactoryMethod("of")
- .withArg(Class.class, type.getRawType())
- .build();
- return result;
- }
- } catch (RuntimeException exc) {
- if (exc.getCause() instanceof InvocationTargetException) {
- throw new CannotProvideCoderException(exc.getCause().getCause());
- }
- throw exc;
- }
- }
-
- private boolean classTakesTypeDescriptor(Class<?> clazz) {
- try {
- clazz.getDeclaredMethod("of", TypeDescriptor.class);
- return true;
- } catch (NoSuchMethodException | SecurityException exc) {
- return false;
- }
- }
-
- private boolean classTakesClass(Class<?> clazz) {
- try {
- clazz.getDeclaredMethod("of", Class.class);
- return true;
- } catch (NoSuchMethodException | SecurityException exc) {
- return false;
- }
- }
- }
-}
[13/67] [partial] incubator-beam git commit: Directory reorganization
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/CounterAggregator.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/CounterAggregator.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/CounterAggregator.java
deleted file mode 100644
index 824825f..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/CounterAggregator.java
+++ /dev/null
@@ -1,96 +0,0 @@
-/*******************************************************************************
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- ******************************************************************************/
-
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.cloud.dataflow.sdk.transforms.Aggregator;
-import com.google.cloud.dataflow.sdk.transforms.Combine.CombineFn;
-import com.google.cloud.dataflow.sdk.transforms.Max;
-import com.google.cloud.dataflow.sdk.transforms.Min;
-import com.google.cloud.dataflow.sdk.transforms.Sum;
-import com.google.cloud.dataflow.sdk.util.common.Counter;
-import com.google.cloud.dataflow.sdk.util.common.CounterProvider;
-import com.google.cloud.dataflow.sdk.util.common.CounterSet;
-
-/**
- * An implementation of the {@code Aggregator} interface that uses a
- * {@link Counter} as the underlying representation. Supports {@link CombineFn}s
- * from the {@link Sum}, {@link Min} and {@link Max} classes.
- *
- * @param <InputT> the type of input values
- * @param <AccumT> the type of accumulator values
- * @param <OutputT> the type of output value
- */
-public class CounterAggregator<InputT, AccumT, OutputT> implements Aggregator<InputT, OutputT> {
-
- private final Counter<InputT> counter;
- private final CombineFn<InputT, AccumT, OutputT> combiner;
-
- /**
- * Constructs a new aggregator with the given name and aggregation logic
- * specified in the CombineFn argument. The underlying counter is
- * automatically added into the provided CounterSet.
- *
- * <p>If a counter with the same name already exists, it will be reused, as
- * long as it has the same type.
- */
- public CounterAggregator(String name, CombineFn<? super InputT, AccumT, OutputT> combiner,
- CounterSet.AddCounterMutator addCounterMutator) {
- // Safe contravariant cast
- this(constructCounter(name, combiner), addCounterMutator,
- (CombineFn<InputT, AccumT, OutputT>) combiner);
- }
-
- private CounterAggregator(Counter<InputT> counter,
- CounterSet.AddCounterMutator addCounterMutator,
- CombineFn<InputT, AccumT, OutputT> combiner) {
- try {
- this.counter = addCounterMutator.addCounter(counter);
- } catch (IllegalArgumentException ex) {
- throw new IllegalArgumentException(
- "aggregator's name collides with an existing aggregator "
- + "or system-provided counter of an incompatible type");
- }
- this.combiner = combiner;
- }
-
- private static <T> Counter<T> constructCounter(String name,
- CombineFn<? super T, ?, ?> combiner) {
- if (combiner instanceof CounterProvider) {
- @SuppressWarnings("unchecked")
- CounterProvider<T> counterProvider = (CounterProvider<T>) combiner;
- return counterProvider.getCounter(name);
- } else {
- throw new IllegalArgumentException("unsupported combiner in Aggregator: "
- + combiner.getClass().getName());
- }
- }
-
- @Override
- public void addValue(InputT value) {
- counter.addValue(value);
- }
-
- @Override
- public String getName() {
- return counter.getName();
- }
-
- @Override
- public CombineFn<InputT, ?, OutputT> getCombineFn() {
- return combiner;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/CredentialFactory.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/CredentialFactory.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/CredentialFactory.java
deleted file mode 100644
index 4913a1e..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/CredentialFactory.java
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.api.client.auth.oauth2.Credential;
-
-import java.io.IOException;
-import java.security.GeneralSecurityException;
-
-/**
- * Construct an oauth credential to be used by the SDK and the SDK workers.
- */
-public interface CredentialFactory {
- public Credential getCredential() throws IOException, GeneralSecurityException;
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/Credentials.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/Credentials.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/Credentials.java
deleted file mode 100644
index 671b131..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/Credentials.java
+++ /dev/null
@@ -1,192 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.api.client.auth.oauth2.Credential;
-import com.google.api.client.extensions.java6.auth.oauth2.AbstractPromptReceiver;
-import com.google.api.client.extensions.java6.auth.oauth2.AuthorizationCodeInstalledApp;
-import com.google.api.client.googleapis.auth.oauth2.GoogleAuthorizationCodeFlow;
-import com.google.api.client.googleapis.auth.oauth2.GoogleClientSecrets;
-import com.google.api.client.googleapis.auth.oauth2.GoogleCredential;
-import com.google.api.client.googleapis.auth.oauth2.GoogleOAuthConstants;
-import com.google.api.client.googleapis.javanet.GoogleNetHttpTransport;
-import com.google.api.client.http.GenericUrl;
-import com.google.api.client.http.HttpTransport;
-import com.google.api.client.json.JsonFactory;
-import com.google.api.client.json.jackson2.JacksonFactory;
-import com.google.api.client.util.store.FileDataStoreFactory;
-import com.google.cloud.dataflow.sdk.options.GcpOptions;
-import com.google.common.base.Preconditions;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.File;
-import java.io.FileReader;
-import java.io.IOException;
-import java.security.GeneralSecurityException;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.List;
-
-/**
- * Provides support for loading credentials.
- */
-public class Credentials {
-
- private static final Logger LOG = LoggerFactory.getLogger(Credentials.class);
-
- /**
- * OAuth 2.0 scopes used by a local worker (not on GCE).
- * The scope cloud-platform provides access to all Cloud Platform resources.
- * cloud-platform isn't sufficient yet for talking to datastore so we request
- * those resources separately.
- *
- * <p>Note that trusted scope relationships don't apply to OAuth tokens, so for
- * services we access directly (GCS) as opposed to through the backend
- * (BigQuery, GCE), we need to explicitly request that scope.
- */
- private static final List<String> SCOPES = Arrays.asList(
- "https://www.googleapis.com/auth/cloud-platform",
- "https://www.googleapis.com/auth/devstorage.full_control",
- "https://www.googleapis.com/auth/userinfo.email",
- "https://www.googleapis.com/auth/datastore");
-
- private static class PromptReceiver extends AbstractPromptReceiver {
- @Override
- public String getRedirectUri() {
- return GoogleOAuthConstants.OOB_REDIRECT_URI;
- }
- }
-
- /**
- * Initializes OAuth2 credentials.
- *
- * <p>This can use 3 different mechanisms for obtaining a credential:
- * <ol>
- * <li>
- * It can fetch the
- * <a href="https://developers.google.com/accounts/docs/application-default-credentials">
- * application default credentials</a>.
- * </li>
- * <li>
- * The user can specify a client secrets file and go through the OAuth2
- * webflow. The credential will then be cached in the user's home
- * directory for reuse. Provide the property "secrets_file" to use this
- * mechanism.
- * </li>
- * <li>
- * The user can specify a file containing a service account.
- * Provide the properties "service_account_keyfile" and
- * "service_account_name" to use this mechanism.
- * </li>
- * </ol>
- * The default mechanism is to use the
- * <a href="https://developers.google.com/accounts/docs/application-default-credentials">
- * application default credentials</a>. The other options can be used by providing the
- * corresponding properties.
- */
- public static Credential getCredential(GcpOptions options)
- throws IOException, GeneralSecurityException {
- String keyFile = options.getServiceAccountKeyfile();
- String accountName = options.getServiceAccountName();
-
- if (keyFile != null && accountName != null) {
- try {
- return getCredentialFromFile(keyFile, accountName, SCOPES);
- } catch (GeneralSecurityException e) {
- throw new IOException("Unable to obtain credentials from file", e);
- }
- }
-
- if (options.getSecretsFile() != null) {
- return getCredentialFromClientSecrets(options, SCOPES);
- }
-
- try {
- return GoogleCredential.getApplicationDefault().createScoped(SCOPES);
- } catch (IOException e) {
- throw new RuntimeException("Unable to get application default credentials. Please see "
- + "https://developers.google.com/accounts/docs/application-default-credentials "
- + "for details on how to specify credentials. This version of the SDK is "
- + "dependent on the gcloud core component version 2015.02.05 or newer to "
- + "be able to get credentials from the currently authorized user via gcloud auth.", e);
- }
- }
-
- /**
- * Loads OAuth2 credential from a local file.
- */
- private static Credential getCredentialFromFile(
- String keyFile, String accountId, Collection<String> scopes)
- throws IOException, GeneralSecurityException {
- GoogleCredential credential = new GoogleCredential.Builder()
- .setTransport(Transport.getTransport())
- .setJsonFactory(Transport.getJsonFactory())
- .setServiceAccountId(accountId)
- .setServiceAccountScopes(scopes)
- .setServiceAccountPrivateKeyFromP12File(new File(keyFile))
- .build();
-
- LOG.info("Created credential from file {}", keyFile);
- return credential;
- }
-
- /**
- * Loads OAuth2 credential from client secrets, which may require an
- * interactive authorization prompt.
- */
- private static Credential getCredentialFromClientSecrets(
- GcpOptions options, Collection<String> scopes)
- throws IOException, GeneralSecurityException {
- String clientSecretsFile = options.getSecretsFile();
-
- Preconditions.checkArgument(clientSecretsFile != null);
- HttpTransport httpTransport = GoogleNetHttpTransport.newTrustedTransport();
-
- JsonFactory jsonFactory = JacksonFactory.getDefaultInstance();
- GoogleClientSecrets clientSecrets;
-
- try {
- clientSecrets = GoogleClientSecrets.load(jsonFactory,
- new FileReader(clientSecretsFile));
- } catch (IOException e) {
- throw new RuntimeException(
- "Could not read the client secrets from file: " + clientSecretsFile,
- e);
- }
-
- FileDataStoreFactory dataStoreFactory =
- new FileDataStoreFactory(new java.io.File(options.getCredentialDir()));
-
- GoogleAuthorizationCodeFlow flow = new GoogleAuthorizationCodeFlow.Builder(
- httpTransport, jsonFactory, clientSecrets, scopes)
- .setDataStoreFactory(dataStoreFactory)
- .setTokenServerUrl(new GenericUrl(options.getTokenServerUrl()))
- .setAuthorizationServerEncodedUrl(options.getAuthorizationServerEncodedUrl())
- .build();
-
- // The credentialId identifies the credential if we're using a persistent
- // credential store.
- Credential credential =
- new AuthorizationCodeInstalledApp(flow, new PromptReceiver())
- .authorize(options.getCredentialId());
-
- LOG.info("Got credential from client secret");
- return credential;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/DataflowPathValidator.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/DataflowPathValidator.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/DataflowPathValidator.java
deleted file mode 100644
index cfb120c..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/DataflowPathValidator.java
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.cloud.dataflow.sdk.options.DataflowPipelineOptions;
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-import com.google.cloud.dataflow.sdk.util.gcsfs.GcsPath;
-import com.google.common.base.Preconditions;
-
-import java.io.IOException;
-
-/**
- * GCP implementation of {@link PathValidator}. Only GCS paths are allowed.
- */
-public class DataflowPathValidator implements PathValidator {
-
- private DataflowPipelineOptions dataflowOptions;
-
- DataflowPathValidator(DataflowPipelineOptions options) {
- this.dataflowOptions = options;
- }
-
- public static DataflowPathValidator fromOptions(PipelineOptions options) {
- return new DataflowPathValidator(options.as(DataflowPipelineOptions.class));
- }
-
- /**
- * Validates the the input GCS path is accessible and that the path
- * is well formed.
- */
- @Override
- public String validateInputFilePatternSupported(String filepattern) {
- GcsPath gcsPath = getGcsPath(filepattern);
- Preconditions.checkArgument(
- dataflowOptions.getGcsUtil().isGcsPatternSupported(gcsPath.getObject()));
- String returnValue = verifyPath(filepattern);
- verifyPathIsAccessible(filepattern, "Could not find file %s");
- return returnValue;
- }
-
- /**
- * Validates the the output GCS path is accessible and that the path
- * is well formed.
- */
- @Override
- public String validateOutputFilePrefixSupported(String filePrefix) {
- String returnValue = verifyPath(filePrefix);
- verifyPathIsAccessible(filePrefix, "Output path does not exist or is not writeable: %s");
- return returnValue;
- }
-
- @Override
- public String verifyPath(String path) {
- GcsPath gcsPath = getGcsPath(path);
- Preconditions.checkArgument(gcsPath.isAbsolute(),
- "Must provide absolute paths for Dataflow");
- Preconditions.checkArgument(!gcsPath.getObject().contains("//"),
- "Dataflow Service does not allow objects with consecutive slashes");
- return gcsPath.toResourceName();
- }
-
- private void verifyPathIsAccessible(String path, String errorMessage) {
- GcsPath gcsPath = getGcsPath(path);
- try {
- Preconditions.checkArgument(dataflowOptions.getGcsUtil().bucketExists(gcsPath),
- errorMessage, path);
- } catch (IOException e) {
- throw new RuntimeException(
- String.format("Unable to verify that GCS bucket gs://%s exists.", gcsPath.getBucket()),
- e);
- }
- }
-
- private GcsPath getGcsPath(String path) {
- try {
- return GcsPath.fromUri(path);
- } catch (IllegalArgumentException e) {
- throw new IllegalArgumentException(String.format(
- "%s expected a valid 'gs://' path but was given '%s'",
- dataflowOptions.getRunner().getSimpleName(), path), e);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/DataflowReleaseInfo.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/DataflowReleaseInfo.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/DataflowReleaseInfo.java
deleted file mode 100644
index 39b3005..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/DataflowReleaseInfo.java
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.api.client.json.GenericJson;
-import com.google.api.client.util.Key;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Properties;
-
-/**
- * Utilities for working with the Dataflow distribution.
- */
-public final class DataflowReleaseInfo extends GenericJson {
- private static final Logger LOG = LoggerFactory.getLogger(DataflowReleaseInfo.class);
-
- private static final String DATAFLOW_PROPERTIES_PATH =
- "/com/google/cloud/dataflow/sdk/sdk.properties";
-
- private static class LazyInit {
- private static final DataflowReleaseInfo INSTANCE =
- new DataflowReleaseInfo(DATAFLOW_PROPERTIES_PATH);
- }
-
- /**
- * Returns an instance of DataflowReleaseInfo.
- */
- public static DataflowReleaseInfo getReleaseInfo() {
- return LazyInit.INSTANCE;
- }
-
- @Key private String name = "Google Cloud Dataflow Java SDK";
- @Key private String version = "Unknown";
-
- /** Provides the SDK name. */
- public String getName() {
- return name;
- }
-
- /** Provides the SDK version. */
- public String getVersion() {
- return version;
- }
-
- private DataflowReleaseInfo(String resourcePath) {
- Properties properties = new Properties();
-
- InputStream in = DataflowReleaseInfo.class.getResourceAsStream(
- DATAFLOW_PROPERTIES_PATH);
- if (in == null) {
- LOG.warn("Dataflow properties resource not found: {}", resourcePath);
- return;
- }
-
- try {
- properties.load(in);
- } catch (IOException e) {
- LOG.warn("Error loading Dataflow properties resource: ", e);
- }
-
- for (String name : properties.stringPropertyNames()) {
- if (name.equals("name")) {
- // We don't allow the properties to override the SDK name.
- continue;
- }
- put(name, properties.getProperty(name));
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/DirectModeExecutionContext.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/DirectModeExecutionContext.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/DirectModeExecutionContext.java
deleted file mode 100644
index 6e97053..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/DirectModeExecutionContext.java
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-import static com.google.common.base.Preconditions.checkNotNull;
-
-import com.google.cloud.dataflow.sdk.runners.DirectPipelineRunner.ValueWithMetadata;
-import com.google.cloud.dataflow.sdk.util.common.worker.StateSampler;
-import com.google.cloud.dataflow.sdk.util.state.InMemoryStateInternals;
-import com.google.cloud.dataflow.sdk.util.state.StateInternals;
-import com.google.cloud.dataflow.sdk.values.TupleTag;
-import com.google.common.collect.Lists;
-import com.google.common.collect.Maps;
-
-import java.util.List;
-import java.util.Map;
-
-/**
- * {@link ExecutionContext} for use in direct mode.
- */
-public class DirectModeExecutionContext
- extends BaseExecutionContext<DirectModeExecutionContext.StepContext> {
-
- private Object key;
- private List<ValueWithMetadata<?>> output = Lists.newArrayList();
- private Map<TupleTag<?>, List<ValueWithMetadata<?>>> sideOutputs = Maps.newHashMap();
-
- protected DirectModeExecutionContext() {}
-
- public static DirectModeExecutionContext create() {
- return new DirectModeExecutionContext();
- }
-
- @Override
- protected StepContext createStepContext(
- String stepName, String transformName, StateSampler stateSampler) {
- return new StepContext(this, stepName, transformName);
- }
-
- public Object getKey() {
- return key;
- }
-
- public void setKey(Object newKey) {
- // The direct mode runner may reorder elements, so we need to keep
- // around the state used for each key.
- for (ExecutionContext.StepContext stepContext : getAllStepContexts()) {
- ((StepContext) stepContext).switchKey(newKey);
- }
- key = newKey;
- }
-
- @Override
- public void noteOutput(WindowedValue<?> outputElem) {
- output.add(ValueWithMetadata.of(outputElem).withKey(getKey()));
- }
-
- @Override
- public void noteSideOutput(TupleTag<?> tag, WindowedValue<?> outputElem) {
- List<ValueWithMetadata<?>> output = sideOutputs.get(tag);
- if (output == null) {
- output = Lists.newArrayList();
- sideOutputs.put(tag, output);
- }
- output.add(ValueWithMetadata.of(outputElem).withKey(getKey()));
- }
-
- public <T> List<ValueWithMetadata<T>> getOutput(@SuppressWarnings("unused") TupleTag<T> tag) {
- @SuppressWarnings({"unchecked", "rawtypes"}) // Cast not expressible without rawtypes
- List<ValueWithMetadata<T>> typedOutput = (List) output;
- return typedOutput;
- }
-
- public <T> List<ValueWithMetadata<T>> getSideOutput(TupleTag<T> tag) {
- if (sideOutputs.containsKey(tag)) {
- @SuppressWarnings({"unchecked", "rawtypes"}) // Cast not expressible without rawtypes
- List<ValueWithMetadata<T>> typedOutput = (List) sideOutputs.get(tag);
- return typedOutput;
- } else {
- return Lists.newArrayList();
- }
- }
-
- /**
- * {@link ExecutionContext.StepContext} used in direct mode.
- */
- public static class StepContext extends BaseExecutionContext.StepContext {
-
- /** A map from each key to the state associated with it. */
- private final Map<Object, InMemoryStateInternals<Object>> stateInternals = Maps.newHashMap();
- private InMemoryStateInternals<Object> currentStateInternals = null;
-
- private StepContext(ExecutionContext executionContext, String stepName, String transformName) {
- super(executionContext, stepName, transformName);
- switchKey(null);
- }
-
- public void switchKey(Object newKey) {
- currentStateInternals = stateInternals.get(newKey);
- if (currentStateInternals == null) {
- currentStateInternals = InMemoryStateInternals.forKey(newKey);
- stateInternals.put(newKey, currentStateInternals);
- }
- }
-
- @Override
- public StateInternals<Object> stateInternals() {
- return checkNotNull(currentStateInternals);
- }
-
- @Override
- public TimerInternals timerInternals() {
- throw new UnsupportedOperationException("Direct mode cannot return timerInternals");
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/DirectSideInputReader.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/DirectSideInputReader.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/DirectSideInputReader.java
deleted file mode 100644
index ee8c922..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/DirectSideInputReader.java
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
-import com.google.cloud.dataflow.sdk.transforms.windowing.GlobalWindows;
-import com.google.cloud.dataflow.sdk.values.PCollectionView;
-import com.google.cloud.dataflow.sdk.values.TupleTag;
-import com.google.common.base.Predicate;
-import com.google.common.collect.Iterables;
-
-/**
- * Basic side input reader wrapping a {@link PTuple} of side input iterables. Encapsulates
- * conversion according to the {@link PCollectionView} and projection to a particular
- * window.
- */
-public class DirectSideInputReader implements SideInputReader {
-
- private PTuple sideInputValues;
-
- private DirectSideInputReader(PTuple sideInputValues) {
- this.sideInputValues = sideInputValues;
- }
-
- public static DirectSideInputReader of(PTuple sideInputValues) {
- return new DirectSideInputReader(sideInputValues);
- }
-
- @Override
- public <T> boolean contains(PCollectionView<T> view) {
- return sideInputValues.has(view.getTagInternal());
- }
-
- @Override
- public boolean isEmpty() {
- return sideInputValues.isEmpty();
- }
-
- @Override
- public <T> T get(PCollectionView<T> view, final BoundedWindow window) {
- final TupleTag<Iterable<WindowedValue<?>>> tag = view.getTagInternal();
- if (!sideInputValues.has(tag)) {
- throw new IllegalArgumentException("calling getSideInput() with unknown view");
- }
-
- if (view.getWindowingStrategyInternal().getWindowFn() instanceof GlobalWindows) {
- return view.fromIterableInternal(sideInputValues.get(tag));
- } else {
- return view.fromIterableInternal(
- Iterables.filter(sideInputValues.get(tag),
- new Predicate<WindowedValue<?>>() {
- @Override
- public boolean apply(WindowedValue<?> element) {
- return element.getWindows().contains(window);
- }
- }));
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/DoFnInfo.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/DoFnInfo.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/DoFnInfo.java
deleted file mode 100644
index 15a3a47..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/DoFnInfo.java
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.transforms.DoFn;
-import com.google.cloud.dataflow.sdk.values.PCollectionView;
-
-import java.io.Serializable;
-
-/**
- * Wrapper class holding the necessary information to serialize a DoFn.
- *
- * @param <InputT> the type of the (main) input elements of the DoFn
- * @param <OutputT> the type of the (main) output elements of the DoFn
- */
-public class DoFnInfo<InputT, OutputT> implements Serializable {
- private final DoFn<InputT, OutputT> doFn;
- private final WindowingStrategy<?, ?> windowingStrategy;
- private final Iterable<PCollectionView<?>> sideInputViews;
- private final Coder<InputT> inputCoder;
-
- public DoFnInfo(DoFn<InputT, OutputT> doFn, WindowingStrategy<?, ?> windowingStrategy) {
- this.doFn = doFn;
- this.windowingStrategy = windowingStrategy;
- this.sideInputViews = null;
- this.inputCoder = null;
- }
-
- public DoFnInfo(DoFn<InputT, OutputT> doFn, WindowingStrategy<?, ?> windowingStrategy,
- Iterable<PCollectionView<?>> sideInputViews, Coder<InputT> inputCoder) {
- this.doFn = doFn;
- this.windowingStrategy = windowingStrategy;
- this.sideInputViews = sideInputViews;
- this.inputCoder = inputCoder;
- }
-
- public DoFn<InputT, OutputT> getDoFn() {
- return doFn;
- }
-
- public WindowingStrategy<?, ?> getWindowingStrategy() {
- return windowingStrategy;
- }
-
- public Iterable<PCollectionView<?>> getSideInputViews() {
- return sideInputViews;
- }
-
- public Coder<InputT> getInputCoder() {
- return inputCoder;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/DoFnRunner.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/DoFnRunner.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/DoFnRunner.java
deleted file mode 100644
index 51c3f39..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/DoFnRunner.java
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (C) 2016 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.cloud.dataflow.sdk.transforms.Aggregator;
-import com.google.cloud.dataflow.sdk.transforms.DoFn;
-import com.google.cloud.dataflow.sdk.transforms.DoFn.ProcessContext;
-import com.google.cloud.dataflow.sdk.values.KV;
-
-/**
- * An wrapper interface that represents the execution of a {@link DoFn}.
- */
-public interface DoFnRunner<InputT, OutputT> {
- /**
- * Prepares and calls {@link DoFn#startBundle}.
- */
- public void startBundle();
-
- /**
- * Calls {@link DoFn#processElement} with a {@link ProcessContext} containing the current element.
- */
- public void processElement(WindowedValue<InputT> elem);
-
- /**
- * Calls {@link DoFn#finishBundle} and performs additional tasks, such as
- * flushing in-memory states.
- */
- public void finishBundle();
-
- /**
- * An internal interface for signaling that a {@link DoFn} requires late data dropping.
- */
- public interface ReduceFnExecutor<K, InputT, OutputT, W> {
- /**
- * Gets this object as a {@link DoFn}.
- *
- * Most implementors of this interface are expected to be {@link DoFn} instances, and will
- * return themselves.
- */
- DoFn<KeyedWorkItem<K, InputT>, KV<K, OutputT>> asDoFn();
-
- /**
- * Returns an aggregator that tracks elements that are dropped due to being late.
- */
- Aggregator<Long, Long> getDroppedDueToLatenessAggregator();
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/DoFnRunnerBase.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/DoFnRunnerBase.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/DoFnRunnerBase.java
deleted file mode 100644
index 04ec59f..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/DoFnRunnerBase.java
+++ /dev/null
@@ -1,558 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.coders.IterableCoder;
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-import com.google.cloud.dataflow.sdk.runners.DirectPipelineRunner;
-import com.google.cloud.dataflow.sdk.transforms.Aggregator;
-import com.google.cloud.dataflow.sdk.transforms.Combine.CombineFn;
-import com.google.cloud.dataflow.sdk.transforms.DoFn;
-import com.google.cloud.dataflow.sdk.transforms.DoFn.RequiresWindowAccess;
-import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
-import com.google.cloud.dataflow.sdk.transforms.windowing.GlobalWindow;
-import com.google.cloud.dataflow.sdk.transforms.windowing.GlobalWindows;
-import com.google.cloud.dataflow.sdk.transforms.windowing.PaneInfo;
-import com.google.cloud.dataflow.sdk.transforms.windowing.WindowFn;
-import com.google.cloud.dataflow.sdk.util.DoFnRunners.OutputManager;
-import com.google.cloud.dataflow.sdk.util.ExecutionContext.StepContext;
-import com.google.cloud.dataflow.sdk.util.common.CounterSet;
-import com.google.cloud.dataflow.sdk.util.state.StateInternals;
-import com.google.cloud.dataflow.sdk.values.PCollectionView;
-import com.google.cloud.dataflow.sdk.values.TupleTag;
-import com.google.common.base.Preconditions;
-import com.google.common.collect.Iterables;
-import com.google.common.collect.Lists;
-import com.google.common.collect.Maps;
-import com.google.common.collect.Sets;
-
-import org.joda.time.Instant;
-import org.joda.time.format.PeriodFormat;
-
-import java.io.IOException;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-
-/**
- * A base implementation of {@link DoFnRunner}.
- *
- * <p> Sub-classes should override {@link #invokeProcessElement}.
- */
-public abstract class DoFnRunnerBase<InputT, OutputT> implements DoFnRunner<InputT, OutputT> {
-
- /** The DoFn being run. */
- public final DoFn<InputT, OutputT> fn;
-
- /** The context used for running the DoFn. */
- public final DoFnContext<InputT, OutputT> context;
-
- protected DoFnRunnerBase(
- PipelineOptions options,
- DoFn<InputT, OutputT> fn,
- SideInputReader sideInputReader,
- OutputManager outputManager,
- TupleTag<OutputT> mainOutputTag,
- List<TupleTag<?>> sideOutputTags,
- StepContext stepContext,
- CounterSet.AddCounterMutator addCounterMutator,
- WindowingStrategy<?, ?> windowingStrategy) {
- this.fn = fn;
- this.context = new DoFnContext<>(
- options,
- fn,
- sideInputReader,
- outputManager,
- mainOutputTag,
- sideOutputTags,
- stepContext,
- addCounterMutator,
- windowingStrategy == null ? null : windowingStrategy.getWindowFn());
- }
-
- /**
- * An implementation of {@code OutputManager} using simple lists, for testing and in-memory
- * contexts such as the {@link DirectPipelineRunner}.
- */
- public static class ListOutputManager implements OutputManager {
-
- private Map<TupleTag<?>, List<WindowedValue<?>>> outputLists = Maps.newHashMap();
-
- @Override
- public <T> void output(TupleTag<T> tag, WindowedValue<T> output) {
- @SuppressWarnings({"rawtypes", "unchecked"})
- List<WindowedValue<T>> outputList = (List) outputLists.get(tag);
-
- if (outputList == null) {
- outputList = Lists.newArrayList();
- @SuppressWarnings({"rawtypes", "unchecked"})
- List<WindowedValue<?>> untypedList = (List) outputList;
- outputLists.put(tag, untypedList);
- }
-
- outputList.add(output);
- }
-
- public <T> List<WindowedValue<T>> getOutput(TupleTag<T> tag) {
- // Safe cast by design, inexpressible in Java without rawtypes
- @SuppressWarnings({"rawtypes", "unchecked"})
- List<WindowedValue<T>> outputList = (List) outputLists.get(tag);
- return (outputList != null) ? outputList : Collections.<WindowedValue<T>>emptyList();
- }
- }
-
- @Override
- public void startBundle() {
- // This can contain user code. Wrap it in case it throws an exception.
- try {
- fn.startBundle(context);
- } catch (Throwable t) {
- // Exception in user code.
- throw wrapUserCodeException(t);
- }
- }
-
- @Override
- public void processElement(WindowedValue<InputT> elem) {
- if (elem.getWindows().size() <= 1
- || (!RequiresWindowAccess.class.isAssignableFrom(fn.getClass())
- && context.sideInputReader.isEmpty())) {
- invokeProcessElement(elem);
- } else {
- // We could modify the windowed value (and the processContext) to
- // avoid repeated allocations, but this is more straightforward.
- for (BoundedWindow window : elem.getWindows()) {
- invokeProcessElement(WindowedValue.of(
- elem.getValue(), elem.getTimestamp(), window, elem.getPane()));
- }
- }
- }
-
- /**
- * Invokes {@link DoFn#processElement} after certain pre-processings has been done in
- * {@link DoFnRunnerBase#processElement}.
- */
- protected abstract void invokeProcessElement(WindowedValue<InputT> elem);
-
- @Override
- public void finishBundle() {
- // This can contain user code. Wrap it in case it throws an exception.
- try {
- fn.finishBundle(context);
- } catch (Throwable t) {
- // Exception in user code.
- throw wrapUserCodeException(t);
- }
- }
-
- /**
- * A concrete implementation of {@code DoFn.Context} used for running a {@link DoFn}.
- *
- * @param <InputT> the type of the DoFn's (main) input elements
- * @param <OutputT> the type of the DoFn's (main) output elements
- */
- private static class DoFnContext<InputT, OutputT>
- extends DoFn<InputT, OutputT>.Context {
- private static final int MAX_SIDE_OUTPUTS = 1000;
-
- final PipelineOptions options;
- final DoFn<InputT, OutputT> fn;
- final SideInputReader sideInputReader;
- final OutputManager outputManager;
- final TupleTag<OutputT> mainOutputTag;
- final StepContext stepContext;
- final CounterSet.AddCounterMutator addCounterMutator;
- final WindowFn<?, ?> windowFn;
-
- /**
- * The set of known output tags, some of which may be undeclared, so we can throw an
- * exception when it exceeds {@link #MAX_SIDE_OUTPUTS}.
- */
- private Set<TupleTag<?>> outputTags;
-
- public DoFnContext(PipelineOptions options,
- DoFn<InputT, OutputT> fn,
- SideInputReader sideInputReader,
- OutputManager outputManager,
- TupleTag<OutputT> mainOutputTag,
- List<TupleTag<?>> sideOutputTags,
- StepContext stepContext,
- CounterSet.AddCounterMutator addCounterMutator,
- WindowFn<?, ?> windowFn) {
- fn.super();
- this.options = options;
- this.fn = fn;
- this.sideInputReader = sideInputReader;
- this.outputManager = outputManager;
- this.mainOutputTag = mainOutputTag;
- this.outputTags = Sets.newHashSet();
-
- outputTags.add(mainOutputTag);
- for (TupleTag<?> sideOutputTag : sideOutputTags) {
- outputTags.add(sideOutputTag);
- }
-
- this.stepContext = stepContext;
- this.addCounterMutator = addCounterMutator;
- this.windowFn = windowFn;
- super.setupDelegateAggregators();
- }
-
- //////////////////////////////////////////////////////////////////////////////
-
- @Override
- public PipelineOptions getPipelineOptions() {
- return options;
- }
-
- <T, W extends BoundedWindow> WindowedValue<T> makeWindowedValue(
- T output, Instant timestamp, Collection<W> windows, PaneInfo pane) {
- final Instant inputTimestamp = timestamp;
-
- if (timestamp == null) {
- timestamp = BoundedWindow.TIMESTAMP_MIN_VALUE;
- }
-
- if (windows == null) {
- try {
- // The windowFn can never succeed at accessing the element, so its type does not
- // matter here
- @SuppressWarnings("unchecked")
- WindowFn<Object, W> objectWindowFn = (WindowFn<Object, W>) windowFn;
- windows = objectWindowFn.assignWindows(objectWindowFn.new AssignContext() {
- @Override
- public Object element() {
- throw new UnsupportedOperationException(
- "WindowFn attempted to access input element when none was available");
- }
-
- @Override
- public Instant timestamp() {
- if (inputTimestamp == null) {
- throw new UnsupportedOperationException(
- "WindowFn attempted to access input timestamp when none was available");
- }
- return inputTimestamp;
- }
-
- @Override
- public Collection<? extends BoundedWindow> windows() {
- throw new UnsupportedOperationException(
- "WindowFn attempted to access input windows when none were available");
- }
- });
- } catch (Exception e) {
- throw UserCodeException.wrap(e);
- }
- }
-
- return WindowedValue.of(output, timestamp, windows, pane);
- }
-
- public <T> T sideInput(PCollectionView<T> view, BoundedWindow mainInputWindow) {
- if (!sideInputReader.contains(view)) {
- throw new IllegalArgumentException("calling sideInput() with unknown view");
- }
- BoundedWindow sideInputWindow =
- view.getWindowingStrategyInternal().getWindowFn().getSideInputWindow(mainInputWindow);
- return sideInputReader.get(view, sideInputWindow);
- }
-
- void outputWindowedValue(
- OutputT output,
- Instant timestamp,
- Collection<? extends BoundedWindow> windows,
- PaneInfo pane) {
- outputWindowedValue(makeWindowedValue(output, timestamp, windows, pane));
- }
-
- void outputWindowedValue(WindowedValue<OutputT> windowedElem) {
- outputManager.output(mainOutputTag, windowedElem);
- if (stepContext != null) {
- stepContext.noteOutput(windowedElem);
- }
- }
-
- protected <T> void sideOutputWindowedValue(TupleTag<T> tag,
- T output,
- Instant timestamp,
- Collection<? extends BoundedWindow> windows,
- PaneInfo pane) {
- sideOutputWindowedValue(tag, makeWindowedValue(output, timestamp, windows, pane));
- }
-
- protected <T> void sideOutputWindowedValue(TupleTag<T> tag, WindowedValue<T> windowedElem) {
- if (!outputTags.contains(tag)) {
- // This tag wasn't declared nor was it seen before during this execution.
- // Thus, this must be a new, undeclared and unconsumed output.
- // To prevent likely user errors, enforce the limit on the number of side
- // outputs.
- if (outputTags.size() >= MAX_SIDE_OUTPUTS) {
- throw new IllegalArgumentException(
- "the number of side outputs has exceeded a limit of " + MAX_SIDE_OUTPUTS);
- }
- outputTags.add(tag);
- }
-
- outputManager.output(tag, windowedElem);
- if (stepContext != null) {
- stepContext.noteSideOutput(tag, windowedElem);
- }
- }
-
- // Following implementations of output, outputWithTimestamp, and sideOutput
- // are only accessible in DoFn.startBundle and DoFn.finishBundle, and will be shadowed by
- // ProcessContext's versions in DoFn.processElement.
- @Override
- public void output(OutputT output) {
- outputWindowedValue(output, null, null, PaneInfo.NO_FIRING);
- }
-
- @Override
- public void outputWithTimestamp(OutputT output, Instant timestamp) {
- outputWindowedValue(output, timestamp, null, PaneInfo.NO_FIRING);
- }
-
- @Override
- public <T> void sideOutput(TupleTag<T> tag, T output) {
- Preconditions.checkNotNull(tag, "TupleTag passed to sideOutput cannot be null");
- sideOutputWindowedValue(tag, output, null, null, PaneInfo.NO_FIRING);
- }
-
- @Override
- public <T> void sideOutputWithTimestamp(TupleTag<T> tag, T output, Instant timestamp) {
- Preconditions.checkNotNull(tag, "TupleTag passed to sideOutputWithTimestamp cannot be null");
- sideOutputWindowedValue(tag, output, timestamp, null, PaneInfo.NO_FIRING);
- }
-
- private String generateInternalAggregatorName(String userName) {
- boolean system = fn.getClass().isAnnotationPresent(SystemDoFnInternal.class);
- return (system ? "" : "user-") + stepContext.getStepName() + "-" + userName;
- }
-
- @Override
- protected <AggInputT, AggOutputT> Aggregator<AggInputT, AggOutputT> createAggregatorInternal(
- String name, CombineFn<AggInputT, ?, AggOutputT> combiner) {
- Preconditions.checkNotNull(combiner,
- "Combiner passed to createAggregator cannot be null");
- return new CounterAggregator<>(generateInternalAggregatorName(name),
- combiner, addCounterMutator);
- }
- }
-
- /**
- * Returns a new {@code DoFn.ProcessContext} for the given element.
- */
- protected DoFn<InputT, OutputT>.ProcessContext createProcessContext(WindowedValue<InputT> elem) {
- return new DoFnProcessContext<InputT, OutputT>(fn, context, elem);
- }
-
- protected RuntimeException wrapUserCodeException(Throwable t) {
- throw UserCodeException.wrapIf(!isSystemDoFn(), t);
- }
-
- private boolean isSystemDoFn() {
- return fn.getClass().isAnnotationPresent(SystemDoFnInternal.class);
- }
-
- /**
- * A concrete implementation of {@code DoFn.ProcessContext} used for
- * running a {@link DoFn} over a single element.
- *
- * @param <InputT> the type of the DoFn's (main) input elements
- * @param <OutputT> the type of the DoFn's (main) output elements
- */
- static class DoFnProcessContext<InputT, OutputT>
- extends DoFn<InputT, OutputT>.ProcessContext {
-
-
- final DoFn<InputT, OutputT> fn;
- final DoFnContext<InputT, OutputT> context;
- final WindowedValue<InputT> windowedValue;
-
- public DoFnProcessContext(DoFn<InputT, OutputT> fn,
- DoFnContext<InputT, OutputT> context,
- WindowedValue<InputT> windowedValue) {
- fn.super();
- this.fn = fn;
- this.context = context;
- this.windowedValue = windowedValue;
- }
-
- @Override
- public PipelineOptions getPipelineOptions() {
- return context.getPipelineOptions();
- }
-
- @Override
- public InputT element() {
- return windowedValue.getValue();
- }
-
- @Override
- public <T> T sideInput(PCollectionView<T> view) {
- Preconditions.checkNotNull(view, "View passed to sideInput cannot be null");
- Iterator<? extends BoundedWindow> windowIter = windows().iterator();
- BoundedWindow window;
- if (!windowIter.hasNext()) {
- if (context.windowFn instanceof GlobalWindows) {
- // TODO: Remove this once GroupByKeyOnly no longer outputs elements
- // without windows
- window = GlobalWindow.INSTANCE;
- } else {
- throw new IllegalStateException(
- "sideInput called when main input element is not in any windows");
- }
- } else {
- window = windowIter.next();
- if (windowIter.hasNext()) {
- throw new IllegalStateException(
- "sideInput called when main input element is in multiple windows");
- }
- }
- return context.sideInput(view, window);
- }
-
- @Override
- public BoundedWindow window() {
- if (!(fn instanceof RequiresWindowAccess)) {
- throw new UnsupportedOperationException(
- "window() is only available in the context of a DoFn marked as RequiresWindow.");
- }
- return Iterables.getOnlyElement(windows());
- }
-
- @Override
- public PaneInfo pane() {
- return windowedValue.getPane();
- }
-
- @Override
- public void output(OutputT output) {
- context.outputWindowedValue(windowedValue.withValue(output));
- }
-
- @Override
- public void outputWithTimestamp(OutputT output, Instant timestamp) {
- checkTimestamp(timestamp);
- context.outputWindowedValue(output, timestamp,
- windowedValue.getWindows(), windowedValue.getPane());
- }
-
- void outputWindowedValue(
- OutputT output,
- Instant timestamp,
- Collection<? extends BoundedWindow> windows,
- PaneInfo pane) {
- context.outputWindowedValue(output, timestamp, windows, pane);
- }
-
- @Override
- public <T> void sideOutput(TupleTag<T> tag, T output) {
- Preconditions.checkNotNull(tag, "Tag passed to sideOutput cannot be null");
- context.sideOutputWindowedValue(tag, windowedValue.withValue(output));
- }
-
- @Override
- public <T> void sideOutputWithTimestamp(TupleTag<T> tag, T output, Instant timestamp) {
- Preconditions.checkNotNull(tag, "Tag passed to sideOutputWithTimestamp cannot be null");
- checkTimestamp(timestamp);
- context.sideOutputWindowedValue(
- tag, output, timestamp, windowedValue.getWindows(), windowedValue.getPane());
- }
-
- @Override
- public Instant timestamp() {
- return windowedValue.getTimestamp();
- }
-
- public Collection<? extends BoundedWindow> windows() {
- return windowedValue.getWindows();
- }
-
- private void checkTimestamp(Instant timestamp) {
- if (timestamp.isBefore(windowedValue.getTimestamp().minus(fn.getAllowedTimestampSkew()))) {
- throw new IllegalArgumentException(String.format(
- "Cannot output with timestamp %s. Output timestamps must be no earlier than the "
- + "timestamp of the current input (%s) minus the allowed skew (%s). See the "
- + "DoFn#getAllowedTimestampSkew() Javadoc for details on changing the allowed skew.",
- timestamp, windowedValue.getTimestamp(),
- PeriodFormat.getDefault().print(fn.getAllowedTimestampSkew().toPeriod())));
- }
- }
-
- @Override
- public WindowingInternals<InputT, OutputT> windowingInternals() {
- return new WindowingInternals<InputT, OutputT>() {
- @Override
- public void outputWindowedValue(OutputT output, Instant timestamp,
- Collection<? extends BoundedWindow> windows, PaneInfo pane) {
- context.outputWindowedValue(output, timestamp, windows, pane);
- }
-
- @Override
- public Collection<? extends BoundedWindow> windows() {
- return windowedValue.getWindows();
- }
-
- @Override
- public PaneInfo pane() {
- return windowedValue.getPane();
- }
-
- @Override
- public TimerInternals timerInternals() {
- return context.stepContext.timerInternals();
- }
-
- @Override
- public <T> void writePCollectionViewData(
- TupleTag<?> tag,
- Iterable<WindowedValue<T>> data,
- Coder<T> elemCoder) throws IOException {
- @SuppressWarnings("unchecked")
- Coder<BoundedWindow> windowCoder = (Coder<BoundedWindow>) context.windowFn.windowCoder();
-
- context.stepContext.writePCollectionViewData(
- tag, data, IterableCoder.of(WindowedValue.getFullCoder(elemCoder, windowCoder)),
- window(), windowCoder);
- }
-
- @Override
- public StateInternals<?> stateInternals() {
- return context.stepContext.stateInternals();
- }
-
- @Override
- public <T> T sideInput(PCollectionView<T> view, BoundedWindow mainInputWindow) {
- return context.sideInput(view, mainInputWindow);
- }
- };
- }
-
- @Override
- protected <AggregatorInputT, AggregatorOutputT> Aggregator<AggregatorInputT, AggregatorOutputT>
- createAggregatorInternal(
- String name, CombineFn<AggregatorInputT, ?, AggregatorOutputT> combiner) {
- return context.createAggregatorInternal(name, combiner);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/DoFnRunners.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/DoFnRunners.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/DoFnRunners.java
deleted file mode 100644
index d56b36e..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/DoFnRunners.java
+++ /dev/null
@@ -1,142 +0,0 @@
-/*
- * Copyright (C) 2016 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-import com.google.cloud.dataflow.sdk.transforms.DoFn;
-import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
-import com.google.cloud.dataflow.sdk.util.DoFnRunner.ReduceFnExecutor;
-import com.google.cloud.dataflow.sdk.util.ExecutionContext.StepContext;
-import com.google.cloud.dataflow.sdk.util.common.CounterSet;
-import com.google.cloud.dataflow.sdk.util.common.CounterSet.AddCounterMutator;
-import com.google.cloud.dataflow.sdk.values.KV;
-import com.google.cloud.dataflow.sdk.values.TupleTag;
-
-import java.util.List;
-
-/**
- * Static utility methods that provide {@link DoFnRunner} implementations.
- */
-public class DoFnRunners {
- /**
- * Information about how to create output receivers and output to them.
- */
- public interface OutputManager {
- /**
- * Outputs a single element to the receiver indicated by the given {@link TupleTag}.
- */
- public <T> void output(TupleTag<T> tag, WindowedValue<T> output);
- }
-
- /**
- * Returns a basic implementation of {@link DoFnRunner} that works for most {@link DoFn DoFns}.
- *
- * <p>It invokes {@link DoFn#processElement} for each input.
- */
- public static <InputT, OutputT> DoFnRunner<InputT, OutputT> simpleRunner(
- PipelineOptions options,
- DoFn<InputT, OutputT> fn,
- SideInputReader sideInputReader,
- OutputManager outputManager,
- TupleTag<OutputT> mainOutputTag,
- List<TupleTag<?>> sideOutputTags,
- StepContext stepContext,
- CounterSet.AddCounterMutator addCounterMutator,
- WindowingStrategy<?, ?> windowingStrategy) {
- return new SimpleDoFnRunner<>(
- options,
- fn,
- sideInputReader,
- outputManager,
- mainOutputTag,
- sideOutputTags,
- stepContext,
- addCounterMutator,
- windowingStrategy);
- }
-
- /**
- * Returns an implementation of {@link DoFnRunner} that handles late data dropping.
- *
- * <p>It drops elements from expired windows before they reach the underlying {@link DoFn}.
- */
- public static <K, InputT, OutputT, W extends BoundedWindow>
- DoFnRunner<KeyedWorkItem<K, InputT>, KV<K, OutputT>> lateDataDroppingRunner(
- PipelineOptions options,
- ReduceFnExecutor<K, InputT, OutputT, W> reduceFnExecutor,
- SideInputReader sideInputReader,
- OutputManager outputManager,
- TupleTag<KV<K, OutputT>> mainOutputTag,
- List<TupleTag<?>> sideOutputTags,
- StepContext stepContext,
- CounterSet.AddCounterMutator addCounterMutator,
- WindowingStrategy<?, W> windowingStrategy) {
- DoFnRunner<KeyedWorkItem<K, InputT>, KV<K, OutputT>> simpleDoFnRunner =
- simpleRunner(
- options,
- reduceFnExecutor.asDoFn(),
- sideInputReader,
- outputManager,
- mainOutputTag,
- sideOutputTags,
- stepContext,
- addCounterMutator,
- windowingStrategy);
- return new LateDataDroppingDoFnRunner<>(
- simpleDoFnRunner,
- windowingStrategy,
- stepContext.timerInternals(),
- reduceFnExecutor.getDroppedDueToLatenessAggregator());
- }
-
- public static <InputT, OutputT> DoFnRunner<InputT, OutputT> createDefault(
- PipelineOptions options,
- DoFn<InputT, OutputT> doFn,
- SideInputReader sideInputReader,
- OutputManager outputManager,
- TupleTag<OutputT> mainOutputTag,
- List<TupleTag<?>> sideOutputTags,
- StepContext stepContext,
- AddCounterMutator addCounterMutator,
- WindowingStrategy<?, ?> windowingStrategy) {
- if (doFn instanceof ReduceFnExecutor) {
- @SuppressWarnings("rawtypes")
- ReduceFnExecutor fn = (ReduceFnExecutor) doFn;
- @SuppressWarnings({"unchecked", "cast", "rawtypes"})
- DoFnRunner<InputT, OutputT> runner = (DoFnRunner<InputT, OutputT>) lateDataDroppingRunner(
- options,
- fn,
- sideInputReader,
- outputManager,
- (TupleTag) mainOutputTag,
- sideOutputTags,
- stepContext,
- addCounterMutator,
- (WindowingStrategy) windowingStrategy);
- return runner;
- }
- return simpleRunner(
- options,
- doFn,
- sideInputReader,
- outputManager,
- mainOutputTag,
- sideOutputTags,
- stepContext,
- addCounterMutator,
- windowingStrategy);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/ExecutableTrigger.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/ExecutableTrigger.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/ExecutableTrigger.java
deleted file mode 100644
index 22a3762..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/ExecutableTrigger.java
+++ /dev/null
@@ -1,159 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
-import com.google.cloud.dataflow.sdk.transforms.windowing.Trigger;
-import com.google.cloud.dataflow.sdk.transforms.windowing.Trigger.OnceTrigger;
-import com.google.common.base.Preconditions;
-
-import java.io.Serializable;
-import java.util.ArrayList;
-import java.util.List;
-
-/**
- * A wrapper around a trigger used during execution. While an actual trigger may appear multiple
- * times (both in the same trigger expression and in other trigger expressions), the
- * {@code ExecutableTrigger} wrapped around them forms a tree (only one occurrence).
- *
- * @param <W> {@link BoundedWindow} subclass used to represent the windows used.
- */
-public class ExecutableTrigger<W extends BoundedWindow> implements Serializable {
-
- /** Store the index assigned to this trigger. */
- private final int triggerIndex;
- private final int firstIndexAfterSubtree;
- private final List<ExecutableTrigger<W>> subTriggers = new ArrayList<>();
- private final Trigger<W> trigger;
-
- public static <W extends BoundedWindow> ExecutableTrigger<W> create(Trigger<W> trigger) {
- return create(trigger, 0);
- }
-
- private static <W extends BoundedWindow> ExecutableTrigger<W> create(
- Trigger<W> trigger, int nextUnusedIndex) {
- if (trigger instanceof OnceTrigger) {
- return new ExecutableOnceTrigger<W>((OnceTrigger<W>) trigger, nextUnusedIndex);
- } else {
- return new ExecutableTrigger<W>(trigger, nextUnusedIndex);
- }
- }
-
- public static <W extends BoundedWindow> ExecutableTrigger<W> createForOnceTrigger(
- OnceTrigger<W> trigger, int nextUnusedIndex) {
- return new ExecutableOnceTrigger<W>(trigger, nextUnusedIndex);
- }
-
- private ExecutableTrigger(Trigger<W> trigger, int nextUnusedIndex) {
- this.trigger = Preconditions.checkNotNull(trigger, "trigger must not be null");
- this.triggerIndex = nextUnusedIndex++;
-
- if (trigger.subTriggers() != null) {
- for (Trigger<W> subTrigger : trigger.subTriggers()) {
- ExecutableTrigger<W> subExecutable = create(subTrigger, nextUnusedIndex);
- subTriggers.add(subExecutable);
- nextUnusedIndex = subExecutable.firstIndexAfterSubtree;
- }
- }
- firstIndexAfterSubtree = nextUnusedIndex;
- }
-
- public List<ExecutableTrigger<W>> subTriggers() {
- return subTriggers;
- }
-
- @Override
- public String toString() {
- return trigger.toString();
- }
-
- /**
- * Return the underlying trigger specification corresponding to this {@code ExecutableTrigger}.
- */
- public Trigger<W> getSpec() {
- return trigger;
- }
-
- public int getTriggerIndex() {
- return triggerIndex;
- }
-
- public final int getFirstIndexAfterSubtree() {
- return firstIndexAfterSubtree;
- }
-
- public boolean isCompatible(ExecutableTrigger<W> other) {
- return trigger.isCompatible(other.trigger);
- }
-
- public ExecutableTrigger<W> getSubTriggerContaining(int index) {
- Preconditions.checkNotNull(subTriggers);
- Preconditions.checkState(index > triggerIndex && index < firstIndexAfterSubtree,
- "Cannot find sub-trigger containing index not in this tree.");
- ExecutableTrigger<W> previous = null;
- for (ExecutableTrigger<W> subTrigger : subTriggers) {
- if (index < subTrigger.triggerIndex) {
- return previous;
- }
- previous = subTrigger;
- }
- return previous;
- }
-
- /**
- * Invoke the {@link Trigger#onElement} method for this trigger, ensuring that the bits are
- * properly updated if the trigger finishes.
- */
- public void invokeOnElement(Trigger<W>.OnElementContext c) throws Exception {
- trigger.onElement(c.forTrigger(this));
- }
-
- /**
- * Invoke the {@link Trigger#onMerge} method for this trigger, ensuring that the bits are properly
- * updated.
- */
- public void invokeOnMerge(Trigger<W>.OnMergeContext c) throws Exception {
- Trigger<W>.OnMergeContext subContext = c.forTrigger(this);
- trigger.onMerge(subContext);
- }
-
- public boolean invokeShouldFire(Trigger<W>.TriggerContext c) throws Exception {
- return trigger.shouldFire(c.forTrigger(this));
- }
-
- public void invokeOnFire(Trigger<W>.TriggerContext c) throws Exception {
- trigger.onFire(c.forTrigger(this));
- }
-
- /**
- * Invoke clear for the current this trigger.
- */
- public void invokeClear(Trigger<W>.TriggerContext c) throws Exception {
- trigger.clear(c.forTrigger(this));
- }
-
- /**
- * {@link ExecutableTrigger} that enforces the fact that the trigger should always FIRE_AND_FINISH
- * and never just FIRE.
- */
- private static class ExecutableOnceTrigger<W extends BoundedWindow> extends ExecutableTrigger<W> {
-
- public ExecutableOnceTrigger(OnceTrigger<W> trigger, int nextUnusedIndex) {
- super(trigger, nextUnusedIndex);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/ExecutionContext.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/ExecutionContext.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/ExecutionContext.java
deleted file mode 100644
index cff5b95..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/ExecutionContext.java
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
-import com.google.cloud.dataflow.sdk.util.common.worker.StateSampler;
-import com.google.cloud.dataflow.sdk.util.state.StateInternals;
-import com.google.cloud.dataflow.sdk.values.TupleTag;
-
-import java.io.IOException;
-import java.util.Collection;
-
-/**
- * Context for the current execution. This is guaranteed to exist during processing,
- * but does not necessarily persist between different batches of work.
- */
-public interface ExecutionContext {
- /**
- * Returns the {@link StepContext} associated with the given step.
- */
- StepContext getOrCreateStepContext(
- String stepName, String transformName, StateSampler stateSampler);
-
- /**
- * Returns a collection view of all of the {@link StepContext}s.
- */
- Collection<? extends StepContext> getAllStepContexts();
-
- /**
- * Hook for subclasses to implement that will be called whenever
- * {@link com.google.cloud.dataflow.sdk.transforms.DoFn.Context#output}
- * is called.
- */
- void noteOutput(WindowedValue<?> output);
-
- /**
- * Hook for subclasses to implement that will be called whenever
- * {@link com.google.cloud.dataflow.sdk.transforms.DoFn.Context#sideOutput}
- * is called.
- */
- void noteSideOutput(TupleTag<?> tag, WindowedValue<?> output);
-
- /**
- * Per-step, per-key context used for retrieving state.
- */
- public interface StepContext {
-
- /**
- * The name of the step.
- */
- String getStepName();
-
- /**
- * The name of the transform for the step.
- */
- String getTransformName();
-
- /**
- * Hook for subclasses to implement that will be called whenever
- * {@link com.google.cloud.dataflow.sdk.transforms.DoFn.Context#output}
- * is called.
- */
- void noteOutput(WindowedValue<?> output);
-
- /**
- * Hook for subclasses to implement that will be called whenever
- * {@link com.google.cloud.dataflow.sdk.transforms.DoFn.Context#sideOutput}
- * is called.
- */
- void noteSideOutput(TupleTag<?> tag, WindowedValue<?> output);
-
- /**
- * Writes the given {@code PCollectionView} data to a globally accessible location.
- */
- <T, W extends BoundedWindow> void writePCollectionViewData(
- TupleTag<?> tag,
- Iterable<WindowedValue<T>> data,
- Coder<Iterable<WindowedValue<T>>> dataCoder,
- W window,
- Coder<W> windowCoder)
- throws IOException;
-
- StateInternals<?> stateInternals();
-
- TimerInternals timerInternals();
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/ExposedByteArrayInputStream.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/ExposedByteArrayInputStream.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/ExposedByteArrayInputStream.java
deleted file mode 100644
index dff5fd1..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/ExposedByteArrayInputStream.java
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-import java.io.ByteArrayInputStream;
-import java.io.IOException;
-
-/**
- * {@link ByteArrayInputStream} that allows accessing the entire internal buffer without copying.
- */
-public class ExposedByteArrayInputStream extends ByteArrayInputStream{
-
- public ExposedByteArrayInputStream(byte[] buf) {
- super(buf);
- }
-
- /** Read all remaining bytes.
- * @throws IOException */
- public byte[] readAll() throws IOException {
- if (pos == 0 && count == buf.length) {
- pos = count;
- return buf;
- }
- byte[] ret = new byte[count - pos];
- super.read(ret);
- return ret;
- }
-
- @Override
- public void close() {
- try {
- super.close();
- } catch (IOException exn) {
- throw new RuntimeException("Unexpected IOException closing ByteArrayInputStream", exn);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/ExposedByteArrayOutputStream.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/ExposedByteArrayOutputStream.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/ExposedByteArrayOutputStream.java
deleted file mode 100644
index d8e4d50..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/ExposedByteArrayOutputStream.java
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
- * in compliance with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software distributed under the License
- * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
- * or implied. See the License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
-
-/**
- * {@link ByteArrayOutputStream} special cased to treat writes of a single byte-array specially.
- * When calling {@link #toByteArray()} after writing only one {@code byte[]} using
- * {@link #writeAndOwn(byte[])}, it will return that array directly.
- */
-public class ExposedByteArrayOutputStream extends ByteArrayOutputStream {
-
- private byte[] swappedBuffer;
-
- /**
- * If true, this stream doesn't allow direct access to the passed in byte-array. It behaves just
- * like a normal {@link ByteArrayOutputStream}.
- *
- * <p>It is set to true after any write operations other than the first call to
- * {@link #writeAndOwn(byte[])}.
- */
- private boolean isFallback = false;
-
- /**
- * Fall back to the behavior of a normal {@link ByteArrayOutputStream}.
- */
- private void fallback() {
- isFallback = true;
- if (swappedBuffer != null) {
- // swappedBuffer != null means buf is actually provided by the caller of writeAndOwn(),
- // while swappedBuffer is the original buffer.
- // Recover the buffer and copy the bytes from buf.
- byte[] tempBuffer = buf;
- count = 0;
- buf = swappedBuffer;
- super.write(tempBuffer, 0, tempBuffer.length);
- swappedBuffer = null;
- }
- }
-
- /**
- * Write {@code b} to the stream and take the ownership of {@code b}.
- * If the stream is empty, {@code b} itself will be used as the content of the stream and
- * no content copy will be involved.
- * <p><i>Note: After passing any byte array to this method, it must not be modified again.</i>
- *
- * @throws IOException
- */
- public void writeAndOwn(byte[] b) throws IOException {
- if (b.length == 0) {
- return;
- }
- if (count == 0) {
- // Optimized first-time whole write.
- // The original buffer will be swapped to swappedBuffer, while the input b is used as buf.
- swappedBuffer = buf;
- buf = b;
- count = b.length;
- } else {
- fallback();
- super.write(b);
- }
- }
-
- @Override
- public void write(byte[] b, int off, int len) {
- fallback();
- super.write(b, off, len);
- }
-
- @Override
- public void write(int b) {
- fallback();
- super.write(b);
- }
-
- @Override
- public byte[] toByteArray() {
- // Note: count == buf.length is not a correct criteria to "return buf;", because the internal
- // buf may be reused after reset().
- if (!isFallback && count > 0) {
- return buf;
- } else {
- return super.toByteArray();
- }
- }
-
- @Override
- public void reset() {
- if (count == 0) {
- return;
- }
- count = 0;
- if (isFallback) {
- isFallback = false;
- } else {
- buf = swappedBuffer;
- swappedBuffer = null;
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/FileIOChannelFactory.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/FileIOChannelFactory.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/FileIOChannelFactory.java
deleted file mode 100644
index 77d0b83..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/FileIOChannelFactory.java
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.common.base.Predicate;
-import com.google.common.base.Predicates;
-import com.google.common.collect.Iterables;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.BufferedOutputStream;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileNotFoundException;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.nio.channels.Channels;
-import java.nio.channels.ReadableByteChannel;
-import java.nio.channels.WritableByteChannel;
-import java.nio.file.FileSystems;
-import java.nio.file.Files;
-import java.nio.file.NoSuchFileException;
-import java.nio.file.PathMatcher;
-import java.nio.file.Paths;
-import java.util.Collection;
-import java.util.LinkedList;
-import java.util.List;
-import java.util.regex.Matcher;
-
-/**
- * Implements IOChannelFactory for local files.
- */
-public class FileIOChannelFactory implements IOChannelFactory {
- private static final Logger LOG = LoggerFactory.getLogger(FileIOChannelFactory.class);
-
- // This implementation only allows for wildcards in the file name.
- // The directory portion must exist as-is.
- @Override
- public Collection<String> match(String spec) throws IOException {
- File file = new File(spec);
-
- File parent = file.getAbsoluteFile().getParentFile();
- if (!parent.exists()) {
- throw new IOException("Unable to find parent directory of " + spec);
- }
-
- // Method getAbsolutePath() on Windows platform may return something like
- // "c:\temp\file.txt". FileSystem.getPathMatcher() call below will treat
- // '\' (backslash) as an escape character, instead of a directory
- // separator. Replacing backslash with double-backslash solves the problem.
- // We perform the replacement on all platforms, even those that allow
- // backslash as a part of the filename, because Globs.toRegexPattern will
- // eat one backslash.
- String pathToMatch = file.getAbsolutePath().replaceAll(Matcher.quoteReplacement("\\"),
- Matcher.quoteReplacement("\\\\"));
-
- final PathMatcher matcher = FileSystems.getDefault().getPathMatcher("glob:" + pathToMatch);
-
- Iterable<File> files = com.google.common.io.Files.fileTreeTraverser().preOrderTraversal(parent);
- Iterable<File> matchedFiles = Iterables.filter(files,
- Predicates.and(
- com.google.common.io.Files.isFile(),
- new Predicate<File>() {
- @Override
- public boolean apply(File input) {
- return matcher.matches(input.toPath());
- }
- }));
-
- List<String> result = new LinkedList<>();
- for (File match : matchedFiles) {
- result.add(match.getPath());
- }
-
- return result;
- }
-
- @Override
- public ReadableByteChannel open(String spec) throws IOException {
- LOG.debug("opening file {}", spec);
- @SuppressWarnings("resource") // The caller is responsible for closing the channel.
- FileInputStream inputStream = new FileInputStream(spec);
- // Use this method for creating the channel (rather than new FileChannel) so that we get
- // regular FileNotFoundException. Closing the underyling channel will close the inputStream.
- return inputStream.getChannel();
- }
-
- @Override
- public WritableByteChannel create(String spec, String mimeType)
- throws IOException {
- LOG.debug("creating file {}", spec);
- File file = new File(spec);
- if (file.getAbsoluteFile().getParentFile() != null
- && !file.getAbsoluteFile().getParentFile().exists()
- && !file.getAbsoluteFile().getParentFile().mkdirs()) {
- throw new IOException("Unable to create parent directories for '" + spec + "'");
- }
- return Channels.newChannel(
- new BufferedOutputStream(new FileOutputStream(file)));
- }
-
- @Override
- public long getSizeBytes(String spec) throws IOException {
- try {
- return Files.size(FileSystems.getDefault().getPath(spec));
- } catch (NoSuchFileException e) {
- throw new FileNotFoundException(e.getReason());
- }
- }
-
- @Override
- public boolean isReadSeekEfficient(String spec) throws IOException {
- return true;
- }
-
- @Override
- public String resolve(String path, String other) throws IOException {
- return Paths.get(path).resolve(other).toString();
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/FinishedTriggers.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/FinishedTriggers.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/FinishedTriggers.java
deleted file mode 100644
index e75be23..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/FinishedTriggers.java
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.util;
-
-/**
- * A mutable set which tracks whether any particular {@link ExecutableTrigger} is
- * finished.
- */
-public interface FinishedTriggers {
- /**
- * Returns {@code true} if the trigger is finished.
- */
- public boolean isFinished(ExecutableTrigger<?> trigger);
-
- /**
- * Sets the fact that the trigger is finished.
- */
- public void setFinished(ExecutableTrigger<?> trigger, boolean value);
-
- /**
- * Sets the trigger and all of its subtriggers to unfinished.
- */
- public void clearRecursively(ExecutableTrigger<?> trigger);
-
- /**
- * Create an independent copy of this mutable {@link FinishedTriggers}.
- */
- public FinishedTriggers copy();
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/FinishedTriggersBitSet.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/FinishedTriggersBitSet.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/FinishedTriggersBitSet.java
deleted file mode 100644
index 09f7af7..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/FinishedTriggersBitSet.java
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-import java.util.BitSet;
-
-/**
- * A {@link FinishedTriggers} implementation based on an underlying {@link BitSet}.
- */
-public class FinishedTriggersBitSet implements FinishedTriggers {
-
- private final BitSet bitSet;
-
- private FinishedTriggersBitSet(BitSet bitSet) {
- this.bitSet = bitSet;
- }
-
- public static FinishedTriggersBitSet emptyWithCapacity(int capacity) {
- return new FinishedTriggersBitSet(new BitSet(capacity));
- }
-
- public static FinishedTriggersBitSet fromBitSet(BitSet bitSet) {
- return new FinishedTriggersBitSet(bitSet);
- }
-
- /**
- * Returns the underlying {@link BitSet} for this {@link FinishedTriggersBitSet}.
- */
- public BitSet getBitSet() {
- return bitSet;
- }
-
- @Override
- public boolean isFinished(ExecutableTrigger<?> trigger) {
- return bitSet.get(trigger.getTriggerIndex());
- }
-
- @Override
- public void setFinished(ExecutableTrigger<?> trigger, boolean value) {
- bitSet.set(trigger.getTriggerIndex(), value);
- }
-
- @Override
- public void clearRecursively(ExecutableTrigger<?> trigger) {
- bitSet.clear(trigger.getTriggerIndex(), trigger.getFirstIndexAfterSubtree());
- }
-
- @Override
- public FinishedTriggersBitSet copy() {
- return new FinishedTriggersBitSet((BitSet) bitSet.clone());
- }
-}
-
-
[04/67] [partial] incubator-beam git commit: Directory reorganization
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/StateContexts.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/StateContexts.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/StateContexts.java
deleted file mode 100644
index e301d43..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/StateContexts.java
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * Copyright (C) 2016 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.util.state;
-
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
-import com.google.cloud.dataflow.sdk.util.WindowingInternals;
-import com.google.cloud.dataflow.sdk.values.PCollectionView;
-
-import javax.annotation.Nullable;
-
-/**
- * Factory that produces {@link StateContext} based on different inputs.
- */
-public class StateContexts {
- private static final StateContext<BoundedWindow> NULL_CONTEXT =
- new StateContext<BoundedWindow>() {
- @Override
- public PipelineOptions getPipelineOptions() {
- throw new IllegalArgumentException("cannot call getPipelineOptions() in a null context");
- }
-
- @Override
- public <T> T sideInput(PCollectionView<T> view) {
- throw new IllegalArgumentException("cannot call sideInput() in a null context");
- }
-
- @Override
- public BoundedWindow window() {
- throw new IllegalArgumentException("cannot call window() in a null context");
- }};
-
- /**
- * Returns a fake {@link StateContext}.
- */
- @SuppressWarnings("unchecked")
- public static <W extends BoundedWindow> StateContext<W> nullContext() {
- return (StateContext<W>) NULL_CONTEXT;
- }
-
- /**
- * Returns a {@link StateContext} that only contains the state window.
- */
- public static <W extends BoundedWindow> StateContext<W> windowOnly(final W window) {
- return new StateContext<W>() {
- @Override
- public PipelineOptions getPipelineOptions() {
- throw new IllegalArgumentException(
- "cannot call getPipelineOptions() in a window only context");
- }
- @Override
- public <T> T sideInput(PCollectionView<T> view) {
- throw new IllegalArgumentException("cannot call sideInput() in a window only context");
- }
- @Override
- public W window() {
- return window;
- }
- };
- }
-
- /**
- * Returns a {@link StateContext} from {@code PipelineOptions}, {@link WindowingInternals},
- * and the state window.
- */
- public static <W extends BoundedWindow> StateContext<W> createFromComponents(
- @Nullable final PipelineOptions options,
- final WindowingInternals<?, ?> windowingInternals,
- final W window) {
- @SuppressWarnings("unchecked")
- StateContext<W> typedNullContext = (StateContext<W>) NULL_CONTEXT;
- if (options == null) {
- return typedNullContext;
- } else {
- return new StateContext<W>() {
-
- @Override
- public PipelineOptions getPipelineOptions() {
- return options;
- }
-
- @Override
- public <T> T sideInput(PCollectionView<T> view) {
- return windowingInternals.sideInput(view, window);
- }
-
- @Override
- public W window() {
- return window;
- }
- };
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/StateInternals.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/StateInternals.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/StateInternals.java
deleted file mode 100644
index b31afb4..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/StateInternals.java
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.util.state;
-
-import com.google.cloud.dataflow.sdk.annotations.Experimental;
-import com.google.cloud.dataflow.sdk.annotations.Experimental.Kind;
-import com.google.cloud.dataflow.sdk.transforms.GroupByKey;
-
-/**
- * {@code StateInternals} describes the functionality a runner needs to provide for the
- * State API to be supported.
- *
- * <p>The SDK will only use this after elements have been partitioned by key. For instance, after a
- * {@link GroupByKey} operation. The runner implementation must ensure that any writes using
- * {@link StateInternals} are implicitly scoped to the key being processed and the specific step
- * accessing state.
- *
- * <p>The runner implementation must also ensure that any writes to the associated state objects
- * are persisted together with the completion status of the processing that produced these
- * writes.
- *
- * <p>This is a low-level API intended for use by the Dataflow SDK. It should not be
- * used directly, and is highly likely to change.
- */
-@Experimental(Kind.STATE)
-public interface StateInternals<K> {
-
- /** The key for this {@link StateInternals}. */
- K getKey();
-
- /**
- * Return the state associated with {@code address} in the specified {@code namespace}.
- */
- <T extends State> T state(StateNamespace namespace, StateTag<? super K, T> address);
-
- /**
- * Return the state associated with {@code address} in the specified {@code namespace}
- * with the {@link StateContext}.
- */
- <T extends State> T state(
- StateNamespace namespace, StateTag<? super K, T> address, StateContext<?> c);
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/StateMerging.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/StateMerging.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/StateMerging.java
deleted file mode 100644
index 0b33ea9..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/StateMerging.java
+++ /dev/null
@@ -1,254 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.util.state;
-
-import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
-import com.google.common.base.Preconditions;
-
-import org.joda.time.Instant;
-
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.List;
-import java.util.Map;
-
-/**
- * Helpers for merging state.
- */
-public class StateMerging {
- /**
- * Clear all state in {@code address} in all windows under merge (even result windows)
- * in {@code context}.
- */
- public static <K, StateT extends State, W extends BoundedWindow> void clear(
- MergingStateAccessor<K, W> context, StateTag<? super K, StateT> address) {
- for (StateT state : context.accessInEachMergingWindow(address).values()) {
- state.clear();
- }
- }
-
- /**
- * Prefetch all bag state in {@code address} across all windows under merge in
- * {@code context}, except for the bag state in the final state address window which we can
- * blindly append to.
- */
- public static <K, T, W extends BoundedWindow> void prefetchBags(
- MergingStateAccessor<K, W> context, StateTag<? super K, BagState<T>> address) {
- Map<W, BagState<T>> map = context.accessInEachMergingWindow(address);
- if (map.isEmpty()) {
- // Nothing to prefetch.
- return;
- }
- BagState<T> result = context.access(address);
- // Prefetch everything except what's already in result.
- for (BagState<T> source : map.values()) {
- if (!source.equals(result)) {
- source.readLater();
- }
- }
- }
-
- /**
- * Merge all bag state in {@code address} across all windows under merge.
- */
- public static <K, T, W extends BoundedWindow> void mergeBags(
- MergingStateAccessor<K, W> context, StateTag<? super K, BagState<T>> address) {
- mergeBags(context.accessInEachMergingWindow(address).values(), context.access(address));
- }
-
- /**
- * Merge all bag state in {@code sources} (which may include {@code result}) into {@code result}.
- */
- public static <T, W extends BoundedWindow> void mergeBags(
- Collection<BagState<T>> sources, BagState<T> result) {
- if (sources.isEmpty()) {
- // Nothing to merge.
- return;
- }
- // Prefetch everything except what's already in result.
- List<ReadableState<Iterable<T>>> futures = new ArrayList<>(sources.size());
- for (BagState<T> source : sources) {
- if (!source.equals(result)) {
- source.readLater();
- futures.add(source);
- }
- }
- if (futures.isEmpty()) {
- // Result already holds all the values.
- return;
- }
- // Transfer from sources to result.
- for (ReadableState<Iterable<T>> future : futures) {
- for (T element : future.read()) {
- result.add(element);
- }
- }
- // Clear sources except for result.
- for (BagState<T> source : sources) {
- if (!source.equals(result)) {
- source.clear();
- }
- }
- }
-
- /**
- * Prefetch all combining value state for {@code address} across all merging windows in {@code
- * context}.
- */
- public static <K, StateT extends CombiningState<?, ?>, W extends BoundedWindow> void
- prefetchCombiningValues(MergingStateAccessor<K, W> context,
- StateTag<? super K, StateT> address) {
- for (StateT state : context.accessInEachMergingWindow(address).values()) {
- state.readLater();
- }
- }
-
- /**
- * Merge all value state in {@code address} across all merging windows in {@code context}.
- */
- public static <K, InputT, AccumT, OutputT, W extends BoundedWindow> void mergeCombiningValues(
- MergingStateAccessor<K, W> context,
- StateTag<? super K, AccumulatorCombiningState<InputT, AccumT, OutputT>> address) {
- mergeCombiningValues(
- context.accessInEachMergingWindow(address).values(), context.access(address));
- }
-
- /**
- * Merge all value state from {@code sources} (which may include {@code result}) into
- * {@code result}.
- */
- public static <InputT, AccumT, OutputT, W extends BoundedWindow> void mergeCombiningValues(
- Collection<AccumulatorCombiningState<InputT, AccumT, OutputT>> sources,
- AccumulatorCombiningState<InputT, AccumT, OutputT> result) {
- if (sources.isEmpty()) {
- // Nothing to merge.
- return;
- }
- if (sources.size() == 1 && sources.contains(result)) {
- // Result already holds combined value.
- return;
- }
- // Prefetch.
- List<ReadableState<AccumT>> futures = new ArrayList<>(sources.size());
- for (AccumulatorCombiningState<InputT, AccumT, OutputT> source : sources) {
- source.readLater();
- }
- // Read.
- List<AccumT> accumulators = new ArrayList<>(futures.size());
- for (AccumulatorCombiningState<InputT, AccumT, OutputT> source : sources) {
- accumulators.add(source.getAccum());
- }
- // Merge (possibly update and return one of the existing accumulators).
- AccumT merged = result.mergeAccumulators(accumulators);
- // Clear sources.
- for (AccumulatorCombiningState<InputT, AccumT, OutputT> source : sources) {
- source.clear();
- }
- // Update result.
- result.addAccum(merged);
- }
-
- /**
- * Prefetch all watermark state for {@code address} across all merging windows in
- * {@code context}.
- */
- public static <K, W extends BoundedWindow> void prefetchWatermarks(
- MergingStateAccessor<K, W> context,
- StateTag<? super K, WatermarkHoldState<W>> address) {
- Map<W, WatermarkHoldState<W>> map = context.accessInEachMergingWindow(address);
- WatermarkHoldState<W> result = context.access(address);
- if (map.isEmpty()) {
- // Nothing to prefetch.
- return;
- }
- if (map.size() == 1 && map.values().contains(result)
- && result.getOutputTimeFn().dependsOnlyOnEarliestInputTimestamp()) {
- // Nothing to change.
- return;
- }
- if (result.getOutputTimeFn().dependsOnlyOnWindow()) {
- // No need to read existing holds.
- return;
- }
- // Prefetch.
- for (WatermarkHoldState<W> source : map.values()) {
- source.readLater();
- }
- }
-
- /**
- * Merge all watermark state in {@code address} across all merging windows in {@code context},
- * where the final merge result window is {@code mergeResult}.
- */
- public static <K, W extends BoundedWindow> void mergeWatermarks(
- MergingStateAccessor<K, W> context,
- StateTag<? super K, WatermarkHoldState<W>> address,
- W mergeResult) {
- mergeWatermarks(
- context.accessInEachMergingWindow(address).values(), context.access(address), mergeResult);
- }
-
- /**
- * Merge all watermark state in {@code sources} (which must include {@code result} if non-empty)
- * into {@code result}, where the final merge result window is {@code mergeResult}.
- */
- public static <W extends BoundedWindow> void mergeWatermarks(
- Collection<WatermarkHoldState<W>> sources, WatermarkHoldState<W> result,
- W resultWindow) {
- if (sources.isEmpty()) {
- // Nothing to merge.
- return;
- }
- if (sources.size() == 1 && sources.contains(result)
- && result.getOutputTimeFn().dependsOnlyOnEarliestInputTimestamp()) {
- // Nothing to merge.
- return;
- }
- if (result.getOutputTimeFn().dependsOnlyOnWindow()) {
- // Clear sources.
- for (WatermarkHoldState<W> source : sources) {
- source.clear();
- }
- // Update directly from window-derived hold.
- Instant hold = result.getOutputTimeFn().assignOutputTime(
- BoundedWindow.TIMESTAMP_MIN_VALUE, resultWindow);
- Preconditions.checkState(hold.isAfter(BoundedWindow.TIMESTAMP_MIN_VALUE));
- result.add(hold);
- } else {
- // Prefetch.
- List<ReadableState<Instant>> futures = new ArrayList<>(sources.size());
- for (WatermarkHoldState<W> source : sources) {
- futures.add(source);
- }
- // Read.
- List<Instant> outputTimesToMerge = new ArrayList<>(sources.size());
- for (ReadableState<Instant> future : futures) {
- Instant sourceOutputTime = future.read();
- if (sourceOutputTime != null) {
- outputTimesToMerge.add(sourceOutputTime);
- }
- }
- // Clear sources.
- for (WatermarkHoldState<W> source : sources) {
- source.clear();
- }
- if (!outputTimesToMerge.isEmpty()) {
- // Merge and update.
- result.add(result.getOutputTimeFn().merge(resultWindow, outputTimesToMerge));
- }
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/StateNamespace.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/StateNamespace.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/StateNamespace.java
deleted file mode 100644
index f972e31..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/StateNamespace.java
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.util.state;
-
-import java.io.IOException;
-
-/**
- * A namespace used for scoping state stored with {@link StateInternals}.
- *
- * <p>Instances of {@code StateNamespace} are guaranteed to have a {@link #hashCode} and
- * {@link #equals} that uniquely identify the namespace.
- */
-public interface StateNamespace {
-
- /**
- * Return a {@link String} representation of the key. It is guaranteed that this
- * {@code String} will uniquely identify the key.
- *
- * <p>This will encode the actual namespace as a {@code String}. It is
- * preferable to use the {@code StateNamespace} object when possible.
- *
- * <p>The string produced by the standard implementations will not contain a '+' character. This
- * enables adding a '+' between the actual namespace and other information, if needed, to separate
- * the two.
- */
- String stringKey();
-
- /**
- * Append the string representation of this key to the {@link Appendable}.
- */
- void appendTo(Appendable sb) throws IOException;
-
- /**
- * Return an {@code Object} to use as a key in a cache.
- *
- * <p>Different namespaces may use the same key in order to be treated as a unit in the cache.
- * The {@code Object}'s {@code hashCode} and {@code equals} methods will be used to determine
- * equality.
- */
- Object getCacheKey();
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/StateNamespaceForTest.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/StateNamespaceForTest.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/StateNamespaceForTest.java
deleted file mode 100644
index 09b86d6..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/StateNamespaceForTest.java
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.util.state;
-
-import java.io.IOException;
-import java.util.Objects;
-
-/**
- * A simple {@link StateNamespace} used for testing.
- */
-public class StateNamespaceForTest implements StateNamespace {
- private String key;
-
- public StateNamespaceForTest(String key) {
- this.key = key;
- }
-
- @Override
- public String stringKey() {
- return key;
- }
-
- @Override
- public Object getCacheKey() {
- return key;
- }
-
- @Override
- public boolean equals(Object obj) {
- if (this == obj) {
- return true;
- }
-
- if (!(obj instanceof StateNamespaceForTest)) {
- return false;
- }
-
- return Objects.equals(this.key, ((StateNamespaceForTest) obj).key);
- }
-
- @Override
- public int hashCode() {
- return key.hashCode();
- }
-
- @Override
- public void appendTo(Appendable sb) throws IOException {
- sb.append(key);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/StateNamespaces.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/StateNamespaces.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/StateNamespaces.java
deleted file mode 100644
index 8fee995..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/StateNamespaces.java
+++ /dev/null
@@ -1,277 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.util.state;
-
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.coders.CoderException;
-import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
-import com.google.cloud.dataflow.sdk.util.CoderUtils;
-import com.google.common.base.Splitter;
-
-import java.io.IOException;
-import java.util.List;
-import java.util.Objects;
-
-/**
- * Factory methods for creating the {@link StateNamespace StateNamespaces}.
- */
-public class StateNamespaces {
-
- private enum Namespace {
- GLOBAL,
- WINDOW,
- WINDOW_AND_TRIGGER;
- }
-
- public static StateNamespace global() {
- return new GlobalNamespace();
- }
-
- public static <W extends BoundedWindow> StateNamespace window(Coder<W> windowCoder, W window) {
- return new WindowNamespace<>(windowCoder, window);
- }
-
- public static <W extends BoundedWindow>
- StateNamespace windowAndTrigger(Coder<W> windowCoder, W window, int triggerIdx) {
- return new WindowAndTriggerNamespace<>(windowCoder, window, triggerIdx);
- }
-
- private StateNamespaces() {}
-
- /**
- * {@link StateNamespace} that is global to the current key being processed.
- */
- public static class GlobalNamespace implements StateNamespace {
-
- private static final String GLOBAL_STRING = "/";
-
- @Override
- public String stringKey() {
- return GLOBAL_STRING;
- }
-
- @Override
- public Object getCacheKey() {
- return GLOBAL_STRING;
- }
-
- @Override
- public boolean equals(Object obj) {
- return obj == this || obj instanceof GlobalNamespace;
- }
-
- @Override
- public int hashCode() {
- return Objects.hash(Namespace.GLOBAL);
- }
-
- @Override
- public String toString() {
- return "Global";
- }
-
- @Override
- public void appendTo(Appendable sb) throws IOException {
- sb.append(GLOBAL_STRING);
- }
- }
-
- /**
- * {@link StateNamespace} that is scoped to a specific window.
- */
- public static class WindowNamespace<W extends BoundedWindow> implements StateNamespace {
-
- private static final String WINDOW_FORMAT = "/%s/";
-
- private Coder<W> windowCoder;
- private W window;
-
- private WindowNamespace(Coder<W> windowCoder, W window) {
- this.windowCoder = windowCoder;
- this.window = window;
- }
-
- public W getWindow() {
- return window;
- }
-
- @Override
- public String stringKey() {
- try {
- return String.format(WINDOW_FORMAT, CoderUtils.encodeToBase64(windowCoder, window));
- } catch (CoderException e) {
- throw new RuntimeException("Unable to generate string key from window " + window, e);
- }
- }
-
- @Override
- public void appendTo(Appendable sb) throws IOException {
- sb.append('/').append(CoderUtils.encodeToBase64(windowCoder, window)).append('/');
- }
-
- /**
- * State in the same window will all be evicted together.
- */
- @Override
- public Object getCacheKey() {
- return window;
- }
-
- @Override
- public boolean equals(Object obj) {
- if (obj == this) {
- return true;
- }
-
- if (!(obj instanceof WindowNamespace)) {
- return false;
- }
-
- WindowNamespace<?> that = (WindowNamespace<?>) obj;
- return Objects.equals(this.window, that.window);
- }
-
- @Override
- public int hashCode() {
- return Objects.hash(Namespace.WINDOW, window);
- }
-
- @Override
- public String toString() {
- return "Window(" + window + ")";
- }
- }
-
- /**
- * {@link StateNamespace} that is scoped to a particular window and trigger index.
- */
- public static class WindowAndTriggerNamespace<W extends BoundedWindow>
- implements StateNamespace {
-
- private static final String WINDOW_AND_TRIGGER_FORMAT = "/%s/%s/";
-
- private static final int TRIGGER_RADIX = 36;
- private Coder<W> windowCoder;
- private W window;
- private int triggerIndex;
-
- private WindowAndTriggerNamespace(Coder<W> windowCoder, W window, int triggerIndex) {
- this.windowCoder = windowCoder;
- this.window = window;
- this.triggerIndex = triggerIndex;
- }
-
- public W getWindow() {
- return window;
- }
-
- public int getTriggerIndex() {
- return triggerIndex;
- }
-
- @Override
- public String stringKey() {
- try {
- return String.format(WINDOW_AND_TRIGGER_FORMAT,
- CoderUtils.encodeToBase64(windowCoder, window),
- // Use base 36 so that can address 36 triggers in a single byte and still be human
- // readable.
- Integer.toString(triggerIndex, TRIGGER_RADIX).toUpperCase());
- } catch (CoderException e) {
- throw new RuntimeException("Unable to generate string key from window " + window, e);
- }
- }
-
- @Override
- public void appendTo(Appendable sb) throws IOException {
- sb.append('/').append(CoderUtils.encodeToBase64(windowCoder, window));
- sb.append('/').append(Integer.toString(triggerIndex, TRIGGER_RADIX).toUpperCase());
- sb.append('/');
- }
-
- /**
- * State in the same window will all be evicted together.
- */
- @Override
- public Object getCacheKey() {
- return window;
- }
-
- @Override
- public boolean equals(Object obj) {
- if (obj == this) {
- return true;
- }
-
- if (!(obj instanceof WindowAndTriggerNamespace)) {
- return false;
- }
-
- WindowAndTriggerNamespace<?> that = (WindowAndTriggerNamespace<?>) obj;
- return this.triggerIndex == that.triggerIndex
- && Objects.equals(this.window, that.window);
- }
-
- @Override
- public int hashCode() {
- return Objects.hash(Namespace.WINDOW_AND_TRIGGER, window, triggerIndex);
- }
-
- @Override
- public String toString() {
- return "WindowAndTrigger(" + window + "," + triggerIndex + ")";
- }
- }
-
- private static final Splitter SLASH_SPLITTER = Splitter.on('/');
-
- /**
- * Convert a {@code stringKey} produced using {@link StateNamespace#stringKey}
- * on one of the namespaces produced by this class into the original
- * {@link StateNamespace}.
- */
- public static <W extends BoundedWindow> StateNamespace fromString(
- String stringKey, Coder<W> windowCoder) {
- if (!stringKey.startsWith("/") || !stringKey.endsWith("/")) {
- throw new RuntimeException("Invalid namespace string: '" + stringKey + "'");
- }
-
- if (GlobalNamespace.GLOBAL_STRING.equals(stringKey)) {
- return global();
- }
-
- List<String> parts = SLASH_SPLITTER.splitToList(stringKey);
- if (parts.size() != 3 && parts.size() != 4) {
- throw new RuntimeException("Invalid namespace string: '" + stringKey + "'");
- }
- // Ends should be empty (we start and end with /)
- if (!parts.get(0).isEmpty() || !parts.get(parts.size() - 1).isEmpty()) {
- throw new RuntimeException("Invalid namespace string: '" + stringKey + "'");
- }
-
- try {
- W window = CoderUtils.decodeFromBase64(windowCoder, parts.get(1));
- if (parts.size() > 3) {
- int index = Integer.parseInt(parts.get(2), WindowAndTriggerNamespace.TRIGGER_RADIX);
- return windowAndTrigger(windowCoder, window, index);
- } else {
- return window(windowCoder, window);
- }
- } catch (Exception e) {
- throw new RuntimeException("Invalid namespace string: '" + stringKey + "'", e);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/StateTable.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/StateTable.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/StateTable.java
deleted file mode 100644
index edd1dae..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/StateTable.java
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.util.state;
-
-import com.google.cloud.dataflow.sdk.util.state.StateTag.StateBinder;
-import com.google.common.base.Supplier;
-import com.google.common.collect.Table;
-import com.google.common.collect.Tables;
-
-import java.util.HashMap;
-import java.util.Map;
-import java.util.Set;
-
-/**
- * Table mapping {@code StateNamespace} and {@code StateTag<?>} to a {@code State} instance.
- */
-public abstract class StateTable<K> {
-
- private final Table<StateNamespace, StateTag<? super K, ?>, State> stateTable =
- Tables.newCustomTable(new HashMap<StateNamespace, Map<StateTag<? super K, ?>, State>>(),
- new Supplier<Map<StateTag<? super K, ?>, State>>() {
- @Override
- public Map<StateTag<? super K, ?>, State> get() {
- return new HashMap<>();
- }
- });
-
- /**
- * Gets the {@link State} in the specified {@link StateNamespace} with the specified {@link
- * StateTag}, binding it using the {@link #binderForNamespace} if it is not
- * already present in this {@link StateTable}.
- */
- public <StateT extends State> StateT get(
- StateNamespace namespace, StateTag<? super K, StateT> tag, StateContext<?> c) {
- State storage = stateTable.get(namespace, tag);
- if (storage != null) {
- @SuppressWarnings("unchecked")
- StateT typedStorage = (StateT) storage;
- return typedStorage;
- }
-
- StateT typedStorage = tag.bind(binderForNamespace(namespace, c));
- stateTable.put(namespace, tag, typedStorage);
- return typedStorage;
- }
-
- public void clearNamespace(StateNamespace namespace) {
- stateTable.rowKeySet().remove(namespace);
- }
-
- public void clear() {
- stateTable.clear();
- }
-
- public Iterable<State> values() {
- return stateTable.values();
- }
-
- public boolean isNamespaceInUse(StateNamespace namespace) {
- return stateTable.containsRow(namespace);
- }
-
- public Map<StateTag<? super K, ?>, State> getTagsInUse(StateNamespace namespace) {
- return stateTable.row(namespace);
- }
-
- public Set<StateNamespace> getNamespacesInUse() {
- return stateTable.rowKeySet();
- }
-
- /**
- * Provide the {@code StateBinder} to use for creating {@code Storage} instances
- * in the specified {@code namespace}.
- */
- protected abstract StateBinder<K> binderForNamespace(StateNamespace namespace, StateContext<?> c);
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/StateTag.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/StateTag.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/StateTag.java
deleted file mode 100644
index c87bdb7..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/StateTag.java
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.util.state;
-
-import com.google.cloud.dataflow.sdk.annotations.Experimental;
-import com.google.cloud.dataflow.sdk.annotations.Experimental.Kind;
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.transforms.Combine.CombineFn;
-import com.google.cloud.dataflow.sdk.transforms.Combine.KeyedCombineFn;
-import com.google.cloud.dataflow.sdk.transforms.CombineWithContext.KeyedCombineFnWithContext;
-import com.google.cloud.dataflow.sdk.transforms.GroupByKey;
-import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
-import com.google.cloud.dataflow.sdk.transforms.windowing.OutputTimeFn;
-
-import java.io.IOException;
-import java.io.Serializable;
-
-/**
- * An address for persistent state. This includes a unique identifier for the location, the
- * information necessary to encode the value, and details about the intended access pattern.
- *
- * <p>State can be thought of as a sparse table, with each {@code StateTag} defining a column
- * that has cells of type {@code StateT}.
- *
- * <p>Currently, this can only be used in a step immediately following a {@link GroupByKey}.
- *
- * @param <K> The type of key that must be used with the state tag. Contravariant: methods should
- * accept values of type {@code KeyedStateTag<? super K, StateT>}.
- * @param <StateT> The type of state being tagged.
- */
-@Experimental(Kind.STATE)
-public interface StateTag<K, StateT extends State> extends Serializable {
-
- /**
- * Visitor for binding a {@link StateTag} and to the associated {@link State}.
- *
- * @param <K> the type of key this binder embodies.
- */
- public interface StateBinder<K> {
- <T> ValueState<T> bindValue(StateTag<? super K, ValueState<T>> address, Coder<T> coder);
-
- <T> BagState<T> bindBag(StateTag<? super K, BagState<T>> address, Coder<T> elemCoder);
-
- <InputT, AccumT, OutputT> AccumulatorCombiningState<InputT, AccumT, OutputT>
- bindCombiningValue(
- StateTag<? super K, AccumulatorCombiningState<InputT, AccumT, OutputT>> address,
- Coder<AccumT> accumCoder, CombineFn<InputT, AccumT, OutputT> combineFn);
-
- <InputT, AccumT, OutputT> AccumulatorCombiningState<InputT, AccumT, OutputT>
- bindKeyedCombiningValue(
- StateTag<? super K, AccumulatorCombiningState<InputT, AccumT, OutputT>> address,
- Coder<AccumT> accumCoder, KeyedCombineFn<? super K, InputT, AccumT, OutputT> combineFn);
-
- <InputT, AccumT, OutputT> AccumulatorCombiningState<InputT, AccumT, OutputT>
- bindKeyedCombiningValueWithContext(
- StateTag<? super K, AccumulatorCombiningState<InputT, AccumT, OutputT>> address,
- Coder<AccumT> accumCoder,
- KeyedCombineFnWithContext<? super K, InputT, AccumT, OutputT> combineFn);
-
- /**
- * Bind to a watermark {@link StateTag}.
- *
- * <p>This accepts the {@link OutputTimeFn} that dictates how watermark hold timestamps
- * added to the returned {@link WatermarkHoldState} are to be combined.
- */
- <W extends BoundedWindow> WatermarkHoldState<W> bindWatermark(
- StateTag<? super K, WatermarkHoldState<W>> address,
- OutputTimeFn<? super W> outputTimeFn);
- }
-
- /** Append the UTF-8 encoding of this tag to the given {@link Appendable}. */
- void appendTo(Appendable sb) throws IOException;
-
- /**
- * Returns the user-provided name of this state cell.
- */
- String getId();
-
- /**
- * Use the {@code binder} to create an instance of {@code StateT} appropriate for this address.
- */
- StateT bind(StateBinder<? extends K> binder);
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/StateTags.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/StateTags.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/StateTags.java
deleted file mode 100644
index ec9a78f..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/StateTags.java
+++ /dev/null
@@ -1,579 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.util.state;
-
-import com.google.cloud.dataflow.sdk.annotations.Experimental;
-import com.google.cloud.dataflow.sdk.annotations.Experimental.Kind;
-import com.google.cloud.dataflow.sdk.coders.CannotProvideCoderException;
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.coders.CoderRegistry;
-import com.google.cloud.dataflow.sdk.transforms.Combine.CombineFn;
-import com.google.cloud.dataflow.sdk.transforms.Combine.KeyedCombineFn;
-import com.google.cloud.dataflow.sdk.transforms.CombineWithContext.KeyedCombineFnWithContext;
-import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
-import com.google.cloud.dataflow.sdk.transforms.windowing.OutputTimeFn;
-import com.google.common.base.MoreObjects;
-
-import java.io.IOException;
-import java.io.Serializable;
-import java.util.Objects;
-
-/**
- * Static utility methods for creating {@link StateTag} instances.
- */
-@Experimental(Kind.STATE)
-public class StateTags {
-
- private static final CoderRegistry STANDARD_REGISTRY = new CoderRegistry();
-
- static {
- STANDARD_REGISTRY.registerStandardCoders();
- }
-
- private enum StateKind {
- SYSTEM('s'),
- USER('u');
-
- private char prefix;
-
- StateKind(char prefix) {
- this.prefix = prefix;
- }
- }
-
- private StateTags() { }
-
- private interface SystemStateTag<K, StateT extends State> {
- StateTag<K, StateT> asKind(StateKind kind);
- }
-
- /**
- * Create a simple state tag for values of type {@code T}.
- */
- public static <T> StateTag<Object, ValueState<T>> value(String id, Coder<T> valueCoder) {
- return new ValueStateTag<>(new StructuredId(id), valueCoder);
- }
-
- /**
- * Create a state tag for values that use a {@link CombineFn} to automatically merge
- * multiple {@code InputT}s into a single {@code OutputT}.
- */
- public static <InputT, AccumT, OutputT>
- StateTag<Object, AccumulatorCombiningState<InputT, AccumT, OutputT>>
- combiningValue(
- String id, Coder<AccumT> accumCoder, CombineFn<InputT, AccumT, OutputT> combineFn) {
- return combiningValueInternal(id, accumCoder, combineFn);
- }
-
- /**
- * Create a state tag for values that use a {@link KeyedCombineFn} to automatically merge
- * multiple {@code InputT}s into a single {@code OutputT}. The key provided to the
- * {@link KeyedCombineFn} comes from the keyed {@link StateAccessor}.
- */
- public static <K, InputT, AccumT,
- OutputT> StateTag<K, AccumulatorCombiningState<InputT, AccumT, OutputT>>
- keyedCombiningValue(String id, Coder<AccumT> accumCoder,
- KeyedCombineFn<K, InputT, AccumT, OutputT> combineFn) {
- return keyedCombiningValueInternal(id, accumCoder, combineFn);
- }
-
- /**
- * Create a state tag for values that use a {@link KeyedCombineFnWithContext} to automatically
- * merge multiple {@code InputT}s into a single {@code OutputT}. The key provided to the
- * {@link KeyedCombineFn} comes from the keyed {@link StateAccessor}, the context provided comes
- * from the {@link StateContext}.
- */
- public static <K, InputT, AccumT, OutputT>
- StateTag<K, AccumulatorCombiningState<InputT, AccumT, OutputT>>
- keyedCombiningValueWithContext(
- String id,
- Coder<AccumT> accumCoder,
- KeyedCombineFnWithContext<K, InputT, AccumT, OutputT> combineFn) {
- return new KeyedCombiningValueWithContextStateTag<K, InputT, AccumT, OutputT>(
- new StructuredId(id),
- accumCoder,
- combineFn);
- }
-
- /**
- * Create a state tag for values that use a {@link CombineFn} to automatically merge
- * multiple {@code InputT}s into a single {@code OutputT}.
- *
- * <p>This determines the {@code Coder<AccumT>} from the given {@code Coder<InputT>}, and
- * should only be used to initialize static values.
- */
- public static <InputT, AccumT, OutputT>
- StateTag<Object, AccumulatorCombiningState<InputT, AccumT, OutputT>>
- combiningValueFromInputInternal(
- String id, Coder<InputT> inputCoder, CombineFn<InputT, AccumT, OutputT> combineFn) {
- try {
- Coder<AccumT> accumCoder = combineFn.getAccumulatorCoder(STANDARD_REGISTRY, inputCoder);
- return combiningValueInternal(id, accumCoder, combineFn);
- } catch (CannotProvideCoderException e) {
- throw new IllegalArgumentException(
- "Unable to determine accumulator coder for " + combineFn.getClass().getSimpleName()
- + " from " + inputCoder, e);
- }
- }
-
- private static <InputT, AccumT,
- OutputT> StateTag<Object, AccumulatorCombiningState<InputT, AccumT, OutputT>>
- combiningValueInternal(
- String id, Coder<AccumT> accumCoder, CombineFn<InputT, AccumT, OutputT> combineFn) {
- return
- new CombiningValueStateTag<InputT, AccumT, OutputT>(
- new StructuredId(id), accumCoder, combineFn);
- }
-
- private static <K, InputT, AccumT, OutputT>
- StateTag<K, AccumulatorCombiningState<InputT, AccumT, OutputT>> keyedCombiningValueInternal(
- String id,
- Coder<AccumT> accumCoder,
- KeyedCombineFn<K, InputT, AccumT, OutputT> combineFn) {
- return new KeyedCombiningValueStateTag<K, InputT, AccumT, OutputT>(
- new StructuredId(id), accumCoder, combineFn);
- }
-
- /**
- * Create a state tag that is optimized for adding values frequently, and
- * occasionally retrieving all the values that have been added.
- */
- public static <T> StateTag<Object, BagState<T>> bag(String id, Coder<T> elemCoder) {
- return new BagStateTag<T>(new StructuredId(id), elemCoder);
- }
-
- /**
- * Create a state tag for holding the watermark.
- */
- public static <W extends BoundedWindow> StateTag<Object, WatermarkHoldState<W>>
- watermarkStateInternal(String id, OutputTimeFn<? super W> outputTimeFn) {
- return new WatermarkStateTagInternal<W>(new StructuredId(id), outputTimeFn);
- }
-
- /**
- * Convert an arbitrary {@link StateTag} to a system-internal tag that is guaranteed not to
- * collide with any user tags.
- */
- public static <K, StateT extends State> StateTag<K, StateT> makeSystemTagInternal(
- StateTag<K, StateT> tag) {
- if (!(tag instanceof SystemStateTag)) {
- throw new IllegalArgumentException("Expected subclass of StateTagBase, got " + tag);
- }
- // Checked above
- @SuppressWarnings("unchecked")
- SystemStateTag<K, StateT> typedTag = (SystemStateTag<K, StateT>) tag;
- return typedTag.asKind(StateKind.SYSTEM);
- }
-
- public static <K, InputT, AccumT, OutputT> StateTag<Object, BagState<AccumT>>
- convertToBagTagInternal(
- StateTag<? super K, AccumulatorCombiningState<InputT, AccumT, OutputT>> combiningTag) {
- if (combiningTag instanceof KeyedCombiningValueStateTag) {
- // Checked above; conversion to a bag tag depends on the provided tag being one of those
- // created via the factory methods in this class.
- @SuppressWarnings("unchecked")
- KeyedCombiningValueStateTag<K, InputT, AccumT, OutputT> typedTag =
- (KeyedCombiningValueStateTag<K, InputT, AccumT, OutputT>) combiningTag;
- return typedTag.asBagTag();
- } else if (combiningTag instanceof KeyedCombiningValueWithContextStateTag) {
- @SuppressWarnings("unchecked")
- KeyedCombiningValueWithContextStateTag<K, InputT, AccumT, OutputT> typedTag =
- (KeyedCombiningValueWithContextStateTag<K, InputT, AccumT, OutputT>) combiningTag;
- return typedTag.asBagTag();
- } else {
- throw new IllegalArgumentException("Unexpected StateTag " + combiningTag);
- }
- }
-
- private static class StructuredId implements Serializable {
- private final StateKind kind;
- private final String rawId;
-
- private StructuredId(String rawId) {
- this(StateKind.USER, rawId);
- }
-
- private StructuredId(StateKind kind, String rawId) {
- this.kind = kind;
- this.rawId = rawId;
- }
-
- public StructuredId asKind(StateKind kind) {
- return new StructuredId(kind, rawId);
- }
-
- public void appendTo(Appendable sb) throws IOException {
- sb.append(kind.prefix).append(rawId);
- }
-
- public String getRawId() {
- return rawId;
- }
-
- @Override
- public String toString() {
- return MoreObjects.toStringHelper(getClass())
- .add("id", rawId)
- .add("kind", kind)
- .toString();
- }
-
- @Override
- public boolean equals(Object obj) {
- if (obj == this) {
- return true;
- }
-
- if (!(obj instanceof StructuredId)) {
- return false;
- }
-
- StructuredId that = (StructuredId) obj;
- return Objects.equals(this.kind, that.kind)
- && Objects.equals(this.rawId, that.rawId);
- }
-
- @Override
- public int hashCode() {
- return Objects.hash(kind, rawId);
- }
- }
-
- /**
- * A base class that just manages the structured ids.
- */
- private abstract static class StateTagBase<K, StateT extends State>
- implements StateTag<K, StateT>, SystemStateTag<K, StateT> {
-
- protected final StructuredId id;
-
- protected StateTagBase(StructuredId id) {
- this.id = id;
- }
-
- @Override
- public String getId() {
- return id.getRawId();
- }
-
- @Override
- public String toString() {
- return MoreObjects.toStringHelper(getClass())
- .add("id", id)
- .toString();
- }
-
- @Override
- public void appendTo(Appendable sb) throws IOException {
- id.appendTo(sb);
- }
-
- @Override
- public abstract StateTag<K, StateT> asKind(StateKind kind);
- }
-
- /**
- * A value state cell for values of type {@code T}.
- *
- * @param <T> the type of value being stored
- */
- private static class ValueStateTag<T> extends StateTagBase<Object, ValueState<T>>
- implements StateTag<Object, ValueState<T>> {
-
- private final Coder<T> coder;
-
- private ValueStateTag(StructuredId id, Coder<T> coder) {
- super(id);
- this.coder = coder;
- }
-
- @Override
- public ValueState<T> bind(StateBinder<? extends Object> visitor) {
- return visitor.bindValue(this, coder);
- }
-
- @Override
- public boolean equals(Object obj) {
- if (obj == this) {
- return true;
- }
-
- if (!(obj instanceof ValueStateTag)) {
- return false;
- }
-
- ValueStateTag<?> that = (ValueStateTag<?>) obj;
- return Objects.equals(this.id, that.id)
- && Objects.equals(this.coder, that.coder);
- }
-
- @Override
- public int hashCode() {
- return Objects.hash(getClass(), id, coder);
- }
-
- @Override
- public StateTag<Object, ValueState<T>> asKind(StateKind kind) {
- return new ValueStateTag<T>(id.asKind(kind), coder);
- }
- }
-
- /**
- * A state cell for values that are combined according to a {@link CombineFn}.
- *
- * @param <InputT> the type of input values
- * @param <AccumT> type of mutable accumulator values
- * @param <OutputT> type of output values
- */
- private static class CombiningValueStateTag<InputT, AccumT, OutputT>
- extends KeyedCombiningValueStateTag<Object, InputT, AccumT, OutputT>
- implements StateTag<Object, AccumulatorCombiningState<InputT, AccumT, OutputT>>,
- SystemStateTag<Object, AccumulatorCombiningState<InputT, AccumT, OutputT>> {
-
- private final Coder<AccumT> accumCoder;
- private final CombineFn<InputT, AccumT, OutputT> combineFn;
-
- private CombiningValueStateTag(
- StructuredId id,
- Coder<AccumT> accumCoder, CombineFn<InputT, AccumT, OutputT> combineFn) {
- super(id, accumCoder, combineFn.asKeyedFn());
- this.combineFn = combineFn;
- this.accumCoder = accumCoder;
- }
-
- @Override
- public StateTag<Object, AccumulatorCombiningState<InputT, AccumT, OutputT>>
- asKind(StateKind kind) {
- return new CombiningValueStateTag<InputT, AccumT, OutputT>(
- id.asKind(kind), accumCoder, combineFn);
- }
- }
-
- /**
- * A state cell for values that are combined according to a {@link KeyedCombineFnWithContext}.
- *
- * @param <K> the type of keys
- * @param <InputT> the type of input values
- * @param <AccumT> type of mutable accumulator values
- * @param <OutputT> type of output values
- */
- private static class KeyedCombiningValueWithContextStateTag<K, InputT, AccumT, OutputT>
- extends StateTagBase<K, AccumulatorCombiningState<InputT, AccumT, OutputT>>
- implements SystemStateTag<K, AccumulatorCombiningState<InputT, AccumT, OutputT>> {
-
- private final Coder<AccumT> accumCoder;
- private final KeyedCombineFnWithContext<K, InputT, AccumT, OutputT> combineFn;
-
- protected KeyedCombiningValueWithContextStateTag(
- StructuredId id,
- Coder<AccumT> accumCoder,
- KeyedCombineFnWithContext<K, InputT, AccumT, OutputT> combineFn) {
- super(id);
- this.combineFn = combineFn;
- this.accumCoder = accumCoder;
- }
-
- @Override
- public AccumulatorCombiningState<InputT, AccumT, OutputT> bind(
- StateBinder<? extends K> visitor) {
- return visitor.bindKeyedCombiningValueWithContext(this, accumCoder, combineFn);
- }
-
- @Override
- public boolean equals(Object obj) {
- if (obj == this) {
- return true;
- }
-
- if (!(obj instanceof KeyedCombiningValueWithContextStateTag)) {
- return false;
- }
-
- KeyedCombiningValueWithContextStateTag<?, ?, ?, ?> that =
- (KeyedCombiningValueWithContextStateTag<?, ?, ?, ?>) obj;
- return Objects.equals(this.id, that.id)
- && Objects.equals(this.accumCoder, that.accumCoder);
- }
-
- @Override
- public int hashCode() {
- return Objects.hash(getClass(), id, accumCoder);
- }
-
- @Override
- public StateTag<K, AccumulatorCombiningState<InputT, AccumT, OutputT>> asKind(
- StateKind kind) {
- return new KeyedCombiningValueWithContextStateTag<>(
- id.asKind(kind), accumCoder, combineFn);
- }
-
- private StateTag<Object, BagState<AccumT>> asBagTag() {
- return new BagStateTag<AccumT>(id, accumCoder);
- }
- }
-
- /**
- * A state cell for values that are combined according to a {@link KeyedCombineFn}.
- *
- * @param <K> the type of keys
- * @param <InputT> the type of input values
- * @param <AccumT> type of mutable accumulator values
- * @param <OutputT> type of output values
- */
- private static class KeyedCombiningValueStateTag<K, InputT, AccumT, OutputT>
- extends StateTagBase<K, AccumulatorCombiningState<InputT, AccumT, OutputT>>
- implements SystemStateTag<K, AccumulatorCombiningState<InputT, AccumT, OutputT>> {
-
- private final Coder<AccumT> accumCoder;
- private final KeyedCombineFn<K, InputT, AccumT, OutputT> keyedCombineFn;
-
- protected KeyedCombiningValueStateTag(
- StructuredId id,
- Coder<AccumT> accumCoder, KeyedCombineFn<K, InputT, AccumT, OutputT> keyedCombineFn) {
- super(id);
- this.keyedCombineFn = keyedCombineFn;
- this.accumCoder = accumCoder;
- }
-
- @Override
- public AccumulatorCombiningState<InputT, AccumT, OutputT> bind(
- StateBinder<? extends K> visitor) {
- return visitor.bindKeyedCombiningValue(this, accumCoder, keyedCombineFn);
- }
-
- @Override
- public boolean equals(Object obj) {
- if (obj == this) {
- return true;
- }
-
- if (!(obj instanceof CombiningValueStateTag)) {
- return false;
- }
-
- KeyedCombiningValueStateTag<?, ?, ?, ?> that = (KeyedCombiningValueStateTag<?, ?, ?, ?>) obj;
- return Objects.equals(this.id, that.id)
- && Objects.equals(this.accumCoder, that.accumCoder);
- }
-
- @Override
- public int hashCode() {
- return Objects.hash(getClass(), id, accumCoder);
- }
-
- @Override
- public StateTag<K, AccumulatorCombiningState<InputT, AccumT, OutputT>> asKind(
- StateKind kind) {
- return new KeyedCombiningValueStateTag<>(id.asKind(kind), accumCoder, keyedCombineFn);
- }
-
- private StateTag<Object, BagState<AccumT>> asBagTag() {
- return new BagStateTag<AccumT>(id, accumCoder);
- }
- }
-
- /**
- * A state cell optimized for bag-like access patterns (frequent additions, occasional reads
- * of all the values).
- *
- * @param <T> the type of value in the bag
- */
- private static class BagStateTag<T> extends StateTagBase<Object, BagState<T>>
- implements StateTag<Object, BagState<T>>{
-
- private final Coder<T> elemCoder;
-
- private BagStateTag(StructuredId id, Coder<T> elemCoder) {
- super(id);
- this.elemCoder = elemCoder;
- }
-
- @Override
- public BagState<T> bind(StateBinder<? extends Object> visitor) {
- return visitor.bindBag(this, elemCoder);
- }
-
- @Override
- public boolean equals(Object obj) {
- if (obj == this) {
- return true;
- }
-
- if (!(obj instanceof BagStateTag)) {
- return false;
- }
-
- BagStateTag<?> that = (BagStateTag<?>) obj;
- return Objects.equals(this.id, that.id)
- && Objects.equals(this.elemCoder, that.elemCoder);
- }
-
- @Override
- public int hashCode() {
- return Objects.hash(getClass(), id, elemCoder);
- }
-
- @Override
- public StateTag<Object, BagState<T>> asKind(StateKind kind) {
- return new BagStateTag<>(id.asKind(kind), elemCoder);
- }
- }
-
- private static class WatermarkStateTagInternal<W extends BoundedWindow>
- extends StateTagBase<Object, WatermarkHoldState<W>> {
-
- /**
- * When multiple output times are added to hold the watermark, this determines how they are
- * combined, and also the behavior when merging windows. Does not contribute to equality/hash
- * since we have at most one watermark hold tag per computation.
- */
- private final OutputTimeFn<? super W> outputTimeFn;
-
- private WatermarkStateTagInternal(StructuredId id, OutputTimeFn<? super W> outputTimeFn) {
- super(id);
- this.outputTimeFn = outputTimeFn;
- }
-
- @Override
- public WatermarkHoldState<W> bind(StateBinder<? extends Object> visitor) {
- return visitor.bindWatermark(this, outputTimeFn);
- }
-
- @Override
- public boolean equals(Object obj) {
- if (obj == this) {
- return true;
- }
-
- if (!(obj instanceof WatermarkStateTagInternal)) {
- return false;
- }
-
- WatermarkStateTagInternal<?> that = (WatermarkStateTagInternal<?>) obj;
- return Objects.equals(this.id, that.id);
- }
-
- @Override
- public int hashCode() {
- return Objects.hash(getClass(), id);
- }
-
- @Override
- public StateTag<Object, WatermarkHoldState<W>> asKind(StateKind kind) {
- return new WatermarkStateTagInternal<W>(id.asKind(kind), outputTimeFn);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/ValueState.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/ValueState.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/ValueState.java
deleted file mode 100644
index 19c12bb..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/ValueState.java
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.util.state;
-
-import com.google.cloud.dataflow.sdk.annotations.Experimental;
-import com.google.cloud.dataflow.sdk.annotations.Experimental.Kind;
-
-/**
- * State holding a single value.
- *
- * @param <T> The type of values being stored.
- */
-@Experimental(Kind.STATE)
-public interface ValueState<T> extends ReadableState<T>, State {
- /**
- * Set the value of the buffer.
- */
- void write(T input);
-
- @Override
- ValueState<T> readLater();
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/WatermarkHoldState.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/WatermarkHoldState.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/WatermarkHoldState.java
deleted file mode 100644
index 8a1adc9..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/WatermarkHoldState.java
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.util.state;
-
-import com.google.cloud.dataflow.sdk.annotations.Experimental;
-import com.google.cloud.dataflow.sdk.annotations.Experimental.Kind;
-import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
-import com.google.cloud.dataflow.sdk.transforms.windowing.OutputTimeFn;
-
-import org.joda.time.Instant;
-
-/**
- * A {@link State} accepting and aggregating output timestamps, which determines
- * the time to which the output watermark must be held.
- *
- * <p><b><i>For internal use only. This API may change at any time.</i></b>
- */
-@Experimental(Kind.STATE)
-public interface WatermarkHoldState<W extends BoundedWindow>
- extends CombiningState<Instant, Instant> {
- /**
- * Return the {@link OutputTimeFn} which will be used to determine a watermark hold time given
- * an element timestamp, and to combine watermarks from windows which are about to be merged.
- */
- OutputTimeFn<? super W> getOutputTimeFn();
-
- @Override
- WatermarkHoldState<W> readLater();
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/values/KV.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/values/KV.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/values/KV.java
deleted file mode 100644
index 23cee07..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/values/KV.java
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.values;
-
-import com.google.cloud.dataflow.sdk.transforms.Combine;
-import com.google.cloud.dataflow.sdk.transforms.GroupByKey;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-import com.google.cloud.dataflow.sdk.transforms.SerializableComparator;
-import com.google.common.base.MoreObjects;
-
-import java.io.Serializable;
-import java.util.Arrays;
-import java.util.Objects;
-
-/**
- * An immutable key/value pair.
- *
- * <p>Various {@link PTransform PTransforms} like {@link GroupByKey} and {@link Combine#perKey}
- * operate on {@link PCollection PCollections} of {@link KV KVs}.
- *
- * @param <K> the type of the key
- * @param <V> the type of the value
- */
-public class KV<K, V> implements Serializable {
- /** Returns a {@link KV} with the given key and value. */
- public static <K, V> KV<K, V> of(K key, V value) {
- return new KV<>(key, value);
- }
-
- /** Returns the key of this {@link KV}. */
- public K getKey() {
- return key;
- }
-
- /** Returns the value of this {@link KV}. */
- public V getValue() {
- return value;
- }
-
-
- /////////////////////////////////////////////////////////////////////////////
-
- final K key;
- final V value;
-
- private KV(K key, V value) {
- this.key = key;
- this.value = value;
- }
-
- @Override
- public boolean equals(Object other) {
- if (this == other) {
- return true;
- }
- if (!(other instanceof KV)) {
- return false;
- }
- KV<?, ?> otherKv = (KV<?, ?>) other;
- // Arrays are very common as values and keys, so deepEquals is mandatory
- return Objects.deepEquals(this.key, otherKv.key)
- && Objects.deepEquals(this.value, otherKv.value);
- }
-
- /**
- * A {@link Comparator} that orders {@link KV KVs} by the natural ordering of their keys.
- *
- * <p>A {@code null} key is less than any non-{@code null} key.
- */
- public static class OrderByKey<K extends Comparable<? super K>, V> implements
- SerializableComparator<KV<K, V>> {
- @Override
- public int compare(KV<K, V> a, KV<K, V> b) {
- if (a.key == null) {
- return b.key == null ? 0 : -1;
- } else if (b.key == null) {
- return 1;
- } else {
- return a.key.compareTo(b.key);
- }
- }
- }
-
- /**
- * A {@link Comparator} that orders {@link KV KVs} by the natural ordering of their values.
- *
- * <p>A {@code null} value is less than any non-{@code null} value.
- */
- public static class OrderByValue<K, V extends Comparable<? super V>>
- implements SerializableComparator<KV<K, V>> {
- @Override
- public int compare(KV<K, V> a, KV<K, V> b) {
- if (a.value == null) {
- return b.value == null ? 0 : -1;
- } else if (b.value == null) {
- return 1;
- } else {
- return a.value.compareTo(b.value);
- }
- }
- }
-
- @Override
- public int hashCode() {
- // Objects.deepEquals requires Arrays.deepHashCode for correctness
- return Arrays.deepHashCode(new Object[]{key, value});
- }
-
- @Override
- public String toString() {
- return MoreObjects.toStringHelper(this)
- .addValue(key)
- .addValue(value)
- .toString();
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/values/PBegin.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/values/PBegin.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/values/PBegin.java
deleted file mode 100644
index 23ac3ae..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/values/PBegin.java
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.values;
-
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.io.TextIO.Read;
-import com.google.cloud.dataflow.sdk.transforms.Create;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-
-import java.util.Collection;
-import java.util.Collections;
-
-/**
- * {@link PBegin} is the "input" to a root {@link PTransform}, such as {@link Read Read} or
- * {@link Create}.
- *
- * <p>Typically created by calling {@link Pipeline#begin} on a Pipeline.
- */
-public class PBegin implements PInput {
- /**
- * Returns a {@link PBegin} in the given {@link Pipeline}.
- */
- public static PBegin in(Pipeline pipeline) {
- return new PBegin(pipeline);
- }
-
- /**
- * Like {@link #apply(String, PTransform)} but defaulting to the name
- * of the {@link PTransform}.
- */
- public <OutputT extends POutput> OutputT apply(
- PTransform<? super PBegin, OutputT> t) {
- return Pipeline.applyTransform(this, t);
- }
-
- /**
- * Applies the given {@link PTransform} to this input {@link PBegin},
- * using {@code name} to identify this specific application of the transform.
- * This name is used in various places, including the monitoring UI, logging,
- * and to stably identify this application node in the job graph.
- */
- public <OutputT extends POutput> OutputT apply(
- String name, PTransform<? super PBegin, OutputT> t) {
- return Pipeline.applyTransform(name, this, t);
- }
-
- @Override
- public Pipeline getPipeline() {
- return pipeline;
- }
-
- @Override
- public Collection<? extends PValue> expand() {
- // A PBegin contains no PValues.
- return Collections.emptyList();
- }
-
- @Override
- public void finishSpecifying() {
- // Nothing more to be done.
- }
-
- /////////////////////////////////////////////////////////////////////////////
-
- /**
- * Constructs a {@link PBegin} in the given {@link Pipeline}.
- */
- protected PBegin(Pipeline pipeline) {
- this.pipeline = pipeline;
- }
-
- private final Pipeline pipeline;
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/values/PCollection.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/values/PCollection.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/values/PCollection.java
deleted file mode 100644
index 6fffddf..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/values/PCollection.java
+++ /dev/null
@@ -1,250 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.values;
-
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.io.BigQueryIO;
-import com.google.cloud.dataflow.sdk.io.PubsubIO;
-import com.google.cloud.dataflow.sdk.io.Read;
-import com.google.cloud.dataflow.sdk.io.TextIO;
-import com.google.cloud.dataflow.sdk.transforms.Create;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
-import com.google.cloud.dataflow.sdk.transforms.windowing.GlobalWindows;
-import com.google.cloud.dataflow.sdk.transforms.windowing.Window;
-import com.google.cloud.dataflow.sdk.transforms.windowing.WindowFn;
-import com.google.cloud.dataflow.sdk.util.WindowingStrategy;
-
-/**
- * A {@link PCollection PCollection<T>} is an immutable collection of values of type
- * {@code T}. A {@link PCollection} can contain either a bounded or unbounded
- * number of elements. Bounded and unbounded {@link PCollection PCollections} are produced
- * as the output of {@link PTransform PTransforms}
- * (including root PTransforms like {@link Read} and {@link Create}), and can
- * be passed as the inputs of other PTransforms.
- *
- * <p>Some root transforms produce bounded {@code PCollections} and others
- * produce unbounded ones. For example, {@link TextIO.Read} reads a static set
- * of files, so it produces a bounded {@link PCollection}.
- * {@link PubsubIO.Read}, on the other hand, receives a potentially infinite stream
- * of Pubsub messages, so it produces an unbounded {@link PCollection}.
- *
- * <p>Each element in a {@link PCollection} may have an associated implicit
- * timestamp. Readers assign timestamps to elements when they create
- * {@link PCollection PCollections}, and other {@link PTransform PTransforms} propagate these
- * timestamps from their input to their output. For example, {@link PubsubIO.Read}
- * assigns pubsub message timestamps to elements, and {@link TextIO.Read} assigns
- * the default value {@link BoundedWindow#TIMESTAMP_MIN_VALUE} to elements. User code can
- * explicitly assign timestamps to elements with
- * {@link com.google.cloud.dataflow.sdk.transforms.DoFn.Context#outputWithTimestamp}.
- *
- * <p>Additionally, a {@link PCollection} has an associated
- * {@link WindowFn} and each element is assigned to a set of windows.
- * By default, the windowing function is {@link GlobalWindows}
- * and all elements are assigned into a single default window.
- * This default can be overridden with the {@link Window}
- * {@link PTransform}.
- *
- * <p>See the individual {@link PTransform} subclasses for specific information
- * on how they propagate timestamps and windowing.
- *
- * @param <T> the type of the elements of this {@link PCollection}
- */
-public class PCollection<T> extends TypedPValue<T> {
-
- /**
- * The enumeration of cases for whether a {@link PCollection} is bounded.
- */
- public enum IsBounded {
- /**
- * Indicates that a {@link PCollection} contains bounded data elements, such as
- * {@link PCollection PCollections} from {@link TextIO}, {@link BigQueryIO},
- * {@link Create} e.t.c.
- */
- BOUNDED,
- /**
- * Indicates that a {@link PCollection} contains unbounded data elements, such as
- * {@link PCollection PCollections} from {@link PubsubIO}.
- */
- UNBOUNDED;
-
- /**
- * Returns the composed IsBounded property.
- *
- * <p>The composed property is {@link #BOUNDED} only if all components are {@link #BOUNDED}.
- * Otherwise, it is {@link #UNBOUNDED}.
- */
- public IsBounded and(IsBounded that) {
- if (this == BOUNDED && that == BOUNDED) {
- return BOUNDED;
- } else {
- return UNBOUNDED;
- }
- }
- }
-
- /**
- * Returns the name of this {@link PCollection}.
- *
- * <p>By default, the name of a {@link PCollection} is based on the name of the
- * {@link PTransform} that produces it. It can be specified explicitly by
- * calling {@link #setName}.
- *
- * @throws IllegalStateException if the name hasn't been set yet
- */
- @Override
- public String getName() {
- return super.getName();
- }
-
- /**
- * Sets the name of this {@link PCollection}. Returns {@code this}.
- *
- * @throws IllegalStateException if this {@link PCollection} has already been
- * finalized and may no longer be set.
- * Once {@link #apply} has been called, this will be the case.
- */
- @Override
- public PCollection<T> setName(String name) {
- super.setName(name);
- return this;
- }
-
- /**
- * Returns the {@link Coder} used by this {@link PCollection} to encode and decode
- * the values stored in it.
- *
- * @throws IllegalStateException if the {@link Coder} hasn't been set, and
- * couldn't be inferred.
- */
- @Override
- public Coder<T> getCoder() {
- return super.getCoder();
- }
-
- /**
- * Sets the {@link Coder} used by this {@link PCollection} to encode and decode the
- * values stored in it. Returns {@code this}.
- *
- * @throws IllegalStateException if this {@link PCollection} has already
- * been finalized and may no longer be set.
- * Once {@link #apply} has been called, this will be the case.
- */
- @Override
- public PCollection<T> setCoder(Coder<T> coder) {
- super.setCoder(coder);
- return this;
- }
-
- /**
- * Like {@link IsBounded#apply(String, PTransform)} but defaulting to the name
- * of the {@link PTransform}.
- *
- * @return the output of the applied {@link PTransform}
- */
- public <OutputT extends POutput> OutputT apply(PTransform<? super PCollection<T>, OutputT> t) {
- return Pipeline.applyTransform(this, t);
- }
-
- /**
- * Applies the given {@link PTransform} to this input {@link PCollection},
- * using {@code name} to identify this specific application of the transform.
- * This name is used in various places, including the monitoring UI, logging,
- * and to stably identify this application node in the job graph.
- *
- * @return the output of the applied {@link PTransform}
- */
- public <OutputT extends POutput> OutputT apply(
- String name, PTransform<? super PCollection<T>, OutputT> t) {
- return Pipeline.applyTransform(name, this, t);
- }
-
- /**
- * Returns the {@link WindowingStrategy} of this {@link PCollection}.
- */
- public WindowingStrategy<?, ?> getWindowingStrategy() {
- return windowingStrategy;
- }
-
- public IsBounded isBounded() {
- return isBounded;
- }
-
- /////////////////////////////////////////////////////////////////////////////
- // Internal details below here.
-
- /**
- * {@link WindowingStrategy} that will be used for merging windows and triggering output in this
- * {@link PCollection} and subsequence {@link PCollection PCollections} produced from this one.
- *
- * <p>By default, no merging is performed.
- */
- private WindowingStrategy<?, ?> windowingStrategy;
-
- private IsBounded isBounded;
-
- private PCollection(Pipeline p) {
- super(p);
- }
-
- /**
- * Sets the {@link TypeDescriptor TypeDescriptor<T>} for this
- * {@link PCollection PCollection<T>}. This may allow the enclosing
- * {@link PCollectionTuple}, {@link PCollectionList}, or {@code PTransform<?, PCollection<T>>},
- * etc., to provide more detailed reflective information.
- */
- @Override
- public PCollection<T> setTypeDescriptorInternal(TypeDescriptor<T> typeDescriptor) {
- super.setTypeDescriptorInternal(typeDescriptor);
- return this;
- }
-
- /**
- * Sets the {@link WindowingStrategy} of this {@link PCollection}.
- *
- * <p>For use by primitive transformations only.
- */
- public PCollection<T> setWindowingStrategyInternal(WindowingStrategy<?, ?> windowingStrategy) {
- this.windowingStrategy = windowingStrategy;
- return this;
- }
-
- /**
- * Sets the {@link PCollection.IsBounded} of this {@link PCollection}.
- *
- * <p>For use by internal transformations only.
- */
- public PCollection<T> setIsBoundedInternal(IsBounded isBounded) {
- this.isBounded = isBounded;
- return this;
- }
-
- /**
- * Creates and returns a new {@link PCollection} for a primitive output.
- *
- * <p>For use by primitive transformations only.
- */
- public static <T> PCollection<T> createPrimitiveOutputInternal(
- Pipeline pipeline,
- WindowingStrategy<?, ?> windowingStrategy,
- IsBounded isBounded) {
- return new PCollection<T>(pipeline)
- .setWindowingStrategyInternal(windowingStrategy)
- .setIsBoundedInternal(isBounded);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/values/PCollectionList.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/values/PCollectionList.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/values/PCollectionList.java
deleted file mode 100644
index b99af02..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/values/PCollectionList.java
+++ /dev/null
@@ -1,238 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.values;
-
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.transforms.AppliedPTransform;
-import com.google.cloud.dataflow.sdk.transforms.Flatten;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-import com.google.cloud.dataflow.sdk.transforms.Partition;
-import com.google.common.collect.ImmutableList;
-
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.Iterator;
-import java.util.List;
-
-/**
- * A {@link PCollectionList PCollectionList<T>} is an immutable list of homogeneously
- * typed {@link PCollection PCollection<T>s}. A {@link PCollectionList} is used, for
- * instance, as the input to
- * {@link Flatten} or the output of {@link Partition}.
- *
- * <p>PCollectionLists can be created and accessed like follows:
- * <pre> {@code
- * PCollection<String> pc1 = ...;
- * PCollection<String> pc2 = ...;
- * PCollection<String> pc3 = ...;
- *
- * // Create a PCollectionList with three PCollections:
- * PCollectionList<String> pcs = PCollectionList.of(pc1).and(pc2).and(pc3);
- *
- * // Create an empty PCollectionList:
- * Pipeline p = ...;
- * PCollectionList<String> pcs2 = PCollectionList.<String>empty(p);
- *
- * // Get PCollections out of a PCollectionList, by index (origin 0):
- * PCollection<String> pcX = pcs.get(1);
- * PCollection<String> pcY = pcs.get(0);
- * PCollection<String> pcZ = pcs.get(2);
- *
- * // Get a list of all PCollections in a PCollectionList:
- * List<PCollection<String>> allPcs = pcs.getAll();
- * } </pre>
- *
- * @param <T> the type of the elements of all the {@link PCollection PCollections} in this list
- */
-public class PCollectionList<T> implements PInput, POutput {
- /**
- * Returns an empty {@link PCollectionList} that is part of the given {@link Pipeline}.
- *
- * <p>Longer {@link PCollectionList PCollectionLists} can be created by calling
- * {@link #and} on the result.
- */
- public static <T> PCollectionList<T> empty(Pipeline pipeline) {
- return new PCollectionList<>(pipeline);
- }
-
- /**
- * Returns a singleton {@link PCollectionList} containing the given {@link PCollection}.
- *
- * <p>Longer {@link PCollectionList PCollectionLists} can be created by calling
- * {@link #and} on the result.
- */
- public static <T> PCollectionList<T> of(PCollection<T> pc) {
- return new PCollectionList<T>(pc.getPipeline()).and(pc);
- }
-
- /**
- * Returns a {@link PCollectionList} containing the given {@link PCollection PCollections},
- * in order.
- *
- * <p>The argument list cannot be empty.
- *
- * <p>All the {@link PCollection PCollections} in the resulting {@link PCollectionList} must be
- * part of the same {@link Pipeline}.
- *
- * <p>Longer PCollectionLists can be created by calling
- * {@link #and} on the result.
- */
- public static <T> PCollectionList<T> of(Iterable<PCollection<T>> pcs) {
- Iterator<PCollection<T>> pcsIter = pcs.iterator();
- if (!pcsIter.hasNext()) {
- throw new IllegalArgumentException(
- "must either have a non-empty list of PCollections, " +
- "or must first call empty(Pipeline)");
- }
- return new PCollectionList<T>(pcsIter.next().getPipeline()).and(pcs);
- }
-
- /**
- * Returns a new {@link PCollectionList} that has all the {@link PCollection PCollections} of
- * this {@link PCollectionList} plus the given {@link PCollection} appended to the end.
- *
- * <p>All the {@link PCollection PCollections} in the resulting {@link PCollectionList} must be
- * part of the same {@link Pipeline}.
- */
- public PCollectionList<T> and(PCollection<T> pc) {
- if (pc.getPipeline() != pipeline) {
- throw new IllegalArgumentException(
- "PCollections come from different Pipelines");
- }
- return new PCollectionList<>(pipeline,
- new ImmutableList.Builder<PCollection<T>>()
- .addAll(pcollections)
- .add(pc)
- .build());
- }
-
- /**
- * Returns a new {@link PCollectionList} that has all the {@link PCollection PCollections} of
- * this {@link PCollectionList} plus the given {@link PCollection PCollections} appended to the
- * end, in order.
- *
- * <p>All the {@link PCollections} in the resulting {@link PCollectionList} must be
- * part of the same {@link Pipeline}.
- */
- public PCollectionList<T> and(Iterable<PCollection<T>> pcs) {
- List<PCollection<T>> copy = new ArrayList<>(pcollections);
- for (PCollection<T> pc : pcs) {
- if (pc.getPipeline() != pipeline) {
- throw new IllegalArgumentException(
- "PCollections come from different Pipelines");
- }
- copy.add(pc);
- }
- return new PCollectionList<>(pipeline, copy);
- }
-
- /**
- * Returns the number of {@link PCollection PCollections} in this {@link PCollectionList}.
- */
- public int size() {
- return pcollections.size();
- }
-
- /**
- * Returns the {@link PCollection} at the given index (origin zero).
- *
- * @throws IndexOutOfBoundsException if the index is out of the range
- * {@code [0..size()-1]}.
- */
- public PCollection<T> get(int index) {
- return pcollections.get(index);
- }
-
- /**
- * Returns an immutable List of all the {@link PCollection PCollections} in this
- * {@link PCollectionList}.
- */
- public List<PCollection<T>> getAll() {
- return pcollections;
- }
-
- /**
- * Like {@link #apply(String, PTransform)} but defaulting to the name
- * of the {@code PTransform}.
- */
- public <OutputT extends POutput> OutputT apply(
- PTransform<PCollectionList<T>, OutputT> t) {
- return Pipeline.applyTransform(this, t);
- }
-
- /**
- * Applies the given {@link PTransform} to this input {@link PCollectionList},
- * using {@code name} to identify this specific application of the transform.
- * This name is used in various places, including the monitoring UI, logging,
- * and to stably identify this application node in the job graph.
- *
- * @return the output of the applied {@link PTransform}
- */
- public <OutputT extends POutput> OutputT apply(
- String name, PTransform<PCollectionList<T>, OutputT> t) {
- return Pipeline.applyTransform(name, this, t);
- }
-
- /////////////////////////////////////////////////////////////////////////////
- // Internal details below here.
-
- final Pipeline pipeline;
- final List<PCollection<T>> pcollections;
-
- PCollectionList(Pipeline pipeline) {
- this(pipeline, new ArrayList<PCollection<T>>());
- }
-
- PCollectionList(Pipeline pipeline, List<PCollection<T>> pcollections) {
- this.pipeline = pipeline;
- this.pcollections = Collections.unmodifiableList(pcollections);
- }
-
- @Override
- public Pipeline getPipeline() {
- return pipeline;
- }
-
- @Override
- public Collection<? extends PValue> expand() {
- return pcollections;
- }
-
- @Override
- public void recordAsOutput(AppliedPTransform<?, ?, ?> transform) {
- int i = 0;
- for (PCollection<T> pc : pcollections) {
- pc.recordAsOutput(transform, "out" + i);
- i++;
- }
- }
-
- @Override
- public void finishSpecifying() {
- for (PCollection<T> pc : pcollections) {
- pc.finishSpecifying();
- }
- }
-
- @Override
- public void finishSpecifyingOutput() {
- for (PCollection<T> pc : pcollections) {
- pc.finishSpecifyingOutput();
- }
- }
-}
[30/67] [partial] incubator-beam git commit: Directory reorganization
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/InProcessExecutionContext.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/InProcessExecutionContext.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/InProcessExecutionContext.java
deleted file mode 100644
index 43cd9eb..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/InProcessExecutionContext.java
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * Copyright (C) 2016 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.runners.inprocess;
-
-import com.google.cloud.dataflow.sdk.runners.inprocess.InMemoryWatermarkManager.TimerUpdate;
-import com.google.cloud.dataflow.sdk.runners.inprocess.InMemoryWatermarkManager.TransformWatermarks;
-import com.google.cloud.dataflow.sdk.util.BaseExecutionContext;
-import com.google.cloud.dataflow.sdk.util.ExecutionContext;
-import com.google.cloud.dataflow.sdk.util.TimerInternals;
-import com.google.cloud.dataflow.sdk.util.common.worker.StateSampler;
-import com.google.cloud.dataflow.sdk.util.state.CopyOnAccessInMemoryStateInternals;
-
-/**
- * Execution Context for the {@link InProcessPipelineRunner}.
- *
- * This implementation is not thread safe. A new {@link InProcessExecutionContext} must be created
- * for each thread that requires it.
- */
-class InProcessExecutionContext
- extends BaseExecutionContext<InProcessExecutionContext.InProcessStepContext> {
- private final Clock clock;
- private final Object key;
- private final CopyOnAccessInMemoryStateInternals<Object> existingState;
- private final TransformWatermarks watermarks;
-
- public InProcessExecutionContext(Clock clock, Object key,
- CopyOnAccessInMemoryStateInternals<Object> existingState, TransformWatermarks watermarks) {
- this.clock = clock;
- this.key = key;
- this.existingState = existingState;
- this.watermarks = watermarks;
- }
-
- @Override
- protected InProcessStepContext createStepContext(
- String stepName, String transformName, StateSampler stateSampler) {
- return new InProcessStepContext(this, stepName, transformName);
- }
-
- /**
- * Step Context for the {@link InProcessPipelineRunner}.
- */
- public class InProcessStepContext
- extends com.google.cloud.dataflow.sdk.util.BaseExecutionContext.StepContext {
- private CopyOnAccessInMemoryStateInternals<Object> stateInternals;
- private InProcessTimerInternals timerInternals;
-
- public InProcessStepContext(
- ExecutionContext executionContext, String stepName, String transformName) {
- super(executionContext, stepName, transformName);
- }
-
- @Override
- public CopyOnAccessInMemoryStateInternals<Object> stateInternals() {
- if (stateInternals == null) {
- stateInternals = CopyOnAccessInMemoryStateInternals.withUnderlying(key, existingState);
- }
- return stateInternals;
- }
-
- @Override
- public InProcessTimerInternals timerInternals() {
- if (timerInternals == null) {
- timerInternals =
- InProcessTimerInternals.create(clock, watermarks, TimerUpdate.builder(key));
- }
- return timerInternals;
- }
-
- /**
- * Commits the state of this step, and returns the committed state. If the step has not
- * accessed any state, return null.
- */
- public CopyOnAccessInMemoryStateInternals<?> commitState() {
- if (stateInternals != null) {
- return stateInternals.commit();
- }
- return null;
- }
-
- /**
- * Gets the timer update of the {@link TimerInternals} of this {@link InProcessStepContext},
- * which is empty if the {@link TimerInternals} were never accessed.
- */
- public TimerUpdate getTimerUpdate() {
- if (timerInternals == null) {
- return TimerUpdate.empty();
- }
- return timerInternals.getTimerUpdate();
- }
- }
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/InProcessExecutor.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/InProcessExecutor.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/InProcessExecutor.java
deleted file mode 100644
index 7b60bca..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/InProcessExecutor.java
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Copyright (C) 2016 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.runners.inprocess;
-
-import com.google.cloud.dataflow.sdk.runners.inprocess.InProcessPipelineRunner.CommittedBundle;
-import com.google.cloud.dataflow.sdk.transforms.AppliedPTransform;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-
-import java.util.Collection;
-
-/**
- * An executor that schedules and executes {@link AppliedPTransform AppliedPTransforms} for both
- * source and intermediate {@link PTransform PTransforms}.
- */
-interface InProcessExecutor {
- /**
- * Starts this executor. The provided collection is the collection of root transforms to
- * initially schedule.
- *
- * @param rootTransforms
- */
- void start(Collection<AppliedPTransform<?, ?, ?>> rootTransforms);
-
- /**
- * Blocks until the job being executed enters a terminal state. A job is completed after all
- * root {@link AppliedPTransform AppliedPTransforms} have completed, and all
- * {@link CommittedBundle Bundles} have been consumed. Jobs may also terminate abnormally.
- *
- * @throws Throwable whenever an executor thread throws anything, transfers the throwable to the
- * waiting thread and rethrows it
- */
- void awaitCompletion() throws Throwable;
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/InProcessPipelineOptions.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/InProcessPipelineOptions.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/InProcessPipelineOptions.java
deleted file mode 100644
index 5ee0e88..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/InProcessPipelineOptions.java
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.runners.inprocess;
-
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.options.ApplicationNameOptions;
-import com.google.cloud.dataflow.sdk.options.Default;
-import com.google.cloud.dataflow.sdk.options.Description;
-import com.google.cloud.dataflow.sdk.options.Hidden;
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-import com.google.cloud.dataflow.sdk.options.Validation.Required;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-
-import com.fasterxml.jackson.annotation.JsonIgnore;
-
-import java.util.concurrent.ExecutorService;
-import java.util.concurrent.Executors;
-
-/**
- * Options that can be used to configure the {@link InProcessPipelineRunner}.
- */
-public interface InProcessPipelineOptions extends PipelineOptions, ApplicationNameOptions {
- /**
- * Gets the {@link ExecutorServiceFactory} to use to create instances of {@link ExecutorService}
- * to execute {@link PTransform PTransforms}.
- *
- * <p>Note that {@link ExecutorService ExecutorServices} returned by the factory must ensure that
- * it cannot enter a state in which it will not schedule additional pending work unless currently
- * scheduled work completes, as this may cause the {@link Pipeline} to cease processing.
- *
- * <p>Defaults to a {@link CachedThreadPoolExecutorServiceFactory}, which produces instances of
- * {@link Executors#newCachedThreadPool()}.
- */
- @JsonIgnore
- @Required
- @Hidden
- @Default.InstanceFactory(CachedThreadPoolExecutorServiceFactory.class)
- ExecutorServiceFactory getExecutorServiceFactory();
-
- void setExecutorServiceFactory(ExecutorServiceFactory executorService);
-
- /**
- * Gets the {@link Clock} used by this pipeline. The clock is used in place of accessing the
- * system time when time values are required by the evaluator.
- */
- @Default.InstanceFactory(NanosOffsetClock.Factory.class)
- @JsonIgnore
- @Required
- @Hidden
- @Description(
- "The processing time source used by the pipeline. When the current time is "
- + "needed by the evaluator, the result of clock#now() is used.")
- Clock getClock();
-
- void setClock(Clock clock);
-
- @Default.Boolean(false)
- @Description(
- "If the pipeline should shut down producers which have reached the maximum "
- + "representable watermark. If this is set to true, a pipeline in which all PTransforms "
- + "have reached the maximum watermark will be shut down, even if there are unbounded "
- + "sources that could produce additional (late) data. By default, if the pipeline "
- + "contains any unbounded PCollections, it will run until explicitly shut down.")
- boolean isShutdownUnboundedProducersWithMaxWatermark();
-
- void setShutdownUnboundedProducersWithMaxWatermark(boolean shutdown);
-
- @Default.Boolean(true)
- @Description(
- "If the pipeline should block awaiting completion of the pipeline. If set to true, "
- + "a call to Pipeline#run() will block until all PTransforms are complete. Otherwise, "
- + "the Pipeline will execute asynchronously. If set to false, the completion of the "
- + "pipeline can be awaited on by use of InProcessPipelineResult#awaitCompletion().")
- boolean isBlockOnRun();
-
- void setBlockOnRun(boolean b);
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/InProcessPipelineRunner.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/InProcessPipelineRunner.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/InProcessPipelineRunner.java
deleted file mode 100644
index a1c8756..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/InProcessPipelineRunner.java
+++ /dev/null
@@ -1,343 +0,0 @@
-/*
- * Copyright (C) 2016 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.runners.inprocess;
-
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.Pipeline.PipelineExecutionException;
-import com.google.cloud.dataflow.sdk.PipelineResult;
-import com.google.cloud.dataflow.sdk.annotations.Experimental;
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-import com.google.cloud.dataflow.sdk.runners.AggregatorPipelineExtractor;
-import com.google.cloud.dataflow.sdk.runners.AggregatorRetrievalException;
-import com.google.cloud.dataflow.sdk.runners.AggregatorValues;
-import com.google.cloud.dataflow.sdk.runners.PipelineRunner;
-import com.google.cloud.dataflow.sdk.runners.inprocess.GroupByKeyEvaluatorFactory.InProcessGroupByKey;
-import com.google.cloud.dataflow.sdk.runners.inprocess.GroupByKeyEvaluatorFactory.InProcessGroupByKeyOnly;
-import com.google.cloud.dataflow.sdk.transforms.Aggregator;
-import com.google.cloud.dataflow.sdk.transforms.AppliedPTransform;
-import com.google.cloud.dataflow.sdk.transforms.Create;
-import com.google.cloud.dataflow.sdk.transforms.GroupByKey;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-import com.google.cloud.dataflow.sdk.transforms.View.CreatePCollectionView;
-import com.google.cloud.dataflow.sdk.util.InstanceBuilder;
-import com.google.cloud.dataflow.sdk.util.MapAggregatorValues;
-import com.google.cloud.dataflow.sdk.util.TimerInternals.TimerData;
-import com.google.cloud.dataflow.sdk.util.UserCodeException;
-import com.google.cloud.dataflow.sdk.util.WindowedValue;
-import com.google.cloud.dataflow.sdk.util.common.Counter;
-import com.google.cloud.dataflow.sdk.util.common.CounterSet;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.cloud.dataflow.sdk.values.PCollection.IsBounded;
-import com.google.cloud.dataflow.sdk.values.PCollectionView;
-import com.google.cloud.dataflow.sdk.values.PInput;
-import com.google.cloud.dataflow.sdk.values.POutput;
-import com.google.cloud.dataflow.sdk.values.PValue;
-import com.google.common.base.Throwables;
-import com.google.common.collect.ImmutableMap;
-import com.google.common.collect.ImmutableSet;
-
-import org.joda.time.Instant;
-
-import java.util.Collection;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.concurrent.ExecutorService;
-
-import javax.annotation.Nullable;
-
-/**
- * An In-Memory implementation of the Dataflow Programming Model. Supports Unbounded
- * {@link PCollection PCollections}.
- */
-@Experimental
-public class InProcessPipelineRunner
- extends PipelineRunner<InProcessPipelineRunner.InProcessPipelineResult> {
- /**
- * The default set of transform overrides to use in the {@link InProcessPipelineRunner}.
- *
- * <p>A transform override must have a single-argument constructor that takes an instance of the
- * type of transform it is overriding.
- */
- @SuppressWarnings("rawtypes")
- private static Map<Class<? extends PTransform>, Class<? extends PTransform>>
- defaultTransformOverrides =
- ImmutableMap.<Class<? extends PTransform>, Class<? extends PTransform>>builder()
- .put(Create.Values.class, InProcessCreate.class)
- .put(GroupByKey.class, InProcessGroupByKey.class)
- .put(
- CreatePCollectionView.class,
- ViewEvaluatorFactory.InProcessCreatePCollectionView.class)
- .build();
-
- /**
- * Part of a {@link PCollection}. Elements are output to a bundle, which will cause them to be
- * executed by {@link PTransform PTransforms} that consume the {@link PCollection} this bundle is
- * a part of at a later point. This is an uncommitted bundle and can have elements added to it.
- *
- * @param <T> the type of elements that can be added to this bundle
- */
- public static interface UncommittedBundle<T> {
- /**
- * Returns the PCollection that the elements of this {@link UncommittedBundle} belong to.
- */
- PCollection<T> getPCollection();
-
- /**
- * Outputs an element to this bundle.
- *
- * @param element the element to add to this bundle
- * @return this bundle
- */
- UncommittedBundle<T> add(WindowedValue<T> element);
-
- /**
- * Commits this {@link UncommittedBundle}, returning an immutable {@link CommittedBundle}
- * containing all of the elements that were added to it. The {@link #add(WindowedValue)} method
- * will throw an {@link IllegalStateException} if called after a call to commit.
- * @param synchronizedProcessingTime the synchronized processing time at which this bundle was
- * committed
- */
- CommittedBundle<T> commit(Instant synchronizedProcessingTime);
- }
-
- /**
- * Part of a {@link PCollection}. Elements are output to an {@link UncommittedBundle}, which will
- * eventually committed. Committed elements are executed by the {@link PTransform PTransforms}
- * that consume the {@link PCollection} this bundle is
- * a part of at a later point.
- * @param <T> the type of elements contained within this bundle
- */
- public static interface CommittedBundle<T> {
- /**
- * Returns the PCollection that the elements of this bundle belong to.
- */
- PCollection<T> getPCollection();
-
- /**
- * Returns whether this bundle is keyed. A bundle that is part of a {@link PCollection} that
- * occurs after a {@link GroupByKey} is keyed by the result of the last {@link GroupByKey}.
- */
- boolean isKeyed();
-
- /**
- * Returns the (possibly null) key that was output in the most recent {@link GroupByKey} in the
- * execution of this bundle.
- */
- @Nullable
- Object getKey();
-
- /**
- * Returns an {@link Iterable} containing all of the elements that have been added to this
- * {@link CommittedBundle}.
- */
- Iterable<WindowedValue<T>> getElements();
-
- /**
- * Returns the processing time output watermark at the time the producing {@link PTransform}
- * committed this bundle. Downstream synchronized processing time watermarks cannot progress
- * past this point before consuming this bundle.
- *
- * <p>This value is no greater than the earliest incomplete processing time or synchronized
- * processing time {@link TimerData timer} at the time this bundle was committed, including any
- * timers that fired to produce this bundle.
- */
- Instant getSynchronizedProcessingOutputWatermark();
- }
-
- /**
- * A {@link PCollectionViewWriter} is responsible for writing contents of a {@link PCollection} to
- * a storage mechanism that can be read from while constructing a {@link PCollectionView}.
- * @param <ElemT> the type of elements the input {@link PCollection} contains.
- * @param <ViewT> the type of the PCollectionView this writer writes to.
- */
- public static interface PCollectionViewWriter<ElemT, ViewT> {
- void add(Iterable<WindowedValue<ElemT>> values);
- }
-
- ////////////////////////////////////////////////////////////////////////////////////////////////
- private final InProcessPipelineOptions options;
-
- public static InProcessPipelineRunner fromOptions(PipelineOptions options) {
- return new InProcessPipelineRunner(options.as(InProcessPipelineOptions.class));
- }
-
- private InProcessPipelineRunner(InProcessPipelineOptions options) {
- this.options = options;
- }
-
- /**
- * Returns the {@link PipelineOptions} used to create this {@link InProcessPipelineRunner}.
- */
- public InProcessPipelineOptions getPipelineOptions() {
- return options;
- }
-
- @Override
- public <OutputT extends POutput, InputT extends PInput> OutputT apply(
- PTransform<InputT, OutputT> transform, InputT input) {
- Class<?> overrideClass = defaultTransformOverrides.get(transform.getClass());
- if (overrideClass != null) {
- // It is the responsibility of whoever constructs overrides to ensure this is type safe.
- @SuppressWarnings("unchecked")
- Class<PTransform<InputT, OutputT>> transformClass =
- (Class<PTransform<InputT, OutputT>>) transform.getClass();
-
- @SuppressWarnings("unchecked")
- Class<PTransform<InputT, OutputT>> customTransformClass =
- (Class<PTransform<InputT, OutputT>>) overrideClass;
-
- PTransform<InputT, OutputT> customTransform =
- InstanceBuilder.ofType(customTransformClass)
- .withArg(transformClass, transform)
- .build();
-
- // This overrides the contents of the apply method without changing the TransformTreeNode that
- // is generated by the PCollection application.
- return super.apply(customTransform, input);
- } else {
- return super.apply(transform, input);
- }
- }
-
- @Override
- public InProcessPipelineResult run(Pipeline pipeline) {
- ConsumerTrackingPipelineVisitor consumerTrackingVisitor = new ConsumerTrackingPipelineVisitor();
- pipeline.traverseTopologically(consumerTrackingVisitor);
- for (PValue unfinalized : consumerTrackingVisitor.getUnfinalizedPValues()) {
- unfinalized.finishSpecifying();
- }
- @SuppressWarnings("rawtypes")
- KeyedPValueTrackingVisitor keyedPValueVisitor =
- KeyedPValueTrackingVisitor.create(
- ImmutableSet.<Class<? extends PTransform>>of(
- GroupByKey.class, InProcessGroupByKeyOnly.class));
- pipeline.traverseTopologically(keyedPValueVisitor);
-
- InProcessEvaluationContext context =
- InProcessEvaluationContext.create(
- getPipelineOptions(),
- consumerTrackingVisitor.getRootTransforms(),
- consumerTrackingVisitor.getValueToConsumers(),
- consumerTrackingVisitor.getStepNames(),
- consumerTrackingVisitor.getViews());
-
- // independent executor service for each run
- ExecutorService executorService =
- context.getPipelineOptions().getExecutorServiceFactory().create();
- InProcessExecutor executor =
- ExecutorServiceParallelExecutor.create(
- executorService,
- consumerTrackingVisitor.getValueToConsumers(),
- keyedPValueVisitor.getKeyedPValues(),
- TransformEvaluatorRegistry.defaultRegistry(),
- context);
- executor.start(consumerTrackingVisitor.getRootTransforms());
-
- Map<Aggregator<?, ?>, Collection<PTransform<?, ?>>> aggregatorSteps =
- new AggregatorPipelineExtractor(pipeline).getAggregatorSteps();
- InProcessPipelineResult result =
- new InProcessPipelineResult(executor, context, aggregatorSteps);
- if (options.isBlockOnRun()) {
- try {
- result.awaitCompletion();
- } catch (UserCodeException userException) {
- throw new PipelineExecutionException(userException.getCause());
- } catch (Throwable t) {
- Throwables.propagate(t);
- }
- }
- return result;
- }
-
- /**
- * The result of running a {@link Pipeline} with the {@link InProcessPipelineRunner}.
- *
- * Throws {@link UnsupportedOperationException} for all methods.
- */
- public static class InProcessPipelineResult implements PipelineResult {
- private final InProcessExecutor executor;
- private final InProcessEvaluationContext evaluationContext;
- private final Map<Aggregator<?, ?>, Collection<PTransform<?, ?>>> aggregatorSteps;
- private State state;
-
- private InProcessPipelineResult(
- InProcessExecutor executor,
- InProcessEvaluationContext evaluationContext,
- Map<Aggregator<?, ?>, Collection<PTransform<?, ?>>> aggregatorSteps) {
- this.executor = executor;
- this.evaluationContext = evaluationContext;
- this.aggregatorSteps = aggregatorSteps;
- // Only ever constructed after the executor has started.
- this.state = State.RUNNING;
- }
-
- @Override
- public State getState() {
- return state;
- }
-
- @Override
- public <T> AggregatorValues<T> getAggregatorValues(Aggregator<?, T> aggregator)
- throws AggregatorRetrievalException {
- CounterSet counters = evaluationContext.getCounters();
- Collection<PTransform<?, ?>> steps = aggregatorSteps.get(aggregator);
- Map<String, T> stepValues = new HashMap<>();
- for (AppliedPTransform<?, ?, ?> transform : evaluationContext.getSteps()) {
- if (steps.contains(transform.getTransform())) {
- String stepName =
- String.format(
- "user-%s-%s", evaluationContext.getStepName(transform), aggregator.getName());
- Counter<T> counter = (Counter<T>) counters.getExistingCounter(stepName);
- if (counter != null) {
- stepValues.put(transform.getFullName(), counter.getAggregate());
- }
- }
- }
- return new MapAggregatorValues<>(stepValues);
- }
-
- /**
- * Blocks until the {@link Pipeline} execution represented by this
- * {@link InProcessPipelineResult} is complete, returning the terminal state.
- *
- * <p>If the pipeline terminates abnormally by throwing an exception, this will rethrow the
- * exception. Future calls to {@link #getState()} will return
- * {@link com.google.cloud.dataflow.sdk.PipelineResult.State#FAILED}.
- *
- * <p>NOTE: if the {@link Pipeline} contains an {@link IsBounded#UNBOUNDED unbounded}
- * {@link PCollection}, and the {@link PipelineRunner} was created with
- * {@link InProcessPipelineOptions#isShutdownUnboundedProducersWithMaxWatermark()} set to false,
- * this method will never return.
- *
- * See also {@link InProcessExecutor#awaitCompletion()}.
- */
- public State awaitCompletion() throws Throwable {
- if (!state.isTerminal()) {
- try {
- executor.awaitCompletion();
- state = State.DONE;
- } catch (InterruptedException e) {
- Thread.currentThread().interrupt();
- throw e;
- } catch (Throwable t) {
- state = State.FAILED;
- throw t;
- }
- }
- return state;
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/InProcessSideInputContainer.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/InProcessSideInputContainer.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/InProcessSideInputContainer.java
deleted file mode 100644
index 37c9fcf..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/InProcessSideInputContainer.java
+++ /dev/null
@@ -1,230 +0,0 @@
-/*
- * Copyright (C) 2016 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.runners.inprocess;
-
-import static com.google.common.base.Preconditions.checkArgument;
-
-import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
-import com.google.cloud.dataflow.sdk.transforms.windowing.PaneInfo;
-import com.google.cloud.dataflow.sdk.util.PCollectionViewWindow;
-import com.google.cloud.dataflow.sdk.util.SideInputReader;
-import com.google.cloud.dataflow.sdk.util.WindowedValue;
-import com.google.cloud.dataflow.sdk.util.WindowingStrategy;
-import com.google.cloud.dataflow.sdk.values.PCollectionView;
-import com.google.common.base.MoreObjects;
-import com.google.common.base.Throwables;
-import com.google.common.cache.CacheBuilder;
-import com.google.common.cache.CacheLoader;
-import com.google.common.cache.LoadingCache;
-import com.google.common.collect.ImmutableSet;
-import com.google.common.collect.Sets;
-import com.google.common.util.concurrent.SettableFuture;
-
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.Map;
-import java.util.Set;
-import java.util.concurrent.ExecutionException;
-
-import javax.annotation.Nullable;
-
-/**
- * An in-process container for {@link PCollectionView PCollectionViews}, which provides methods for
- * constructing {@link SideInputReader SideInputReaders} which block until a side input is
- * available and writing to a {@link PCollectionView}.
- */
-class InProcessSideInputContainer {
- private final InProcessEvaluationContext evaluationContext;
- private final Collection<PCollectionView<?>> containedViews;
- private final LoadingCache<PCollectionViewWindow<?>,
- SettableFuture<Iterable<? extends WindowedValue<?>>>> viewByWindows;
-
- /**
- * Create a new {@link InProcessSideInputContainer} with the provided views and the provided
- * context.
- */
- public static InProcessSideInputContainer create(
- InProcessEvaluationContext context, Collection<PCollectionView<?>> containedViews) {
- CacheLoader<PCollectionViewWindow<?>, SettableFuture<Iterable<? extends WindowedValue<?>>>>
- loader = new CacheLoader<PCollectionViewWindow<?>,
- SettableFuture<Iterable<? extends WindowedValue<?>>>>() {
- @Override
- public SettableFuture<Iterable<? extends WindowedValue<?>>> load(
- PCollectionViewWindow<?> view) {
- return SettableFuture.create();
- }
- };
- LoadingCache<PCollectionViewWindow<?>, SettableFuture<Iterable<? extends WindowedValue<?>>>>
- viewByWindows = CacheBuilder.newBuilder().build(loader);
- return new InProcessSideInputContainer(context, containedViews, viewByWindows);
- }
-
- private InProcessSideInputContainer(InProcessEvaluationContext context,
- Collection<PCollectionView<?>> containedViews,
- LoadingCache<PCollectionViewWindow<?>, SettableFuture<Iterable<? extends WindowedValue<?>>>>
- viewByWindows) {
- this.evaluationContext = context;
- this.containedViews = ImmutableSet.copyOf(containedViews);
- this.viewByWindows = viewByWindows;
- }
-
- /**
- * Return a view of this {@link InProcessSideInputContainer} that contains only the views in
- * the provided argument. The returned {@link InProcessSideInputContainer} is unmodifiable without
- * casting, but will change as this {@link InProcessSideInputContainer} is modified.
- */
- public SideInputReader createReaderForViews(Collection<PCollectionView<?>> newContainedViews) {
- if (!containedViews.containsAll(newContainedViews)) {
- Set<PCollectionView<?>> currentlyContained = ImmutableSet.copyOf(containedViews);
- Set<PCollectionView<?>> newRequested = ImmutableSet.copyOf(newContainedViews);
- throw new IllegalArgumentException("Can't create a SideInputReader with unknown views "
- + Sets.difference(newRequested, currentlyContained));
- }
- return new SideInputContainerSideInputReader(newContainedViews);
- }
-
- /**
- * Write the provided values to the provided view.
- *
- * <p>The windowed values are first exploded, then for each window the pane is determined. For
- * each window, if the pane is later than the current pane stored within this container, write
- * all of the values to the container as the new values of the {@link PCollectionView}.
- *
- * <p>The provided iterable is expected to contain only a single window and pane.
- */
- public void write(PCollectionView<?> view, Iterable<? extends WindowedValue<?>> values) {
- Map<BoundedWindow, Collection<WindowedValue<?>>> valuesPerWindow =
- indexValuesByWindow(values);
- for (Map.Entry<BoundedWindow, Collection<WindowedValue<?>>> windowValues :
- valuesPerWindow.entrySet()) {
- updatePCollectionViewWindowValues(view, windowValues.getKey(), windowValues.getValue());
- }
- }
-
- /**
- * Index the provided values by all {@link BoundedWindow windows} in which they appear.
- */
- private Map<BoundedWindow, Collection<WindowedValue<?>>> indexValuesByWindow(
- Iterable<? extends WindowedValue<?>> values) {
- Map<BoundedWindow, Collection<WindowedValue<?>>> valuesPerWindow = new HashMap<>();
- for (WindowedValue<?> value : values) {
- for (BoundedWindow window : value.getWindows()) {
- Collection<WindowedValue<?>> windowValues = valuesPerWindow.get(window);
- if (windowValues == null) {
- windowValues = new ArrayList<>();
- valuesPerWindow.put(window, windowValues);
- }
- windowValues.add(value);
- }
- }
- return valuesPerWindow;
- }
-
- /**
- * Set the value of the {@link PCollectionView} in the {@link BoundedWindow} to be based on the
- * specified values, if the values are part of a later pane than currently exist within the
- * {@link PCollectionViewWindow}.
- */
- private void updatePCollectionViewWindowValues(
- PCollectionView<?> view, BoundedWindow window, Collection<WindowedValue<?>> windowValues) {
- PCollectionViewWindow<?> windowedView = PCollectionViewWindow.of(view, window);
- SettableFuture<Iterable<? extends WindowedValue<?>>> future = null;
- try {
- future = viewByWindows.get(windowedView);
- if (future.isDone()) {
- Iterator<? extends WindowedValue<?>> existingValues = future.get().iterator();
- PaneInfo newPane = windowValues.iterator().next().getPane();
- // The current value may have no elements, if no elements were produced for the window,
- // but we are recieving late data.
- if (!existingValues.hasNext()
- || newPane.getIndex() > existingValues.next().getPane().getIndex()) {
- viewByWindows.invalidate(windowedView);
- viewByWindows.get(windowedView).set(windowValues);
- }
- } else {
- future.set(windowValues);
- }
- } catch (InterruptedException e) {
- Thread.currentThread().interrupt();
- if (future != null && !future.isDone()) {
- future.set(Collections.<WindowedValue<?>>emptyList());
- }
- } catch (ExecutionException e) {
- Throwables.propagate(e.getCause());
- }
- }
-
- private final class SideInputContainerSideInputReader implements SideInputReader {
- private final Collection<PCollectionView<?>> readerViews;
-
- private SideInputContainerSideInputReader(Collection<PCollectionView<?>> readerViews) {
- this.readerViews = ImmutableSet.copyOf(readerViews);
- }
-
- @Override
- @Nullable
- public <T> T get(final PCollectionView<T> view, final BoundedWindow window) {
- checkArgument(
- readerViews.contains(view), "calling get(PCollectionView) with unknown view: " + view);
- PCollectionViewWindow<T> windowedView = PCollectionViewWindow.of(view, window);
- try {
- final SettableFuture<Iterable<? extends WindowedValue<?>>> future =
- viewByWindows.get(windowedView);
-
- WindowingStrategy<?, ?> windowingStrategy = view.getWindowingStrategyInternal();
- evaluationContext.scheduleAfterOutputWouldBeProduced(
- view, window, windowingStrategy, new Runnable() {
- @Override
- public void run() {
- // The requested window has closed without producing elements, so reflect that in
- // the PCollectionView. If set has already been called, will do nothing.
- future.set(Collections.<WindowedValue<?>>emptyList());
- }
-
- @Override
- public String toString() {
- return MoreObjects.toStringHelper("InProcessSideInputContainerEmptyCallback")
- .add("view", view)
- .add("window", window)
- .toString();
- }
- });
- // Safe covariant cast
- @SuppressWarnings("unchecked")
- Iterable<WindowedValue<?>> values = (Iterable<WindowedValue<?>>) future.get();
- return view.fromIterableInternal(values);
- } catch (InterruptedException e) {
- Thread.currentThread().interrupt();
- return null;
- } catch (ExecutionException e) {
- throw new RuntimeException(e);
- }
- }
-
- @Override
- public <T> boolean contains(PCollectionView<T> view) {
- return readerViews.contains(view);
- }
-
- @Override
- public boolean isEmpty() {
- return readerViews.isEmpty();
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/InProcessTimerInternals.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/InProcessTimerInternals.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/InProcessTimerInternals.java
deleted file mode 100644
index 06ba7b8..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/InProcessTimerInternals.java
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.runners.inprocess;
-
-import com.google.cloud.dataflow.sdk.runners.inprocess.InMemoryWatermarkManager.TimerUpdate;
-import com.google.cloud.dataflow.sdk.runners.inprocess.InMemoryWatermarkManager.TimerUpdate.TimerUpdateBuilder;
-import com.google.cloud.dataflow.sdk.runners.inprocess.InMemoryWatermarkManager.TransformWatermarks;
-import com.google.cloud.dataflow.sdk.util.TimerInternals;
-
-import org.joda.time.Instant;
-
-import javax.annotation.Nullable;
-
-/**
- * An implementation of {@link TimerInternals} where all relevant data exists in memory.
- */
-public class InProcessTimerInternals implements TimerInternals {
- private final Clock processingTimeClock;
- private final TransformWatermarks watermarks;
- private final TimerUpdateBuilder timerUpdateBuilder;
-
- public static InProcessTimerInternals create(
- Clock clock, TransformWatermarks watermarks, TimerUpdateBuilder timerUpdateBuilder) {
- return new InProcessTimerInternals(clock, watermarks, timerUpdateBuilder);
- }
-
- private InProcessTimerInternals(
- Clock clock, TransformWatermarks watermarks, TimerUpdateBuilder timerUpdateBuilder) {
- this.processingTimeClock = clock;
- this.watermarks = watermarks;
- this.timerUpdateBuilder = timerUpdateBuilder;
- }
-
- @Override
- public void setTimer(TimerData timerKey) {
- timerUpdateBuilder.setTimer(timerKey);
- }
-
- @Override
- public void deleteTimer(TimerData timerKey) {
- timerUpdateBuilder.deletedTimer(timerKey);
- }
-
- public TimerUpdate getTimerUpdate() {
- return timerUpdateBuilder.build();
- }
-
- @Override
- public Instant currentProcessingTime() {
- return processingTimeClock.now();
- }
-
- @Override
- @Nullable
- public Instant currentSynchronizedProcessingTime() {
- return watermarks.getSynchronizedProcessingInputTime();
- }
-
- @Override
- @Nullable
- public Instant currentInputWatermarkTime() {
- return watermarks.getInputWatermark();
- }
-
- @Override
- @Nullable
- public Instant currentOutputWatermarkTime() {
- return watermarks.getOutputWatermark();
- }
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/InProcessTransformResult.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/InProcessTransformResult.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/InProcessTransformResult.java
deleted file mode 100644
index 3f9e94a..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/InProcessTransformResult.java
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.runners.inprocess;
-
-import com.google.cloud.dataflow.sdk.runners.inprocess.InMemoryWatermarkManager.TimerUpdate;
-import com.google.cloud.dataflow.sdk.runners.inprocess.InProcessPipelineRunner.UncommittedBundle;
-import com.google.cloud.dataflow.sdk.transforms.AppliedPTransform;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
-import com.google.cloud.dataflow.sdk.util.common.CounterSet;
-import com.google.cloud.dataflow.sdk.util.state.CopyOnAccessInMemoryStateInternals;
-
-import org.joda.time.Instant;
-
-import javax.annotation.Nullable;
-
-/**
- * The result of evaluating an {@link AppliedPTransform} with a {@link TransformEvaluator}.
- */
-public interface InProcessTransformResult {
- /**
- * Returns the {@link AppliedPTransform} that produced this result.
- */
- AppliedPTransform<?, ?, ?> getTransform();
-
- /**
- * Returns the {@link UncommittedBundle (uncommitted) Bundles} output by this transform. These
- * will be committed by the evaluation context as part of completing this result.
- */
- Iterable<? extends UncommittedBundle<?>> getOutputBundles();
-
- /**
- * Returns the {@link CounterSet} used by this {@link PTransform}, or null if this transform did
- * not use a {@link CounterSet}.
- */
- @Nullable CounterSet getCounters();
-
- /**
- * Returns the Watermark Hold for the transform at the time this result was produced.
- *
- * If the transform does not set any watermark hold, returns
- * {@link BoundedWindow#TIMESTAMP_MAX_VALUE}.
- */
- Instant getWatermarkHold();
-
- /**
- * Returns the State used by the transform.
- *
- * If this evaluation did not access state, this may return null.
- */
- CopyOnAccessInMemoryStateInternals<?> getState();
-
- /**
- * Returns a TimerUpdateBuilder that was produced as a result of this evaluation. If the
- * evaluation was triggered due to the delivery of one or more timers, those timers must be added
- * to the builder before it is complete.
- *
- * <p>If this evaluation did not add or remove any timers, returns an empty TimerUpdate.
- */
- TimerUpdate getTimerUpdate();
-
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/KeyedPValueTrackingVisitor.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/KeyedPValueTrackingVisitor.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/KeyedPValueTrackingVisitor.java
deleted file mode 100644
index 23a8c0f..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/KeyedPValueTrackingVisitor.java
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * Copyright (C) 2016 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.runners.inprocess;
-
-import static com.google.common.base.Preconditions.checkState;
-
-import com.google.cloud.dataflow.sdk.Pipeline.PipelineVisitor;
-import com.google.cloud.dataflow.sdk.runners.TransformTreeNode;
-import com.google.cloud.dataflow.sdk.transforms.GroupByKey;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-import com.google.cloud.dataflow.sdk.values.PValue;
-
-import java.util.HashSet;
-import java.util.Set;
-
-/**
- * A pipeline visitor that tracks all keyed {@link PValue PValues}. A {@link PValue} is keyed if it
- * is the result of a {@link PTransform} that produces keyed outputs. A {@link PTransform} that
- * produces keyed outputs is assumed to colocate output elements that share a key.
- *
- * <p>All {@link GroupByKey} transforms, or their runner-specific implementation primitive, produce
- * keyed output.
- */
-// TODO: Handle Key-preserving transforms when appropriate and more aggressively make PTransforms
-// unkeyed
-class KeyedPValueTrackingVisitor implements PipelineVisitor {
- @SuppressWarnings("rawtypes")
- private final Set<Class<? extends PTransform>> producesKeyedOutputs;
- private final Set<PValue> keyedValues;
- private boolean finalized;
-
- public static KeyedPValueTrackingVisitor create(
- @SuppressWarnings("rawtypes") Set<Class<? extends PTransform>> producesKeyedOutputs) {
- return new KeyedPValueTrackingVisitor(producesKeyedOutputs);
- }
-
- private KeyedPValueTrackingVisitor(
- @SuppressWarnings("rawtypes") Set<Class<? extends PTransform>> producesKeyedOutputs) {
- this.producesKeyedOutputs = producesKeyedOutputs;
- this.keyedValues = new HashSet<>();
- }
-
- @Override
- public void enterCompositeTransform(TransformTreeNode node) {
- checkState(
- !finalized,
- "Attempted to use a %s that has already been finalized on a pipeline (visiting node %s)",
- KeyedPValueTrackingVisitor.class.getSimpleName(),
- node);
- }
-
- @Override
- public void leaveCompositeTransform(TransformTreeNode node) {
- checkState(
- !finalized,
- "Attempted to use a %s that has already been finalized on a pipeline (visiting node %s)",
- KeyedPValueTrackingVisitor.class.getSimpleName(),
- node);
- if (node.isRootNode()) {
- finalized = true;
- } else if (producesKeyedOutputs.contains(node.getTransform().getClass())) {
- keyedValues.addAll(node.getExpandedOutputs());
- }
- }
-
- @Override
- public void visitTransform(TransformTreeNode node) {}
-
- @Override
- public void visitValue(PValue value, TransformTreeNode producer) {
- if (producesKeyedOutputs.contains(producer.getTransform().getClass())) {
- keyedValues.addAll(value.expand());
- }
- }
-
- public Set<PValue> getKeyedPValues() {
- checkState(
- finalized, "can't call getKeyedPValues before a Pipeline has been completely traversed");
- return keyedValues;
- }
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/NanosOffsetClock.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/NanosOffsetClock.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/NanosOffsetClock.java
deleted file mode 100644
index 958e26d..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/NanosOffsetClock.java
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.runners.inprocess;
-
-import com.google.cloud.dataflow.sdk.options.DefaultValueFactory;
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-
-import org.joda.time.Instant;
-
-import java.util.concurrent.TimeUnit;
-
-/**
- * A {@link Clock} that uses {@link System#nanoTime()} to track the progress of time.
- */
-public class NanosOffsetClock implements Clock {
- private final long baseMillis;
- private final long nanosAtBaseMillis;
-
- public static NanosOffsetClock create() {
- return new NanosOffsetClock();
- }
-
- private NanosOffsetClock() {
- baseMillis = System.currentTimeMillis();
- nanosAtBaseMillis = System.nanoTime();
- }
-
- @Override
- public Instant now() {
- return new Instant(
- baseMillis + (TimeUnit.MILLISECONDS.convert(
- System.nanoTime() - nanosAtBaseMillis, TimeUnit.NANOSECONDS)));
- }
-
- /**
- * Creates instances of {@link NanosOffsetClock}.
- */
- public static class Factory implements DefaultValueFactory<Clock> {
- @Override
- public Clock create(PipelineOptions options) {
- return new NanosOffsetClock();
- }
- }
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/ParDoInProcessEvaluator.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/ParDoInProcessEvaluator.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/ParDoInProcessEvaluator.java
deleted file mode 100644
index 2a21e8c..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/ParDoInProcessEvaluator.java
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.runners.inprocess;
-
-import com.google.cloud.dataflow.sdk.runners.inprocess.InProcessExecutionContext.InProcessStepContext;
-import com.google.cloud.dataflow.sdk.runners.inprocess.InProcessPipelineRunner.UncommittedBundle;
-import com.google.cloud.dataflow.sdk.transforms.AppliedPTransform;
-import com.google.cloud.dataflow.sdk.util.DoFnRunner;
-import com.google.cloud.dataflow.sdk.util.DoFnRunners.OutputManager;
-import com.google.cloud.dataflow.sdk.util.WindowedValue;
-import com.google.cloud.dataflow.sdk.util.common.CounterSet;
-import com.google.cloud.dataflow.sdk.util.state.CopyOnAccessInMemoryStateInternals;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.cloud.dataflow.sdk.values.TupleTag;
-
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-
-class ParDoInProcessEvaluator<T> implements TransformEvaluator<T> {
- private final DoFnRunner<T, ?> fnRunner;
- private final AppliedPTransform<PCollection<T>, ?, ?> transform;
- private final CounterSet counters;
- private final Collection<UncommittedBundle<?>> outputBundles;
- private final InProcessStepContext stepContext;
-
- public ParDoInProcessEvaluator(
- DoFnRunner<T, ?> fnRunner,
- AppliedPTransform<PCollection<T>, ?, ?> transform,
- CounterSet counters,
- Collection<UncommittedBundle<?>> outputBundles,
- InProcessStepContext stepContext) {
- this.fnRunner = fnRunner;
- this.transform = transform;
- this.counters = counters;
- this.outputBundles = outputBundles;
- this.stepContext = stepContext;
- }
-
- @Override
- public void processElement(WindowedValue<T> element) {
- fnRunner.processElement(element);
- }
-
- @Override
- public InProcessTransformResult finishBundle() {
- fnRunner.finishBundle();
- StepTransformResult.Builder resultBuilder;
- CopyOnAccessInMemoryStateInternals<?> state = stepContext.commitState();
- if (state != null) {
- resultBuilder =
- StepTransformResult.withHold(transform, state.getEarliestWatermarkHold())
- .withState(state);
- } else {
- resultBuilder = StepTransformResult.withoutHold(transform);
- }
- return resultBuilder
- .addOutput(outputBundles)
- .withTimerUpdate(stepContext.getTimerUpdate())
- .withCounters(counters)
- .build();
- }
-
- static class BundleOutputManager implements OutputManager {
- private final Map<TupleTag<?>, UncommittedBundle<?>> bundles;
- private final Map<TupleTag<?>, List<?>> undeclaredOutputs;
-
- public static BundleOutputManager create(Map<TupleTag<?>, UncommittedBundle<?>> outputBundles) {
- return new BundleOutputManager(outputBundles);
- }
-
- private BundleOutputManager(Map<TupleTag<?>, UncommittedBundle<?>> bundles) {
- this.bundles = bundles;
- undeclaredOutputs = new HashMap<>();
- }
-
- @SuppressWarnings("unchecked")
- @Override
- public <T> void output(TupleTag<T> tag, WindowedValue<T> output) {
- @SuppressWarnings("rawtypes")
- UncommittedBundle bundle = bundles.get(tag);
- if (bundle == null) {
- List undeclaredContents = undeclaredOutputs.get(tag);
- if (undeclaredContents == null) {
- undeclaredContents = new ArrayList<T>();
- undeclaredOutputs.put(tag, undeclaredContents);
- }
- undeclaredContents.add(output);
- } else {
- bundle.add(output);
- }
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/ParDoMultiEvaluatorFactory.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/ParDoMultiEvaluatorFactory.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/ParDoMultiEvaluatorFactory.java
deleted file mode 100644
index 659bdd2..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/ParDoMultiEvaluatorFactory.java
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.runners.inprocess;
-
-import com.google.cloud.dataflow.sdk.runners.inprocess.InProcessExecutionContext.InProcessStepContext;
-import com.google.cloud.dataflow.sdk.runners.inprocess.InProcessPipelineRunner.CommittedBundle;
-import com.google.cloud.dataflow.sdk.runners.inprocess.InProcessPipelineRunner.UncommittedBundle;
-import com.google.cloud.dataflow.sdk.runners.inprocess.ParDoInProcessEvaluator.BundleOutputManager;
-import com.google.cloud.dataflow.sdk.transforms.AppliedPTransform;
-import com.google.cloud.dataflow.sdk.transforms.DoFn;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-import com.google.cloud.dataflow.sdk.transforms.ParDo.BoundMulti;
-import com.google.cloud.dataflow.sdk.util.DoFnRunner;
-import com.google.cloud.dataflow.sdk.util.DoFnRunners;
-import com.google.cloud.dataflow.sdk.util.common.CounterSet;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.cloud.dataflow.sdk.values.PCollectionTuple;
-import com.google.cloud.dataflow.sdk.values.TupleTag;
-
-import java.util.HashMap;
-import java.util.Map;
-
-/**
- * The {@link InProcessPipelineRunner} {@link TransformEvaluatorFactory} for the
- * {@link BoundMulti} primitive {@link PTransform}.
- */
-class ParDoMultiEvaluatorFactory implements TransformEvaluatorFactory {
- @Override
- public <T> TransformEvaluator<T> forApplication(
- AppliedPTransform<?, ?, ?> application,
- CommittedBundle<?> inputBundle,
- InProcessEvaluationContext evaluationContext) {
- @SuppressWarnings({"cast", "unchecked", "rawtypes"})
- TransformEvaluator<T> evaluator = (TransformEvaluator<T>) createMultiEvaluator(
- (AppliedPTransform) application, inputBundle, evaluationContext);
- return evaluator;
- }
-
- private static <InT, OuT> ParDoInProcessEvaluator<InT> createMultiEvaluator(
- AppliedPTransform<PCollection<InT>, PCollectionTuple, BoundMulti<InT, OuT>> application,
- CommittedBundle<InT> inputBundle,
- InProcessEvaluationContext evaluationContext) {
- PCollectionTuple output = application.getOutput();
- Map<TupleTag<?>, PCollection<?>> outputs = output.getAll();
- Map<TupleTag<?>, UncommittedBundle<?>> outputBundles = new HashMap<>();
- for (Map.Entry<TupleTag<?>, PCollection<?>> outputEntry : outputs.entrySet()) {
- outputBundles.put(
- outputEntry.getKey(),
- evaluationContext.createBundle(inputBundle, outputEntry.getValue()));
- }
- InProcessExecutionContext executionContext =
- evaluationContext.getExecutionContext(application, inputBundle.getKey());
- String stepName = evaluationContext.getStepName(application);
- InProcessStepContext stepContext =
- executionContext.getOrCreateStepContext(stepName, stepName, null);
-
- CounterSet counters = evaluationContext.createCounterSet();
-
- DoFn<InT, OuT> fn = application.getTransform().getFn();
- DoFnRunner<InT, OuT> runner =
- DoFnRunners.createDefault(
- evaluationContext.getPipelineOptions(),
- fn,
- evaluationContext.createSideInputReader(application.getTransform().getSideInputs()),
- BundleOutputManager.create(outputBundles),
- application.getTransform().getMainOutputTag(),
- application.getTransform().getSideOutputTags().getAll(),
- stepContext,
- counters.getAddCounterMutator(),
- application.getInput().getWindowingStrategy());
-
- runner.startBundle();
-
- return new ParDoInProcessEvaluator<>(
- runner, application, counters, outputBundles.values(), stepContext);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/ParDoSingleEvaluatorFactory.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/ParDoSingleEvaluatorFactory.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/ParDoSingleEvaluatorFactory.java
deleted file mode 100644
index e9bc1f7..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/ParDoSingleEvaluatorFactory.java
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.runners.inprocess;
-
-import com.google.cloud.dataflow.sdk.runners.inprocess.InProcessExecutionContext.InProcessStepContext;
-import com.google.cloud.dataflow.sdk.runners.inprocess.InProcessPipelineRunner.CommittedBundle;
-import com.google.cloud.dataflow.sdk.runners.inprocess.InProcessPipelineRunner.UncommittedBundle;
-import com.google.cloud.dataflow.sdk.runners.inprocess.ParDoInProcessEvaluator.BundleOutputManager;
-import com.google.cloud.dataflow.sdk.transforms.AppliedPTransform;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-import com.google.cloud.dataflow.sdk.transforms.ParDo.Bound;
-import com.google.cloud.dataflow.sdk.util.DoFnRunner;
-import com.google.cloud.dataflow.sdk.util.DoFnRunners;
-import com.google.cloud.dataflow.sdk.util.common.CounterSet;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.cloud.dataflow.sdk.values.TupleTag;
-
-import java.util.Collections;
-
-/**
- * The {@link InProcessPipelineRunner} {@link TransformEvaluatorFactory} for the
- * {@link Bound ParDo.Bound} primitive {@link PTransform}.
- */
-class ParDoSingleEvaluatorFactory implements TransformEvaluatorFactory {
- @Override
- public <T> TransformEvaluator<T> forApplication(
- final AppliedPTransform<?, ?, ?> application,
- CommittedBundle<?> inputBundle,
- InProcessEvaluationContext evaluationContext) {
- @SuppressWarnings({"cast", "unchecked", "rawtypes"})
- TransformEvaluator<T> evaluator = (TransformEvaluator<T>) createSingleEvaluator(
- (AppliedPTransform) application, inputBundle, evaluationContext);
- return evaluator;
- }
-
- private static <InputT, OutputT> ParDoInProcessEvaluator<InputT> createSingleEvaluator(
- @SuppressWarnings("rawtypes") AppliedPTransform<PCollection<InputT>, PCollection<OutputT>,
- Bound<InputT, OutputT>> application,
- CommittedBundle<InputT> inputBundle, InProcessEvaluationContext evaluationContext) {
- TupleTag<OutputT> mainOutputTag = new TupleTag<>("out");
- UncommittedBundle<OutputT> outputBundle =
- evaluationContext.createBundle(inputBundle, application.getOutput());
-
- InProcessExecutionContext executionContext =
- evaluationContext.getExecutionContext(application, inputBundle.getKey());
- String stepName = evaluationContext.getStepName(application);
- InProcessStepContext stepContext =
- executionContext.getOrCreateStepContext(stepName, stepName, null);
-
- CounterSet counters = evaluationContext.createCounterSet();
-
- DoFnRunner<InputT, OutputT> runner =
- DoFnRunners.createDefault(
- evaluationContext.getPipelineOptions(),
- application.getTransform().getFn(),
- evaluationContext.createSideInputReader(application.getTransform().getSideInputs()),
- BundleOutputManager.create(
- Collections.<TupleTag<?>, UncommittedBundle<?>>singletonMap(
- mainOutputTag, outputBundle)),
- mainOutputTag,
- Collections.<TupleTag<?>>emptyList(),
- stepContext,
- counters.getAddCounterMutator(),
- application.getInput().getWindowingStrategy());
-
- runner.startBundle();
- return new ParDoInProcessEvaluator<InputT>(
- runner,
- application,
- counters,
- Collections.<UncommittedBundle<?>>singleton(outputBundle),
- stepContext);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/StepAndKey.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/StepAndKey.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/StepAndKey.java
deleted file mode 100644
index 1595572..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/StepAndKey.java
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.runners.inprocess;
-
-import com.google.cloud.dataflow.sdk.transforms.AppliedPTransform;
-import com.google.common.base.MoreObjects;
-
-import java.util.Objects;
-
-/**
- * A (Step, Key) pair. This is useful as a map key or cache key for things that are available
- * per-step in a keyed manner (e.g. State).
- */
-final class StepAndKey {
- private final AppliedPTransform<?, ?, ?> step;
- private final Object key;
-
- /**
- * Create a new {@link StepAndKey} with the provided step and key.
- */
- public static StepAndKey of(AppliedPTransform<?, ?, ?> step, Object key) {
- return new StepAndKey(step, key);
- }
-
- private StepAndKey(AppliedPTransform<?, ?, ?> step, Object key) {
- this.step = step;
- this.key = key;
- }
-
- @Override
- public String toString() {
- return MoreObjects.toStringHelper(StepAndKey.class)
- .add("step", step.getFullName())
- .add("key", key)
- .toString();
- }
-
- @Override
- public int hashCode() {
- return Objects.hash(step, key);
- }
-
- @Override
- public boolean equals(Object other) {
- if (other == this) {
- return true;
- } else if (!(other instanceof StepAndKey)) {
- return false;
- } else {
- StepAndKey that = (StepAndKey) other;
- return Objects.equals(this.step, that.step)
- && Objects.equals(this.key, that.key);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/StepTransformResult.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/StepTransformResult.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/StepTransformResult.java
deleted file mode 100644
index 3c4ee29..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/StepTransformResult.java
+++ /dev/null
@@ -1,157 +0,0 @@
-/*
- * Copyright (C) 2016 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.runners.inprocess;
-
-import com.google.cloud.dataflow.sdk.runners.inprocess.InMemoryWatermarkManager.TimerUpdate;
-import com.google.cloud.dataflow.sdk.runners.inprocess.InProcessPipelineRunner.UncommittedBundle;
-import com.google.cloud.dataflow.sdk.transforms.AppliedPTransform;
-import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
-import com.google.cloud.dataflow.sdk.util.common.CounterSet;
-import com.google.cloud.dataflow.sdk.util.state.CopyOnAccessInMemoryStateInternals;
-import com.google.common.base.MoreObjects;
-import com.google.common.collect.ImmutableList;
-
-import org.joda.time.Instant;
-
-import java.util.Collection;
-
-/**
- * An immutable {@link InProcessTransformResult}.
- */
-public class StepTransformResult implements InProcessTransformResult {
- private final AppliedPTransform<?, ?, ?> transform;
- private final Iterable<? extends UncommittedBundle<?>> bundles;
- private final CopyOnAccessInMemoryStateInternals<?> state;
- private final TimerUpdate timerUpdate;
- private final CounterSet counters;
- private final Instant watermarkHold;
-
- private StepTransformResult(
- AppliedPTransform<?, ?, ?> transform,
- Iterable<? extends UncommittedBundle<?>> outputBundles,
- CopyOnAccessInMemoryStateInternals<?> state,
- TimerUpdate timerUpdate,
- CounterSet counters,
- Instant watermarkHold) {
- this.transform = transform;
- this.bundles = outputBundles;
- this.state = state;
- this.timerUpdate = timerUpdate;
- this.counters = counters;
- this.watermarkHold = watermarkHold;
- }
-
- @Override
- public Iterable<? extends UncommittedBundle<?>> getOutputBundles() {
- return bundles;
- }
-
- @Override
- public CounterSet getCounters() {
- return counters;
- }
-
- @Override
- public AppliedPTransform<?, ?, ?> getTransform() {
- return transform;
- }
-
- @Override
- public Instant getWatermarkHold() {
- return watermarkHold;
- }
-
- @Override
- public CopyOnAccessInMemoryStateInternals<?> getState() {
- return state;
- }
-
- @Override
- public TimerUpdate getTimerUpdate() {
- return timerUpdate;
- }
-
- public static Builder withHold(AppliedPTransform<?, ?, ?> transform, Instant watermarkHold) {
- return new Builder(transform, watermarkHold);
- }
-
- public static Builder withoutHold(AppliedPTransform<?, ?, ?> transform) {
- return new Builder(transform, BoundedWindow.TIMESTAMP_MAX_VALUE);
- }
-
- @Override
- public String toString() {
- return MoreObjects.toStringHelper(StepTransformResult.class)
- .add("transform", transform)
- .toString();
- }
-
- /**
- * A builder for creating instances of {@link StepTransformResult}.
- */
- public static class Builder {
- private final AppliedPTransform<?, ?, ?> transform;
- private final ImmutableList.Builder<UncommittedBundle<?>> bundlesBuilder;
- private CopyOnAccessInMemoryStateInternals<?> state;
- private TimerUpdate timerUpdate;
- private CounterSet counters;
- private final Instant watermarkHold;
-
- private Builder(AppliedPTransform<?, ?, ?> transform, Instant watermarkHold) {
- this.transform = transform;
- this.watermarkHold = watermarkHold;
- this.bundlesBuilder = ImmutableList.builder();
- this.timerUpdate = TimerUpdate.builder(null).build();
- }
-
- public StepTransformResult build() {
- return new StepTransformResult(
- transform,
- bundlesBuilder.build(),
- state,
- timerUpdate,
- counters,
- watermarkHold);
- }
-
- public Builder withCounters(CounterSet counters) {
- this.counters = counters;
- return this;
- }
-
- public Builder withState(CopyOnAccessInMemoryStateInternals<?> state) {
- this.state = state;
- return this;
- }
-
- public Builder withTimerUpdate(TimerUpdate timerUpdate) {
- this.timerUpdate = timerUpdate;
- return this;
- }
-
- public Builder addOutput(
- UncommittedBundle<?> outputBundle, UncommittedBundle<?>... outputBundles) {
- bundlesBuilder.add(outputBundle);
- bundlesBuilder.add(outputBundles);
- return this;
- }
-
- public Builder addOutput(Collection<UncommittedBundle<?>> outputBundles) {
- bundlesBuilder.addAll(outputBundles);
- return this;
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/TransformEvaluator.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/TransformEvaluator.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/TransformEvaluator.java
deleted file mode 100644
index 270557d..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/TransformEvaluator.java
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.runners.inprocess;
-
-import com.google.cloud.dataflow.sdk.runners.inprocess.InProcessPipelineRunner.CommittedBundle;
-import com.google.cloud.dataflow.sdk.util.WindowedValue;
-
-/**
- * An evaluator of a specific application of a transform. Will be used for at least one
- * {@link CommittedBundle}.
- *
- * @param <InputT> the type of elements that will be passed to {@link #processElement}
- */
-public interface TransformEvaluator<InputT> {
- /**
- * Process an element in the input {@link CommittedBundle}.
- *
- * @param element the element to process
- */
- void processElement(WindowedValue<InputT> element) throws Exception;
-
- /**
- * Finish processing the bundle of this {@link TransformEvaluator}.
- *
- * After {@link #finishBundle()} is called, the {@link TransformEvaluator} will not be reused,
- * and no more elements will be processed.
- *
- * @return an {@link InProcessTransformResult} containing the results of this bundle evaluation.
- */
- InProcessTransformResult finishBundle() throws Exception;
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/TransformEvaluatorFactory.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/TransformEvaluatorFactory.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/TransformEvaluatorFactory.java
deleted file mode 100644
index 860ddfe..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/TransformEvaluatorFactory.java
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.runners.inprocess;
-
-import com.google.cloud.dataflow.sdk.runners.inprocess.InProcessPipelineRunner.CommittedBundle;
-import com.google.cloud.dataflow.sdk.transforms.AppliedPTransform;
-import com.google.cloud.dataflow.sdk.transforms.DoFn;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-
-import javax.annotation.Nullable;
-
-/**
- * A factory for creating instances of {@link TransformEvaluator} for the application of a
- * {@link PTransform}.
- */
-public interface TransformEvaluatorFactory {
- /**
- * Create a new {@link TransformEvaluator} for the application of the {@link PTransform}.
- *
- * Any work that must be done before input elements are processed (such as calling
- * {@link DoFn#startBundle(DoFn.Context)}) must be done before the {@link TransformEvaluator} is
- * made available to the caller.
- *
- * @throws Exception whenever constructing the underlying evaluator throws an exception
- */
- <InputT> TransformEvaluator<InputT> forApplication(
- AppliedPTransform<?, ?, ?> application, @Nullable CommittedBundle<?> inputBundle,
- InProcessEvaluationContext evaluationContext) throws Exception;
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/TransformEvaluatorRegistry.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/TransformEvaluatorRegistry.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/TransformEvaluatorRegistry.java
deleted file mode 100644
index 0c8cb7e..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/TransformEvaluatorRegistry.java
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright (C) 2016 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.runners.inprocess;
-
-import com.google.cloud.dataflow.sdk.io.Read;
-import com.google.cloud.dataflow.sdk.runners.inprocess.InProcessPipelineRunner.CommittedBundle;
-import com.google.cloud.dataflow.sdk.transforms.AppliedPTransform;
-import com.google.cloud.dataflow.sdk.transforms.Flatten.FlattenPCollectionList;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-import com.google.cloud.dataflow.sdk.transforms.ParDo;
-import com.google.common.collect.ImmutableMap;
-
-import java.util.Map;
-
-import javax.annotation.Nullable;
-
-/**
- * A {@link TransformEvaluatorFactory} that delegates to primitive {@link TransformEvaluatorFactory}
- * implementations based on the type of {@link PTransform} of the application.
- */
-class TransformEvaluatorRegistry implements TransformEvaluatorFactory {
- public static TransformEvaluatorRegistry defaultRegistry() {
- @SuppressWarnings("rawtypes")
- ImmutableMap<Class<? extends PTransform>, TransformEvaluatorFactory> primitives =
- ImmutableMap.<Class<? extends PTransform>, TransformEvaluatorFactory>builder()
- .put(Read.Bounded.class, new BoundedReadEvaluatorFactory())
- .put(Read.Unbounded.class, new UnboundedReadEvaluatorFactory())
- .put(ParDo.Bound.class, new ParDoSingleEvaluatorFactory())
- .put(ParDo.BoundMulti.class, new ParDoMultiEvaluatorFactory())
- .put(
- GroupByKeyEvaluatorFactory.InProcessGroupByKeyOnly.class,
- new GroupByKeyEvaluatorFactory())
- .put(FlattenPCollectionList.class, new FlattenEvaluatorFactory())
- .put(ViewEvaluatorFactory.WriteView.class, new ViewEvaluatorFactory())
- .build();
- return new TransformEvaluatorRegistry(primitives);
- }
-
- // the TransformEvaluatorFactories can construct instances of all generic types of transform,
- // so all instances of a primitive can be handled with the same evaluator factory.
- @SuppressWarnings("rawtypes")
- private final Map<Class<? extends PTransform>, TransformEvaluatorFactory> factories;
-
- private TransformEvaluatorRegistry(
- @SuppressWarnings("rawtypes")
- Map<Class<? extends PTransform>, TransformEvaluatorFactory> factories) {
- this.factories = factories;
- }
-
- @Override
- public <InputT> TransformEvaluator<InputT> forApplication(
- AppliedPTransform<?, ?, ?> application,
- @Nullable CommittedBundle<?> inputBundle,
- InProcessEvaluationContext evaluationContext)
- throws Exception {
- TransformEvaluatorFactory factory = factories.get(application.getTransform().getClass());
- return factory.forApplication(application, inputBundle, evaluationContext);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/TransformExecutor.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/TransformExecutor.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/TransformExecutor.java
deleted file mode 100644
index d630749..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/TransformExecutor.java
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Copyright (C) 2016 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.runners.inprocess;
-
-import com.google.cloud.dataflow.sdk.runners.inprocess.InProcessPipelineRunner.CommittedBundle;
-import com.google.cloud.dataflow.sdk.transforms.AppliedPTransform;
-import com.google.cloud.dataflow.sdk.util.WindowedValue;
-import com.google.common.base.Throwables;
-
-import java.util.concurrent.Callable;
-
-import javax.annotation.Nullable;
-
-/**
- * A {@link Callable} responsible for constructing a {@link TransformEvaluator} from a
- * {@link TransformEvaluatorFactory} and evaluating it on some bundle of input, and registering
- * the result using a registered {@link CompletionCallback}.
- *
- * <p>A {@link TransformExecutor} that is currently executing also provides access to the thread
- * that it is being executed on.
- */
-class TransformExecutor<T> implements Callable<InProcessTransformResult> {
- public static <T> TransformExecutor<T> create(
- TransformEvaluatorFactory factory,
- InProcessEvaluationContext evaluationContext,
- CommittedBundle<T> inputBundle,
- AppliedPTransform<?, ?, ?> transform,
- CompletionCallback completionCallback,
- TransformExecutorService transformEvaluationState) {
- return new TransformExecutor<>(
- factory,
- evaluationContext,
- inputBundle,
- transform,
- completionCallback,
- transformEvaluationState);
- }
-
- private final TransformEvaluatorFactory evaluatorFactory;
- private final InProcessEvaluationContext evaluationContext;
-
- /** The transform that will be evaluated. */
- private final AppliedPTransform<?, ?, ?> transform;
- /** The inputs this {@link TransformExecutor} will deliver to the transform. */
- private final CommittedBundle<T> inputBundle;
-
- private final CompletionCallback onComplete;
- private final TransformExecutorService transformEvaluationState;
-
- private Thread thread;
-
- private TransformExecutor(
- TransformEvaluatorFactory factory,
- InProcessEvaluationContext evaluationContext,
- CommittedBundle<T> inputBundle,
- AppliedPTransform<?, ?, ?> transform,
- CompletionCallback completionCallback,
- TransformExecutorService transformEvaluationState) {
- this.evaluatorFactory = factory;
- this.evaluationContext = evaluationContext;
-
- this.inputBundle = inputBundle;
- this.transform = transform;
-
- this.onComplete = completionCallback;
-
- this.transformEvaluationState = transformEvaluationState;
- }
-
- @Override
- public InProcessTransformResult call() {
- this.thread = Thread.currentThread();
- try {
- TransformEvaluator<T> evaluator =
- evaluatorFactory.forApplication(transform, inputBundle, evaluationContext);
- if (inputBundle != null) {
- for (WindowedValue<T> value : inputBundle.getElements()) {
- evaluator.processElement(value);
- }
- }
- InProcessTransformResult result = evaluator.finishBundle();
- onComplete.handleResult(inputBundle, result);
- return result;
- } catch (Throwable t) {
- onComplete.handleThrowable(inputBundle, t);
- throw Throwables.propagate(t);
- } finally {
- this.thread = null;
- transformEvaluationState.complete(this);
- }
- }
-
- /**
- * If this {@link TransformExecutor} is currently executing, return the thread it is executing in.
- * Otherwise, return null.
- */
- @Nullable
- public Thread getThread() {
- return this.thread;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/TransformExecutorService.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/TransformExecutorService.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/TransformExecutorService.java
deleted file mode 100644
index 3f00da6..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/TransformExecutorService.java
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright (C) 2016 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.runners.inprocess;
-
-/**
- * Schedules and completes {@link TransformExecutor TransformExecutors}, controlling concurrency as
- * appropriate for the {@link StepAndKey} the executor exists for.
- */
-interface TransformExecutorService {
- /**
- * Schedule the provided work to be eventually executed.
- */
- void schedule(TransformExecutor<?> work);
-
- /**
- * Finish executing the provided work. This may cause additional
- * {@link TransformExecutor TransformExecutors} to be evaluated.
- */
- void complete(TransformExecutor<?> completed);
-}
-
[11/67] [partial] incubator-beam git commit: Directory reorganization
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/MergingActiveWindowSet.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/MergingActiveWindowSet.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/MergingActiveWindowSet.java
deleted file mode 100644
index 96629b1..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/MergingActiveWindowSet.java
+++ /dev/null
@@ -1,544 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.cloud.dataflow.sdk.coders.MapCoder;
-import com.google.cloud.dataflow.sdk.coders.SetCoder;
-import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
-import com.google.cloud.dataflow.sdk.transforms.windowing.WindowFn;
-import com.google.cloud.dataflow.sdk.util.state.StateInternals;
-import com.google.cloud.dataflow.sdk.util.state.StateNamespaces;
-import com.google.cloud.dataflow.sdk.util.state.StateTag;
-import com.google.cloud.dataflow.sdk.util.state.StateTags;
-import com.google.cloud.dataflow.sdk.util.state.ValueState;
-import com.google.common.annotations.VisibleForTesting;
-import com.google.common.base.Preconditions;
-import com.google.common.collect.Iterables;
-
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.LinkedHashSet;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-
-import javax.annotation.Nullable;
-
-/**
- * An {@link ActiveWindowSet} for merging {@link WindowFn} implementations.
- *
- * <p>The underlying notion of {@link MergingActiveWindowSet} is that of representing equivalence
- * classes of merged windows as a mapping from the merged "super-window" to a set of
- * <i>state address</i> windows in which some state has been persisted. The mapping need not
- * contain EPHEMERAL windows, because they are created and merged without any persistent state.
- * Each window must be a state address window for at most one window, so the mapping is
- * invertible.
- *
- * <p>The states of a non-expired window are treated as follows:
- *
- * <ul>
- * <li><b>NEW</b>: a NEW has an empty set of associated state address windows.</li>
- * <li><b>ACTIVE</b>: an ACTIVE window will be associated with some nonempty set of state
- * address windows. If the window has not merged, this will necessarily be the singleton set
- * containing just itself, but it is not required that an ACTIVE window be amongst its
- * state address windows.</li>
- * <li><b>MERGED</b>: a MERGED window will be in the set of associated windows for some
- * other window - that window is retrieved via {@link #representative} (this reverse
- * association is implemented in O(1) time).</li>
- * <li><b>EPHEMERAL</b>: EPHEMERAL windows are not persisted but are tracked transiently;
- * an EPHEMERAL window must be registered with this {@link ActiveWindowSet} by a call
- * to {@link #recordMerge} prior to any request for a {@link #representative}.</li>
- * </ul>
- *
- * <p>To illustrate why an ACTIVE window need not be amongst its own state address windows,
- * consider two active windows W1 and W2 that are merged to form W12. Further writes may be
- * applied to either of W1 or W2, since a read of W12 implies reading both of W12 and merging
- * their results. Hence W12 need not have state directly associated with it.
- */
-public class MergingActiveWindowSet<W extends BoundedWindow> implements ActiveWindowSet<W> {
- private final WindowFn<Object, W> windowFn;
- private final Map<W, Set<W>> activeWindowToStateAddressWindows;
-
- /**
- * As above, but only for EPHEMERAL windows. Does not need to be persisted.
- */
- private final Map<W, Set<W>> activeWindowToEphemeralWindows;
-
- /**
- * A map from window to the ACTIVE window it has been merged into.
- *
- * <p>Does not need to be persisted.
- *
- * <ul>
- * <li>Key window may be ACTIVE, MERGED or EPHEMERAL.
- * <li>ACTIVE windows map to themselves.
- * <li>If W1 maps to W2 then W2 is in {@link #activeWindowToStateAddressWindows}.
- * <li>If W1 = W2 then W1 is ACTIVE. If W1 is in the state address window set for W2 then W1 is
- * MERGED. Otherwise W1 is EPHEMERAL.
- * </ul>
- */
- private final Map<W, W> windowToActiveWindow;
-
- /**
- * Deep clone of {@link #activeWindowToStateAddressWindows} as of last commit.
- *
- * <p>Used to avoid writing to state if no changes have been made during the work unit.
- */
- private final Map<W, Set<W>> originalActiveWindowToStateAddressWindows;
-
- /**
- * Handle representing our state in the backend.
- */
- private final ValueState<Map<W, Set<W>>> valueState;
-
- public MergingActiveWindowSet(WindowFn<Object, W> windowFn, StateInternals<?> state) {
- this.windowFn = windowFn;
-
- StateTag<Object, ValueState<Map<W, Set<W>>>> mergeTreeAddr =
- StateTags.makeSystemTagInternal(StateTags.value(
- "tree", MapCoder.of(windowFn.windowCoder(), SetCoder.of(windowFn.windowCoder()))));
- valueState = state.state(StateNamespaces.global(), mergeTreeAddr);
- // Little use trying to prefetch this state since the ReduceFnRunner is stymied until it is
- // available.
- activeWindowToStateAddressWindows = emptyIfNull(valueState.read());
- activeWindowToEphemeralWindows = new HashMap<>();
- originalActiveWindowToStateAddressWindows = deepCopy(activeWindowToStateAddressWindows);
- windowToActiveWindow = invert(activeWindowToStateAddressWindows);
- }
-
- @Override
- public void removeEphemeralWindows() {
- for (Map.Entry<W, Set<W>> entry : activeWindowToEphemeralWindows.entrySet()) {
- for (W ephemeral : entry.getValue()) {
- windowToActiveWindow.remove(ephemeral);
- }
- }
- activeWindowToEphemeralWindows.clear();
- }
-
- @Override
- public void persist() {
- if (activeWindowToStateAddressWindows.isEmpty()) {
- // Force all persistent state to disappear.
- valueState.clear();
- return;
- }
- if (activeWindowToStateAddressWindows.equals(originalActiveWindowToStateAddressWindows)) {
- // No change.
- return;
- }
- // All NEW windows must have been accounted for.
- for (Map.Entry<W, Set<W>> entry : activeWindowToStateAddressWindows.entrySet()) {
- Preconditions.checkState(
- !entry.getValue().isEmpty(), "Cannot persist NEW window %s", entry.getKey());
- }
- // Should be no EPHEMERAL windows.
- Preconditions.checkState(
- activeWindowToEphemeralWindows.isEmpty(), "Unexpected EPHEMERAL windows before persist");
-
- valueState.write(activeWindowToStateAddressWindows);
- // No need to update originalActiveWindowToStateAddressWindows since this object is about to
- // become garbage.
- }
-
- @Override
- @Nullable
- public W representative(W window) {
- return windowToActiveWindow.get(window);
- }
-
- @Override
- public Set<W> getActiveWindows() {
- return activeWindowToStateAddressWindows.keySet();
- }
-
- @Override
- public boolean isActive(W window) {
- return activeWindowToStateAddressWindows.containsKey(window);
- }
-
- @Override
- public void addNew(W window) {
- if (!windowToActiveWindow.containsKey(window)) {
- activeWindowToStateAddressWindows.put(window, new LinkedHashSet<W>());
- }
- }
-
- @Override
- public void addActive(W window) {
- if (!windowToActiveWindow.containsKey(window)) {
- Set<W> stateAddressWindows = new LinkedHashSet<>();
- stateAddressWindows.add(window);
- activeWindowToStateAddressWindows.put(window, stateAddressWindows);
- windowToActiveWindow.put(window, window);
- }
- }
-
- @Override
- public void remove(W window) {
- Set<W> stateAddressWindows = activeWindowToStateAddressWindows.get(window);
- if (stateAddressWindows == null) {
- // Window is no longer active.
- return;
- }
- for (W stateAddressWindow : stateAddressWindows) {
- windowToActiveWindow.remove(stateAddressWindow);
- }
- activeWindowToStateAddressWindows.remove(window);
- Set<W> ephemeralWindows = activeWindowToEphemeralWindows.get(window);
- if (ephemeralWindows != null) {
- for (W ephemeralWindow : ephemeralWindows) {
- windowToActiveWindow.remove(ephemeralWindow);
- }
- activeWindowToEphemeralWindows.remove(window);
- }
- windowToActiveWindow.remove(window);
- }
-
- private class MergeContextImpl extends WindowFn<Object, W>.MergeContext {
- private MergeCallback<W> mergeCallback;
- private final List<Collection<W>> allToBeMerged;
- private final List<Collection<W>> allActiveToBeMerged;
- private final List<W> allMergeResults;
- private final Set<W> seen;
-
- public MergeContextImpl(MergeCallback<W> mergeCallback) {
- windowFn.super();
- this.mergeCallback = mergeCallback;
- allToBeMerged = new ArrayList<>();
- allActiveToBeMerged = new ArrayList<>();
- allMergeResults = new ArrayList<>();
- seen = new HashSet<>();
- }
-
- @Override
- public Collection<W> windows() {
- return activeWindowToStateAddressWindows.keySet();
- }
-
- @Override
- public void merge(Collection<W> toBeMerged, W mergeResult) throws Exception {
- // The arguments have come from userland.
- Preconditions.checkNotNull(toBeMerged);
- Preconditions.checkNotNull(mergeResult);
- List<W> copyOfToBeMerged = new ArrayList<>(toBeMerged.size());
- List<W> activeToBeMerged = new ArrayList<>(toBeMerged.size());
- boolean includesMergeResult = false;
- for (W window : toBeMerged) {
- Preconditions.checkNotNull(window);
- Preconditions.checkState(
- isActive(window), "Expecting merge window %s to be active", window);
- if (window.equals(mergeResult)) {
- includesMergeResult = true;
- }
- boolean notDup = seen.add(window);
- Preconditions.checkState(
- notDup, "Expecting merge window %s to appear in at most one merge set", window);
- copyOfToBeMerged.add(window);
- if (!activeWindowToStateAddressWindows.get(window).isEmpty()) {
- activeToBeMerged.add(window);
- }
- }
- if (!includesMergeResult) {
- Preconditions.checkState(
- !isActive(mergeResult), "Expecting result window %s to be new", mergeResult);
- }
- allToBeMerged.add(copyOfToBeMerged);
- allActiveToBeMerged.add(activeToBeMerged);
- allMergeResults.add(mergeResult);
- }
-
- public void recordMerges() throws Exception {
- for (int i = 0; i < allToBeMerged.size(); i++) {
- mergeCallback.prefetchOnMerge(
- allToBeMerged.get(i), allActiveToBeMerged.get(i), allMergeResults.get(i));
- }
- for (int i = 0; i < allToBeMerged.size(); i++) {
- mergeCallback.onMerge(
- allToBeMerged.get(i), allActiveToBeMerged.get(i), allMergeResults.get(i));
- recordMerge(allToBeMerged.get(i), allMergeResults.get(i));
- }
- allToBeMerged.clear();
- allActiveToBeMerged.clear();
- allMergeResults.clear();
- seen.clear();
- }
- }
-
- @Override
- public void merge(MergeCallback<W> mergeCallback) throws Exception {
- MergeContextImpl context = new MergeContextImpl(mergeCallback);
-
- // See what the window function does with the NEW and already ACTIVE windows.
- // Entering userland.
- windowFn.mergeWindows(context);
-
- // Actually do the merging and invoke the callbacks.
- context.recordMerges();
-
- // Any remaining NEW windows should become implicitly ACTIVE.
- for (Map.Entry<W, Set<W>> entry : activeWindowToStateAddressWindows.entrySet()) {
- if (entry.getValue().isEmpty()) {
- // This window was NEW but since it survived merging must now become ACTIVE.
- W window = entry.getKey();
- entry.getValue().add(window);
- windowToActiveWindow.put(window, window);
- }
- }
- }
-
- /**
- * A {@link WindowFn#mergeWindows} call has determined that {@code toBeMerged} (which must
- * all be ACTIVE}) should be considered equivalent to {@code activeWindow} (which is either a
- * member of {@code toBeMerged} or is a new window). Make the corresponding change in
- * the active window set.
- */
- private void recordMerge(Collection<W> toBeMerged, W mergeResult) throws Exception {
- Set<W> newStateAddressWindows = new LinkedHashSet<>();
- Set<W> existingStateAddressWindows = activeWindowToStateAddressWindows.get(mergeResult);
- if (existingStateAddressWindows != null) {
- // Preserve all the existing state address windows for mergeResult.
- newStateAddressWindows.addAll(existingStateAddressWindows);
- }
-
- Set<W> newEphemeralWindows = new HashSet<>();
- Set<W> existingEphemeralWindows = activeWindowToEphemeralWindows.get(mergeResult);
- if (existingEphemeralWindows != null) {
- // Preserve all the existing EPHEMERAL windows for meregResult.
- newEphemeralWindows.addAll(existingEphemeralWindows);
- }
-
- for (W other : toBeMerged) {
- Set<W> otherStateAddressWindows = activeWindowToStateAddressWindows.get(other);
- Preconditions.checkState(otherStateAddressWindows != null, "Window %s is not ACTIVE", other);
-
- for (W otherStateAddressWindow : otherStateAddressWindows) {
- // Since otherTarget equiv other AND other equiv mergeResult
- // THEN otherTarget equiv mergeResult.
- newStateAddressWindows.add(otherStateAddressWindow);
- windowToActiveWindow.put(otherStateAddressWindow, mergeResult);
- }
- activeWindowToStateAddressWindows.remove(other);
-
- Set<W> otherEphemeralWindows = activeWindowToEphemeralWindows.get(other);
- if (otherEphemeralWindows != null) {
- for (W otherEphemeral : otherEphemeralWindows) {
- // Since otherEphemeral equiv other AND other equiv mergeResult
- // THEN otherEphemeral equiv mergeResult.
- newEphemeralWindows.add(otherEphemeral);
- windowToActiveWindow.put(otherEphemeral, mergeResult);
- }
- }
- activeWindowToEphemeralWindows.remove(other);
-
- // Now other equiv mergeResult.
- if (otherStateAddressWindows.contains(other)) {
- // Other was ACTIVE and is now known to be MERGED.
- } else if (otherStateAddressWindows.isEmpty()) {
- // Other was NEW thus has no state. It is now EPHEMERAL.
- newEphemeralWindows.add(other);
- } else if (other.equals(mergeResult)) {
- // Other was ACTIVE, was never used to store elements, but is still ACTIVE.
- // Leave it as active.
- } else {
- // Other was ACTIVE, was never used to store element, as is no longer considered ACTIVE.
- // It is now EPHEMERAL.
- newEphemeralWindows.add(other);
- }
- windowToActiveWindow.put(other, mergeResult);
- }
-
- if (newStateAddressWindows.isEmpty()) {
- // If stateAddressWindows is empty then toBeMerged must have only contained EPHEMERAL windows.
- // Promote mergeResult to be active now.
- newStateAddressWindows.add(mergeResult);
- }
- windowToActiveWindow.put(mergeResult, mergeResult);
-
- activeWindowToStateAddressWindows.put(mergeResult, newStateAddressWindows);
- if (!newEphemeralWindows.isEmpty()) {
- activeWindowToEphemeralWindows.put(mergeResult, newEphemeralWindows);
- }
-
- merged(mergeResult);
- }
-
- @Override
- public void merged(W window) {
- Set<W> stateAddressWindows = activeWindowToStateAddressWindows.get(window);
- Preconditions.checkState(stateAddressWindows != null, "Window %s is not ACTIVE", window);
- W first = Iterables.getFirst(stateAddressWindows, null);
- stateAddressWindows.clear();
- stateAddressWindows.add(first);
- }
-
- /**
- * Return the state address windows for ACTIVE {@code window} from which all state associated
- * should
- * be read and merged.
- */
- @Override
- public Set<W> readStateAddresses(W window) {
- Set<W> stateAddressWindows = activeWindowToStateAddressWindows.get(window);
- Preconditions.checkState(stateAddressWindows != null, "Window %s is not ACTIVE", window);
- return stateAddressWindows;
- }
-
- /**
- * Return the state address window of ACTIVE {@code window} into which all new state should be
- * written.
- */
- @Override
- public W writeStateAddress(W window) {
- Set<W> stateAddressWindows = activeWindowToStateAddressWindows.get(window);
- Preconditions.checkState(stateAddressWindows != null, "Window %s is not ACTIVE", window);
- W result = Iterables.getFirst(stateAddressWindows, null);
- Preconditions.checkState(result != null, "Window %s is still NEW", window);
- return result;
- }
-
- @Override
- public W mergedWriteStateAddress(Collection<W> toBeMerged, W mergeResult) {
- Set<W> stateAddressWindows = activeWindowToStateAddressWindows.get(mergeResult);
- if (stateAddressWindows != null && !stateAddressWindows.isEmpty()) {
- return Iterables.getFirst(stateAddressWindows, null);
- }
- for (W mergedWindow : toBeMerged) {
- stateAddressWindows = activeWindowToStateAddressWindows.get(mergedWindow);
- if (stateAddressWindows != null && !stateAddressWindows.isEmpty()) {
- return Iterables.getFirst(stateAddressWindows, null);
- }
- }
- return mergeResult;
- }
-
- @VisibleForTesting
- public void checkInvariants() {
- Set<W> knownStateAddressWindows = new HashSet<>();
- for (Map.Entry<W, Set<W>> entry : activeWindowToStateAddressWindows.entrySet()) {
- W active = entry.getKey();
- Preconditions.checkState(!entry.getValue().isEmpty(),
- "Unexpected empty state address window set for ACTIVE window %s", active);
- for (W stateAddressWindow : entry.getValue()) {
- Preconditions.checkState(knownStateAddressWindows.add(stateAddressWindow),
- "%s is in more than one state address window set", stateAddressWindow);
- Preconditions.checkState(active.equals(windowToActiveWindow.get(stateAddressWindow)),
- "%s should have %s as its ACTIVE window", stateAddressWindow, active);
- }
- }
- for (Map.Entry<W, Set<W>> entry : activeWindowToEphemeralWindows.entrySet()) {
- W active = entry.getKey();
- Preconditions.checkState(activeWindowToStateAddressWindows.containsKey(active),
- "%s must be ACTIVE window", active);
- Preconditions.checkState(
- !entry.getValue().isEmpty(), "Unexpected empty EPHEMERAL set for %s", active);
- for (W ephemeralWindow : entry.getValue()) {
- Preconditions.checkState(knownStateAddressWindows.add(ephemeralWindow),
- "%s is EPHEMERAL/state address of more than one ACTIVE window", ephemeralWindow);
- Preconditions.checkState(active.equals(windowToActiveWindow.get(ephemeralWindow)),
- "%s should have %s as its ACTIVE window", ephemeralWindow, active);
- }
- }
- for (Map.Entry<W, W> entry : windowToActiveWindow.entrySet()) {
- Preconditions.checkState(activeWindowToStateAddressWindows.containsKey(entry.getValue()),
- "%s should be ACTIVE since representative for %s", entry.getValue(), entry.getKey());
- }
- }
-
- @Override
- public String toString() {
- StringBuilder sb = new StringBuilder();
- sb.append("MergingActiveWindowSet {\n");
- for (Map.Entry<W, Set<W>> entry : activeWindowToStateAddressWindows.entrySet()) {
- W active = entry.getKey();
- Set<W> stateAddressWindows = entry.getValue();
- if (stateAddressWindows.isEmpty()) {
- sb.append(" NEW ");
- sb.append(active);
- sb.append('\n');
- } else {
- sb.append(" ACTIVE ");
- sb.append(active);
- sb.append(":\n");
- for (W stateAddressWindow : stateAddressWindows) {
- if (stateAddressWindow.equals(active)) {
- sb.append(" ACTIVE ");
- } else {
- sb.append(" MERGED ");
- }
- sb.append(stateAddressWindow);
- sb.append("\n");
- W active2 = windowToActiveWindow.get(stateAddressWindow);
- Preconditions.checkState(active2.equals(active));
- }
- Set<W> ephemeralWindows = activeWindowToEphemeralWindows.get(active);
- if (ephemeralWindows != null) {
- for (W ephemeralWindow : ephemeralWindows) {
- sb.append(" EPHEMERAL ");
- sb.append(ephemeralWindow);
- sb.append('\n');
- }
- }
- }
- }
- sb.append("}");
- return sb.toString();
- }
-
- // ======================================================================
-
- /**
- * Replace null {@code multimap} with empty map, and replace null entries in {@code multimap} with
- * empty sets.
- */
- private static <W> Map<W, Set<W>> emptyIfNull(@Nullable Map<W, Set<W>> multimap) {
- if (multimap == null) {
- return new HashMap<>();
- } else {
- for (Map.Entry<W, Set<W>> entry : multimap.entrySet()) {
- if (entry.getValue() == null) {
- entry.setValue(new LinkedHashSet<W>());
- }
- }
- return multimap;
- }
- }
-
- /** Return a deep copy of {@code multimap}. */
- private static <W> Map<W, Set<W>> deepCopy(Map<W, Set<W>> multimap) {
- Map<W, Set<W>> newMultimap = new HashMap<>();
- for (Map.Entry<W, Set<W>> entry : multimap.entrySet()) {
- newMultimap.put(entry.getKey(), new LinkedHashSet<>(entry.getValue()));
- }
- return newMultimap;
- }
-
- /** Return inversion of {@code multimap}, which must be invertible. */
- private static <W> Map<W, W> invert(Map<W, Set<W>> multimap) {
- Map<W, W> result = new HashMap<>();
- for (Map.Entry<W, Set<W>> entry : multimap.entrySet()) {
- W active = entry.getKey();
- for (W target : entry.getValue()) {
- W previous = result.put(target, active);
- Preconditions.checkState(previous == null,
- "Window %s has both %s and %s as representatives", target, previous, active);
- }
- }
- return result;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/MimeTypes.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/MimeTypes.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/MimeTypes.java
deleted file mode 100644
index 489d183..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/MimeTypes.java
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-/** Constants representing various mime types. */
-public class MimeTypes {
- public static final String TEXT = "text/plain";
- public static final String BINARY = "application/octet-stream";
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/MonitoringUtil.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/MonitoringUtil.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/MonitoringUtil.java
deleted file mode 100644
index d450187..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/MonitoringUtil.java
+++ /dev/null
@@ -1,233 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.util;
-
-import static com.google.cloud.dataflow.sdk.util.TimeUtil.fromCloudTime;
-
-import com.google.api.services.dataflow.Dataflow;
-import com.google.api.services.dataflow.Dataflow.Projects.Jobs.Messages;
-import com.google.api.services.dataflow.model.JobMessage;
-import com.google.api.services.dataflow.model.ListJobMessagesResponse;
-import com.google.cloud.dataflow.sdk.PipelineResult.State;
-import com.google.cloud.dataflow.sdk.options.DataflowPipelineOptions;
-import com.google.common.base.MoreObjects;
-import com.google.common.collect.ImmutableMap;
-
-import org.joda.time.Instant;
-
-import java.io.IOException;
-import java.io.PrintStream;
-import java.io.UnsupportedEncodingException;
-import java.net.URLEncoder;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.List;
-import java.util.Map;
-
-import javax.annotation.Nullable;
-
-/**
- * A helper class for monitoring jobs submitted to the service.
- */
-public final class MonitoringUtil {
-
- private static final String GCLOUD_DATAFLOW_PREFIX = "gcloud alpha dataflow";
- private static final String ENDPOINT_OVERRIDE_ENV_VAR =
- "CLOUDSDK_API_ENDPOINT_OVERRIDES_DATAFLOW";
-
- private static final Map<String, State> DATAFLOW_STATE_TO_JOB_STATE =
- ImmutableMap
- .<String, State>builder()
- .put("JOB_STATE_UNKNOWN", State.UNKNOWN)
- .put("JOB_STATE_STOPPED", State.STOPPED)
- .put("JOB_STATE_RUNNING", State.RUNNING)
- .put("JOB_STATE_DONE", State.DONE)
- .put("JOB_STATE_FAILED", State.FAILED)
- .put("JOB_STATE_CANCELLED", State.CANCELLED)
- .put("JOB_STATE_UPDATED", State.UPDATED)
- .build();
-
- private String projectId;
- private Messages messagesClient;
-
- /**
- * An interface that can be used for defining callbacks to receive a list
- * of JobMessages containing monitoring information.
- */
- public interface JobMessagesHandler {
- /** Process the rows. */
- void process(List<JobMessage> messages);
- }
-
- /** A handler that prints monitoring messages to a stream. */
- public static class PrintHandler implements JobMessagesHandler {
- private PrintStream out;
-
- /**
- * Construct the handler.
- *
- * @param stream The stream to write the messages to.
- */
- public PrintHandler(PrintStream stream) {
- out = stream;
- }
-
- @Override
- public void process(List<JobMessage> messages) {
- for (JobMessage message : messages) {
- if (message.getMessageText() == null || message.getMessageText().isEmpty()) {
- continue;
- }
- String importanceString = null;
- if (message.getMessageImportance() == null) {
- continue;
- } else if (message.getMessageImportance().equals("JOB_MESSAGE_ERROR")) {
- importanceString = "Error: ";
- } else if (message.getMessageImportance().equals("JOB_MESSAGE_WARNING")) {
- importanceString = "Warning: ";
- } else if (message.getMessageImportance().equals("JOB_MESSAGE_BASIC")) {
- importanceString = "Basic: ";
- } else if (message.getMessageImportance().equals("JOB_MESSAGE_DETAILED")) {
- importanceString = "Detail: ";
- } else {
- // TODO: Remove filtering here once getJobMessages supports minimum
- // importance.
- continue;
- }
- @Nullable Instant time = TimeUtil.fromCloudTime(message.getTime());
- if (time == null) {
- out.print("UNKNOWN TIMESTAMP: ");
- } else {
- out.print(time + ": ");
- }
- if (importanceString != null) {
- out.print(importanceString);
- }
- out.println(message.getMessageText());
- }
- out.flush();
- }
- }
-
- /** Construct a helper for monitoring. */
- public MonitoringUtil(String projectId, Dataflow dataflow) {
- this(projectId, dataflow.projects().jobs().messages());
- }
-
- // @VisibleForTesting
- MonitoringUtil(String projectId, Messages messagesClient) {
- this.projectId = projectId;
- this.messagesClient = messagesClient;
- }
-
- /**
- * Comparator for sorting rows in increasing order based on timestamp.
- */
- public static class TimeStampComparator implements Comparator<JobMessage> {
- @Override
- public int compare(JobMessage o1, JobMessage o2) {
- @Nullable Instant t1 = fromCloudTime(o1.getTime());
- if (t1 == null) {
- return -1;
- }
- @Nullable Instant t2 = fromCloudTime(o2.getTime());
- if (t2 == null) {
- return 1;
- }
- return t1.compareTo(t2);
- }
- }
-
- /**
- * Return job messages sorted in ascending order by timestamp.
- * @param jobId The id of the job to get the messages for.
- * @param startTimestampMs Return only those messages with a
- * timestamp greater than this value.
- * @return collection of messages
- * @throws IOException
- */
- public ArrayList<JobMessage> getJobMessages(
- String jobId, long startTimestampMs) throws IOException {
- // TODO: Allow filtering messages by importance
- Instant startTimestamp = new Instant(startTimestampMs);
- ArrayList<JobMessage> allMessages = new ArrayList<>();
- String pageToken = null;
- while (true) {
- Messages.List listRequest = messagesClient.list(projectId, jobId);
- if (pageToken != null) {
- listRequest.setPageToken(pageToken);
- }
- ListJobMessagesResponse response = listRequest.execute();
-
- if (response == null || response.getJobMessages() == null) {
- return allMessages;
- }
-
- for (JobMessage m : response.getJobMessages()) {
- @Nullable Instant timestamp = fromCloudTime(m.getTime());
- if (timestamp == null) {
- continue;
- }
- if (timestamp.isAfter(startTimestamp)) {
- allMessages.add(m);
- }
- }
-
- if (response.getNextPageToken() == null) {
- break;
- } else {
- pageToken = response.getNextPageToken();
- }
- }
-
- Collections.sort(allMessages, new TimeStampComparator());
- return allMessages;
- }
-
- public static String getJobMonitoringPageURL(String projectName, String jobId) {
- try {
- // Project name is allowed in place of the project id: the user will be redirected to a URL
- // that has the project name replaced with project id.
- return String.format(
- "https://console.developers.google.com/project/%s/dataflow/job/%s",
- URLEncoder.encode(projectName, "UTF-8"),
- URLEncoder.encode(jobId, "UTF-8"));
- } catch (UnsupportedEncodingException e) {
- // Should never happen.
- throw new AssertionError("UTF-8 encoding is not supported by the environment", e);
- }
- }
-
- public static String getGcloudCancelCommand(DataflowPipelineOptions options, String jobId) {
-
- // If using a different Dataflow API than default, prefix command with an API override.
- String dataflowApiOverridePrefix = "";
- String apiUrl = options.getDataflowClient().getBaseUrl();
- if (!apiUrl.equals(Dataflow.DEFAULT_BASE_URL)) {
- dataflowApiOverridePrefix = String.format("%s=%s ", ENDPOINT_OVERRIDE_ENV_VAR, apiUrl);
- }
-
- // Assemble cancel command from optional prefix and project/job parameters.
- return String.format("%s%s jobs --project=%s cancel %s",
- dataflowApiOverridePrefix, GCLOUD_DATAFLOW_PREFIX, options.getProject(), jobId);
- }
-
- public static State toState(String stateName) {
- return MoreObjects.firstNonNull(DATAFLOW_STATE_TO_JOB_STATE.get(stateName),
- State.UNKNOWN);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/MutationDetector.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/MutationDetector.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/MutationDetector.java
deleted file mode 100644
index 51e65ab..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/MutationDetector.java
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-/**
- * An object for detecting illegal mutations.
- *
- * <p>The {@link AutoCloseable} aspect of this interface allows use in a try-with-resources
- * style, where the implementing class may choose to perform a final mutation check upon
- * {@link #close()}.
- */
-public interface MutationDetector extends AutoCloseable {
- /**
- * @throws IllegalMutationException if illegal mutations are detected.
- */
- void verifyUnmodified();
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/MutationDetectors.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/MutationDetectors.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/MutationDetectors.java
deleted file mode 100644
index 412e3eb..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/MutationDetectors.java
+++ /dev/null
@@ -1,182 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.coders.CoderException;
-import com.google.common.base.Throwables;
-
-import java.util.Arrays;
-import java.util.Objects;
-
-/**
- * Static methods for creating and working with {@link MutationDetector}.
- */
-public class MutationDetectors {
-
- private MutationDetectors() {}
-
- /**
- * Creates a new {@code MutationDetector} for the provided {@code value} that uses the provided
- * {@link Coder} to perform deep copies and comparisons by serializing and deserializing values.
- *
- * <p>It is permissible for {@code value} to be {@code null}. Since {@code null} is immutable,
- * the mutation check will always succeed.
- */
- public static <T> MutationDetector forValueWithCoder(T value, Coder<T> coder)
- throws CoderException {
- if (value == null) {
- return noopMutationDetector();
- } else {
- return new CodedValueMutationDetector<>(value, coder);
- }
- }
-
- /**
- * Creates a new {@code MutationDetector} that always succeeds.
- *
- * <p>This is useful, for example, for providing a very efficient mutation detector for a value
- * which is already immutable by design.
- */
- public static MutationDetector noopMutationDetector() {
- return new NoopMutationDetector();
- }
-
- /**
- * A {@link MutationDetector} for {@code null}, which is immutable.
- */
- private static class NoopMutationDetector implements MutationDetector {
-
- @Override
- public void verifyUnmodified() { }
-
- @Override
- public void close() { }
- }
-
- /**
- * Given a value of type {@code T} and a {@link Coder} for that type, provides facilities to save
- * check that the value has not changed.
- *
- * @param <T> the type of values checked for mutation
- */
- private static class CodedValueMutationDetector<T> implements MutationDetector {
-
- private final Coder<T> coder;
-
- /**
- * A saved pointer to an in-memory value provided upon construction, which we will check for
- * forbidden mutations.
- */
- private final T possiblyModifiedObject;
-
- /**
- * A saved encoded copy of the same value as {@link #possiblyModifiedObject}. Naturally, it
- * will not change if {@link #possiblyModifiedObject} is mutated.
- */
- private final byte[] encodedOriginalObject;
-
- /**
- * The object decoded from {@link #encodedOriginalObject}. It will be used during every call to
- * {@link #verifyUnmodified}, which could be called many times throughout the lifetime of this
- * {@link CodedValueMutationDetector}.
- */
- private final T clonedOriginalObject;
-
- /**
- * Create a mutation detector for the provided {@code value}, using the provided {@link Coder}
- * for cloning and checking serialized forms for equality.
- */
- public CodedValueMutationDetector(T value, Coder<T> coder) throws CoderException {
- this.coder = coder;
- this.possiblyModifiedObject = value;
- this.encodedOriginalObject = CoderUtils.encodeToByteArray(coder, value);
- this.clonedOriginalObject = CoderUtils.decodeFromByteArray(coder, encodedOriginalObject);
- }
-
- @Override
- public void verifyUnmodified() {
- try {
- verifyUnmodifiedThrowingCheckedExceptions();
- } catch (CoderException exn) {
- Throwables.propagate(exn);
- }
- }
-
- private void verifyUnmodifiedThrowingCheckedExceptions() throws CoderException {
- // If either object believes they are equal, we trust that and short-circuit deeper checks.
- if (Objects.equals(possiblyModifiedObject, clonedOriginalObject)
- || Objects.equals(clonedOriginalObject, possiblyModifiedObject)) {
- return;
- }
-
- // Since retainedObject is in general an instance of a subclass of T, when it is cloned to
- // clonedObject using a Coder<T>, the two will generally be equivalent viewed as a T, but in
- // general neither retainedObject.equals(clonedObject) nor clonedObject.equals(retainedObject)
- // will hold.
- //
- // For example, CoderUtils.clone(IterableCoder<Integer>, IterableSubclass<Integer>) will
- // produce an ArrayList<Integer> with the same contents as the IterableSubclass, but the
- // latter will quite reasonably not consider itself equivalent to an ArrayList (and vice
- // versa).
- //
- // To enable a reasonable comparison, we clone retainedObject again here, converting it to
- // the same sort of T that the Coder<T> output when it created clonedObject.
- T clonedPossiblyModifiedObject = CoderUtils.clone(coder, possiblyModifiedObject);
-
- // If deepEquals() then we trust the equals implementation.
- // This deliberately allows fields to escape this check.
- if (Objects.deepEquals(clonedPossiblyModifiedObject, clonedOriginalObject)) {
- return;
- }
-
- // If not deepEquals(), the class may just have a poor equals() implementation.
- // So we next try checking their serialized forms. We re-serialize instead of checking
- // encodedObject, because the Coder may treat it differently.
- //
- // For example, an unbounded Iterable will be encoded in an unbounded way, but decoded into an
- // ArrayList, which will then be re-encoded in a bounded format. So we really do need to
- // encode-decode-encode retainedObject.
- if (Arrays.equals(
- CoderUtils.encodeToByteArray(coder, clonedOriginalObject),
- CoderUtils.encodeToByteArray(coder, clonedPossiblyModifiedObject))) {
- return;
- }
-
- // If we got here, then they are not deepEquals() and do not have deepEquals() encodings.
- // Even if there is some conceptual sense in which the objects are equivalent, it has not
- // been adequately expressed in code.
- illegalMutation(clonedOriginalObject, clonedPossiblyModifiedObject);
- }
-
- private void illegalMutation(T previousValue, T newValue) throws CoderException {
- throw new IllegalMutationException(
- String.format("Value %s mutated illegally, new value was %s."
- + " Encoding was %s, now %s.",
- previousValue, newValue,
- CoderUtils.encodeToBase64(coder, previousValue),
- CoderUtils.encodeToBase64(coder, newValue)),
- previousValue, newValue);
- }
-
- @Override
- public void close() {
- verifyUnmodified();
- }
- }
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/NonEmptyPanes.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/NonEmptyPanes.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/NonEmptyPanes.java
deleted file mode 100644
index 1270f01..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/NonEmptyPanes.java
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.cloud.dataflow.sdk.coders.VarLongCoder;
-import com.google.cloud.dataflow.sdk.transforms.Sum;
-import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
-import com.google.cloud.dataflow.sdk.util.WindowingStrategy.AccumulationMode;
-import com.google.cloud.dataflow.sdk.util.state.AccumulatorCombiningState;
-import com.google.cloud.dataflow.sdk.util.state.MergingStateAccessor;
-import com.google.cloud.dataflow.sdk.util.state.ReadableState;
-import com.google.cloud.dataflow.sdk.util.state.StateAccessor;
-import com.google.cloud.dataflow.sdk.util.state.StateMerging;
-import com.google.cloud.dataflow.sdk.util.state.StateTag;
-import com.google.cloud.dataflow.sdk.util.state.StateTags;
-
-/**
- * Tracks which windows have non-empty panes. Specifically, which windows have new elements since
- * their last triggering.
- *
- * @param <W> The kind of windows being tracked.
- */
-public abstract class NonEmptyPanes<K, W extends BoundedWindow> {
-
- static <K, W extends BoundedWindow> NonEmptyPanes<K, W> create(
- WindowingStrategy<?, W> strategy, ReduceFn<K, ?, ?, W> reduceFn) {
- if (strategy.getMode() == AccumulationMode.DISCARDING_FIRED_PANES) {
- return new DiscardingModeNonEmptyPanes<>(reduceFn);
- } else {
- return new GeneralNonEmptyPanes<>();
- }
- }
-
- /**
- * Record that some content has been added to the window in {@code context}, and therefore the
- * current pane is not empty.
- */
- public abstract void recordContent(StateAccessor<K> context);
-
- /**
- * Record that the given pane is empty.
- */
- public abstract void clearPane(StateAccessor<K> state);
-
- /**
- * Return true if the current pane for the window in {@code context} is empty.
- */
- public abstract ReadableState<Boolean> isEmpty(StateAccessor<K> context);
-
- /**
- * Prefetch in preparation for merging.
- */
- public abstract void prefetchOnMerge(MergingStateAccessor<K, W> state);
-
- /**
- * Eagerly merge backing state.
- */
- public abstract void onMerge(MergingStateAccessor<K, W> context);
-
- /**
- * An implementation of {@code NonEmptyPanes} optimized for use with discarding mode. Uses the
- * presence of data in the accumulation buffer to record non-empty panes.
- */
- private static class DiscardingModeNonEmptyPanes<K, W extends BoundedWindow>
- extends NonEmptyPanes<K, W> {
-
- private ReduceFn<K, ?, ?, W> reduceFn;
-
- private DiscardingModeNonEmptyPanes(ReduceFn<K, ?, ?, W> reduceFn) {
- this.reduceFn = reduceFn;
- }
-
- @Override
- public ReadableState<Boolean> isEmpty(StateAccessor<K> state) {
- return reduceFn.isEmpty(state);
- }
-
- @Override
- public void recordContent(StateAccessor<K> state) {
- // Nothing to do -- the reduceFn is tracking contents
- }
-
- @Override
- public void clearPane(StateAccessor<K> state) {
- // Nothing to do -- the reduceFn is tracking contents
- }
-
- @Override
- public void prefetchOnMerge(MergingStateAccessor<K, W> state) {
- // Nothing to do -- the reduceFn is tracking contents
- }
-
- @Override
- public void onMerge(MergingStateAccessor<K, W> context) {
- // Nothing to do -- the reduceFn is tracking contents
- }
- }
-
- /**
- * An implementation of {@code NonEmptyPanes} for general use.
- */
- private static class GeneralNonEmptyPanes<K, W extends BoundedWindow>
- extends NonEmptyPanes<K, W> {
-
- private static final StateTag<Object, AccumulatorCombiningState<Long, long[], Long>>
- PANE_ADDITIONS_TAG =
- StateTags.makeSystemTagInternal(StateTags.combiningValueFromInputInternal(
- "count", VarLongCoder.of(), new Sum.SumLongFn()));
-
- @Override
- public void recordContent(StateAccessor<K> state) {
- state.access(PANE_ADDITIONS_TAG).add(1L);
- }
-
- @Override
- public void clearPane(StateAccessor<K> state) {
- state.access(PANE_ADDITIONS_TAG).clear();
- }
-
- @Override
- public ReadableState<Boolean> isEmpty(StateAccessor<K> state) {
- return state.access(PANE_ADDITIONS_TAG).isEmpty();
- }
-
- @Override
- public void prefetchOnMerge(MergingStateAccessor<K, W> state) {
- StateMerging.prefetchCombiningValues(state, PANE_ADDITIONS_TAG);
- }
-
- @Override
- public void onMerge(MergingStateAccessor<K, W> context) {
- StateMerging.mergeCombiningValues(context, PANE_ADDITIONS_TAG);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/NonMergingActiveWindowSet.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/NonMergingActiveWindowSet.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/NonMergingActiveWindowSet.java
deleted file mode 100644
index cb7f9b0..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/NonMergingActiveWindowSet.java
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
-import com.google.cloud.dataflow.sdk.transforms.windowing.WindowFn;
-import com.google.common.collect.ImmutableSet;
-
-import java.util.Collection;
-import java.util.Set;
-
-/**
- * Implementation of {@link ActiveWindowSet} used with {@link WindowFn WindowFns} that don't support
- * merging.
- *
- * @param <W> the types of windows being managed
- */
-public class NonMergingActiveWindowSet<W extends BoundedWindow> implements ActiveWindowSet<W> {
- @Override
- public void removeEphemeralWindows() {}
-
- @Override
- public void persist() {}
-
- @Override
- public W representative(W window) {
- // Always represented by itself.
- return window;
- }
-
- @Override
- public Set<W> getActiveWindows() {
- // Only supported when merging.
- throw new java.lang.UnsupportedOperationException();
- }
-
- @Override
- public boolean isActive(W window) {
- // Windows should never disappear, since we don't support merging.
- return true;
- }
-
- @Override
- public void addNew(W window) {}
-
- @Override
- public void addActive(W window) {}
-
- @Override
- public void remove(W window) {}
-
- @Override
- public void merge(MergeCallback<W> mergeCallback) throws Exception {}
-
- @Override
- public void merged(W window) {}
-
- @Override
- public Set<W> readStateAddresses(W window) {
- return ImmutableSet.of(window);
- }
-
- @Override
- public W writeStateAddress(W window) {
- return window;
- }
-
- @Override
- public W mergedWriteStateAddress(Collection<W> toBeMerged, W mergeResult) {
- return mergeResult;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/NoopCredentialFactory.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/NoopCredentialFactory.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/NoopCredentialFactory.java
deleted file mode 100644
index 9ef4c2e..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/NoopCredentialFactory.java
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.api.client.auth.oauth2.Credential;
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-
-import java.io.IOException;
-import java.security.GeneralSecurityException;
-
-/**
- * Construct an oauth credential to be used by the SDK and the SDK workers.
- * Always returns a null Credential object.
- */
-public class NoopCredentialFactory implements CredentialFactory {
- public static NoopCredentialFactory fromOptions(PipelineOptions options) {
- return new NoopCredentialFactory();
- }
-
- @Override
- public Credential getCredential() throws IOException, GeneralSecurityException {
- return null;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/NoopPathValidator.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/NoopPathValidator.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/NoopPathValidator.java
deleted file mode 100644
index 00abbb1..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/NoopPathValidator.java
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-
-/**
- * Noop implementation of {@link PathValidator}. All paths are allowed and returned unchanged.
- */
-public class NoopPathValidator implements PathValidator {
-
- private NoopPathValidator() {
- }
-
- public static PathValidator fromOptions(
- @SuppressWarnings("unused") PipelineOptions options) {
- return new NoopPathValidator();
- }
-
- @Override
- public String validateInputFilePatternSupported(String filepattern) {
- return filepattern;
- }
-
- @Override
- public String validateOutputFilePrefixSupported(String filePrefix) {
- return filePrefix;
- }
-
- @Override
- public String verifyPath(String path) {
- return path;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/NullSideInputReader.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/NullSideInputReader.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/NullSideInputReader.java
deleted file mode 100644
index 0fc2646..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/NullSideInputReader.java
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
-import com.google.cloud.dataflow.sdk.values.PCollectionView;
-import com.google.common.collect.Sets;
-
-import java.util.Collections;
-import java.util.Set;
-
-/**
- * A {@link SideInputReader} representing a well-defined set of views, but not storing
- * any values for them. Used to check if a side input is present when the data itself
- * comes from elsewhere.
- */
-public class NullSideInputReader implements SideInputReader {
-
- private Set<PCollectionView<?>> views;
-
- public static NullSideInputReader empty() {
- return new NullSideInputReader(Collections.<PCollectionView<?>>emptySet());
- }
-
- public static NullSideInputReader of(Iterable<? extends PCollectionView<?>> views) {
- return new NullSideInputReader(views);
- }
-
- private NullSideInputReader(Iterable<? extends PCollectionView<?>> views) {
- this.views = Sets.newHashSet(views);
- }
-
- @Override
- public <T> T get(PCollectionView<T> view, BoundedWindow window) {
- throw new IllegalArgumentException("cannot call NullSideInputReader.get()");
- }
-
- @Override
- public boolean isEmpty() {
- return views.isEmpty();
- }
-
- @Override
- public <T> boolean contains(PCollectionView<T> view) {
- return views.contains(view);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/OutputReference.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/OutputReference.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/OutputReference.java
deleted file mode 100644
index 096c996..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/OutputReference.java
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-import static com.google.api.client.util.Preconditions.checkNotNull;
-
-import com.google.api.client.json.GenericJson;
-import com.google.api.client.util.Key;
-
-/**
- * A representation used by {@link com.google.api.services.dataflow.model.Step}s
- * to reference the output of other {@code Step}s.
- */
-public final class OutputReference extends GenericJson {
- @Key("@type")
- public final String type = "OutputReference";
-
- @Key("step_name")
- private final String stepName;
-
- @Key("output_name")
- private final String outputName;
-
- public OutputReference(String stepName, String outputName) {
- this.stepName = checkNotNull(stepName);
- this.outputName = checkNotNull(outputName);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/PCollectionViewWindow.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/PCollectionViewWindow.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/PCollectionViewWindow.java
deleted file mode 100644
index 7cf636e..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/PCollectionViewWindow.java
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
-import com.google.cloud.dataflow.sdk.values.PCollectionView;
-
-import java.util.Objects;
-
-/**
- * A pair of a {@link PCollectionView} and a {@link BoundedWindow}, which can
- * be thought of as window "of" the view. This is a value class for use e.g.
- * as a compound cache key.
- *
- * @param <T> the type of the underlying PCollectionView
- */
-public final class PCollectionViewWindow<T> {
-
- private final PCollectionView<T> view;
- private final BoundedWindow window;
-
- private PCollectionViewWindow(PCollectionView<T> view, BoundedWindow window) {
- this.view = view;
- this.window = window;
- }
-
- public static <T> PCollectionViewWindow<T> of(PCollectionView<T> view, BoundedWindow window) {
- return new PCollectionViewWindow<>(view, window);
- }
-
- public PCollectionView<T> getView() {
- return view;
- }
-
- public BoundedWindow getWindow() {
- return window;
- }
-
- @Override
- public boolean equals(Object otherObject) {
- if (!(otherObject instanceof PCollectionViewWindow)) {
- return false;
- }
- @SuppressWarnings("unchecked")
- PCollectionViewWindow<T> other = (PCollectionViewWindow<T>) otherObject;
- return getView().equals(other.getView()) && getWindow().equals(other.getWindow());
- }
-
- @Override
- public int hashCode() {
- return Objects.hash(getView(), getWindow());
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/PCollectionViews.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/PCollectionViews.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/PCollectionViews.java
deleted file mode 100644
index 7e73547..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/PCollectionViews.java
+++ /dev/null
@@ -1,426 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.coders.IterableCoder;
-import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
-import com.google.cloud.dataflow.sdk.transforms.windowing.InvalidWindows;
-import com.google.cloud.dataflow.sdk.values.KV;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.cloud.dataflow.sdk.values.PCollectionView;
-import com.google.cloud.dataflow.sdk.values.PValueBase;
-import com.google.cloud.dataflow.sdk.values.TupleTag;
-import com.google.common.base.Function;
-import com.google.common.base.MoreObjects;
-import com.google.common.collect.HashMultimap;
-import com.google.common.collect.ImmutableList;
-import com.google.common.collect.Iterables;
-import com.google.common.collect.Multimap;
-
-import java.io.IOException;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.NoSuchElementException;
-import java.util.Objects;
-
-import javax.annotation.Nullable;
-
-/**
- * Implementations of {@link PCollectionView} shared across the SDK.
- *
- * <p>For internal use only, subject to change.
- */
-public class PCollectionViews {
-
- /**
- * Returns a {@code PCollectionView<T>} capable of processing elements encoded using the provided
- * {@link Coder} and windowed using the provided * {@link WindowingStrategy}.
- *
- * <p>If {@code hasDefault} is {@code true}, then the view will take on the value
- * {@code defaultValue} for any empty windows.
- */
- public static <T, W extends BoundedWindow> PCollectionView<T> singletonView(
- Pipeline pipeline,
- WindowingStrategy<?, W> windowingStrategy,
- boolean hasDefault,
- T defaultValue,
- Coder<T> valueCoder) {
- return new SingletonPCollectionView<>(
- pipeline, windowingStrategy, hasDefault, defaultValue, valueCoder);
- }
-
- /**
- * Returns a {@code PCollectionView<Iterable<T>>} capable of processing elements encoded using the
- * provided {@link Coder} and windowed using the provided {@link WindowingStrategy}.
- */
- public static <T, W extends BoundedWindow> PCollectionView<Iterable<T>> iterableView(
- Pipeline pipeline,
- WindowingStrategy<?, W> windowingStrategy,
- Coder<T> valueCoder) {
- return new IterablePCollectionView<>(pipeline, windowingStrategy, valueCoder);
- }
-
- /**
- * Returns a {@code PCollectionView<List<T>>} capable of processing elements encoded using the
- * provided {@link Coder} and windowed using the provided {@link WindowingStrategy}.
- */
- public static <T, W extends BoundedWindow> PCollectionView<List<T>> listView(
- Pipeline pipeline,
- WindowingStrategy<?, W> windowingStrategy,
- Coder<T> valueCoder) {
- return new ListPCollectionView<>(pipeline, windowingStrategy, valueCoder);
- }
-
- /**
- * Returns a {@code PCollectionView<Map<K, V>>} capable of processing elements encoded using the
- * provided {@link Coder} and windowed using the provided {@link WindowingStrategy}.
- */
- public static <K, V, W extends BoundedWindow> PCollectionView<Map<K, V>> mapView(
- Pipeline pipeline,
- WindowingStrategy<?, W> windowingStrategy,
- Coder<KV<K, V>> valueCoder) {
-
- return new MapPCollectionView<K, V, W>(pipeline, windowingStrategy, valueCoder);
- }
-
- /**
- * Returns a {@code PCollectionView<Map<K, Iterable<V>>>} capable of processing elements encoded
- * using the provided {@link Coder} and windowed using the provided {@link WindowingStrategy}.
- */
- public static <K, V, W extends BoundedWindow> PCollectionView<Map<K, Iterable<V>>> multimapView(
- Pipeline pipeline,
- WindowingStrategy<?, W> windowingStrategy,
- Coder<KV<K, V>> valueCoder) {
- return new MultimapPCollectionView<K, V, W>(pipeline, windowingStrategy, valueCoder);
- }
-
- /**
- * Implementation of conversion of singleton {@code Iterable<WindowedValue<T>>} to {@code T}.
- *
- * <p>For internal use only.
- *
- * <p>Instantiate via {@link PCollectionViews#singletonView}.
- */
- public static class SingletonPCollectionView<T, W extends BoundedWindow>
- extends PCollectionViewBase<T, T, W> {
- @Nullable private byte[] encodedDefaultValue;
- @Nullable private transient T defaultValue;
- @Nullable private Coder<T> valueCoder;
- private boolean hasDefault;
-
- private SingletonPCollectionView(
- Pipeline pipeline, WindowingStrategy<?, W> windowingStrategy,
- boolean hasDefault, T defaultValue, Coder<T> valueCoder) {
- super(pipeline, windowingStrategy, valueCoder);
- this.hasDefault = hasDefault;
- this.defaultValue = defaultValue;
- this.valueCoder = valueCoder;
- if (hasDefault) {
- try {
- this.encodedDefaultValue = CoderUtils.encodeToByteArray(valueCoder, defaultValue);
- } catch (IOException e) {
- throw new RuntimeException("Unexpected IOException: ", e);
- }
- }
- }
-
- /**
- * Returns the default value that was specified.
- *
- * <p>For internal use only.
- *
- * @throws NoSuchElementException if no default was specified.
- */
- public T getDefaultValue() {
- if (!hasDefault) {
- throw new NoSuchElementException("Empty PCollection accessed as a singleton view.");
- }
- // Lazily decode the default value once
- synchronized (this) {
- if (encodedDefaultValue != null) {
- try {
- defaultValue = CoderUtils.decodeFromByteArray(valueCoder, encodedDefaultValue);
- encodedDefaultValue = null;
- } catch (IOException e) {
- throw new RuntimeException("Unexpected IOException: ", e);
- }
- }
- }
- return defaultValue;
- }
-
- @Override
- protected T fromElements(Iterable<WindowedValue<T>> contents) {
- try {
- return Iterables.getOnlyElement(contents).getValue();
- } catch (NoSuchElementException exc) {
- return getDefaultValue();
- } catch (IllegalArgumentException exc) {
- throw new IllegalArgumentException(
- "PCollection with more than one element "
- + "accessed as a singleton view.");
- }
- }
- }
-
- /**
- * Implementation of conversion {@code Iterable<WindowedValue<T>>} to {@code Iterable<T>}.
- *
- * <p>For internal use only.
- *
- * <p>Instantiate via {@link PCollectionViews#iterableView}.
- */
- public static class IterablePCollectionView<T, W extends BoundedWindow>
- extends PCollectionViewBase<T, Iterable<T>, W> {
- private IterablePCollectionView(
- Pipeline pipeline, WindowingStrategy<?, W> windowingStrategy, Coder<T> valueCoder) {
- super(pipeline, windowingStrategy, valueCoder);
- }
-
- @Override
- protected Iterable<T> fromElements(Iterable<WindowedValue<T>> contents) {
- return Iterables.unmodifiableIterable(
- Iterables.transform(contents, new Function<WindowedValue<T>, T>() {
- @SuppressWarnings("unchecked")
- @Override
- public T apply(WindowedValue<T> input) {
- return input.getValue();
- }
- }));
- }
- }
-
- /**
- * Implementation of conversion {@code Iterable<WindowedValue<T>>} to {@code List<T>}.
- *
- * <p>For internal use only.
- *
- * <p>Instantiate via {@link PCollectionViews#listView}.
- */
- public static class ListPCollectionView<T, W extends BoundedWindow>
- extends PCollectionViewBase<T, List<T>, W> {
- private ListPCollectionView(
- Pipeline pipeline, WindowingStrategy<?, W> windowingStrategy, Coder<T> valueCoder) {
- super(pipeline, windowingStrategy, valueCoder);
- }
-
- @Override
- protected List<T> fromElements(Iterable<WindowedValue<T>> contents) {
- return ImmutableList.copyOf(
- Iterables.transform(contents, new Function<WindowedValue<T>, T>() {
- @SuppressWarnings("unchecked")
- @Override
- public T apply(WindowedValue<T> input) {
- return input.getValue();
- }
- }));
- }
- }
-
- /**
- * Implementation of conversion {@code Iterable<WindowedValue<KV<K, V>>>}
- * to {@code Map<K, Iterable<V>>}.
- *
- * <p>For internal use only.
- */
- public static class MultimapPCollectionView<K, V, W extends BoundedWindow>
- extends PCollectionViewBase<KV<K, V>, Map<K, Iterable<V>>, W> {
- private MultimapPCollectionView(
- Pipeline pipeline,
- WindowingStrategy<?, W> windowingStrategy,
- Coder<KV<K, V>> valueCoder) {
- super(pipeline, windowingStrategy, valueCoder);
- }
-
- @Override
- protected Map<K, Iterable<V>> fromElements(Iterable<WindowedValue<KV<K, V>>> elements) {
- Multimap<K, V> multimap = HashMultimap.create();
- for (WindowedValue<KV<K, V>> elem : elements) {
- KV<K, V> kv = elem.getValue();
- multimap.put(kv.getKey(), kv.getValue());
- }
- // Safe covariant cast that Java cannot express without rawtypes, even with unchecked casts
- @SuppressWarnings({"unchecked", "rawtypes"})
- Map<K, Iterable<V>> resultMap = (Map) multimap.asMap();
- return Collections.unmodifiableMap(resultMap);
- }
- }
-
- /**
- * Implementation of conversion {@code Iterable<WindowedValue<KV<K, V>>} with
- * one value per key to {@code Map<K, V>}.
- *
- * <p>For internal use only.
- */
- public static class MapPCollectionView<K, V, W extends BoundedWindow>
- extends PCollectionViewBase<KV<K, V>, Map<K, V>, W> {
- private MapPCollectionView(
- Pipeline pipeline,
- WindowingStrategy<?, W> windowingStrategy,
- Coder<KV<K, V>> valueCoder) {
- super(pipeline, windowingStrategy, valueCoder);
- }
-
- /**
- * Input iterable must actually be {@code Iterable<WindowedValue<KV<K, V>>>}.
- */
- @Override
- protected Map<K, V> fromElements(Iterable<WindowedValue<KV<K, V>>> elements) {
- Map<K, V> map = new HashMap<>();
- for (WindowedValue<KV<K, V>> elem : elements) {
- KV<K, V> kv = elem.getValue();
- if (map.containsKey(kv.getKey())) {
- throw new IllegalArgumentException("Duplicate values for " + kv.getKey());
- }
- map.put(kv.getKey(), kv.getValue());
- }
- return Collections.unmodifiableMap(map);
- }
- }
-
- /**
- * A base class for {@link PCollectionView} implementations, with additional type parameters
- * that are not visible at pipeline assembly time when the view is used as a side input.
- */
- private abstract static class PCollectionViewBase<ElemT, ViewT, W extends BoundedWindow>
- extends PValueBase
- implements PCollectionView<ViewT> {
- /** A unique tag for the view, typed according to the elements underlying the view. */
- private TupleTag<Iterable<WindowedValue<ElemT>>> tag;
-
- /** The windowing strategy for the PCollection underlying the view. */
- private WindowingStrategy<?, W> windowingStrategy;
-
- /** The coder for the elements underlying the view. */
- private Coder<Iterable<WindowedValue<ElemT>>> coder;
-
- /**
- * Implement this to complete the implementation. It is a conversion function from
- * all of the elements of the underlying {@link PCollection} to the value of the view.
- */
- protected abstract ViewT fromElements(Iterable<WindowedValue<ElemT>> elements);
-
- /**
- * Call this constructor to initialize the fields for which this base class provides
- * boilerplate accessors.
- */
- protected PCollectionViewBase(
- Pipeline pipeline,
- TupleTag<Iterable<WindowedValue<ElemT>>> tag,
- WindowingStrategy<?, W> windowingStrategy,
- Coder<ElemT> valueCoder) {
- super(pipeline);
- if (windowingStrategy.getWindowFn() instanceof InvalidWindows) {
- throw new IllegalArgumentException("WindowFn of PCollectionView cannot be InvalidWindows");
- }
- this.tag = tag;
- this.windowingStrategy = windowingStrategy;
- this.coder =
- IterableCoder.of(WindowedValue.getFullCoder(
- valueCoder, windowingStrategy.getWindowFn().windowCoder()));
- }
-
- /**
- * Call this constructor to initialize the fields for which this base class provides
- * boilerplate accessors, with an auto-generated tag.
- */
- protected PCollectionViewBase(
- Pipeline pipeline,
- WindowingStrategy<?, W> windowingStrategy,
- Coder<ElemT> valueCoder) {
- this(pipeline, new TupleTag<Iterable<WindowedValue<ElemT>>>(), windowingStrategy, valueCoder);
- }
-
- /**
- * For serialization only. Do not use directly. Subclasses should call from their own
- * protected no-argument constructor.
- */
- @SuppressWarnings("unused") // used for serialization
- protected PCollectionViewBase() {
- super();
- }
-
- @Override
- public ViewT fromIterableInternal(Iterable<WindowedValue<?>> elements) {
- // Safe cast: it is required that the rest of the SDK maintain the invariant
- // that a PCollectionView is only provided an iterable for the elements of an
- // appropriately typed PCollection.
- @SuppressWarnings({"rawtypes", "unchecked"})
- Iterable<WindowedValue<ElemT>> typedElements = (Iterable) elements;
- return fromElements(typedElements);
- }
-
- /**
- * Returns a unique {@link TupleTag} identifying this {@link PCollectionView}.
- *
- * <p>For internal use only by runner implementors.
- */
- @Override
- public TupleTag<Iterable<WindowedValue<?>>> getTagInternal() {
- // Safe cast: It is required that the rest of the SDK maintain the invariant that
- // this tag is only used to access the contents of an appropriately typed underlying
- // PCollection
- @SuppressWarnings({"rawtypes", "unchecked"})
- TupleTag<Iterable<WindowedValue<?>>> untypedTag = (TupleTag) tag;
- return untypedTag;
- }
-
- /**
- * Returns the {@link WindowingStrategy} of this {@link PCollectionView}, which should
- * be that of the underlying {@link PCollection}.
- *
- * <p>For internal use only by runner implementors.
- */
- @Override
- public WindowingStrategy<?, ?> getWindowingStrategyInternal() {
- return windowingStrategy;
- }
-
- @Override
- public Coder<Iterable<WindowedValue<?>>> getCoderInternal() {
- // Safe cast: It is required that the rest of the SDK only use this untyped coder
- // for the elements of an appropriately typed underlying PCollection.
- @SuppressWarnings({"rawtypes", "unchecked"})
- Coder<Iterable<WindowedValue<?>>> untypedCoder = (Coder) coder;
- return untypedCoder;
- }
-
- @Override
- public int hashCode() {
- return Objects.hash(tag);
- }
-
- @Override
- public boolean equals(Object other) {
- if (!(other instanceof PCollectionView) || other == null) {
- return false;
- }
- @SuppressWarnings("unchecked")
- PCollectionView<?> otherView = (PCollectionView<?>) other;
- return tag.equals(otherView.getTagInternal());
- }
-
- @Override
- public String toString() {
- return MoreObjects.toStringHelper(this).add("tag", tag).toString();
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/PTuple.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/PTuple.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/PTuple.java
deleted file mode 100644
index 5b87b5c..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/PTuple.java
+++ /dev/null
@@ -1,160 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.cloud.dataflow.sdk.values.TupleTag;
-
-import java.util.Collections;
-import java.util.LinkedHashMap;
-import java.util.Map;
-
-/**
- * A {@code PTuple} is an immutable tuple of
- * heterogeneously-typed values, "keyed" by {@link TupleTag}s.
- *
- * <p>PTuples can be created and accessed like follows:
- * <pre> {@code
- * String v1 = ...;
- * Integer v2 = ...;
- * Iterable<String> v3 = ...;
- *
- * // Create TupleTags for each of the values to put in the
- * // PTuple (the type of the TupleTag enables tracking the
- * // static type of each of the values in the PTuple):
- * TupleTag<String> tag1 = new TupleTag<>();
- * TupleTag<Integer> tag2 = new TupleTag<>();
- * TupleTag<Iterable<String>> tag3 = new TupleTag<>();
- *
- * // Create a PTuple with three values:
- * PTuple povs =
- * PTuple.of(tag1, v1)
- * .and(tag2, v2)
- * .and(tag3, v3);
- *
- * // Create an empty PTuple:
- * Pipeline p = ...;
- * PTuple povs2 = PTuple.empty(p);
- *
- * // Get values out of a PTuple, using the same tags
- * // that were used to put them in:
- * Integer vX = povs.get(tag2);
- * String vY = povs.get(tag1);
- * Iterable<String> vZ = povs.get(tag3);
- *
- * // Get a map of all values in a PTuple:
- * Map<TupleTag<?>, ?> allVs = povs.getAll();
- * } </pre>
- */
-public class PTuple {
- /**
- * Returns an empty PTuple.
- *
- * <p>Longer PTuples can be created by calling
- * {@link #and} on the result.
- */
- public static PTuple empty() {
- return new PTuple();
- }
-
- /**
- * Returns a singleton PTuple containing the given
- * value keyed by the given TupleTag.
- *
- * <p>Longer PTuples can be created by calling
- * {@link #and} on the result.
- */
- public static <V> PTuple of(TupleTag<V> tag, V value) {
- return empty().and(tag, value);
- }
-
- /**
- * Returns a new PTuple that has all the values and
- * tags of this PTuple plus the given value and tag.
- *
- * <p>The given TupleTag should not already be mapped to a
- * value in this PTuple.
- */
- public <V> PTuple and(TupleTag<V> tag, V value) {
- Map<TupleTag<?>, Object> newMap = new LinkedHashMap<TupleTag<?>, Object>();
- newMap.putAll(valueMap);
- newMap.put(tag, value);
- return new PTuple(newMap);
- }
-
- /**
- * Returns whether this PTuple contains a value with
- * the given tag.
- */
- public <V> boolean has(TupleTag<V> tag) {
- return valueMap.containsKey(tag);
- }
-
- /**
- * Returns true if this {@code PTuple} is empty.
- */
- public boolean isEmpty() {
- return valueMap.isEmpty();
- }
-
- /**
- * Returns the value with the given tag in this
- * PTuple. Throws IllegalArgumentException if there is no
- * such value, i.e., {@code !has(tag)}.
- */
- public <V> V get(TupleTag<V> tag) {
- if (!has(tag)) {
- throw new IllegalArgumentException(
- "TupleTag not found in this PTuple");
- }
- @SuppressWarnings("unchecked")
- V value = (V) valueMap.get(tag);
- return value;
- }
-
- /**
- * Returns an immutable Map from TupleTag to corresponding
- * value, for all the members of this PTuple.
- */
- public Map<TupleTag<?>, ?> getAll() {
- return valueMap;
- }
-
-
- /////////////////////////////////////////////////////////////////////////////
- // Internal details below here.
-
- private final Map<TupleTag<?>, ?> valueMap;
-
- @SuppressWarnings("rawtypes")
- private PTuple() {
- this(new LinkedHashMap());
- }
-
- private PTuple(Map<TupleTag<?>, ?> valueMap) {
- this.valueMap = Collections.unmodifiableMap(valueMap);
- }
-
- /**
- * Returns a PTuple with each of the given tags mapping
- * to the corresponding value.
- *
- * <p>For internal use only.
- */
- public static PTuple ofInternal(Map<TupleTag<?>, ?> valueMap) {
- return new PTuple(valueMap);
- }
-}
[06/67] [partial] incubator-beam git commit: Directory reorganization
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/ZipFiles.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/ZipFiles.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/ZipFiles.java
deleted file mode 100644
index 773b65f..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/ZipFiles.java
+++ /dev/null
@@ -1,294 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-import static com.google.common.base.Preconditions.checkArgument;
-import static com.google.common.base.Preconditions.checkNotNull;
-
-import com.google.common.collect.FluentIterable;
-import com.google.common.collect.Iterators;
-import com.google.common.io.ByteSource;
-import com.google.common.io.CharSource;
-import com.google.common.io.Closer;
-import com.google.common.io.Files;
-
-import java.io.BufferedOutputStream;
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.nio.charset.Charset;
-import java.util.Arrays;
-import java.util.Iterator;
-import java.util.zip.ZipEntry;
-import java.util.zip.ZipFile;
-import java.util.zip.ZipOutputStream;
-
-/**
- * Functions for zipping a directory (including a subdirectory) into a ZIP-file
- * or unzipping it again.
- */
-public final class ZipFiles {
- private ZipFiles() {}
-
- /**
- * Returns a new {@link ByteSource} for reading the contents of the given
- * entry in the given zip file.
- */
- static ByteSource asByteSource(ZipFile file, ZipEntry entry) {
- return new ZipEntryByteSource(file, entry);
- }
-
- /**
- * Returns a new {@link CharSource} for reading the contents of the given
- * entry in the given zip file as text using the given charset.
- */
- static CharSource asCharSource(
- ZipFile file, ZipEntry entry, Charset charset) {
- return asByteSource(file, entry).asCharSource(charset);
- }
-
- private static final class ZipEntryByteSource extends ByteSource {
-
- private final ZipFile file;
- private final ZipEntry entry;
-
- ZipEntryByteSource(ZipFile file, ZipEntry entry) {
- this.file = checkNotNull(file);
- this.entry = checkNotNull(entry);
- }
-
- @Override
- public InputStream openStream() throws IOException {
- return file.getInputStream(entry);
- }
-
- // TODO: implement size() to try calling entry.getSize()?
-
- @Override
- public String toString() {
- return "ZipFiles.asByteSource(" + file + ", " + entry + ")";
- }
- }
-
- /**
- * Returns a {@link FluentIterable} of all the entries in the given zip file.
- */
- // unmodifiable Iterator<? extends ZipEntry> can be safely cast
- // to Iterator<ZipEntry>
- @SuppressWarnings("unchecked")
- static FluentIterable<ZipEntry> entries(final ZipFile file) {
- checkNotNull(file);
- return new FluentIterable<ZipEntry>() {
- @Override
- public Iterator<ZipEntry> iterator() {
- return (Iterator<ZipEntry>) Iterators.forEnumeration(file.entries());
- }
- };
- }
-
- /**
- * Unzips the zip file specified by the path and creates the directory structure <i>inside</i>
- * the target directory. Refuses to unzip files that refer to a parent directory, for security
- * reasons.
- *
- * @param zipFile the source zip-file to unzip
- * @param targetDirectory the directory to unzip to. If the zip-file contains
- * any subdirectories, they will be created within our target directory.
- * @throws IOException the unzipping failed, e.g. because the output was not writable, the {@code
- * zipFile} was not readable, or contains an illegal entry (contains "..", pointing outside
- * the target directory)
- * @throws IllegalArgumentException the target directory is not a valid directory (e.g. does not
- * exist, or is a file instead of a directory)
- */
- static void unzipFile(
- File zipFile,
- File targetDirectory) throws IOException {
- checkNotNull(zipFile);
- checkNotNull(targetDirectory);
- checkArgument(
- targetDirectory.isDirectory(),
- "%s is not a valid directory",
- targetDirectory.getAbsolutePath());
- final ZipFile zipFileObj = new ZipFile(zipFile);
- try {
- for (ZipEntry entry : entries(zipFileObj)) {
- checkName(entry.getName());
- File targetFile = new File(targetDirectory, entry.getName());
- if (entry.isDirectory()) {
- if (!targetFile.isDirectory() && !targetFile.mkdirs()) {
- throw new IOException(
- "Failed to create directory: " + targetFile.getAbsolutePath());
- }
- } else {
- File parentFile = targetFile.getParentFile();
- if (!parentFile.isDirectory()) {
- if (!parentFile.mkdirs()) {
- throw new IOException(
- "Failed to create directory: "
- + parentFile.getAbsolutePath());
- }
- }
- // Write the file to the destination.
- asByteSource(zipFileObj, entry).copyTo(Files.asByteSink(targetFile));
- }
- }
- } finally {
- zipFileObj.close();
- }
- }
-
- /**
- * Checks that the given entry name is legal for unzipping: if it contains
- * ".." as a name element, it could cause the entry to be unzipped outside
- * the directory we're unzipping to.
- *
- * @throws IOException if the name is illegal
- */
- private static void checkName(String name) throws IOException {
- // First just check whether the entry name string contains "..".
- // This should weed out the the vast majority of entries, which will not
- // contain "..".
- if (name.contains("..")) {
- // If the string does contain "..", break it down into its actual name
- // elements to ensure it actually contains ".." as a name, not just a
- // name like "foo..bar" or even "foo..", which should be fine.
- File file = new File(name);
- while (file != null) {
- if (file.getName().equals("..")) {
- throw new IOException("Cannot unzip file containing an entry with "
- + "\"..\" in the name: " + name);
- }
- file = file.getParentFile();
- }
- }
- }
-
- /**
- * Zips an entire directory specified by the path.
- *
- * @param sourceDirectory the directory to read from. This directory and all
- * subdirectories will be added to the zip-file. The path within the zip
- * file is relative to the directory given as parameter, not absolute.
- * @param zipFile the zip-file to write to.
- * @throws IOException the zipping failed, e.g. because the input was not
- * readable.
- */
- static void zipDirectory(
- File sourceDirectory,
- File zipFile) throws IOException {
- checkNotNull(sourceDirectory);
- checkNotNull(zipFile);
- checkArgument(
- sourceDirectory.isDirectory(),
- "%s is not a valid directory",
- sourceDirectory.getAbsolutePath());
- checkArgument(
- !zipFile.exists(),
- "%s does already exist, files are not being overwritten",
- zipFile.getAbsolutePath());
- Closer closer = Closer.create();
- try {
- OutputStream outputStream = closer.register(new BufferedOutputStream(
- new FileOutputStream(zipFile)));
- zipDirectory(sourceDirectory, outputStream);
- } catch (Throwable t) {
- throw closer.rethrow(t);
- } finally {
- closer.close();
- }
- }
-
- /**
- * Zips an entire directory specified by the path.
- *
- * @param sourceDirectory the directory to read from. This directory and all
- * subdirectories will be added to the zip-file. The path within the zip
- * file is relative to the directory given as parameter, not absolute.
- * @param outputStream the stream to write the zip-file to. This method does not close
- * outputStream.
- * @throws IOException the zipping failed, e.g. because the input was not
- * readable.
- */
- static void zipDirectory(
- File sourceDirectory,
- OutputStream outputStream) throws IOException {
- checkNotNull(sourceDirectory);
- checkNotNull(outputStream);
- checkArgument(
- sourceDirectory.isDirectory(),
- "%s is not a valid directory",
- sourceDirectory.getAbsolutePath());
- ZipOutputStream zos = new ZipOutputStream(outputStream);
- for (File file : sourceDirectory.listFiles()) {
- zipDirectoryInternal(file, "", zos);
- }
- zos.finish();
- }
-
- /**
- * Private helper function for zipping files. This one goes recursively
- * through the input directory and all of its subdirectories and adds the
- * single zip entries.
- *
- * @param inputFile the file or directory to be added to the zip file
- * @param directoryName the string-representation of the parent directory
- * name. Might be an empty name, or a name containing multiple directory
- * names separated by "/". The directory name must be a valid name
- * according to the file system limitations. The directory name should be
- * empty or should end in "/".
- * @param zos the zipstream to write to
- * @throws IOException the zipping failed, e.g. because the output was not
- * writeable.
- */
- private static void zipDirectoryInternal(
- File inputFile,
- String directoryName,
- ZipOutputStream zos) throws IOException {
- String entryName = directoryName + inputFile.getName();
- if (inputFile.isDirectory()) {
- entryName += "/";
-
- // We are hitting a sub-directory. Recursively add children to zip in deterministic,
- // sorted order.
- File[] childFiles = inputFile.listFiles();
- if (childFiles.length > 0) {
- Arrays.sort(childFiles);
- // loop through the directory content, and zip the files
- for (File file : childFiles) {
- zipDirectoryInternal(file, entryName, zos);
- }
-
- // Since this directory has children, exit now without creating a zipentry specific to
- // this directory. The entry for a non-entry directory is incompatible with certain
- // implementations of unzip.
- return;
- }
- }
-
- // Put the zip-entry for this file or empty directory into the zipoutputstream.
- ZipEntry entry = new ZipEntry(entryName);
- entry.setTime(inputFile.lastModified());
- zos.putNextEntry(entry);
-
- // Copy file contents into zipoutput stream.
- if (inputFile.isFile()) {
- Files.asByteSource(inputFile).copyTo(zos);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/common/Counter.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/common/Counter.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/common/Counter.java
deleted file mode 100644
index 2c1985c..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/common/Counter.java
+++ /dev/null
@@ -1,1103 +0,0 @@
-/*******************************************************************************
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- ******************************************************************************/
-
-package com.google.cloud.dataflow.sdk.util.common;
-
-import static com.google.cloud.dataflow.sdk.util.common.Counter.AggregationKind.AND;
-import static com.google.cloud.dataflow.sdk.util.common.Counter.AggregationKind.MEAN;
-import static com.google.cloud.dataflow.sdk.util.common.Counter.AggregationKind.OR;
-import static com.google.common.base.Preconditions.checkArgument;
-
-import com.google.cloud.dataflow.sdk.values.TypeDescriptor;
-import com.google.common.util.concurrent.AtomicDouble;
-
-import java.util.Objects;
-import java.util.concurrent.atomic.AtomicBoolean;
-import java.util.concurrent.atomic.AtomicInteger;
-import java.util.concurrent.atomic.AtomicLong;
-import java.util.concurrent.atomic.AtomicReference;
-
-import javax.annotation.Nullable;
-
-/**
- * A Counter enables the aggregation of a stream of values over time. The
- * cumulative aggregate value is updated as new values are added, or it can be
- * reset to a new value. Multiple kinds of aggregation are supported depending
- * on the type of the counter.
- *
- * <p>Counters compare using value equality of their name, kind, and
- * cumulative value. Equal counters should have equal toString()s.
- *
- * @param <T> the type of values aggregated by this counter
- */
-public abstract class Counter<T> {
- /**
- * Possible kinds of counter aggregation.
- */
- public static enum AggregationKind {
-
- /**
- * Computes the sum of all added values.
- * Applicable to {@link Integer}, {@link Long}, and {@link Double} values.
- */
- SUM,
-
- /**
- * Computes the maximum value of all added values.
- * Applicable to {@link Integer}, {@link Long}, and {@link Double} values.
- */
- MAX,
-
- /**
- * Computes the minimum value of all added values.
- * Applicable to {@link Integer}, {@link Long}, and {@link Double} values.
- */
- MIN,
-
- /**
- * Computes the arithmetic mean of all added values. Applicable to
- * {@link Integer}, {@link Long}, and {@link Double} values.
- */
- MEAN,
-
- /**
- * Computes boolean AND over all added values.
- * Applicable only to {@link Boolean} values.
- */
- AND,
-
- /**
- * Computes boolean OR over all added values. Applicable only to
- * {@link Boolean} values.
- */
- OR
- // TODO: consider adding VECTOR_SUM, HISTOGRAM, KV_SET, PRODUCT, TOP.
- }
-
- /**
- * Constructs a new {@link Counter} that aggregates {@link Integer}, values
- * according to the desired aggregation kind. The supported aggregation kinds
- * are {@link AggregationKind#SUM}, {@link AggregationKind#MIN},
- * {@link AggregationKind#MAX}, and {@link AggregationKind#MEAN}.
- * This is a convenience wrapper over a
- * {@link Counter} implementation that aggregates {@link Long} values. This is
- * useful when the application handles (boxed) {@link Integer} values that
- * are not readily convertible to the (boxed) {@link Long} values otherwise
- * expected by the {@link Counter} implementation aggregating {@link Long}
- * values.
- *
- * @param name the name of the new counter
- * @param kind the new counter's aggregation kind
- * @return the newly constructed Counter
- * @throws IllegalArgumentException if the aggregation kind is not supported
- */
- public static Counter<Integer> ints(String name, AggregationKind kind) {
- return new IntegerCounter(name, kind);
- }
-
- /**
- * Constructs a new {@link Counter} that aggregates {@link Long} values
- * according to the desired aggregation kind. The supported aggregation kinds
- * are {@link AggregationKind#SUM}, {@link AggregationKind#MIN},
- * {@link AggregationKind#MAX}, and {@link AggregationKind#MEAN}.
- *
- * @param name the name of the new counter
- * @param kind the new counter's aggregation kind
- * @return the newly constructed Counter
- * @throws IllegalArgumentException if the aggregation kind is not supported
- */
- public static Counter<Long> longs(String name, AggregationKind kind) {
- return new LongCounter(name, kind);
- }
-
- /**
- * Constructs a new {@link Counter} that aggregates {@link Double} values
- * according to the desired aggregation kind. The supported aggregation kinds
- * are {@link AggregationKind#SUM}, {@link AggregationKind#MIN},
- * {@link AggregationKind#MAX}, and {@link AggregationKind#MEAN}.
- *
- * @param name the name of the new counter
- * @param kind the new counter's aggregation kind
- * @return the newly constructed Counter
- * @throws IllegalArgumentException if the aggregation kind is not supported
- */
- public static Counter<Double> doubles(String name, AggregationKind kind) {
- return new DoubleCounter(name, kind);
- }
-
- /**
- * Constructs a new {@link Counter} that aggregates {@link Boolean} values
- * according to the desired aggregation kind. The only supported aggregation
- * kinds are {@link AggregationKind#AND} and {@link AggregationKind#OR}.
- *
- * @param name the name of the new counter
- * @param kind the new counter's aggregation kind
- * @return the newly constructed Counter
- * @throws IllegalArgumentException if the aggregation kind is not supported
- */
- public static Counter<Boolean> booleans(String name, AggregationKind kind) {
- return new BooleanCounter(name, kind);
- }
-
- /**
- * Constructs a new {@link Counter} that aggregates {@link String} values
- * according to the desired aggregation kind. The only supported aggregation
- * kind is {@link AggregationKind#MIN} and {@link AggregationKind#MAX}.
- *
- * @param name the name of the new counter
- * @param kind the new counter's aggregation kind
- * @return the newly constructed Counter
- * @throws IllegalArgumentException if the aggregation kind is not supported
- */
- @SuppressWarnings("unused")
- private static Counter<String> strings(String name, AggregationKind kind) {
- return new StringCounter(name, kind);
- }
-
-
- //////////////////////////////////////////////////////////////////////////////
-
- /**
- * Adds a new value to the aggregation stream. Returns this (to allow method
- * chaining).
- */
- public abstract Counter<T> addValue(T value);
-
- /**
- * Resets the aggregation stream to this new value. This aggregator must not
- * be a MEAN aggregator. Returns this (to allow method chaining).
- */
- public abstract Counter<T> resetToValue(T value);
-
- /**
- * Resets the aggregation stream to this new value. Returns this (to allow
- * method chaining). The value of elementCount must be non-negative, and this
- * aggregator must be a MEAN aggregator.
- */
- public abstract Counter<T> resetMeanToValue(long elementCount, T value);
-
- /**
- * Resets the counter's delta value to have no values accumulated and returns
- * the value of the delta prior to the reset.
- *
- * @return the aggregate delta at the time this method is called
- */
- public abstract T getAndResetDelta();
-
- /**
- * Resets the counter's delta value to have no values accumulated and returns
- * the value of the delta prior to the reset, for a MEAN counter.
- *
- * @return the mean delta t the time this method is called
- */
- public abstract CounterMean<T> getAndResetMeanDelta();
-
- /**
- * Returns the counter's name.
- */
- public String getName() {
- return name;
- }
-
- /**
- * Returns the counter's aggregation kind.
- */
- public AggregationKind getKind() {
- return kind;
- }
-
- /**
- * Returns the counter's type.
- */
- public Class<?> getType() {
- return new TypeDescriptor<T>(getClass()) {}.getRawType();
- }
-
- /**
- * Returns the aggregated value, or the sum for MEAN aggregation, either
- * total or, if delta, since the last update extraction or resetDelta.
- */
- public abstract T getAggregate();
-
- /**
- * The mean value of a {@code Counter}, represented as an aggregate value and
- * a count.
- *
- * @param <T> the type of the aggregate
- */
- public static interface CounterMean<T> {
- /**
- * Gets the aggregate value of this {@code CounterMean}.
- */
- T getAggregate();
-
- /**
- * Gets the count of this {@code CounterMean}.
- */
- long getCount();
- }
-
- /**
- * Returns the mean in the form of a CounterMean, or null if this is not a
- * MEAN counter.
- */
- @Nullable
- public abstract CounterMean<T> getMean();
-
- /**
- * Returns a string representation of the Counter. Useful for debugging logs.
- * Example return value: "ElementCount:SUM(15)".
- */
- @Override
- public String toString() {
- StringBuilder sb = new StringBuilder();
- sb.append(getName());
- sb.append(":");
- sb.append(getKind());
- sb.append("(");
- switch (kind) {
- case SUM:
- case MAX:
- case MIN:
- case AND:
- case OR:
- sb.append(getAggregate());
- break;
- case MEAN:
- sb.append(getMean());
- break;
- default:
- throw illegalArgumentException();
- }
- sb.append(")");
-
- return sb.toString();
- }
-
- @Override
- public boolean equals(Object o) {
- if (this == o) {
- return true;
- } else if (o instanceof Counter) {
- Counter<?> that = (Counter<?>) o;
- if (this.name.equals(that.name) && this.kind == that.kind
- && this.getClass().equals(that.getClass())) {
- if (kind == MEAN) {
- CounterMean<T> thisMean = this.getMean();
- CounterMean<?> thatMean = that.getMean();
- return thisMean == thatMean
- || (Objects.equals(thisMean.getAggregate(), thatMean.getAggregate())
- && thisMean.getCount() == thatMean.getCount());
- } else {
- return Objects.equals(this.getAggregate(), that.getAggregate());
- }
- }
- }
- return false;
- }
-
- @Override
- public int hashCode() {
- if (kind == MEAN) {
- CounterMean<T> mean = getMean();
- return Objects.hash(getClass(), name, kind, mean.getAggregate(), mean.getCount());
- } else {
- return Objects.hash(getClass(), name, kind, getAggregate());
- }
- }
-
- /**
- * Returns whether this Counter is compatible with that Counter. If
- * so, they can be merged into a single Counter.
- */
- public boolean isCompatibleWith(Counter<?> that) {
- return this.name.equals(that.name)
- && this.kind == that.kind
- && this.getClass().equals(that.getClass());
- }
-
- /**
- * Merges this counter with the provided counter, returning this counter with the combined value
- * of both counters. This may reset the delta of this counter.
- *
- * @throws IllegalArgumentException if the provided Counter is not compatible with this Counter
- */
- public abstract Counter<T> merge(Counter<T> that);
-
- //////////////////////////////////////////////////////////////////////////////
-
- /** The name of this counter. */
- protected final String name;
-
- /** The kind of aggregation function to apply to this counter. */
- protected final AggregationKind kind;
-
- protected Counter(String name, AggregationKind kind) {
- this.name = name;
- this.kind = kind;
- }
-
- //////////////////////////////////////////////////////////////////////////////
-
- /**
- * Implements a {@link Counter} for {@link Long} values.
- */
- private static class LongCounter extends Counter<Long> {
- private final AtomicLong aggregate;
- private final AtomicLong deltaAggregate;
- private final AtomicReference<LongCounterMean> mean;
- private final AtomicReference<LongCounterMean> deltaMean;
-
- /** Initializes a new {@link Counter} for {@link Long} values. */
- private LongCounter(String name, AggregationKind kind) {
- super(name, kind);
- switch (kind) {
- case MEAN:
- mean = new AtomicReference<>();
- deltaMean = new AtomicReference<>();
- getAndResetMeanDelta();
- mean.set(deltaMean.get());
- aggregate = deltaAggregate = null;
- break;
- case SUM:
- case MAX:
- case MIN:
- aggregate = new AtomicLong();
- deltaAggregate = new AtomicLong();
- getAndResetDelta();
- aggregate.set(deltaAggregate.get());
- mean = deltaMean = null;
- break;
- default:
- throw illegalArgumentException();
- }
- }
-
- @Override
- public LongCounter addValue(Long value) {
- switch (kind) {
- case SUM:
- aggregate.addAndGet(value);
- deltaAggregate.addAndGet(value);
- break;
- case MEAN:
- addToMeanAndSet(value, mean);
- addToMeanAndSet(value, deltaMean);
- break;
- case MAX:
- maxAndSet(value, aggregate);
- maxAndSet(value, deltaAggregate);
- break;
- case MIN:
- minAndSet(value, aggregate);
- minAndSet(value, deltaAggregate);
- break;
- default:
- throw illegalArgumentException();
- }
- return this;
- }
-
- private void minAndSet(Long value, AtomicLong target) {
- long current;
- long update;
- do {
- current = target.get();
- update = Math.min(value, current);
- } while (update < current && !target.compareAndSet(current, update));
- }
-
- private void maxAndSet(Long value, AtomicLong target) {
- long current;
- long update;
- do {
- current = target.get();
- update = Math.max(value, current);
- } while (update > current && !target.compareAndSet(current, update));
- }
-
- private void addToMeanAndSet(Long value, AtomicReference<LongCounterMean> target) {
- LongCounterMean current;
- LongCounterMean update;
- do {
- current = target.get();
- update = new LongCounterMean(current.getAggregate() + value, current.getCount() + 1L);
- } while (!target.compareAndSet(current, update));
- }
-
- @Override
- public Long getAggregate() {
- if (kind != MEAN) {
- return aggregate.get();
- } else {
- return getMean().getAggregate();
- }
- }
-
- @Override
- public Long getAndResetDelta() {
- switch (kind) {
- case SUM:
- return deltaAggregate.getAndSet(0L);
- case MAX:
- return deltaAggregate.getAndSet(Long.MIN_VALUE);
- case MIN:
- return deltaAggregate.getAndSet(Long.MAX_VALUE);
- default:
- throw illegalArgumentException();
- }
- }
-
- @Override
- public Counter<Long> resetToValue(Long value) {
- if (kind == MEAN) {
- throw illegalArgumentException();
- }
- aggregate.set(value);
- deltaAggregate.set(value);
- return this;
- }
-
- @Override
- public Counter<Long> resetMeanToValue(long elementCount, Long value) {
- if (kind != MEAN) {
- throw illegalArgumentException();
- }
- if (elementCount < 0) {
- throw new IllegalArgumentException("elementCount must be non-negative");
- }
- LongCounterMean counterMean = new LongCounterMean(value, elementCount);
- mean.set(counterMean);
- deltaMean.set(counterMean);
- return this;
- }
-
- @Override
- public CounterMean<Long> getAndResetMeanDelta() {
- if (kind != MEAN) {
- throw illegalArgumentException();
- }
- return deltaMean.getAndSet(new LongCounterMean(0L, 0L));
- }
-
- @Override
- @Nullable
- public CounterMean<Long> getMean() {
- if (kind != MEAN) {
- throw illegalArgumentException();
- }
- return mean.get();
- }
-
- @Override
- public Counter<Long> merge(Counter<Long> that) {
- checkArgument(this.isCompatibleWith(that), "Counters %s and %s are incompatible", this, that);
- switch (kind) {
- case SUM:
- case MIN:
- case MAX:
- return addValue(that.getAggregate());
- case MEAN:
- CounterMean<Long> thisCounterMean = this.getMean();
- CounterMean<Long> thatCounterMean = that.getMean();
- return resetMeanToValue(
- thisCounterMean.getCount() + thatCounterMean.getCount(),
- thisCounterMean.getAggregate() + thatCounterMean.getAggregate());
- default:
- throw illegalArgumentException();
- }
- }
-
- private static class LongCounterMean implements CounterMean<Long> {
- private final long aggregate;
- private final long count;
-
- public LongCounterMean(long aggregate, long count) {
- this.aggregate = aggregate;
- this.count = count;
- }
-
- @Override
- public Long getAggregate() {
- return aggregate;
- }
-
- @Override
- public long getCount() {
- return count;
- }
-
- @Override
- public String toString() {
- return aggregate + "/" + count;
- }
- }
- }
-
- /**
- * Implements a {@link Counter} for {@link Double} values.
- */
- private static class DoubleCounter extends Counter<Double> {
- AtomicDouble aggregate;
- AtomicDouble deltaAggregate;
- AtomicReference<DoubleCounterMean> mean;
- AtomicReference<DoubleCounterMean> deltaMean;
-
- /** Initializes a new {@link Counter} for {@link Double} values. */
- private DoubleCounter(String name, AggregationKind kind) {
- super(name, kind);
- switch (kind) {
- case MEAN:
- aggregate = deltaAggregate = null;
- mean = new AtomicReference<>();
- deltaMean = new AtomicReference<>();
- getAndResetMeanDelta();
- mean.set(deltaMean.get());
- break;
- case SUM:
- case MAX:
- case MIN:
- mean = deltaMean = null;
- aggregate = new AtomicDouble();
- deltaAggregate = new AtomicDouble();
- getAndResetDelta();
- aggregate.set(deltaAggregate.get());
- break;
- default:
- throw illegalArgumentException();
- }
- }
-
- @Override
- public DoubleCounter addValue(Double value) {
- switch (kind) {
- case SUM:
- aggregate.addAndGet(value);
- deltaAggregate.addAndGet(value);
- break;
- case MEAN:
- addToMeanAndSet(value, mean);
- addToMeanAndSet(value, deltaMean);
- break;
- case MAX:
- maxAndSet(value, aggregate);
- maxAndSet(value, deltaAggregate);
- break;
- case MIN:
- minAndSet(value, aggregate);
- minAndSet(value, deltaAggregate);
- break;
- default:
- throw illegalArgumentException();
- }
- return this;
- }
-
- private void addToMeanAndSet(Double value, AtomicReference<DoubleCounterMean> target) {
- DoubleCounterMean current;
- DoubleCounterMean update;
- do {
- current = target.get();
- update = new DoubleCounterMean(current.getAggregate() + value, current.getCount() + 1);
- } while (!target.compareAndSet(current, update));
- }
-
- private void maxAndSet(Double value, AtomicDouble target) {
- double current;
- double update;
- do {
- current = target.get();
- update = Math.max(current, value);
- } while (update > current && !target.compareAndSet(current, update));
- }
-
- private void minAndSet(Double value, AtomicDouble target) {
- double current;
- double update;
- do {
- current = target.get();
- update = Math.min(current, value);
- } while (update < current && !target.compareAndSet(current, update));
- }
-
- @Override
- public Double getAndResetDelta() {
- switch (kind) {
- case SUM:
- return deltaAggregate.getAndSet(0.0);
- case MAX:
- return deltaAggregate.getAndSet(Double.NEGATIVE_INFINITY);
- case MIN:
- return deltaAggregate.getAndSet(Double.POSITIVE_INFINITY);
- default:
- throw illegalArgumentException();
- }
- }
-
- @Override
- public Counter<Double> resetToValue(Double value) {
- if (kind == MEAN) {
- throw illegalArgumentException();
- }
- aggregate.set(value);
- deltaAggregate.set(value);
- return this;
- }
-
- @Override
- public Counter<Double> resetMeanToValue(long elementCount, Double value) {
- if (kind != MEAN) {
- throw illegalArgumentException();
- }
- if (elementCount < 0) {
- throw new IllegalArgumentException("elementCount must be non-negative");
- }
- DoubleCounterMean counterMean = new DoubleCounterMean(value, elementCount);
- mean.set(counterMean);
- deltaMean.set(counterMean);
- return this;
- }
-
- @Override
- public CounterMean<Double> getAndResetMeanDelta() {
- if (kind != MEAN) {
- throw illegalArgumentException();
- }
- return deltaMean.getAndSet(new DoubleCounterMean(0.0, 0L));
- }
-
- @Override
- public Double getAggregate() {
- if (kind != MEAN) {
- return aggregate.get();
- } else {
- return getMean().getAggregate();
- }
- }
-
- @Override
- @Nullable
- public CounterMean<Double> getMean() {
- if (kind != MEAN) {
- throw illegalArgumentException();
- }
- return mean.get();
- }
-
- @Override
- public Counter<Double> merge(Counter<Double> that) {
- checkArgument(this.isCompatibleWith(that), "Counters %s and %s are incompatible", this, that);
- switch (kind) {
- case SUM:
- case MIN:
- case MAX:
- return addValue(that.getAggregate());
- case MEAN:
- CounterMean<Double> thisCounterMean = this.getMean();
- CounterMean<Double> thatCounterMean = that.getMean();
- return resetMeanToValue(
- thisCounterMean.getCount() + thatCounterMean.getCount(),
- thisCounterMean.getAggregate() + thatCounterMean.getAggregate());
- default:
- throw illegalArgumentException();
- }
- }
-
- private static class DoubleCounterMean implements CounterMean<Double> {
- private final double aggregate;
- private final long count;
-
- public DoubleCounterMean(double aggregate, long count) {
- this.aggregate = aggregate;
- this.count = count;
- }
-
- @Override
- public Double getAggregate() {
- return aggregate;
- }
-
- @Override
- public long getCount() {
- return count;
- }
-
- @Override
- public String toString() {
- return aggregate + "/" + count;
- }
- }
- }
-
- /**
- * Implements a {@link Counter} for {@link Boolean} values.
- */
- private static class BooleanCounter extends Counter<Boolean> {
- private final AtomicBoolean aggregate;
- private final AtomicBoolean deltaAggregate;
-
- /** Initializes a new {@link Counter} for {@link Boolean} values. */
- private BooleanCounter(String name, AggregationKind kind) {
- super(name, kind);
- aggregate = new AtomicBoolean();
- deltaAggregate = new AtomicBoolean();
- getAndResetDelta();
- aggregate.set(deltaAggregate.get());
- }
-
- @Override
- public BooleanCounter addValue(Boolean value) {
- if (kind.equals(AND) && !value) {
- aggregate.set(value);
- deltaAggregate.set(value);
- } else if (kind.equals(OR) && value) {
- aggregate.set(value);
- deltaAggregate.set(value);
- }
- return this;
- }
-
- @Override
- public Boolean getAndResetDelta() {
- switch (kind) {
- case AND:
- return deltaAggregate.getAndSet(true);
- case OR:
- return deltaAggregate.getAndSet(false);
- default:
- throw illegalArgumentException();
- }
- }
-
- @Override
- public Counter<Boolean> resetToValue(Boolean value) {
- aggregate.set(value);
- deltaAggregate.set(value);
- return this;
- }
-
- @Override
- public Counter<Boolean> resetMeanToValue(long elementCount, Boolean value) {
- throw illegalArgumentException();
- }
-
- @Override
- public CounterMean<Boolean> getAndResetMeanDelta() {
- throw illegalArgumentException();
- }
-
- @Override
- public Boolean getAggregate() {
- return aggregate.get();
- }
-
- @Override
- @Nullable
- public CounterMean<Boolean> getMean() {
- throw illegalArgumentException();
- }
-
- @Override
- public Counter<Boolean> merge(Counter<Boolean> that) {
- checkArgument(this.isCompatibleWith(that), "Counters %s and %s are incompatible", this, that);
- return addValue(that.getAggregate());
- }
- }
-
- /**
- * Implements a {@link Counter} for {@link String} values.
- */
- private static class StringCounter extends Counter<String> {
- /** Initializes a new {@link Counter} for {@link String} values. */
- private StringCounter(String name, AggregationKind kind) {
- super(name, kind);
- // TODO: Support MIN, MAX of Strings.
- throw illegalArgumentException();
- }
-
- @Override
- public StringCounter addValue(String value) {
- switch (kind) {
- default:
- throw illegalArgumentException();
- }
- }
-
- @Override
- public Counter<String> resetToValue(String value) {
- switch (kind) {
- default:
- throw illegalArgumentException();
- }
- }
-
- @Override
- public Counter<String> resetMeanToValue(long elementCount, String value) {
- switch (kind) {
- default:
- throw illegalArgumentException();
- }
- }
-
- @Override
- public String getAndResetDelta() {
- switch (kind) {
- default:
- throw illegalArgumentException();
- }
- }
-
- @Override
- public CounterMean<String> getAndResetMeanDelta() {
- switch (kind) {
- default:
- throw illegalArgumentException();
- }
- }
-
- @Override
- public String getAggregate() {
- switch (kind) {
- default:
- throw illegalArgumentException();
- }
- }
-
- @Override
- @Nullable
- public CounterMean<String> getMean() {
- switch (kind) {
- default:
- throw illegalArgumentException();
- }
- }
-
- @Override
- public Counter<String> merge(Counter<String> that) {
- checkArgument(this.isCompatibleWith(that), "Counters %s and %s are incompatible", this, that);
- switch (kind) {
- default:
- throw illegalArgumentException();
- }
- }
- }
-
- /**
- * Implements a {@link Counter} for {@link Integer} values.
- */
- private static class IntegerCounter extends Counter<Integer> {
- private final AtomicInteger aggregate;
- private final AtomicInteger deltaAggregate;
- private final AtomicReference<IntegerCounterMean> mean;
- private final AtomicReference<IntegerCounterMean> deltaMean;
-
- /** Initializes a new {@link Counter} for {@link Integer} values. */
- private IntegerCounter(String name, AggregationKind kind) {
- super(name, kind);
- switch (kind) {
- case MEAN:
- aggregate = deltaAggregate = null;
- mean = new AtomicReference<>();
- deltaMean = new AtomicReference<>();
- getAndResetMeanDelta();
- mean.set(deltaMean.get());
- break;
- case SUM:
- case MAX:
- case MIN:
- mean = deltaMean = null;
- aggregate = new AtomicInteger();
- deltaAggregate = new AtomicInteger();
- getAndResetDelta();
- aggregate.set(deltaAggregate.get());
- break;
- default:
- throw illegalArgumentException();
- }
- }
-
- @Override
- public IntegerCounter addValue(Integer value) {
- switch (kind) {
- case SUM:
- aggregate.getAndAdd(value);
- deltaAggregate.getAndAdd(value);
- break;
- case MEAN:
- addToMeanAndSet(value, mean);
- addToMeanAndSet(value, deltaMean);
- break;
- case MAX:
- maxAndSet(value, aggregate);
- maxAndSet(value, deltaAggregate);
- break;
- case MIN:
- minAndSet(value, aggregate);
- minAndSet(value, deltaAggregate);
- break;
- default:
- throw illegalArgumentException();
- }
- return this;
- }
-
- private void addToMeanAndSet(int value, AtomicReference<IntegerCounterMean> target) {
- IntegerCounterMean current;
- IntegerCounterMean update;
- do {
- current = target.get();
- update = new IntegerCounterMean(current.getAggregate() + value, current.getCount() + 1);
- } while (!target.compareAndSet(current, update));
- }
-
- private void maxAndSet(int value, AtomicInteger target) {
- int current;
- int update;
- do {
- current = target.get();
- update = Math.max(value, current);
- } while (update > current && !target.compareAndSet(current, update));
- }
-
- private void minAndSet(int value, AtomicInteger target) {
- int current;
- int update;
- do {
- current = target.get();
- update = Math.min(value, current);
- } while (update < current && !target.compareAndSet(current, update));
- }
-
- @Override
- public Integer getAndResetDelta() {
- switch (kind) {
- case SUM:
- return deltaAggregate.getAndSet(0);
- case MAX:
- return deltaAggregate.getAndSet(Integer.MIN_VALUE);
- case MIN:
- return deltaAggregate.getAndSet(Integer.MAX_VALUE);
- default:
- throw illegalArgumentException();
- }
- }
-
- @Override
- public Counter<Integer> resetToValue(Integer value) {
- if (kind == MEAN) {
- throw illegalArgumentException();
- }
- aggregate.set(value);
- deltaAggregate.set(value);
- return this;
- }
-
- @Override
- public Counter<Integer> resetMeanToValue(long elementCount, Integer value) {
- if (kind != MEAN) {
- throw illegalArgumentException();
- }
- if (elementCount < 0) {
- throw new IllegalArgumentException("elementCount must be non-negative");
- }
- IntegerCounterMean counterMean = new IntegerCounterMean(value, elementCount);
- mean.set(counterMean);
- deltaMean.set(counterMean);
- return this;
- }
-
- @Override
- public CounterMean<Integer> getAndResetMeanDelta() {
- if (kind != MEAN) {
- throw illegalArgumentException();
- }
- return deltaMean.getAndSet(new IntegerCounterMean(0, 0L));
- }
-
- @Override
- public Integer getAggregate() {
- if (kind != MEAN) {
- return aggregate.get();
- } else {
- return getMean().getAggregate();
- }
- }
-
- @Override
- @Nullable
- public CounterMean<Integer> getMean() {
- if (kind != MEAN) {
- throw illegalArgumentException();
- }
- return mean.get();
- }
-
- @Override
- public Counter<Integer> merge(Counter<Integer> that) {
- checkArgument(this.isCompatibleWith(that), "Counters %s and %s are incompatible", this, that);
- switch (kind) {
- case SUM:
- case MIN:
- case MAX:
- return addValue(that.getAggregate());
- case MEAN:
- CounterMean<Integer> thisCounterMean = this.getMean();
- CounterMean<Integer> thatCounterMean = that.getMean();
- return resetMeanToValue(
- thisCounterMean.getCount() + thatCounterMean.getCount(),
- thisCounterMean.getAggregate() + thatCounterMean.getAggregate());
- default:
- throw illegalArgumentException();
- }
- }
-
- private static class IntegerCounterMean implements CounterMean<Integer> {
- private final int aggregate;
- private final long count;
-
- public IntegerCounterMean(int aggregate, long count) {
- this.aggregate = aggregate;
- this.count = count;
- }
-
- @Override
- public Integer getAggregate() {
- return aggregate;
- }
-
- @Override
- public long getCount() {
- return count;
- }
-
- @Override
- public String toString() {
- return aggregate + "/" + count;
- }
- }
- }
-
- //////////////////////////////////////////////////////////////////////////////
-
- /**
- * Constructs an {@link IllegalArgumentException} explaining that this
- * {@link Counter}'s aggregation kind is not supported by its value type.
- */
- protected IllegalArgumentException illegalArgumentException() {
- return new IllegalArgumentException("Cannot compute " + kind
- + " aggregation over " + getType().getSimpleName() + " values.");
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/common/CounterProvider.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/common/CounterProvider.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/common/CounterProvider.java
deleted file mode 100644
index ba53f80..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/common/CounterProvider.java
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util.common;
-
-/**
- * A counter provider can provide {@link Counter} instances.
- *
- * @param <T> the input type of the counter.
- */
-public interface CounterProvider<T> {
- Counter<T> getCounter(String name);
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/common/CounterSet.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/common/CounterSet.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/common/CounterSet.java
deleted file mode 100644
index 9e9638f..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/common/CounterSet.java
+++ /dev/null
@@ -1,177 +0,0 @@
-/*******************************************************************************
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- ******************************************************************************/
-
-package com.google.cloud.dataflow.sdk.util.common;
-
-import static com.google.common.base.Preconditions.checkArgument;
-
-import java.util.AbstractSet;
-import java.util.HashMap;
-import java.util.Iterator;
-
-/**
- * A CounterSet maintains a set of {@link Counter}s.
- *
- * <p>Thread-safe.
- */
-public class CounterSet extends AbstractSet<Counter<?>> {
-
- /** Registered counters. */
- private final HashMap<String, Counter<?>> counters = new HashMap<>();
-
- private final AddCounterMutator addCounterMutator = new AddCounterMutator();
-
- /**
- * Constructs a CounterSet containing the given Counters.
- */
- public CounterSet(Counter<?>... counters) {
- for (Counter<?> counter : counters) {
- addNewCounter(counter);
- }
- }
-
- /**
- * Returns an object that supports adding additional counters into
- * this CounterSet.
- */
- public AddCounterMutator getAddCounterMutator() {
- return addCounterMutator;
- }
-
- /**
- * Adds a new counter, throwing an exception if a counter of the
- * same name already exists.
- */
- public void addNewCounter(Counter<?> counter) {
- if (!addCounter(counter)) {
- throw new IllegalArgumentException(
- "Counter " + counter + " duplicates an existing counter in " + this);
- }
- }
-
- /**
- * Adds the given Counter to this CounterSet.
- *
- * <p>If a counter with the same name already exists, it will be
- * reused, as long as it is compatible.
- *
- * @return the Counter that was reused, or added
- * @throws IllegalArgumentException if a counter with the same
- * name but an incompatible kind had already been added
- */
- public synchronized <T> Counter<T> addOrReuseCounter(Counter<T> counter) {
- Counter<?> oldCounter = counters.get(counter.getName());
- if (oldCounter == null) {
- // A new counter.
- counters.put(counter.getName(), counter);
- return counter;
- }
- if (counter.isCompatibleWith(oldCounter)) {
- // Return the counter to reuse.
- @SuppressWarnings("unchecked")
- Counter<T> compatibleCounter = (Counter<T>) oldCounter;
- return compatibleCounter;
- }
- throw new IllegalArgumentException(
- "Counter " + counter + " duplicates incompatible counter "
- + oldCounter + " in " + this);
- }
-
- /**
- * Adds a counter. Returns {@code true} if the counter was added to the set
- * and false if the given counter was {@code null} or it already existed in
- * the set.
- *
- * @param counter to register
- */
- public boolean addCounter(Counter<?> counter) {
- return add(counter);
- }
-
- /**
- * Returns the Counter with the given name in this CounterSet;
- * returns null if no such Counter exists.
- */
- public synchronized Counter<?> getExistingCounter(String name) {
- return counters.get(name);
- }
-
- @Override
- public synchronized Iterator<Counter<?>> iterator() {
- return counters.values().iterator();
- }
-
- @Override
- public synchronized int size() {
- return counters.size();
- }
-
- @Override
- public synchronized boolean add(Counter<?> e) {
- if (null == e) {
- return false;
- }
- if (counters.containsKey(e.getName())) {
- return false;
- }
- counters.put(e.getName(), e);
- return true;
- }
-
- public synchronized void merge(CounterSet that) {
- for (Counter<?> theirCounter : that) {
- Counter<?> myCounter = counters.get(theirCounter.getName());
- if (myCounter != null) {
- mergeCounters(myCounter, theirCounter);
- } else {
- addCounter(theirCounter);
- }
- }
- }
-
- private <T> void mergeCounters(Counter<T> mine, Counter<?> theirCounter) {
- checkArgument(
- mine.isCompatibleWith(theirCounter),
- "Can't merge CounterSets containing incompatible counters with the same name: "
- + "%s (existing) and %s (merged)",
- mine,
- theirCounter);
- @SuppressWarnings("unchecked")
- Counter<T> theirs = (Counter<T>) theirCounter;
- mine.merge(theirs);
- }
-
- /**
- * A nested class that supports adding additional counters into the
- * enclosing CounterSet. This is useful as a mutator, hiding other
- * public methods of the CounterSet.
- */
- public class AddCounterMutator {
- /**
- * Adds the given Counter into the enclosing CounterSet.
- *
- * <p>If a counter with the same name already exists, it will be
- * reused, as long as it has the same type.
- *
- * @return the Counter that was reused, or added
- * @throws IllegalArgumentException if a counter with the same
- * name but an incompatible kind had already been added
- */
- public <T> Counter<T> addCounter(Counter<T> counter) {
- return addOrReuseCounter(counter);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/common/ElementByteSizeObservable.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/common/ElementByteSizeObservable.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/common/ElementByteSizeObservable.java
deleted file mode 100644
index fee6737..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/common/ElementByteSizeObservable.java
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util.common;
-
-/**
- * An interface for things that allow observing the size in bytes of
- * encoded values of type {@code T}.
- *
- * @param <T> the type of the values being observed
- */
-public interface ElementByteSizeObservable<T> {
- /**
- * Returns whether {@link #registerByteSizeObserver} is cheap enough
- * to call for every element, that is, if this
- * {@code ElementByteSizeObservable} can calculate the byte size of
- * the element to be coded in roughly constant time (or lazily).
- */
- public boolean isRegisterByteSizeObserverCheap(T value);
-
- /**
- * Notifies the {@code ElementByteSizeObserver} about the byte size
- * of the encoded value using this {@code ElementByteSizeObservable}.
- */
- public void registerByteSizeObserver(T value,
- ElementByteSizeObserver observer)
- throws Exception;
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/common/ElementByteSizeObservableIterable.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/common/ElementByteSizeObservableIterable.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/common/ElementByteSizeObservableIterable.java
deleted file mode 100644
index 591d2be..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/common/ElementByteSizeObservableIterable.java
+++ /dev/null
@@ -1,63 +0,0 @@
-/*******************************************************************************
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- ******************************************************************************/
-
-package com.google.cloud.dataflow.sdk.util.common;
-
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Observer;
-
-/**
- * An abstract class used for iterables that notify observers about size in
- * bytes of their elements, as they are being iterated over.
- *
- * @param <V> the type of elements returned by this iterable
- * @param <InputT> type type of iterator returned by this iterable
- */
-public abstract class ElementByteSizeObservableIterable<
- V, InputT extends ElementByteSizeObservableIterator<V>>
- implements Iterable<V> {
- private List<Observer> observers = new ArrayList<>();
-
- /**
- * Derived classes override this method to return an iterator for this
- * iterable.
- */
- protected abstract InputT createIterator();
-
- /**
- * Sets the observer, which will observe the iterator returned in
- * the next call to iterator() method. Future calls to iterator()
- * won't be observed, unless an observer is set again.
- */
- public void addObserver(Observer observer) {
- observers.add(observer);
- }
-
- /**
- * Returns a new iterator for this iterable. If an observer was set in
- * a previous call to setObserver(), it will observe the iterator returned.
- */
- @Override
- public InputT iterator() {
- InputT iterator = createIterator();
- for (Observer observer : observers) {
- iterator.addObserver(observer);
- }
- observers.clear();
- return iterator;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/common/ElementByteSizeObservableIterator.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/common/ElementByteSizeObservableIterator.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/common/ElementByteSizeObservableIterator.java
deleted file mode 100644
index c094900..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/common/ElementByteSizeObservableIterator.java
+++ /dev/null
@@ -1,36 +0,0 @@
-/*******************************************************************************
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- ******************************************************************************/
-
-package com.google.cloud.dataflow.sdk.util.common;
-
-import java.util.Iterator;
-import java.util.Observable;
-
-/**
- * An abstract class used for iterators that notify observers about size in
- * bytes of their elements, as they are being iterated over. The subclasses
- * need to implement the standard Iterator interface and call method
- * notifyValueReturned() for each element read and/or iterated over.
- *
- * @param <V> value type
- */
-public abstract class ElementByteSizeObservableIterator<V>
- extends Observable implements Iterator<V> {
- protected final void notifyValueReturned(long byteSize) {
- setChanged();
- notifyObservers(byteSize);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/common/ElementByteSizeObserver.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/common/ElementByteSizeObserver.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/common/ElementByteSizeObserver.java
deleted file mode 100644
index 6c764d9..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/common/ElementByteSizeObserver.java
+++ /dev/null
@@ -1,92 +0,0 @@
-/*******************************************************************************
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- ******************************************************************************/
-
-package com.google.cloud.dataflow.sdk.util.common;
-
-import java.util.Observable;
-import java.util.Observer;
-
-/**
- * An observer that gets notified when additional bytes are read
- * and/or used. It adds all bytes into a local counter. When the
- * observer gets advanced via the next() call, it adds the total byte
- * count to the specified counter, and prepares for the next element.
- */
-public class ElementByteSizeObserver implements Observer {
- private final Counter<Long> counter;
- private boolean isLazy = false;
- private long totalSize = 0;
- private double scalingFactor = 1.0;
-
- public ElementByteSizeObserver(Counter<Long> counter) {
- this.counter = counter;
- }
-
- /**
- * Sets byte counting for the current element as lazy. That is, the
- * observer will get notified of the element's byte count only as
- * element's pieces are being processed or iterated over.
- */
- public void setLazy() {
- isLazy = true;
- }
-
- /**
- * Returns whether byte counting for the current element is lazy, that is,
- * whether the observer gets notified of the element's byte count only as
- * element's pieces are being processed or iterated over.
- */
- public boolean getIsLazy() {
- return isLazy;
- }
-
- /**
- * Updates the observer with a context specified, but without an instance of
- * the Observable.
- */
- public void update(Object obj) {
- update(null, obj);
- }
-
- /**
- * Sets a multiplier to use on observed sizes.
- */
- public void setScalingFactor(double scalingFactor) {
- this.scalingFactor = scalingFactor;
- }
-
- @Override
- public void update(Observable obs, Object obj) {
- if (obj instanceof Long) {
- totalSize += scalingFactor * (Long) obj;
- } else if (obj instanceof Integer) {
- totalSize += scalingFactor * (Integer) obj;
- } else {
- throw new AssertionError("unexpected parameter object");
- }
- }
-
- /**
- * Advances the observer to the next element. Adds the current total byte
- * size to the counter, and prepares the observer for the next element.
- */
- public void advance() {
- counter.addValue(totalSize);
-
- totalSize = 0;
- isLazy = false;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/common/PeekingReiterator.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/common/PeekingReiterator.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/common/PeekingReiterator.java
deleted file mode 100644
index 0948747..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/common/PeekingReiterator.java
+++ /dev/null
@@ -1,98 +0,0 @@
-/*******************************************************************************
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- ******************************************************************************/
-
-package com.google.cloud.dataflow.sdk.util.common;
-
-import static com.google.common.base.Preconditions.checkNotNull;
-import static com.google.common.base.Preconditions.checkState;
-
-import java.util.NoSuchElementException;
-
-/**
- * A {@link Reiterator} that supports one-element lookahead during iteration.
- *
- * @param <T> the type of elements returned by this iterator
- */
-public final class PeekingReiterator<T> implements Reiterator<T> {
- private T nextElement;
- private boolean nextElementComputed;
- private final Reiterator<T> iterator;
-
- public PeekingReiterator(Reiterator<T> iterator) {
- this.iterator = checkNotNull(iterator);
- }
-
- PeekingReiterator(PeekingReiterator<T> it) {
- this.iterator = checkNotNull(checkNotNull(it).iterator.copy());
- this.nextElement = it.nextElement;
- this.nextElementComputed = it.nextElementComputed;
- }
-
- @Override
- public boolean hasNext() {
- computeNext();
- return nextElementComputed;
- }
-
- @Override
- public T next() {
- T result = peek();
- nextElementComputed = false;
- return result;
- }
-
- /**
- * {@inheritDoc}
- *
- * <p>If {@link #peek} is called, {@code remove} is disallowed until
- * {@link #next} has been subsequently called.
- */
- @Override
- public void remove() {
- checkState(!nextElementComputed,
- "After peek(), remove() is disallowed until next() is called");
- iterator.remove();
- }
-
- @Override
- public PeekingReiterator<T> copy() {
- return new PeekingReiterator<>(this);
- }
-
- /**
- * Returns the element that would be returned by {@link #next}, without
- * actually consuming the element.
- * @throws NoSuchElementException if there is no next element
- */
- public T peek() {
- computeNext();
- if (!nextElementComputed) {
- throw new NoSuchElementException();
- }
- return nextElement;
- }
-
- private void computeNext() {
- if (nextElementComputed) {
- return;
- }
- if (!iterator.hasNext()) {
- return;
- }
- nextElement = iterator.next();
- nextElementComputed = true;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/common/ReflectHelpers.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/common/ReflectHelpers.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/common/ReflectHelpers.java
deleted file mode 100644
index f87242f..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/common/ReflectHelpers.java
+++ /dev/null
@@ -1,209 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.util.common;
-
-import static java.util.Arrays.asList;
-
-import com.google.common.base.Function;
-import com.google.common.base.Joiner;
-import com.google.common.base.Preconditions;
-import com.google.common.collect.FluentIterable;
-import com.google.common.collect.ImmutableSet;
-import com.google.common.collect.Queues;
-
-import java.lang.reflect.GenericArrayType;
-import java.lang.reflect.Method;
-import java.lang.reflect.ParameterizedType;
-import java.lang.reflect.Type;
-import java.lang.reflect.TypeVariable;
-import java.lang.reflect.WildcardType;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.LinkedHashSet;
-import java.util.Queue;
-
-import javax.annotation.Nullable;
-
-/**
- * Utilities for working with with {@link Class Classes} and {@link Method Methods}.
- */
-public class ReflectHelpers {
-
- private static final Joiner COMMA_SEPARATOR = Joiner.on(", ");
-
- /** A {@link Function} that turns a method into a simple method signature. */
- public static final Function<Method, String> METHOD_FORMATTER = new Function<Method, String>() {
- @Override
- public String apply(Method input) {
- String parameterTypes = FluentIterable.from(asList(input.getParameterTypes()))
- .transform(CLASS_SIMPLE_NAME)
- .join(COMMA_SEPARATOR);
- return String.format("%s(%s)",
- input.getName(),
- parameterTypes);
- }
- };
-
- /** A {@link Function} that turns a method into the declaring class + method signature. */
- public static final Function<Method, String> CLASS_AND_METHOD_FORMATTER =
- new Function<Method, String>() {
- @Override
- public String apply(Method input) {
- return String.format("%s#%s",
- CLASS_NAME.apply(input.getDeclaringClass()),
- METHOD_FORMATTER.apply(input));
- }
- };
-
- /** A {@link Function} with returns the classes name. */
- public static final Function<Class<?>, String> CLASS_NAME =
- new Function<Class<?>, String>() {
- @Override
- public String apply(Class<?> input) {
- return input.getName();
- }
- };
-
- /** A {@link Function} with returns the classes name. */
- public static final Function<Class<?>, String> CLASS_SIMPLE_NAME =
- new Function<Class<?>, String>() {
- @Override
- public String apply(Class<?> input) {
- return input.getSimpleName();
- }
- };
-
- /** A {@link Function} that formats types. */
- public static final Function<Type, String> TYPE_SIMPLE_DESCRIPTION =
- new Function<Type, String>() {
- @Override
- @Nullable
- public String apply(@Nullable Type input) {
- StringBuilder builder = new StringBuilder();
- format(builder, input);
- return builder.toString();
- }
-
- private void format(StringBuilder builder, Type t) {
- if (t instanceof Class) {
- formatClass(builder, (Class<?>) t);
- } else if (t instanceof TypeVariable) {
- formatTypeVariable(builder, (TypeVariable<?>) t);
- } else if (t instanceof WildcardType) {
- formatWildcardType(builder, (WildcardType) t);
- } else if (t instanceof ParameterizedType) {
- formatParameterizedType(builder, (ParameterizedType) t);
- } else if (t instanceof GenericArrayType) {
- formatGenericArrayType(builder, (GenericArrayType) t);
- } else {
- builder.append(t.toString());
- }
- }
-
- private void formatClass(StringBuilder builder, Class<?> clazz) {
- builder.append(clazz.getSimpleName());
- }
-
- private void formatTypeVariable(StringBuilder builder, TypeVariable<?> t) {
- builder.append(t.getName());
- }
-
- private void formatWildcardType(StringBuilder builder, WildcardType t) {
- builder.append("?");
- for (Type lowerBound : t.getLowerBounds()) {
- builder.append(" super ");
- format(builder, lowerBound);
- }
- for (Type upperBound : t.getUpperBounds()) {
- if (!Object.class.equals(upperBound)) {
- builder.append(" extends ");
- format(builder, upperBound);
- }
- }
- }
-
- private void formatParameterizedType(StringBuilder builder, ParameterizedType t) {
- format(builder, t.getRawType());
- builder.append('<');
- COMMA_SEPARATOR.appendTo(builder,
- FluentIterable.from(asList(t.getActualTypeArguments()))
- .transform(TYPE_SIMPLE_DESCRIPTION));
- builder.append('>');
- }
-
- private void formatGenericArrayType(StringBuilder builder, GenericArrayType t) {
- format(builder, t.getGenericComponentType());
- builder.append("[]");
- }
- };
-
- /**
- * Returns all interfaces of the given clazz.
- * @param clazz
- * @return
- */
- public static FluentIterable<Class<?>> getClosureOfInterfaces(Class<?> clazz) {
- Preconditions.checkNotNull(clazz);
- Queue<Class<?>> interfacesToProcess = Queues.newArrayDeque();
- Collections.addAll(interfacesToProcess, clazz.getInterfaces());
-
- LinkedHashSet<Class<?>> interfaces = new LinkedHashSet<>();
- while (!interfacesToProcess.isEmpty()) {
- Class<?> current = interfacesToProcess.remove();
- if (interfaces.add(current)) {
- Collections.addAll(interfacesToProcess, current.getInterfaces());
- }
- }
- return FluentIterable.from(interfaces);
- }
-
- /**
- * Returns all the methods visible from the provided interfaces.
- *
- * @param interfaces The interfaces to use when searching for all their methods.
- * @return An iterable of {@link Method}s which interfaces expose.
- */
- public static Iterable<Method> getClosureOfMethodsOnInterfaces(
- Iterable<? extends Class<?>> interfaces) {
- return FluentIterable.from(interfaces).transformAndConcat(
- new Function<Class<?>, Iterable<Method>>() {
- @Override
- public Iterable<Method> apply(Class<?> input) {
- return getClosureOfMethodsOnInterface(input);
- }
- });
- }
-
- /**
- * Returns all the methods visible from {@code iface}.
- *
- * @param iface The interface to use when searching for all its methods.
- * @return An iterable of {@link Method}s which {@code iface} exposes.
- */
- public static Iterable<Method> getClosureOfMethodsOnInterface(Class<?> iface) {
- Preconditions.checkNotNull(iface);
- Preconditions.checkArgument(iface.isInterface());
- ImmutableSet.Builder<Method> builder = ImmutableSet.builder();
- Queue<Class<?>> interfacesToProcess = Queues.newArrayDeque();
- interfacesToProcess.add(iface);
- while (!interfacesToProcess.isEmpty()) {
- Class<?> current = interfacesToProcess.remove();
- builder.add(current.getMethods());
- interfacesToProcess.addAll(Arrays.asList(current.getInterfaces()));
- }
- return builder.build();
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/common/Reiterable.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/common/Reiterable.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/common/Reiterable.java
deleted file mode 100644
index 01c5775..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/common/Reiterable.java
+++ /dev/null
@@ -1,27 +0,0 @@
-/*******************************************************************************
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- ******************************************************************************/
-
-package com.google.cloud.dataflow.sdk.util.common;
-
-/**
- * An {@link Iterable} that returns {@link Reiterator} iterators.
- *
- * @param <T> the type of elements returned by the iterator
- */
-public interface Reiterable<T> extends Iterable<T> {
- @Override
- public Reiterator<T> iterator();
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/common/Reiterator.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/common/Reiterator.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/common/Reiterator.java
deleted file mode 100644
index dd8036d..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/common/Reiterator.java
+++ /dev/null
@@ -1,39 +0,0 @@
-/*******************************************************************************
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- ******************************************************************************/
-
-package com.google.cloud.dataflow.sdk.util.common;
-
-import java.util.Iterator;
-
-/**
- * An {@link Iterator} with the ability to copy its iteration state.
- *
- * @param <T> the type of elements returned by this iterator
- */
-public interface Reiterator<T> extends Iterator<T> {
- /**
- * Returns a copy of the current {@link Reiterator}. The copy's iteration
- * state is logically independent of the current iterator; each may be
- * advanced without affecting the other.
- *
- * <p>The returned {@code Reiterator} is not guaranteed to return
- * referentially identical iteration results as the original
- * {@link Reiterator}, although {@link Object#equals} will typically return
- * true for the corresponding elements of each if the original source is
- * logically immutable.
- */
- public Reiterator<T> copy();
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/common/package-info.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/common/package-info.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/common/package-info.java
deleted file mode 100644
index 7fb16c5..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/common/package-info.java
+++ /dev/null
@@ -1,18 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-/** Defines utilities shared by multiple PipelineRunner implementations. **/
-package com.google.cloud.dataflow.sdk.util.common;
[32/67] [partial] incubator-beam git commit: Directory reorganization
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/dataflow/PubsubIOTranslator.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/dataflow/PubsubIOTranslator.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/dataflow/PubsubIOTranslator.java
deleted file mode 100644
index 8b066ab..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/dataflow/PubsubIOTranslator.java
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.runners.dataflow;
-
-import com.google.cloud.dataflow.sdk.io.PubsubIO;
-import com.google.cloud.dataflow.sdk.runners.DataflowPipelineRunner;
-import com.google.cloud.dataflow.sdk.runners.DataflowPipelineTranslator.TransformTranslator;
-import com.google.cloud.dataflow.sdk.runners.DataflowPipelineTranslator.TranslationContext;
-import com.google.cloud.dataflow.sdk.util.PropertyNames;
-import com.google.cloud.dataflow.sdk.util.WindowedValue;
-
-/**
- * Pubsub transform support code for the Dataflow backend.
- */
-public class PubsubIOTranslator {
-
- /**
- * Implements PubsubIO Read translation for the Dataflow backend.
- */
- public static class ReadTranslator<T> implements TransformTranslator<PubsubIO.Read.Bound<T>> {
- @Override
- @SuppressWarnings({"rawtypes", "unchecked"})
- public void translate(
- PubsubIO.Read.Bound transform,
- TranslationContext context) {
- translateReadHelper(transform, context);
- }
-
- private <T> void translateReadHelper(
- PubsubIO.Read.Bound<T> transform,
- TranslationContext context) {
- if (!context.getPipelineOptions().isStreaming()) {
- throw new IllegalArgumentException(
- "PubsubIO.Read can only be used with the Dataflow streaming runner.");
- }
-
- context.addStep(transform, "ParallelRead");
- context.addInput(PropertyNames.FORMAT, "pubsub");
- if (transform.getTopic() != null) {
- context.addInput(PropertyNames.PUBSUB_TOPIC, transform.getTopic().asV1Beta1Path());
- }
- if (transform.getSubscription() != null) {
- context.addInput(
- PropertyNames.PUBSUB_SUBSCRIPTION, transform.getSubscription().asV1Beta1Path());
- }
- if (transform.getTimestampLabel() != null) {
- context.addInput(PropertyNames.PUBSUB_TIMESTAMP_LABEL, transform.getTimestampLabel());
- }
- if (transform.getIdLabel() != null) {
- context.addInput(PropertyNames.PUBSUB_ID_LABEL, transform.getIdLabel());
- }
- context.addValueOnlyOutput(PropertyNames.OUTPUT, context.getOutput(transform));
- }
- }
-
- /**
- * Implements PubsubIO Write translation for the Dataflow backend.
- */
- public static class WriteTranslator<T>
- implements TransformTranslator<DataflowPipelineRunner.StreamingPubsubIOWrite<T>> {
-
- @Override
- @SuppressWarnings({"rawtypes", "unchecked"})
- public void translate(
- DataflowPipelineRunner.StreamingPubsubIOWrite transform,
- TranslationContext context) {
- translateWriteHelper(transform, context);
- }
-
- private <T> void translateWriteHelper(
- DataflowPipelineRunner.StreamingPubsubIOWrite<T> customTransform,
- TranslationContext context) {
- if (!context.getPipelineOptions().isStreaming()) {
- throw new IllegalArgumentException(
- "PubsubIO.Write is non-primitive for the Dataflow batch runner.");
- }
-
- PubsubIO.Write.Bound<T> transform = customTransform.getOverriddenTransform();
-
- context.addStep(customTransform, "ParallelWrite");
- context.addInput(PropertyNames.FORMAT, "pubsub");
- context.addInput(PropertyNames.PUBSUB_TOPIC, transform.getTopic().asV1Beta1Path());
- if (transform.getTimestampLabel() != null) {
- context.addInput(PropertyNames.PUBSUB_TIMESTAMP_LABEL, transform.getTimestampLabel());
- }
- if (transform.getIdLabel() != null) {
- context.addInput(PropertyNames.PUBSUB_ID_LABEL, transform.getIdLabel());
- }
- context.addEncodingInput(WindowedValue.getValueOnlyCoder(transform.getCoder()));
- context.addInput(PropertyNames.PARALLEL_INPUT, context.getInput(customTransform));
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/dataflow/ReadTranslator.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/dataflow/ReadTranslator.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/dataflow/ReadTranslator.java
deleted file mode 100644
index f110e84..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/dataflow/ReadTranslator.java
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.runners.dataflow;
-
-import static com.google.cloud.dataflow.sdk.util.Structs.addBoolean;
-import static com.google.cloud.dataflow.sdk.util.Structs.addDictionary;
-import static com.google.cloud.dataflow.sdk.util.Structs.addLong;
-
-import com.google.api.services.dataflow.model.SourceMetadata;
-import com.google.cloud.dataflow.sdk.io.FileBasedSource;
-import com.google.cloud.dataflow.sdk.io.Read;
-import com.google.cloud.dataflow.sdk.io.Source;
-import com.google.cloud.dataflow.sdk.runners.DataflowPipelineTranslator;
-import com.google.cloud.dataflow.sdk.runners.DataflowPipelineTranslator.TransformTranslator;
-import com.google.cloud.dataflow.sdk.runners.DataflowPipelineTranslator.TranslationContext;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-import com.google.cloud.dataflow.sdk.util.PropertyNames;
-import com.google.cloud.dataflow.sdk.values.PValue;
-
-import java.util.HashMap;
-import java.util.Map;
-
-/**
- * Translator for the {@code Read} {@code PTransform} for the Dataflow back-end.
- */
-public class ReadTranslator implements TransformTranslator<Read.Bounded<?>> {
- @Override
- public void translate(Read.Bounded<?> transform, TranslationContext context) {
- translateReadHelper(transform.getSource(), transform, context);
- }
-
- public static <T> void translateReadHelper(Source<T> source,
- PTransform<?, ? extends PValue> transform,
- DataflowPipelineTranslator.TranslationContext context) {
- try {
- // TODO: Move this validation out of translation once IOChannelUtils is portable
- // and can be reconstructed on the worker.
- if (source instanceof FileBasedSource) {
- String filePatternOrSpec = ((FileBasedSource<?>) source).getFileOrPatternSpec();
- context.getPipelineOptions()
- .getPathValidator()
- .validateInputFilePatternSupported(filePatternOrSpec);
- }
-
- context.addStep(transform, "ParallelRead");
- context.addInput(PropertyNames.FORMAT, PropertyNames.CUSTOM_SOURCE_FORMAT);
- context.addInput(
- PropertyNames.SOURCE_STEP_INPUT,
- cloudSourceToDictionary(
- CustomSources.serializeToCloudSource(source, context.getPipelineOptions())));
- context.addValueOnlyOutput(PropertyNames.OUTPUT, context.getOutput(transform));
- } catch (Exception e) {
- throw new RuntimeException(e);
- }
- }
-
- // Represents a cloud Source as a dictionary for encoding inside the {@code SOURCE_STEP_INPUT}
- // property of CloudWorkflowStep.input.
- private static Map<String, Object> cloudSourceToDictionary(
- com.google.api.services.dataflow.model.Source source) {
- // Do not translate encoding - the source's encoding is translated elsewhere
- // to the step's output info.
- Map<String, Object> res = new HashMap<>();
- addDictionary(res, PropertyNames.SOURCE_SPEC, source.getSpec());
- if (source.getMetadata() != null) {
- addDictionary(res, PropertyNames.SOURCE_METADATA,
- cloudSourceMetadataToDictionary(source.getMetadata()));
- }
- if (source.getDoesNotNeedSplitting() != null) {
- addBoolean(
- res, PropertyNames.SOURCE_DOES_NOT_NEED_SPLITTING, source.getDoesNotNeedSplitting());
- }
- return res;
- }
-
- private static Map<String, Object> cloudSourceMetadataToDictionary(SourceMetadata metadata) {
- Map<String, Object> res = new HashMap<>();
- if (metadata.getProducesSortedKeys() != null) {
- addBoolean(res, PropertyNames.SOURCE_PRODUCES_SORTED_KEYS, metadata.getProducesSortedKeys());
- }
- if (metadata.getEstimatedSizeBytes() != null) {
- addLong(res, PropertyNames.SOURCE_ESTIMATED_SIZE_BYTES, metadata.getEstimatedSizeBytes());
- }
- if (metadata.getInfinite() != null) {
- addBoolean(res, PropertyNames.SOURCE_IS_INFINITE, metadata.getInfinite());
- }
- return res;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/dataflow/package-info.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/dataflow/package-info.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/dataflow/package-info.java
deleted file mode 100644
index b6b2ce6..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/dataflow/package-info.java
+++ /dev/null
@@ -1,20 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-/**
- * Implementation of the {@link com.google.cloud.dataflow.sdk.runners.DataflowPipelineRunner}.
- */
-package com.google.cloud.dataflow.sdk.runners.dataflow;
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/BoundedReadEvaluatorFactory.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/BoundedReadEvaluatorFactory.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/BoundedReadEvaluatorFactory.java
deleted file mode 100644
index eaea3ed..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/BoundedReadEvaluatorFactory.java
+++ /dev/null
@@ -1,152 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.runners.inprocess;
-
-import com.google.cloud.dataflow.sdk.io.BoundedSource;
-import com.google.cloud.dataflow.sdk.io.BoundedSource.BoundedReader;
-import com.google.cloud.dataflow.sdk.io.Read.Bounded;
-import com.google.cloud.dataflow.sdk.io.Source.Reader;
-import com.google.cloud.dataflow.sdk.runners.inprocess.InProcessPipelineRunner.CommittedBundle;
-import com.google.cloud.dataflow.sdk.runners.inprocess.InProcessPipelineRunner.UncommittedBundle;
-import com.google.cloud.dataflow.sdk.transforms.AppliedPTransform;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
-import com.google.cloud.dataflow.sdk.util.WindowedValue;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-
-import java.io.IOException;
-import java.util.Queue;
-import java.util.concurrent.ConcurrentHashMap;
-import java.util.concurrent.ConcurrentLinkedQueue;
-import java.util.concurrent.ConcurrentMap;
-
-import javax.annotation.Nullable;
-
-/**
- * A {@link TransformEvaluatorFactory} that produces {@link TransformEvaluator TransformEvaluators}
- * for the {@link Bounded Read.Bounded} primitive {@link PTransform}.
- */
-final class BoundedReadEvaluatorFactory implements TransformEvaluatorFactory {
- /*
- * An evaluator for a Source is stateful, to ensure data is not read multiple times.
- * Evaluators are cached here to ensure that the reader is not restarted if the evaluator is
- * retriggered.
- */
- private final ConcurrentMap<EvaluatorKey, Queue<? extends BoundedReadEvaluator<?>>>
- sourceEvaluators = new ConcurrentHashMap<>();
-
- @SuppressWarnings({"unchecked", "rawtypes"})
- @Override
- public <InputT> TransformEvaluator<InputT> forApplication(
- AppliedPTransform<?, ?, ?> application,
- @Nullable CommittedBundle<?> inputBundle,
- InProcessEvaluationContext evaluationContext)
- throws IOException {
- return getTransformEvaluator((AppliedPTransform) application, evaluationContext);
- }
-
- private <OutputT> TransformEvaluator<?> getTransformEvaluator(
- final AppliedPTransform<?, PCollection<OutputT>, Bounded<OutputT>> transform,
- final InProcessEvaluationContext evaluationContext)
- throws IOException {
- BoundedReadEvaluator<?> evaluator =
- getTransformEvaluatorQueue(transform, evaluationContext).poll();
- if (evaluator == null) {
- return EmptyTransformEvaluator.create(transform);
- }
- return evaluator;
- }
-
- /**
- * Get the queue of {@link TransformEvaluator TransformEvaluators} that produce elements for the
- * provided application of {@link Bounded Read.Bounded}, initializing it if required.
- *
- * <p>This method is thread-safe, and will only produce new evaluators if no other invocation has
- * already done so.
- */
- @SuppressWarnings("unchecked")
- private <OutputT> Queue<BoundedReadEvaluator<OutputT>> getTransformEvaluatorQueue(
- final AppliedPTransform<?, PCollection<OutputT>, Bounded<OutputT>> transform,
- final InProcessEvaluationContext evaluationContext) {
- // Key by the application and the context the evaluation is occurring in (which call to
- // Pipeline#run).
- EvaluatorKey key = new EvaluatorKey(transform, evaluationContext);
- Queue<BoundedReadEvaluator<OutputT>> evaluatorQueue =
- (Queue<BoundedReadEvaluator<OutputT>>) sourceEvaluators.get(key);
- if (evaluatorQueue == null) {
- evaluatorQueue = new ConcurrentLinkedQueue<>();
- if (sourceEvaluators.putIfAbsent(key, evaluatorQueue) == null) {
- // If no queue existed in the evaluators, add an evaluator to initialize the evaluator
- // factory for this transform
- BoundedReadEvaluator<OutputT> evaluator =
- new BoundedReadEvaluator<OutputT>(transform, evaluationContext);
- evaluatorQueue.offer(evaluator);
- } else {
- // otherwise return the existing Queue that arrived before us
- evaluatorQueue = (Queue<BoundedReadEvaluator<OutputT>>) sourceEvaluators.get(key);
- }
- }
- return evaluatorQueue;
- }
-
- /**
- * A {@link BoundedReadEvaluator} produces elements from an underlying {@link BoundedSource},
- * discarding all input elements. Within the call to {@link #finishBundle()}, the evaluator
- * creates the {@link BoundedReader} and consumes all available input.
- *
- * <p>A {@link BoundedReadEvaluator} should only be created once per {@link BoundedSource}, and
- * each evaluator should only be called once per evaluation of the pipeline. Otherwise, the source
- * may produce duplicate elements.
- */
- private static class BoundedReadEvaluator<OutputT> implements TransformEvaluator<Object> {
- private final AppliedPTransform<?, PCollection<OutputT>, Bounded<OutputT>> transform;
- private final InProcessEvaluationContext evaluationContext;
- private boolean contentsRemaining;
-
- public BoundedReadEvaluator(
- AppliedPTransform<?, PCollection<OutputT>, Bounded<OutputT>> transform,
- InProcessEvaluationContext evaluationContext) {
- this.transform = transform;
- this.evaluationContext = evaluationContext;
- }
-
- @Override
- public void processElement(WindowedValue<Object> element) {}
-
- @Override
- public InProcessTransformResult finishBundle() throws IOException {
- try (final Reader<OutputT> reader =
- transform
- .getTransform()
- .getSource()
- .createReader(evaluationContext.getPipelineOptions());) {
- contentsRemaining = reader.start();
- UncommittedBundle<OutputT> output =
- evaluationContext.createRootBundle(transform.getOutput());
- while (contentsRemaining) {
- output.add(
- WindowedValue.timestampedValueInGlobalWindow(
- reader.getCurrent(), reader.getCurrentTimestamp()));
- contentsRemaining = reader.advance();
- }
- reader.close();
- return StepTransformResult.withHold(transform, BoundedWindow.TIMESTAMP_MAX_VALUE)
- .addOutput(output)
- .build();
- }
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/CachedThreadPoolExecutorServiceFactory.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/CachedThreadPoolExecutorServiceFactory.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/CachedThreadPoolExecutorServiceFactory.java
deleted file mode 100644
index 3350d2b..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/CachedThreadPoolExecutorServiceFactory.java
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright (C) 2016 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.runners.inprocess;
-
-import com.google.cloud.dataflow.sdk.options.DefaultValueFactory;
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-
-import java.util.concurrent.ExecutorService;
-import java.util.concurrent.Executors;
-
-/**
- * A {@link ExecutorServiceFactory} that produces cached thread pools via
- * {@link Executors#newCachedThreadPool()}.
- */
-class CachedThreadPoolExecutorServiceFactory
- implements DefaultValueFactory<ExecutorServiceFactory>, ExecutorServiceFactory {
- private static final CachedThreadPoolExecutorServiceFactory INSTANCE =
- new CachedThreadPoolExecutorServiceFactory();
-
- @Override
- public ExecutorServiceFactory create(PipelineOptions options) {
- return INSTANCE;
- }
-
- @Override
- public ExecutorService create() {
- return Executors.newCachedThreadPool();
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/Clock.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/Clock.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/Clock.java
deleted file mode 100644
index 11e6ec1..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/Clock.java
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.runners.inprocess;
-
-import org.joda.time.Instant;
-
-/**
- * Access to the current time.
- */
-public interface Clock {
- /**
- * Returns the current time as an {@link Instant}.
- */
- Instant now();
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/CompletionCallback.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/CompletionCallback.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/CompletionCallback.java
deleted file mode 100644
index 2792631..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/CompletionCallback.java
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (C) 2016 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.runners.inprocess;
-
-import com.google.cloud.dataflow.sdk.runners.inprocess.InProcessPipelineRunner.CommittedBundle;
-
-/**
- * A callback for completing a bundle of input.
- */
-interface CompletionCallback {
- /**
- * Handle a successful result.
- */
- void handleResult(CommittedBundle<?> inputBundle, InProcessTransformResult result);
-
- /**
- * Handle a result that terminated abnormally due to the provided {@link Throwable}.
- */
- void handleThrowable(CommittedBundle<?> inputBundle, Throwable t);
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/ConsumerTrackingPipelineVisitor.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/ConsumerTrackingPipelineVisitor.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/ConsumerTrackingPipelineVisitor.java
deleted file mode 100644
index c602b23..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/ConsumerTrackingPipelineVisitor.java
+++ /dev/null
@@ -1,173 +0,0 @@
-/*
- * Copyright (C) 2016 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.runners.inprocess;
-
-import static com.google.common.base.Preconditions.checkState;
-
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.Pipeline.PipelineVisitor;
-import com.google.cloud.dataflow.sdk.runners.PipelineRunner;
-import com.google.cloud.dataflow.sdk.runners.TransformTreeNode;
-import com.google.cloud.dataflow.sdk.transforms.AppliedPTransform;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-import com.google.cloud.dataflow.sdk.values.PCollectionView;
-import com.google.cloud.dataflow.sdk.values.PInput;
-import com.google.cloud.dataflow.sdk.values.PValue;
-
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Map;
-import java.util.Set;
-
-/**
- * Tracks the {@link AppliedPTransform AppliedPTransforms} that consume each {@link PValue} in the
- * {@link Pipeline}. This is used to schedule consuming {@link PTransform PTransforms} to consume
- * input after the upstream transform has produced and committed output.
- */
-public class ConsumerTrackingPipelineVisitor implements PipelineVisitor {
- private Map<PValue, Collection<AppliedPTransform<?, ?, ?>>> valueToConsumers = new HashMap<>();
- private Collection<AppliedPTransform<?, ?, ?>> rootTransforms = new ArrayList<>();
- private Collection<PCollectionView<?>> views = new ArrayList<>();
- private Map<AppliedPTransform<?, ?, ?>, String> stepNames = new HashMap<>();
- private Set<PValue> toFinalize = new HashSet<>();
- private int numTransforms = 0;
- private boolean finalized = false;
-
- @Override
- public void enterCompositeTransform(TransformTreeNode node) {
- checkState(
- !finalized,
- "Attempting to traverse a pipeline (node %s) with a %s "
- + "which has already visited a Pipeline and is finalized",
- node.getFullName(),
- ConsumerTrackingPipelineVisitor.class.getSimpleName());
- }
-
- @Override
- public void leaveCompositeTransform(TransformTreeNode node) {
- checkState(
- !finalized,
- "Attempting to traverse a pipeline (node %s) with a %s which is already finalized",
- node.getFullName(),
- ConsumerTrackingPipelineVisitor.class.getSimpleName());
- if (node.isRootNode()) {
- finalized = true;
- }
- }
-
- @Override
- public void visitTransform(TransformTreeNode node) {
- toFinalize.removeAll(node.getInput().expand());
- AppliedPTransform<?, ?, ?> appliedTransform = getAppliedTransform(node);
- if (node.getInput().expand().isEmpty()) {
- rootTransforms.add(appliedTransform);
- } else {
- for (PValue value : node.getInput().expand()) {
- valueToConsumers.get(value).add(appliedTransform);
- stepNames.put(appliedTransform, genStepName());
- }
- }
- }
-
- private AppliedPTransform<?, ?, ?> getAppliedTransform(TransformTreeNode node) {
- @SuppressWarnings({"rawtypes", "unchecked"})
- AppliedPTransform<?, ?, ?> application = AppliedPTransform.of(
- node.getFullName(), node.getInput(), node.getOutput(), (PTransform) node.getTransform());
- return application;
- }
-
- @Override
- public void visitValue(PValue value, TransformTreeNode producer) {
- toFinalize.add(value);
- for (PValue expandedValue : value.expand()) {
- valueToConsumers.put(expandedValue, new ArrayList<AppliedPTransform<?, ?, ?>>());
- if (expandedValue instanceof PCollectionView) {
- views.add((PCollectionView<?>) expandedValue);
- }
- expandedValue.recordAsOutput(getAppliedTransform(producer));
- }
- value.recordAsOutput(getAppliedTransform(producer));
- }
-
- private String genStepName() {
- return String.format("s%s", numTransforms++);
- }
-
-
- /**
- * Returns a mapping of each fully-expanded {@link PValue} to each
- * {@link AppliedPTransform} that consumes it. For each AppliedPTransform in the collection
- * returned from {@code getValueToCustomers().get(PValue)},
- * {@code AppliedPTransform#getInput().expand()} will contain the argument {@link PValue}.
- */
- public Map<PValue, Collection<AppliedPTransform<?, ?, ?>>> getValueToConsumers() {
- checkState(
- finalized,
- "Can't call getValueToConsumers before the Pipeline has been completely traversed");
-
- return valueToConsumers;
- }
-
- /**
- * Returns the mapping for each {@link AppliedPTransform} in the {@link Pipeline} to a unique step
- * name.
- */
- public Map<AppliedPTransform<?, ?, ?>, String> getStepNames() {
- checkState(
- finalized, "Can't call getStepNames before the Pipeline has been completely traversed");
-
- return stepNames;
- }
-
- /**
- * Returns the root transforms of the {@link Pipeline}. A root {@link AppliedPTransform} consumes
- * a {@link PInput} where the {@link PInput#expand()} returns an empty collection.
- */
- public Collection<AppliedPTransform<?, ?, ?>> getRootTransforms() {
- checkState(
- finalized,
- "Can't call getRootTransforms before the Pipeline has been completely traversed");
-
- return rootTransforms;
- }
-
- /**
- * Returns all of the {@link PCollectionView PCollectionViews} contained in the visited
- * {@link Pipeline}.
- */
- public Collection<PCollectionView<?>> getViews() {
- checkState(finalized, "Can't call getViews before the Pipeline has been completely traversed");
-
- return views;
- }
-
- /**
- * Returns all of the {@link PValue PValues} that have been produced but not consumed. These
- * {@link PValue PValues} should be finalized by the {@link PipelineRunner} before the
- * {@link Pipeline} is executed.
- */
- public Set<PValue> getUnfinalizedPValues() {
- checkState(
- finalized,
- "Can't call getUnfinalizedPValues before the Pipeline has been completely traversed");
-
- return toFinalize;
- }
-}
-
-
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/EmptyTransformEvaluator.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/EmptyTransformEvaluator.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/EmptyTransformEvaluator.java
deleted file mode 100644
index fc09237..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/EmptyTransformEvaluator.java
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Copyright (C) 2016 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.runners.inprocess;
-
-import com.google.cloud.dataflow.sdk.transforms.AppliedPTransform;
-import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
-import com.google.cloud.dataflow.sdk.util.WindowedValue;
-
-/**
- * A {@link TransformEvaluator} that ignores all input and produces no output. The result of
- * invoking {@link #finishBundle()} on this evaluator is to return an
- * {@link InProcessTransformResult} with no elements and a timestamp hold equal to
- * {@link BoundedWindow#TIMESTAMP_MIN_VALUE}. Because the result contains no elements, this hold
- * will not affect the watermark.
- */
-final class EmptyTransformEvaluator<T> implements TransformEvaluator<T> {
- public static <T> TransformEvaluator<T> create(AppliedPTransform<?, ?, ?> transform) {
- return new EmptyTransformEvaluator<T>(transform);
- }
-
- private final AppliedPTransform<?, ?, ?> transform;
-
- private EmptyTransformEvaluator(AppliedPTransform<?, ?, ?> transform) {
- this.transform = transform;
- }
-
- @Override
- public void processElement(WindowedValue<T> element) throws Exception {}
-
- @Override
- public InProcessTransformResult finishBundle() throws Exception {
- return StepTransformResult.withHold(transform, BoundedWindow.TIMESTAMP_MIN_VALUE)
- .build();
- }
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/EvaluatorKey.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/EvaluatorKey.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/EvaluatorKey.java
deleted file mode 100644
index 307bc5c..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/EvaluatorKey.java
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.runners.inprocess;
-
-import com.google.cloud.dataflow.sdk.transforms.AppliedPTransform;
-
-import java.util.Objects;
-
-/**
- * A (Transform, Pipeline Execution) key for stateful evaluators.
- *
- * Source evaluators are stateful to ensure data is not read multiple times. Evaluators are cached
- * to ensure that the reader is not restarted if the evaluator is retriggered. An
- * {@link EvaluatorKey} is used to ensure that multiple Pipelines can be executed without sharing
- * the same evaluators.
- */
-final class EvaluatorKey {
- private final AppliedPTransform<?, ?, ?> transform;
- private final InProcessEvaluationContext context;
-
- public EvaluatorKey(AppliedPTransform<?, ?, ?> transform, InProcessEvaluationContext context) {
- this.transform = transform;
- this.context = context;
- }
-
- @Override
- public int hashCode() {
- return Objects.hash(transform, context);
- }
-
- @Override
- public boolean equals(Object other) {
- if (other == null || !(other instanceof EvaluatorKey)) {
- return false;
- }
- EvaluatorKey that = (EvaluatorKey) other;
- return Objects.equals(this.transform, that.transform)
- && Objects.equals(this.context, that.context);
- }
-}
-
-
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/ExecutorServiceFactory.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/ExecutorServiceFactory.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/ExecutorServiceFactory.java
deleted file mode 100644
index 480bcde..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/ExecutorServiceFactory.java
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- * Copyright (C) 2016 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.runners.inprocess;
-
-import java.util.concurrent.ExecutorService;
-
-/**
- * A factory that creates {@link ExecutorService ExecutorServices}.
- * {@link ExecutorService ExecutorServices} created by this factory should be independent of one
- * another (e.g., if any executor is shut down the remaining executors should continue to process
- * work).
- */
-public interface ExecutorServiceFactory {
- /**
- * Create a new {@link ExecutorService}.
- */
- ExecutorService create();
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/ExecutorServiceParallelExecutor.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/ExecutorServiceParallelExecutor.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/ExecutorServiceParallelExecutor.java
deleted file mode 100644
index 68a1b8c..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/ExecutorServiceParallelExecutor.java
+++ /dev/null
@@ -1,432 +0,0 @@
-/*
- * Copyright (C) 2016 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.runners.inprocess;
-
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.runners.inprocess.InMemoryWatermarkManager.FiredTimers;
-import com.google.cloud.dataflow.sdk.runners.inprocess.InProcessPipelineRunner.CommittedBundle;
-import com.google.cloud.dataflow.sdk.transforms.AppliedPTransform;
-import com.google.cloud.dataflow.sdk.util.KeyedWorkItem;
-import com.google.cloud.dataflow.sdk.util.KeyedWorkItems;
-import com.google.cloud.dataflow.sdk.util.TimeDomain;
-import com.google.cloud.dataflow.sdk.util.TimerInternals.TimerData;
-import com.google.cloud.dataflow.sdk.util.WindowedValue;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.cloud.dataflow.sdk.values.PValue;
-import com.google.common.base.MoreObjects;
-import com.google.common.base.Optional;
-import com.google.common.collect.ImmutableList;
-
-import org.joda.time.Instant;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.util.Collection;
-import java.util.Collections;
-import java.util.Map;
-import java.util.Queue;
-import java.util.Set;
-import java.util.concurrent.ArrayBlockingQueue;
-import java.util.concurrent.BlockingQueue;
-import java.util.concurrent.ConcurrentHashMap;
-import java.util.concurrent.ConcurrentLinkedQueue;
-import java.util.concurrent.ConcurrentMap;
-import java.util.concurrent.ExecutorService;
-
-import javax.annotation.Nullable;
-
-/**
- * An {@link InProcessExecutor} that uses an underlying {@link ExecutorService} and
- * {@link InProcessEvaluationContext} to execute a {@link Pipeline}.
- */
-final class ExecutorServiceParallelExecutor implements InProcessExecutor {
- private static final Logger LOG = LoggerFactory.getLogger(ExecutorServiceParallelExecutor.class);
-
- private final ExecutorService executorService;
-
- private final Map<PValue, Collection<AppliedPTransform<?, ?, ?>>> valueToConsumers;
- private final Set<PValue> keyedPValues;
- private final TransformEvaluatorRegistry registry;
- private final InProcessEvaluationContext evaluationContext;
-
- private final ConcurrentMap<StepAndKey, TransformExecutorService> currentEvaluations;
- private final ConcurrentMap<TransformExecutor<?>, Boolean> scheduledExecutors;
-
- private final Queue<ExecutorUpdate> allUpdates;
- private final BlockingQueue<VisibleExecutorUpdate> visibleUpdates;
-
- private final TransformExecutorService parallelExecutorService;
- private final CompletionCallback defaultCompletionCallback;
-
- private Collection<AppliedPTransform<?, ?, ?>> rootNodes;
-
- public static ExecutorServiceParallelExecutor create(
- ExecutorService executorService,
- Map<PValue, Collection<AppliedPTransform<?, ?, ?>>> valueToConsumers,
- Set<PValue> keyedPValues,
- TransformEvaluatorRegistry registry,
- InProcessEvaluationContext context) {
- return new ExecutorServiceParallelExecutor(
- executorService, valueToConsumers, keyedPValues, registry, context);
- }
-
- private ExecutorServiceParallelExecutor(
- ExecutorService executorService,
- Map<PValue, Collection<AppliedPTransform<?, ?, ?>>> valueToConsumers,
- Set<PValue> keyedPValues,
- TransformEvaluatorRegistry registry,
- InProcessEvaluationContext context) {
- this.executorService = executorService;
- this.valueToConsumers = valueToConsumers;
- this.keyedPValues = keyedPValues;
- this.registry = registry;
- this.evaluationContext = context;
-
- currentEvaluations = new ConcurrentHashMap<>();
- scheduledExecutors = new ConcurrentHashMap<>();
-
- this.allUpdates = new ConcurrentLinkedQueue<>();
- this.visibleUpdates = new ArrayBlockingQueue<>(20);
-
- parallelExecutorService =
- TransformExecutorServices.parallel(executorService, scheduledExecutors);
- defaultCompletionCallback = new DefaultCompletionCallback();
- }
-
- @Override
- public void start(Collection<AppliedPTransform<?, ?, ?>> roots) {
- rootNodes = ImmutableList.copyOf(roots);
- Runnable monitorRunnable = new MonitorRunnable();
- executorService.submit(monitorRunnable);
- }
-
- @SuppressWarnings("unchecked")
- public void scheduleConsumption(
- AppliedPTransform<?, ?, ?> consumer,
- @Nullable CommittedBundle<?> bundle,
- CompletionCallback onComplete) {
- evaluateBundle(consumer, bundle, onComplete);
- }
-
- private <T> void evaluateBundle(
- final AppliedPTransform<?, ?, ?> transform,
- @Nullable final CommittedBundle<T> bundle,
- final CompletionCallback onComplete) {
- TransformExecutorService transformExecutor;
- if (bundle != null && isKeyed(bundle.getPCollection())) {
- final StepAndKey stepAndKey =
- StepAndKey.of(transform, bundle == null ? null : bundle.getKey());
- transformExecutor = getSerialExecutorService(stepAndKey);
- } else {
- transformExecutor = parallelExecutorService;
- }
- TransformExecutor<T> callable =
- TransformExecutor.create(
- registry, evaluationContext, bundle, transform, onComplete, transformExecutor);
- transformExecutor.schedule(callable);
- }
-
- private boolean isKeyed(PValue pvalue) {
- return keyedPValues.contains(pvalue);
- }
-
- private void scheduleConsumers(CommittedBundle<?> bundle) {
- for (AppliedPTransform<?, ?, ?> consumer : valueToConsumers.get(bundle.getPCollection())) {
- scheduleConsumption(consumer, bundle, defaultCompletionCallback);
- }
- }
-
- private TransformExecutorService getSerialExecutorService(StepAndKey stepAndKey) {
- if (!currentEvaluations.containsKey(stepAndKey)) {
- currentEvaluations.putIfAbsent(
- stepAndKey, TransformExecutorServices.serial(executorService, scheduledExecutors));
- }
- return currentEvaluations.get(stepAndKey);
- }
-
- @Override
- public void awaitCompletion() throws Throwable {
- VisibleExecutorUpdate update;
- do {
- update = visibleUpdates.take();
- if (update.throwable.isPresent()) {
- throw update.throwable.get();
- }
- } while (!update.isDone());
- executorService.shutdown();
- }
-
- /**
- * The default {@link CompletionCallback}. The default completion callback is used to complete
- * transform evaluations that are triggered due to the arrival of elements from an upstream
- * transform, or for a source transform.
- */
- private class DefaultCompletionCallback implements CompletionCallback {
- @Override
- public void handleResult(CommittedBundle<?> inputBundle, InProcessTransformResult result) {
- Iterable<? extends CommittedBundle<?>> resultBundles =
- evaluationContext.handleResult(inputBundle, Collections.<TimerData>emptyList(), result);
- for (CommittedBundle<?> outputBundle : resultBundles) {
- allUpdates.offer(ExecutorUpdate.fromBundle(outputBundle));
- }
- }
-
- @Override
- public void handleThrowable(CommittedBundle<?> inputBundle, Throwable t) {
- allUpdates.offer(ExecutorUpdate.fromThrowable(t));
- }
- }
-
- /**
- * A {@link CompletionCallback} where the completed bundle was produced to deliver some collection
- * of {@link TimerData timers}. When the evaluator completes successfully, reports all of the
- * timers used to create the input to the {@link InProcessEvaluationContext evaluation context}
- * as part of the result.
- */
- private class TimerCompletionCallback implements CompletionCallback {
- private final Iterable<TimerData> timers;
-
- private TimerCompletionCallback(Iterable<TimerData> timers) {
- this.timers = timers;
- }
-
- @Override
- public void handleResult(CommittedBundle<?> inputBundle, InProcessTransformResult result) {
- Iterable<? extends CommittedBundle<?>> resultBundles =
- evaluationContext.handleResult(inputBundle, timers, result);
- for (CommittedBundle<?> outputBundle : resultBundles) {
- allUpdates.offer(ExecutorUpdate.fromBundle(outputBundle));
- }
- }
-
- @Override
- public void handleThrowable(CommittedBundle<?> inputBundle, Throwable t) {
- allUpdates.offer(ExecutorUpdate.fromThrowable(t));
- }
- }
-
- /**
- * An internal status update on the state of the executor.
- *
- * Used to signal when the executor should be shut down (due to an exception).
- */
- private static class ExecutorUpdate {
- private final Optional<? extends CommittedBundle<?>> bundle;
- private final Optional<? extends Throwable> throwable;
-
- public static ExecutorUpdate fromBundle(CommittedBundle<?> bundle) {
- return new ExecutorUpdate(bundle, null);
- }
-
- public static ExecutorUpdate fromThrowable(Throwable t) {
- return new ExecutorUpdate(null, t);
- }
-
- private ExecutorUpdate(CommittedBundle<?> producedBundle, Throwable throwable) {
- this.bundle = Optional.fromNullable(producedBundle);
- this.throwable = Optional.fromNullable(throwable);
- }
-
- public Optional<? extends CommittedBundle<?>> getBundle() {
- return bundle;
- }
-
- public Optional<? extends Throwable> getException() {
- return throwable;
- }
-
- @Override
- public String toString() {
- return MoreObjects.toStringHelper(ExecutorUpdate.class)
- .add("bundle", bundle)
- .add("exception", throwable)
- .toString();
- }
- }
-
- /**
- * An update of interest to the user. Used in {@link #awaitCompletion} to decide whether to
- * return normally or throw an exception.
- */
- private static class VisibleExecutorUpdate {
- private final Optional<? extends Throwable> throwable;
- private final boolean done;
-
- public static VisibleExecutorUpdate fromThrowable(Throwable e) {
- return new VisibleExecutorUpdate(false, e);
- }
-
- public static VisibleExecutorUpdate finished() {
- return new VisibleExecutorUpdate(true, null);
- }
-
- private VisibleExecutorUpdate(boolean done, @Nullable Throwable exception) {
- this.throwable = Optional.fromNullable(exception);
- this.done = done;
- }
-
- public boolean isDone() {
- return done;
- }
- }
-
- private class MonitorRunnable implements Runnable {
- private final String runnableName =
- String.format(
- "%s$%s-monitor",
- evaluationContext.getPipelineOptions().getAppName(),
- ExecutorServiceParallelExecutor.class.getSimpleName());
-
- @Override
- public void run() {
- String oldName = Thread.currentThread().getName();
- Thread.currentThread().setName(runnableName);
- try {
- ExecutorUpdate update = allUpdates.poll();
- if (update != null) {
- LOG.debug("Executor Update: {}", update);
- if (update.getBundle().isPresent()) {
- scheduleConsumers(update.getBundle().get());
- } else if (update.getException().isPresent()) {
- visibleUpdates.offer(VisibleExecutorUpdate.fromThrowable(update.getException().get()));
- }
- }
- boolean timersFired = fireTimers();
- addWorkIfNecessary(timersFired);
- } catch (InterruptedException e) {
- Thread.currentThread().interrupt();
- LOG.error("Monitor died due to being interrupted");
- while (!visibleUpdates.offer(VisibleExecutorUpdate.fromThrowable(e))) {
- visibleUpdates.poll();
- }
- } catch (Throwable t) {
- LOG.error("Monitor thread died due to throwable", t);
- while (!visibleUpdates.offer(VisibleExecutorUpdate.fromThrowable(t))) {
- visibleUpdates.poll();
- }
- } finally {
- if (!shouldShutdown()) {
- // The monitor thread should always be scheduled; but we only need to be scheduled once
- executorService.submit(this);
- }
- Thread.currentThread().setName(oldName);
- }
- }
-
- /**
- * Fires any available timers. Returns true if at least one timer was fired.
- */
- private boolean fireTimers() throws Exception {
- try {
- boolean firedTimers = false;
- for (Map.Entry<AppliedPTransform<?, ?, ?>, Map<Object, FiredTimers>> transformTimers :
- evaluationContext.extractFiredTimers().entrySet()) {
- AppliedPTransform<?, ?, ?> transform = transformTimers.getKey();
- for (Map.Entry<Object, FiredTimers> keyTimers : transformTimers.getValue().entrySet()) {
- for (TimeDomain domain : TimeDomain.values()) {
- Collection<TimerData> delivery = keyTimers.getValue().getTimers(domain);
- if (delivery.isEmpty()) {
- continue;
- }
- KeyedWorkItem<Object, Object> work =
- KeyedWorkItems.timersWorkItem(keyTimers.getKey(), delivery);
- @SuppressWarnings({"unchecked", "rawtypes"})
- CommittedBundle<?> bundle =
- InProcessBundle.<KeyedWorkItem<Object, Object>>keyed(
- (PCollection) transform.getInput(), keyTimers.getKey())
- .add(WindowedValue.valueInEmptyWindows(work))
- .commit(Instant.now());
- scheduleConsumption(transform, bundle, new TimerCompletionCallback(delivery));
- firedTimers = true;
- }
- }
- }
- return firedTimers;
- } catch (Exception e) {
- LOG.error("Internal Error while delivering timers", e);
- throw e;
- }
- }
-
- private boolean shouldShutdown() {
- if (evaluationContext.isDone()) {
- LOG.debug("Pipeline is finished. Shutting down. {}");
- while (!visibleUpdates.offer(VisibleExecutorUpdate.finished())) {
- visibleUpdates.poll();
- }
- executorService.shutdown();
- return true;
- }
- return false;
- }
-
- /**
- * If all active {@link TransformExecutor TransformExecutors} are in a blocked state,
- * add more work from root nodes that may have additional work. This ensures that if a pipeline
- * has elements available from the root nodes it will add those elements when necessary.
- */
- private void addWorkIfNecessary(boolean firedTimers) {
- // If any timers have fired, they will add more work; We don't need to add more
- if (firedTimers) {
- return;
- }
- for (TransformExecutor<?> executor : scheduledExecutors.keySet()) {
- if (!isExecutorBlocked(executor)) {
- // We have at least one executor that can proceed without adding additional work
- return;
- }
- }
- // All current TransformExecutors are blocked; add more work from the roots.
- for (AppliedPTransform<?, ?, ?> root : rootNodes) {
- if (!evaluationContext.isDone(root)) {
- scheduleConsumption(root, null, defaultCompletionCallback);
- }
- }
- }
-
- /**
- * Return true if the provided executor might make more progress if no action is taken.
- *
- * <p>May return false even if all executor threads are currently blocked or cleaning up, as
- * these can cause more work to be scheduled. If this does not occur, after these calls
- * terminate, future calls will return true if all executors are waiting.
- */
- private boolean isExecutorBlocked(TransformExecutor<?> executor) {
- Thread thread = executor.getThread();
- if (thread == null) {
- return false;
- }
- switch (thread.getState()) {
- case TERMINATED:
- throw new IllegalStateException(String.format(
- "Unexpectedly encountered a Terminated TransformExecutor %s", executor));
- case WAITING:
- case TIMED_WAITING:
- // The thread is waiting for some external input. Adding more work may cause the thread
- // to stop waiting (e.g. the thread is waiting on an unbounded side input)
- return true;
- case BLOCKED:
- // The executor is blocked on acquisition of a java monitor. This usually means it is
- // making a call to the EvaluationContext, but not a model-blocking call - and will
- // eventually complete, at which point we may reevaluate.
- default:
- // NEW and RUNNABLE threads can make progress
- return false;
- }
- }
- }
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/FlattenEvaluatorFactory.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/FlattenEvaluatorFactory.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/FlattenEvaluatorFactory.java
deleted file mode 100644
index ce315be..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/FlattenEvaluatorFactory.java
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright (C) 2016 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.runners.inprocess;
-
-import com.google.cloud.dataflow.sdk.runners.inprocess.InProcessPipelineRunner.CommittedBundle;
-import com.google.cloud.dataflow.sdk.runners.inprocess.InProcessPipelineRunner.UncommittedBundle;
-import com.google.cloud.dataflow.sdk.transforms.AppliedPTransform;
-import com.google.cloud.dataflow.sdk.transforms.Flatten;
-import com.google.cloud.dataflow.sdk.transforms.Flatten.FlattenPCollectionList;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-import com.google.cloud.dataflow.sdk.util.WindowedValue;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.cloud.dataflow.sdk.values.PCollectionList;
-
-/**
- * The {@link InProcessPipelineRunner} {@link TransformEvaluatorFactory} for the {@link Flatten}
- * {@link PTransform}.
- */
-class FlattenEvaluatorFactory implements TransformEvaluatorFactory {
- @Override
- public <InputT> TransformEvaluator<InputT> forApplication(
- AppliedPTransform<?, ?, ?> application,
- CommittedBundle<?> inputBundle,
- InProcessEvaluationContext evaluationContext) {
- @SuppressWarnings({"cast", "unchecked", "rawtypes"})
- TransformEvaluator<InputT> evaluator = (TransformEvaluator<InputT>) createInMemoryEvaluator(
- (AppliedPTransform) application, inputBundle, evaluationContext);
- return evaluator;
- }
-
- private <InputT> TransformEvaluator<InputT> createInMemoryEvaluator(
- final AppliedPTransform<
- PCollectionList<InputT>, PCollection<InputT>, FlattenPCollectionList<InputT>>
- application,
- final CommittedBundle<InputT> inputBundle,
- final InProcessEvaluationContext evaluationContext) {
- if (inputBundle == null) {
- // it is impossible to call processElement on a flatten with no input bundle. A Flatten with
- // no input bundle occurs as an output of Flatten.pcollections(PCollectionList.empty())
- return new FlattenEvaluator<>(
- null, StepTransformResult.withoutHold(application).build());
- }
- final UncommittedBundle<InputT> outputBundle =
- evaluationContext.createBundle(inputBundle, application.getOutput());
- final InProcessTransformResult result =
- StepTransformResult.withoutHold(application).addOutput(outputBundle).build();
- return new FlattenEvaluator<>(outputBundle, result);
- }
-
- private static class FlattenEvaluator<InputT> implements TransformEvaluator<InputT> {
- private final UncommittedBundle<InputT> outputBundle;
- private final InProcessTransformResult result;
-
- public FlattenEvaluator(
- UncommittedBundle<InputT> outputBundle, InProcessTransformResult result) {
- this.outputBundle = outputBundle;
- this.result = result;
- }
-
- @Override
- public void processElement(WindowedValue<InputT> element) {
- outputBundle.add(element);
- }
-
- @Override
- public InProcessTransformResult finishBundle() {
- return result;
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/ForwardingPTransform.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/ForwardingPTransform.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/ForwardingPTransform.java
deleted file mode 100644
index b736e35..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/ForwardingPTransform.java
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (C) 2016 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.runners.inprocess;
-
-import com.google.cloud.dataflow.sdk.coders.CannotProvideCoderException;
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-import com.google.cloud.dataflow.sdk.values.PInput;
-import com.google.cloud.dataflow.sdk.values.POutput;
-import com.google.cloud.dataflow.sdk.values.TypedPValue;
-
-/**
- * A base class for implementing {@link PTransform} overrides, which behave identically to the
- * delegate transform but with overridden methods. Implementors are required to implement
- * {@link #delegate()}, which returns the object to forward calls to, and {@link #apply(PInput)}.
- */
-public abstract class ForwardingPTransform<InputT extends PInput, OutputT extends POutput>
- extends PTransform<InputT, OutputT> {
- protected abstract PTransform<InputT, OutputT> delegate();
-
- @Override
- public OutputT apply(InputT input) {
- return delegate().apply(input);
- }
-
- @Override
- public void validate(InputT input) {
- delegate().validate(input);
- }
-
- @Override
- public String getName() {
- return delegate().getName();
- }
-
- @Override
- public <T> Coder<T> getDefaultOutputCoder(InputT input, @SuppressWarnings("unused")
- TypedPValue<T> output) throws CannotProvideCoderException {
- return delegate().getDefaultOutputCoder(input, output);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/GroupByKeyEvaluatorFactory.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/GroupByKeyEvaluatorFactory.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/GroupByKeyEvaluatorFactory.java
deleted file mode 100644
index 3ec4af1..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/GroupByKeyEvaluatorFactory.java
+++ /dev/null
@@ -1,252 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.runners.inprocess;
-
-import static com.google.cloud.dataflow.sdk.util.CoderUtils.encodeToByteArray;
-
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.coders.CoderException;
-import com.google.cloud.dataflow.sdk.coders.IterableCoder;
-import com.google.cloud.dataflow.sdk.coders.KvCoder;
-import com.google.cloud.dataflow.sdk.runners.inprocess.InProcessPipelineRunner.CommittedBundle;
-import com.google.cloud.dataflow.sdk.runners.inprocess.InProcessPipelineRunner.UncommittedBundle;
-import com.google.cloud.dataflow.sdk.runners.inprocess.StepTransformResult.Builder;
-import com.google.cloud.dataflow.sdk.transforms.AppliedPTransform;
-import com.google.cloud.dataflow.sdk.transforms.DoFn;
-import com.google.cloud.dataflow.sdk.transforms.GroupByKey;
-import com.google.cloud.dataflow.sdk.transforms.GroupByKey.ReifyTimestampsAndWindows;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-import com.google.cloud.dataflow.sdk.transforms.ParDo;
-import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
-import com.google.cloud.dataflow.sdk.util.GroupAlsoByWindowViaWindowSetDoFn;
-import com.google.cloud.dataflow.sdk.util.KeyedWorkItem;
-import com.google.cloud.dataflow.sdk.util.KeyedWorkItemCoder;
-import com.google.cloud.dataflow.sdk.util.KeyedWorkItems;
-import com.google.cloud.dataflow.sdk.util.SystemReduceFn;
-import com.google.cloud.dataflow.sdk.util.WindowedValue;
-import com.google.cloud.dataflow.sdk.util.WindowingStrategy;
-import com.google.cloud.dataflow.sdk.values.KV;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.common.annotations.VisibleForTesting;
-
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-
-/**
- * The {@link InProcessPipelineRunner} {@link TransformEvaluatorFactory} for the {@link GroupByKey}
- * {@link PTransform}.
- */
-class GroupByKeyEvaluatorFactory implements TransformEvaluatorFactory {
- @Override
- public <InputT> TransformEvaluator<InputT> forApplication(
- AppliedPTransform<?, ?, ?> application,
- CommittedBundle<?> inputBundle,
- InProcessEvaluationContext evaluationContext) {
- @SuppressWarnings({"cast", "unchecked", "rawtypes"})
- TransformEvaluator<InputT> evaluator = createEvaluator(
- (AppliedPTransform) application, (CommittedBundle) inputBundle, evaluationContext);
- return evaluator;
- }
-
- private <K, V> TransformEvaluator<KV<K, WindowedValue<V>>> createEvaluator(
- final AppliedPTransform<
- PCollection<KV<K, WindowedValue<V>>>, PCollection<KeyedWorkItem<K, V>>,
- InProcessGroupByKeyOnly<K, V>>
- application,
- final CommittedBundle<KV<K, V>> inputBundle,
- final InProcessEvaluationContext evaluationContext) {
- return new GroupByKeyEvaluator<K, V>(evaluationContext, inputBundle, application);
- }
-
- private static class GroupByKeyEvaluator<K, V>
- implements TransformEvaluator<KV<K, WindowedValue<V>>> {
- private final InProcessEvaluationContext evaluationContext;
-
- private final CommittedBundle<KV<K, V>> inputBundle;
- private final AppliedPTransform<
- PCollection<KV<K, WindowedValue<V>>>, PCollection<KeyedWorkItem<K, V>>,
- InProcessGroupByKeyOnly<K, V>>
- application;
- private final Coder<K> keyCoder;
- private Map<GroupingKey<K>, List<WindowedValue<V>>> groupingMap;
-
- public GroupByKeyEvaluator(
- InProcessEvaluationContext evaluationContext,
- CommittedBundle<KV<K, V>> inputBundle,
- AppliedPTransform<
- PCollection<KV<K, WindowedValue<V>>>, PCollection<KeyedWorkItem<K, V>>,
- InProcessGroupByKeyOnly<K, V>>
- application) {
- this.evaluationContext = evaluationContext;
- this.inputBundle = inputBundle;
- this.application = application;
-
- PCollection<KV<K, WindowedValue<V>>> input = application.getInput();
- keyCoder = getKeyCoder(input.getCoder());
- groupingMap = new HashMap<>();
- }
-
- private Coder<K> getKeyCoder(Coder<KV<K, WindowedValue<V>>> coder) {
- if (!(coder instanceof KvCoder)) {
- throw new IllegalStateException();
- }
- @SuppressWarnings("unchecked")
- Coder<K> keyCoder = ((KvCoder<K, WindowedValue<V>>) coder).getKeyCoder();
- return keyCoder;
- }
-
- @Override
- public void processElement(WindowedValue<KV<K, WindowedValue<V>>> element) {
- KV<K, WindowedValue<V>> kv = element.getValue();
- K key = kv.getKey();
- byte[] encodedKey;
- try {
- encodedKey = encodeToByteArray(keyCoder, key);
- } catch (CoderException exn) {
- // TODO: Put in better element printing:
- // truncate if too long.
- throw new IllegalArgumentException(
- String.format("unable to encode key %s of input to %s using %s", key, this, keyCoder),
- exn);
- }
- GroupingKey<K> groupingKey = new GroupingKey<>(key, encodedKey);
- List<WindowedValue<V>> values = groupingMap.get(groupingKey);
- if (values == null) {
- values = new ArrayList<WindowedValue<V>>();
- groupingMap.put(groupingKey, values);
- }
- values.add(kv.getValue());
- }
-
- @Override
- public InProcessTransformResult finishBundle() {
- Builder resultBuilder = StepTransformResult.withoutHold(application);
- for (Map.Entry<GroupingKey<K>, List<WindowedValue<V>>> groupedEntry :
- groupingMap.entrySet()) {
- K key = groupedEntry.getKey().key;
- KeyedWorkItem<K, V> groupedKv =
- KeyedWorkItems.elementsWorkItem(key, groupedEntry.getValue());
- UncommittedBundle<KeyedWorkItem<K, V>> bundle =
- evaluationContext.createKeyedBundle(inputBundle, key, application.getOutput());
- bundle.add(WindowedValue.valueInEmptyWindows(groupedKv));
- resultBuilder.addOutput(bundle);
- }
- return resultBuilder.build();
- }
-
- private static class GroupingKey<K> {
- private K key;
- private byte[] encodedKey;
-
- public GroupingKey(K key, byte[] encodedKey) {
- this.key = key;
- this.encodedKey = encodedKey;
- }
-
- @Override
- public boolean equals(Object o) {
- if (o instanceof GroupingKey) {
- GroupingKey<?> that = (GroupingKey<?>) o;
- return Arrays.equals(this.encodedKey, that.encodedKey);
- } else {
- return false;
- }
- }
-
- @Override
- public int hashCode() {
- return Arrays.hashCode(encodedKey);
- }
- }
- }
-
- /**
- * An in-memory implementation of the {@link GroupByKey} primitive as a composite
- * {@link PTransform}.
- */
- public static final class InProcessGroupByKey<K, V>
- extends ForwardingPTransform<PCollection<KV<K, V>>, PCollection<KV<K, Iterable<V>>>> {
- private final GroupByKey<K, V> original;
-
- private InProcessGroupByKey(GroupByKey<K, V> from) {
- this.original = from;
- }
-
- @Override
- public PTransform<PCollection<KV<K, V>>, PCollection<KV<K, Iterable<V>>>> delegate() {
- return original;
- }
-
- @Override
- public PCollection<KV<K, Iterable<V>>> apply(PCollection<KV<K, V>> input) {
- KvCoder<K, V> inputCoder = (KvCoder<K, V>) input.getCoder();
-
- // This operation groups by the combination of key and window,
- // merging windows as needed, using the windows assigned to the
- // key/value input elements and the window merge operation of the
- // window function associated with the input PCollection.
- WindowingStrategy<?, ?> windowingStrategy = input.getWindowingStrategy();
-
- // Use the default GroupAlsoByWindow implementation
- DoFn<KeyedWorkItem<K, V>, KV<K, Iterable<V>>> groupAlsoByWindow =
- groupAlsoByWindow(windowingStrategy, inputCoder.getValueCoder());
-
- // By default, implement GroupByKey via a series of lower-level operations.
- return input
- // Make each input element's timestamp and assigned windows
- // explicit, in the value part.
- .apply(new ReifyTimestampsAndWindows<K, V>())
-
- .apply(new InProcessGroupByKeyOnly<K, V>())
- .setCoder(KeyedWorkItemCoder.of(inputCoder.getKeyCoder(),
- inputCoder.getValueCoder(), input.getWindowingStrategy().getWindowFn().windowCoder()))
-
- // Group each key's values by window, merging windows as needed.
- .apply("GroupAlsoByWindow", ParDo.of(groupAlsoByWindow))
-
- // And update the windowing strategy as appropriate.
- .setWindowingStrategyInternal(original.updateWindowingStrategy(windowingStrategy))
- .setCoder(
- KvCoder.of(inputCoder.getKeyCoder(), IterableCoder.of(inputCoder.getValueCoder())));
- }
-
- private <W extends BoundedWindow>
- DoFn<KeyedWorkItem<K, V>, KV<K, Iterable<V>>> groupAlsoByWindow(
- final WindowingStrategy<?, W> windowingStrategy, final Coder<V> inputCoder) {
- return GroupAlsoByWindowViaWindowSetDoFn.create(
- windowingStrategy, SystemReduceFn.<K, V, W>buffering(inputCoder));
- }
- }
-
- /**
- * An implementation primitive to use in the evaluation of a {@link GroupByKey}
- * {@link PTransform}.
- */
- public static final class InProcessGroupByKeyOnly<K, V>
- extends PTransform<PCollection<KV<K, WindowedValue<V>>>, PCollection<KeyedWorkItem<K, V>>> {
- @Override
- public PCollection<KeyedWorkItem<K, V>> apply(PCollection<KV<K, WindowedValue<V>>> input) {
- return PCollection.<KeyedWorkItem<K, V>>createPrimitiveOutputInternal(
- input.getPipeline(), input.getWindowingStrategy(), input.isBounded());
- }
-
- @VisibleForTesting
- InProcessGroupByKeyOnly() {}
- }
-}
[51/67] [partial] incubator-beam git commit: Directory reorganization
Posted by dh...@apache.org.
Directory reorganization
Move Java SDK from "sdk/" into "sdks/java/core".
Project: http://git-wip-us.apache.org/repos/asf/incubator-beam/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-beam/commit/7bef2b7e
Tree: http://git-wip-us.apache.org/repos/asf/incubator-beam/tree/7bef2b7e
Diff: http://git-wip-us.apache.org/repos/asf/incubator-beam/diff/7bef2b7e
Branch: refs/heads/master
Commit: 7bef2b7ec4c2d60c28f0f5a8ccc6297edb934b11
Parents: 9f8dd18
Author: Davor Bonaci <da...@google.com>
Authored: Wed Mar 23 16:52:19 2016 -0700
Committer: Davor Bonaci <da...@google.com>
Committed: Wed Mar 23 18:11:34 2016 -0700
----------------------------------------------------------------------
pom.xml | 2 +-
sdk/pom.xml | 771 -----
.../com/google/cloud/dataflow/sdk/Pipeline.java | 502 ---
.../cloud/dataflow/sdk/PipelineResult.java | 95 -
.../dataflow/sdk/annotations/Experimental.java | 80 -
.../dataflow/sdk/annotations/package-info.java | 20 -
.../cloud/dataflow/sdk/coders/AtomicCoder.java | 51 -
.../cloud/dataflow/sdk/coders/AvroCoder.java | 714 -----
.../sdk/coders/BigEndianIntegerCoder.java | 99 -
.../dataflow/sdk/coders/BigEndianLongCoder.java | 99 -
.../dataflow/sdk/coders/ByteArrayCoder.java | 138 -
.../cloud/dataflow/sdk/coders/ByteCoder.java | 111 -
.../dataflow/sdk/coders/ByteStringCoder.java | 106 -
.../sdk/coders/CannotProvideCoderException.java | 95 -
.../google/cloud/dataflow/sdk/coders/Coder.java | 298 --
.../dataflow/sdk/coders/CoderException.java | 36 -
.../dataflow/sdk/coders/CoderFactories.java | 274 --
.../cloud/dataflow/sdk/coders/CoderFactory.java | 43 -
.../dataflow/sdk/coders/CoderProvider.java | 33 -
.../dataflow/sdk/coders/CoderProviders.java | 164 -
.../dataflow/sdk/coders/CoderRegistry.java | 843 -----
.../dataflow/sdk/coders/CollectionCoder.java | 73 -
.../cloud/dataflow/sdk/coders/CustomCoder.java | 137 -
.../cloud/dataflow/sdk/coders/DefaultCoder.java | 66 -
.../dataflow/sdk/coders/DelegateCoder.java | 164 -
.../sdk/coders/DeterministicStandardCoder.java | 38 -
.../cloud/dataflow/sdk/coders/DoubleCoder.java | 113 -
.../dataflow/sdk/coders/DurationCoder.java | 97 -
.../cloud/dataflow/sdk/coders/EntityCoder.java | 86 -
.../cloud/dataflow/sdk/coders/InstantCoder.java | 113 -
.../dataflow/sdk/coders/IterableCoder.java | 78 -
.../dataflow/sdk/coders/IterableLikeCoder.java | 278 --
.../cloud/dataflow/sdk/coders/JAXBCoder.java | 135 -
.../cloud/dataflow/sdk/coders/KvCoder.java | 162 -
.../cloud/dataflow/sdk/coders/KvCoderBase.java | 61 -
.../cloud/dataflow/sdk/coders/ListCoder.java | 77 -
.../cloud/dataflow/sdk/coders/MapCoder.java | 160 -
.../cloud/dataflow/sdk/coders/MapCoderBase.java | 54 -
.../dataflow/sdk/coders/NullableCoder.java | 175 -
.../cloud/dataflow/sdk/coders/Proto2Coder.java | 361 ---
.../dataflow/sdk/coders/SerializableCoder.java | 183 --
.../cloud/dataflow/sdk/coders/SetCoder.java | 94 -
.../dataflow/sdk/coders/StandardCoder.java | 229 --
.../sdk/coders/StringDelegateCoder.java | 86 -
.../dataflow/sdk/coders/StringUtf8Coder.java | 139 -
.../sdk/coders/StructuralByteArray.java | 56 -
.../dataflow/sdk/coders/TableRowJsonCoder.java | 82 -
.../sdk/coders/TextualIntegerCoder.java | 69 -
.../cloud/dataflow/sdk/coders/VarIntCoder.java | 97 -
.../cloud/dataflow/sdk/coders/VarLongCoder.java | 96 -
.../cloud/dataflow/sdk/coders/VoidCoder.java | 76 -
.../cloud/dataflow/sdk/coders/package-info.java | 44 -
.../sdk/coders/protobuf/ProtoCoder.java | 404 ---
.../sdk/coders/protobuf/ProtobufUtil.java | 171 -
.../sdk/coders/protobuf/package-info.java | 23 -
.../google/cloud/dataflow/sdk/io/AvroIO.java | 810 -----
.../cloud/dataflow/sdk/io/AvroSource.java | 647 ----
.../cloud/dataflow/sdk/io/BigQueryIO.java | 1499 ---------
.../cloud/dataflow/sdk/io/BlockBasedSource.java | 237 --
.../sdk/io/BoundedReadFromUnboundedSource.java | 271 --
.../cloud/dataflow/sdk/io/BoundedSource.java | 277 --
.../cloud/dataflow/sdk/io/CompressedSource.java | 413 ---
.../cloud/dataflow/sdk/io/CountingInput.java | 191 --
.../cloud/dataflow/sdk/io/CountingSource.java | 397 ---
.../cloud/dataflow/sdk/io/DatastoreIO.java | 957 ------
.../cloud/dataflow/sdk/io/FileBasedSink.java | 864 -----
.../cloud/dataflow/sdk/io/FileBasedSource.java | 648 ----
.../dataflow/sdk/io/OffsetBasedSource.java | 326 --
.../google/cloud/dataflow/sdk/io/PubsubIO.java | 1044 ------
.../com/google/cloud/dataflow/sdk/io/Read.java | 253 --
.../dataflow/sdk/io/ShardNameTemplate.java | 75 -
.../com/google/cloud/dataflow/sdk/io/Sink.java | 252 --
.../google/cloud/dataflow/sdk/io/Source.java | 193 --
.../google/cloud/dataflow/sdk/io/TextIO.java | 992 ------
.../cloud/dataflow/sdk/io/UnboundedSource.java | 253 --
.../com/google/cloud/dataflow/sdk/io/Write.java | 213 --
.../google/cloud/dataflow/sdk/io/XmlSink.java | 310 --
.../google/cloud/dataflow/sdk/io/XmlSource.java | 541 ----
.../dataflow/sdk/io/bigtable/BigtableIO.java | 987 ------
.../sdk/io/bigtable/BigtableService.java | 108 -
.../sdk/io/bigtable/BigtableServiceImpl.java | 241 --
.../dataflow/sdk/io/bigtable/package-info.java | 22 -
.../cloud/dataflow/sdk/io/package-info.java | 37 -
.../cloud/dataflow/sdk/io/range/ByteKey.java | 173 -
.../dataflow/sdk/io/range/ByteKeyRange.java | 376 ---
.../sdk/io/range/ByteKeyRangeTracker.java | 117 -
.../sdk/io/range/OffsetRangeTracker.java | 182 --
.../dataflow/sdk/io/range/RangeTracker.java | 220 --
.../dataflow/sdk/io/range/package-info.java | 23 -
.../sdk/options/ApplicationNameOptions.java | 33 -
.../dataflow/sdk/options/BigQueryOptions.java | 31 -
.../BlockingDataflowPipelineOptions.java | 49 -
.../sdk/options/CloudDebuggerOptions.java | 43 -
.../options/DataflowPipelineDebugOptions.java | 259 --
.../sdk/options/DataflowPipelineOptions.java | 134 -
.../DataflowPipelineWorkerPoolOptions.java | 254 --
.../sdk/options/DataflowProfilingOptions.java | 46 -
.../options/DataflowWorkerHarnessOptions.java | 50 -
.../options/DataflowWorkerLoggingOptions.java | 153 -
.../cloud/dataflow/sdk/options/Default.java | 153 -
.../sdk/options/DefaultValueFactory.java | 38 -
.../cloud/dataflow/sdk/options/Description.java | 35 -
.../sdk/options/DirectPipelineOptions.java | 75 -
.../cloud/dataflow/sdk/options/GcpOptions.java | 291 --
.../cloud/dataflow/sdk/options/GcsOptions.java | 113 -
.../sdk/options/GoogleApiDebugOptions.java | 87 -
.../cloud/dataflow/sdk/options/Hidden.java | 33 -
.../dataflow/sdk/options/PipelineOptions.java | 249 --
.../sdk/options/PipelineOptionsFactory.java | 1537 ---------
.../sdk/options/PipelineOptionsRegistrar.java | 36 -
.../sdk/options/PipelineOptionsValidator.java | 102 -
.../sdk/options/ProxyInvocationHandler.java | 441 ---
.../dataflow/sdk/options/StreamingOptions.java | 30 -
.../cloud/dataflow/sdk/options/Validation.java | 46 -
.../dataflow/sdk/options/package-info.java | 25 -
.../google/cloud/dataflow/sdk/package-info.java | 33 -
.../runners/AggregatorPipelineExtractor.java | 97 -
.../runners/AggregatorRetrievalException.java | 32 -
.../dataflow/sdk/runners/AggregatorValues.java | 52 -
.../runners/BlockingDataflowPipelineRunner.java | 181 --
.../DataflowJobAlreadyExistsException.java | 34 -
.../DataflowJobAlreadyUpdatedException.java | 33 -
.../runners/DataflowJobCancelledException.java | 38 -
.../sdk/runners/DataflowJobException.java | 40 -
.../runners/DataflowJobExecutionException.java | 34 -
.../runners/DataflowJobUpdatedException.java | 51 -
.../dataflow/sdk/runners/DataflowPipeline.java | 59 -
.../sdk/runners/DataflowPipelineJob.java | 389 ---
.../sdk/runners/DataflowPipelineRegistrar.java | 58 -
.../sdk/runners/DataflowPipelineRunner.java | 3003 ------------------
.../runners/DataflowPipelineRunnerHooks.java | 37 -
.../sdk/runners/DataflowPipelineTranslator.java | 1104 -------
.../sdk/runners/DataflowServiceException.java | 32 -
.../dataflow/sdk/runners/DirectPipeline.java | 55 -
.../sdk/runners/DirectPipelineRegistrar.java | 53 -
.../sdk/runners/DirectPipelineRunner.java | 1156 -------
.../dataflow/sdk/runners/PipelineRunner.java | 76 -
.../sdk/runners/PipelineRunnerRegistrar.java | 40 -
.../sdk/runners/RecordingPipelineVisitor.java | 54 -
.../sdk/runners/TransformHierarchy.java | 104 -
.../dataflow/sdk/runners/TransformTreeNode.java | 252 --
.../sdk/runners/dataflow/AssignWindows.java | 88 -
.../runners/dataflow/BigQueryIOTranslator.java | 125 -
.../sdk/runners/dataflow/CustomSources.java | 118 -
.../dataflow/DataflowAggregatorTransforms.java | 79 -
.../dataflow/DataflowMetricUpdateExtractor.java | 110 -
.../runners/dataflow/PubsubIOTranslator.java | 107 -
.../sdk/runners/dataflow/ReadTranslator.java | 103 -
.../sdk/runners/dataflow/package-info.java | 20 -
.../inprocess/BoundedReadEvaluatorFactory.java | 152 -
.../CachedThreadPoolExecutorServiceFactory.java | 42 -
.../dataflow/sdk/runners/inprocess/Clock.java | 29 -
.../runners/inprocess/CompletionCallback.java | 33 -
.../ConsumerTrackingPipelineVisitor.java | 173 -
.../inprocess/EmptyTransformEvaluator.java | 49 -
.../sdk/runners/inprocess/EvaluatorKey.java | 55 -
.../inprocess/ExecutorServiceFactory.java | 32 -
.../ExecutorServiceParallelExecutor.java | 432 ---
.../inprocess/FlattenEvaluatorFactory.java | 83 -
.../runners/inprocess/ForwardingPTransform.java | 54 -
.../inprocess/GroupByKeyEvaluatorFactory.java | 252 --
.../inprocess/InMemoryWatermarkManager.java | 1310 --------
.../sdk/runners/inprocess/InProcessBundle.java | 121 -
.../inprocess/InProcessBundleOutputManager.java | 50 -
.../sdk/runners/inprocess/InProcessCreate.java | 209 --
.../inprocess/InProcessEvaluationContext.java | 405 ---
.../inprocess/InProcessExecutionContext.java | 106 -
.../runners/inprocess/InProcessExecutor.java | 46 -
.../inprocess/InProcessPipelineOptions.java | 90 -
.../inprocess/InProcessPipelineRunner.java | 343 --
.../inprocess/InProcessSideInputContainer.java | 230 --
.../inprocess/InProcessTimerInternals.java | 84 -
.../inprocess/InProcessTransformResult.java | 75 -
.../inprocess/KeyedPValueTrackingVisitor.java | 95 -
.../sdk/runners/inprocess/NanosOffsetClock.java | 58 -
.../inprocess/ParDoInProcessEvaluator.java | 109 -
.../inprocess/ParDoMultiEvaluatorFactory.java | 90 -
.../inprocess/ParDoSingleEvaluatorFactory.java | 87 -
.../sdk/runners/inprocess/StepAndKey.java | 68 -
.../runners/inprocess/StepTransformResult.java | 157 -
.../runners/inprocess/TransformEvaluator.java | 45 -
.../inprocess/TransformEvaluatorFactory.java | 42 -
.../inprocess/TransformEvaluatorRegistry.java | 72 -
.../runners/inprocess/TransformExecutor.java | 114 -
.../inprocess/TransformExecutorService.java | 34 -
.../inprocess/TransformExecutorServices.java | 153 -
.../UnboundedReadEvaluatorFactory.java | 168 -
.../runners/inprocess/ViewEvaluatorFactory.java | 121 -
.../inprocess/WatermarkCallbackExecutor.java | 143 -
.../dataflow/sdk/runners/package-info.java | 33 -
.../dataflow/sdk/runners/worker/IsmFormat.java | 946 ------
.../sdk/runners/worker/package-info.java | 24 -
.../dataflow/sdk/testing/CoderProperties.java | 349 --
.../dataflow/sdk/testing/DataflowAssert.java | 825 -----
.../dataflow/sdk/testing/RunnableOnService.java | 30 -
.../sdk/testing/SerializableMatcher.java | 36 -
.../sdk/testing/SerializableMatchers.java | 1180 -------
.../dataflow/sdk/testing/SourceTestUtils.java | 642 ----
.../testing/TestDataflowPipelineOptions.java | 26 -
.../sdk/testing/TestDataflowPipelineRunner.java | 220 --
.../dataflow/sdk/testing/TestPipeline.java | 193 --
.../dataflow/sdk/testing/WindowFnTestUtils.java | 325 --
.../dataflow/sdk/testing/package-info.java | 21 -
.../dataflow/sdk/transforms/Aggregator.java | 78 -
.../sdk/transforms/AggregatorRetriever.java | 36 -
.../sdk/transforms/AppliedPTransform.java | 100 -
.../sdk/transforms/ApproximateQuantiles.java | 766 -----
.../sdk/transforms/ApproximateUnique.java | 419 ---
.../cloud/dataflow/sdk/transforms/Combine.java | 2240 -------------
.../dataflow/sdk/transforms/CombineFnBase.java | 283 --
.../dataflow/sdk/transforms/CombineFns.java | 1100 -------
.../sdk/transforms/CombineWithContext.java | 277 --
.../cloud/dataflow/sdk/transforms/Count.java | 135 -
.../cloud/dataflow/sdk/transforms/Create.java | 426 ---
.../cloud/dataflow/sdk/transforms/DoFn.java | 563 ----
.../dataflow/sdk/transforms/DoFnReflector.java | 668 ----
.../dataflow/sdk/transforms/DoFnTester.java | 495 ---
.../sdk/transforms/DoFnWithContext.java | 416 ---
.../cloud/dataflow/sdk/transforms/Filter.java | 234 --
.../sdk/transforms/FlatMapElements.java | 145 -
.../cloud/dataflow/sdk/transforms/Flatten.java | 219 --
.../dataflow/sdk/transforms/GroupByKey.java | 575 ----
.../transforms/IntraBundleParallelization.java | 346 --
.../cloud/dataflow/sdk/transforms/Keys.java | 68 -
.../cloud/dataflow/sdk/transforms/KvSwap.java | 73 -
.../dataflow/sdk/transforms/MapElements.java | 112 -
.../cloud/dataflow/sdk/transforms/Max.java | 255 --
.../cloud/dataflow/sdk/transforms/Mean.java | 202 --
.../cloud/dataflow/sdk/transforms/Min.java | 255 --
.../dataflow/sdk/transforms/PTransform.java | 324 --
.../cloud/dataflow/sdk/transforms/ParDo.java | 1321 --------
.../dataflow/sdk/transforms/Partition.java | 173 -
.../sdk/transforms/RemoveDuplicates.java | 158 -
.../cloud/dataflow/sdk/transforms/Sample.java | 246 --
.../sdk/transforms/SerializableComparator.java | 28 -
.../sdk/transforms/SerializableFunction.java | 31 -
.../dataflow/sdk/transforms/SimpleFunction.java | 54 -
.../cloud/dataflow/sdk/transforms/Sum.java | 188 --
.../cloud/dataflow/sdk/transforms/Top.java | 559 ----
.../cloud/dataflow/sdk/transforms/Values.java | 68 -
.../cloud/dataflow/sdk/transforms/View.java | 470 ---
.../cloud/dataflow/sdk/transforms/WithKeys.java | 140 -
.../dataflow/sdk/transforms/WithTimestamps.java | 129 -
.../cloud/dataflow/sdk/transforms/Write.java | 27 -
.../sdk/transforms/display/DisplayData.java | 530 ----
.../sdk/transforms/display/HasDisplayData.java | 53 -
.../sdk/transforms/join/CoGbkResult.java | 463 ---
.../sdk/transforms/join/CoGbkResultSchema.java | 134 -
.../sdk/transforms/join/CoGroupByKey.java | 211 --
.../transforms/join/KeyedPCollectionTuple.java | 247 --
.../sdk/transforms/join/RawUnionValue.java | 51 -
.../sdk/transforms/join/UnionCoder.java | 147 -
.../sdk/transforms/join/package-info.java | 21 -
.../dataflow/sdk/transforms/package-info.java | 43 -
.../sdk/transforms/windowing/AfterAll.java | 117 -
.../windowing/AfterDelayFromFirstElement.java | 322 --
.../sdk/transforms/windowing/AfterEach.java | 135 -
.../sdk/transforms/windowing/AfterFirst.java | 119 -
.../sdk/transforms/windowing/AfterPane.java | 145 -
.../windowing/AfterProcessingTime.java | 97 -
.../AfterSynchronizedProcessingTime.java | 75 -
.../transforms/windowing/AfterWatermark.java | 397 ---
.../sdk/transforms/windowing/BoundedWindow.java | 46 -
.../transforms/windowing/CalendarWindows.java | 348 --
.../transforms/windowing/DefaultTrigger.java | 95 -
.../sdk/transforms/windowing/FixedWindows.java | 116 -
.../sdk/transforms/windowing/GlobalWindow.java | 68 -
.../sdk/transforms/windowing/GlobalWindows.java | 63 -
.../transforms/windowing/IntervalWindow.java | 201 --
.../transforms/windowing/InvalidWindows.java | 87 -
.../MergeOverlappingIntervalWindows.java | 86 -
.../windowing/NonMergingWindowFn.java | 35 -
.../transforms/windowing/OrFinallyTrigger.java | 100 -
.../sdk/transforms/windowing/OutputTimeFn.java | 319 --
.../sdk/transforms/windowing/OutputTimeFns.java | 168 -
.../sdk/transforms/windowing/PaneInfo.java | 384 ---
.../windowing/PartitioningWindowFn.java | 61 -
.../sdk/transforms/windowing/Repeatedly.java | 100 -
.../sdk/transforms/windowing/Sessions.java | 112 -
.../transforms/windowing/SlidingWindows.java | 214 --
.../sdk/transforms/windowing/Trigger.java | 544 ----
.../transforms/windowing/TriggerBuilder.java | 29 -
.../sdk/transforms/windowing/Window.java | 662 ----
.../sdk/transforms/windowing/WindowFn.java | 221 --
.../sdk/transforms/windowing/package-info.java | 49 -
.../dataflow/sdk/util/ActiveWindowSet.java | 171 -
.../cloud/dataflow/sdk/util/ApiSurface.java | 642 ----
.../dataflow/sdk/util/AppEngineEnvironment.java | 61 -
.../dataflow/sdk/util/AppliedCombineFn.java | 130 -
.../dataflow/sdk/util/AssignWindowsDoFn.java | 67 -
...AttemptAndTimeBoundedExponentialBackOff.java | 168 -
.../util/AttemptBoundedExponentialBackOff.java | 83 -
.../cloud/dataflow/sdk/util/AvroUtils.java | 345 --
.../dataflow/sdk/util/BaseExecutionContext.java | 155 -
.../dataflow/sdk/util/BatchTimerInternals.java | 138 -
.../sdk/util/BigQueryTableInserter.java | 434 ---
.../sdk/util/BigQueryTableRowIterator.java | 469 ---
.../cloud/dataflow/sdk/util/BitSetCoder.java | 59 -
.../BufferedElementCountingOutputStream.java | 184 --
.../cloud/dataflow/sdk/util/CloudKnownType.java | 138 -
.../cloud/dataflow/sdk/util/CloudObject.java | 184 --
.../cloud/dataflow/sdk/util/CoderUtils.java | 327 --
.../sdk/util/CombineContextFactory.java | 107 -
.../cloud/dataflow/sdk/util/CombineFnUtil.java | 154 -
.../dataflow/sdk/util/CounterAggregator.java | 96 -
.../dataflow/sdk/util/CredentialFactory.java | 29 -
.../cloud/dataflow/sdk/util/Credentials.java | 192 --
.../sdk/util/DataflowPathValidator.java | 97 -
.../dataflow/sdk/util/DataflowReleaseInfo.java | 87 -
.../sdk/util/DirectModeExecutionContext.java | 130 -
.../sdk/util/DirectSideInputReader.java | 73 -
.../cloud/dataflow/sdk/util/DoFnInfo.java | 67 -
.../cloud/dataflow/sdk/util/DoFnRunner.java | 60 -
.../cloud/dataflow/sdk/util/DoFnRunnerBase.java | 558 ----
.../cloud/dataflow/sdk/util/DoFnRunners.java | 142 -
.../dataflow/sdk/util/ExecutableTrigger.java | 159 -
.../dataflow/sdk/util/ExecutionContext.java | 102 -
.../sdk/util/ExposedByteArrayInputStream.java | 51 -
.../sdk/util/ExposedByteArrayOutputStream.java | 115 -
.../dataflow/sdk/util/FileIOChannelFactory.java | 135 -
.../dataflow/sdk/util/FinishedTriggers.java | 42 -
.../sdk/util/FinishedTriggersBitSet.java | 68 -
.../dataflow/sdk/util/FinishedTriggersSet.java | 74 -
.../dataflow/sdk/util/GcpCredentialFactory.java | 45 -
.../dataflow/sdk/util/GcsIOChannelFactory.java | 86 -
.../cloud/dataflow/sdk/util/GcsStager.java | 53 -
.../google/cloud/dataflow/sdk/util/GcsUtil.java | 406 ---
.../util/GroupAlsoByWindowViaWindowSetDoFn.java | 104 -
.../sdk/util/GroupAlsoByWindowsDoFn.java | 58 -
.../GroupAlsoByWindowsViaOutputBufferDoFn.java | 98 -
.../dataflow/sdk/util/IOChannelFactory.java | 101 -
.../cloud/dataflow/sdk/util/IOChannelUtils.java | 204 --
.../sdk/util/IllegalMutationException.java | 52 -
.../dataflow/sdk/util/InstanceBuilder.java | 269 --
.../util/IntervalBoundedExponentialBackOff.java | 87 -
.../cloud/dataflow/sdk/util/KeyedWorkItem.java | 41 -
.../dataflow/sdk/util/KeyedWorkItemCoder.java | 120 -
.../cloud/dataflow/sdk/util/KeyedWorkItems.java | 120 -
.../sdk/util/LateDataDroppingDoFnRunner.java | 145 -
.../dataflow/sdk/util/MapAggregatorValues.java | 48 -
.../sdk/util/MergingActiveWindowSet.java | 544 ----
.../cloud/dataflow/sdk/util/MimeTypes.java | 23 -
.../cloud/dataflow/sdk/util/MonitoringUtil.java | 233 --
.../dataflow/sdk/util/MutationDetector.java | 31 -
.../dataflow/sdk/util/MutationDetectors.java | 182 --
.../cloud/dataflow/sdk/util/NonEmptyPanes.java | 148 -
.../sdk/util/NonMergingActiveWindowSet.java | 85 -
.../sdk/util/NoopCredentialFactory.java | 38 -
.../dataflow/sdk/util/NoopPathValidator.java | 48 -
.../dataflow/sdk/util/NullSideInputReader.java | 61 -
.../dataflow/sdk/util/OutputReference.java | 42 -
.../sdk/util/PCollectionViewWindow.java | 67 -
.../dataflow/sdk/util/PCollectionViews.java | 426 ---
.../google/cloud/dataflow/sdk/util/PTuple.java | 160 -
.../cloud/dataflow/sdk/util/PackageUtil.java | 327 --
.../dataflow/sdk/util/PaneInfoTracker.java | 151 -
.../cloud/dataflow/sdk/util/PathValidator.java | 47 -
.../sdk/util/PerKeyCombineFnRunner.java | 147 -
.../sdk/util/PerKeyCombineFnRunners.java | 257 --
.../cloud/dataflow/sdk/util/PropertyNames.java | 107 -
.../dataflow/sdk/util/RandomAccessData.java | 352 --
.../cloud/dataflow/sdk/util/ReduceFn.java | 128 -
.../sdk/util/ReduceFnContextFactory.java | 495 ---
.../cloud/dataflow/sdk/util/ReduceFnRunner.java | 843 -----
.../sdk/util/ReifyTimestampAndWindowsDoFn.java | 46 -
.../cloud/dataflow/sdk/util/Reshuffle.java | 145 -
.../dataflow/sdk/util/ReshuffleTrigger.java | 61 -
.../sdk/util/RetryHttpRequestInitializer.java | 250 --
.../dataflow/sdk/util/SerializableUtils.java | 159 -
.../cloud/dataflow/sdk/util/Serializer.java | 145 -
.../sdk/util/ShardingWritableByteChannel.java | 118 -
.../dataflow/sdk/util/SideInputReader.java | 48 -
.../dataflow/sdk/util/SimpleDoFnRunner.java | 55 -
.../google/cloud/dataflow/sdk/util/Stager.java | 29 -
.../cloud/dataflow/sdk/util/StreamUtils.java | 68 -
.../cloud/dataflow/sdk/util/StringUtils.java | 242 --
.../google/cloud/dataflow/sdk/util/Structs.java | 384 ---
.../dataflow/sdk/util/SystemDoFnInternal.java | 37 -
.../cloud/dataflow/sdk/util/SystemReduceFn.java | 133 -
.../cloud/dataflow/sdk/util/TestCredential.java | 51 -
.../cloud/dataflow/sdk/util/TimeDomain.java | 41 -
.../cloud/dataflow/sdk/util/TimeUtil.java | 164 -
.../cloud/dataflow/sdk/util/TimerInternals.java | 269 --
.../google/cloud/dataflow/sdk/util/Timers.java | 60 -
.../cloud/dataflow/sdk/util/Transport.java | 205 --
.../sdk/util/TriggerContextFactory.java | 522 ---
.../cloud/dataflow/sdk/util/TriggerRunner.java | 223 --
.../dataflow/sdk/util/UnownedInputStream.java | 76 -
.../dataflow/sdk/util/UnownedOutputStream.java | 56 -
.../sdk/util/UploadIdResponseInterceptor.java | 61 -
.../dataflow/sdk/util/UserCodeException.java | 94 -
.../dataflow/sdk/util/ValueWithRecordId.java | 154 -
.../google/cloud/dataflow/sdk/util/Values.java | 88 -
.../google/cloud/dataflow/sdk/util/VarInt.java | 115 -
.../cloud/dataflow/sdk/util/WatermarkHold.java | 450 ---
.../cloud/dataflow/sdk/util/Weighted.java | 27 -
.../cloud/dataflow/sdk/util/WeightedValue.java | 45 -
.../cloud/dataflow/sdk/util/WindowTracing.java | 36 -
.../cloud/dataflow/sdk/util/WindowedValue.java | 720 -----
.../dataflow/sdk/util/WindowingInternals.java | 82 -
.../dataflow/sdk/util/WindowingStrategy.java | 268 --
.../cloud/dataflow/sdk/util/ZipFiles.java | 294 --
.../cloud/dataflow/sdk/util/common/Counter.java | 1103 -------
.../sdk/util/common/CounterProvider.java | 26 -
.../dataflow/sdk/util/common/CounterSet.java | 177 --
.../util/common/ElementByteSizeObservable.java | 41 -
.../ElementByteSizeObservableIterable.java | 63 -
.../ElementByteSizeObservableIterator.java | 36 -
.../util/common/ElementByteSizeObserver.java | 92 -
.../sdk/util/common/PeekingReiterator.java | 98 -
.../sdk/util/common/ReflectHelpers.java | 209 --
.../dataflow/sdk/util/common/Reiterable.java | 27 -
.../dataflow/sdk/util/common/Reiterator.java | 39 -
.../dataflow/sdk/util/common/package-info.java | 18 -
.../sdk/util/common/worker/StateSampler.java | 365 ---
.../sdk/util/common/worker/package-info.java | 18 -
.../cloud/dataflow/sdk/util/gcsfs/GcsPath.java | 619 ----
.../dataflow/sdk/util/gcsfs/package-info.java | 18 -
.../cloud/dataflow/sdk/util/package-info.java | 18 -
.../util/state/AccumulatorCombiningState.java | 51 -
.../cloud/dataflow/sdk/util/state/BagState.java | 26 -
.../dataflow/sdk/util/state/CombiningState.java | 40 -
.../CopyOnAccessInMemoryStateInternals.java | 454 ---
.../sdk/util/state/InMemoryStateInternals.java | 414 ---
.../sdk/util/state/MergingStateAccessor.java | 40 -
.../dataflow/sdk/util/state/ReadableState.java | 53 -
.../cloud/dataflow/sdk/util/state/State.java | 30 -
.../dataflow/sdk/util/state/StateAccessor.java | 36 -
.../dataflow/sdk/util/state/StateContext.java | 41 -
.../dataflow/sdk/util/state/StateContexts.java | 107 -
.../dataflow/sdk/util/state/StateInternals.java | 55 -
.../dataflow/sdk/util/state/StateMerging.java | 254 --
.../dataflow/sdk/util/state/StateNamespace.java | 54 -
.../sdk/util/state/StateNamespaceForTest.java | 63 -
.../sdk/util/state/StateNamespaces.java | 277 --
.../dataflow/sdk/util/state/StateTable.java | 89 -
.../cloud/dataflow/sdk/util/state/StateTag.java | 96 -
.../dataflow/sdk/util/state/StateTags.java | 579 ----
.../dataflow/sdk/util/state/ValueState.java | 35 -
.../sdk/util/state/WatermarkHoldState.java | 42 -
.../google/cloud/dataflow/sdk/values/KV.java | 130 -
.../cloud/dataflow/sdk/values/PBegin.java | 87 -
.../cloud/dataflow/sdk/values/PCollection.java | 250 --
.../dataflow/sdk/values/PCollectionList.java | 238 --
.../dataflow/sdk/values/PCollectionTuple.java | 264 --
.../dataflow/sdk/values/PCollectionView.java | 64 -
.../google/cloud/dataflow/sdk/values/PDone.java | 47 -
.../cloud/dataflow/sdk/values/PInput.java | 56 -
.../cloud/dataflow/sdk/values/POutput.java | 76 -
.../dataflow/sdk/values/POutputValueBase.java | 102 -
.../cloud/dataflow/sdk/values/PValue.java | 38 -
.../cloud/dataflow/sdk/values/PValueBase.java | 155 -
.../dataflow/sdk/values/TimestampedValue.java | 155 -
.../cloud/dataflow/sdk/values/TupleTag.java | 196 --
.../cloud/dataflow/sdk/values/TupleTagList.java | 148 -
.../dataflow/sdk/values/TypeDescriptor.java | 351 --
.../cloud/dataflow/sdk/values/TypedPValue.java | 197 --
.../cloud/dataflow/sdk/values/package-info.java | 52 -
sdk/src/main/proto/README.md | 27 -
.../main/proto/proto2_coder_test_messages.proto | 51 -
.../google/cloud/dataflow/sdk/sdk.properties | 5 -
.../cloud/dataflow/sdk/DataflowMatchers.java | 65 -
.../google/cloud/dataflow/sdk/PipelineTest.java | 296 --
.../google/cloud/dataflow/sdk/TestUtils.java | 213 --
.../cloud/dataflow/sdk/WindowMatchers.java | 137 -
.../dataflow/sdk/coders/AvroCoderTest.java | 754 -----
.../sdk/coders/BigEndianIntegerCoderTest.java | 90 -
.../sdk/coders/BigEndianLongCoderTest.java | 94 -
.../dataflow/sdk/coders/ByteArrayCoderTest.java | 144 -
.../dataflow/sdk/coders/ByteCoderTest.java | 91 -
.../sdk/coders/ByteStringCoderTest.java | 121 -
.../dataflow/sdk/coders/CoderFactoriesTest.java | 100 -
.../dataflow/sdk/coders/CoderProvidersTest.java | 71 -
.../dataflow/sdk/coders/CoderRegistryTest.java | 521 ---
.../cloud/dataflow/sdk/coders/CoderTest.java | 78 -
.../sdk/coders/CollectionCoderTest.java | 93 -
.../dataflow/sdk/coders/CustomCoderTest.java | 135 -
.../dataflow/sdk/coders/DefaultCoderTest.java | 128 -
.../dataflow/sdk/coders/DelegateCoderTest.java | 141 -
.../dataflow/sdk/coders/DoubleCoderTest.java | 96 -
.../dataflow/sdk/coders/DurationCoderTest.java | 86 -
.../dataflow/sdk/coders/EntityCoderTest.java | 108 -
.../dataflow/sdk/coders/InstantCoderTest.java | 116 -
.../dataflow/sdk/coders/IterableCoderTest.java | 109 -
.../dataflow/sdk/coders/JAXBCoderTest.java | 99 -
.../cloud/dataflow/sdk/coders/KvCoderTest.java | 118 -
.../dataflow/sdk/coders/ListCoderTest.java | 134 -
.../cloud/dataflow/sdk/coders/MapCoderTest.java | 106 -
.../dataflow/sdk/coders/NullableCoderTest.java | 132 -
.../sdk/coders/PrintBase64Encodings.java | 81 -
.../dataflow/sdk/coders/Proto2CoderTest.java | 145 -
.../sdk/coders/SerializableCoderTest.java | 222 --
.../cloud/dataflow/sdk/coders/SetCoderTest.java | 86 -
.../dataflow/sdk/coders/StandardCoderTest.java | 176 -
.../sdk/coders/StringDelegateCoderTest.java | 72 -
.../sdk/coders/StringUtf8CoderTest.java | 80 -
.../sdk/coders/StructuralByteArrayTest.java | 39 -
.../sdk/coders/TableRowJsonCoderTest.java | 86 -
.../sdk/coders/TextualIntegerCoderTest.java | 90 -
.../dataflow/sdk/coders/VarIntCoderTest.java | 91 -
.../dataflow/sdk/coders/VarLongCoderTest.java | 94 -
.../sdk/coders/protobuf/ProtoCoderTest.java | 182 --
.../sdk/coders/protobuf/ProtobufUtilTest.java | 195 --
.../sdk/io/AvroIOGeneratedClassTest.java | 374 ---
.../cloud/dataflow/sdk/io/AvroIOTest.java | 226 --
.../cloud/dataflow/sdk/io/AvroSourceTest.java | 692 ----
.../cloud/dataflow/sdk/io/BigQueryIOTest.java | 445 ---
.../io/BoundedReadFromUnboundedSourceTest.java | 132 -
.../dataflow/sdk/io/CompressedSourceTest.java | 430 ---
.../dataflow/sdk/io/CountingInputTest.java | 125 -
.../dataflow/sdk/io/CountingSourceTest.java | 216 --
.../cloud/dataflow/sdk/io/DatastoreIOTest.java | 631 ----
.../dataflow/sdk/io/FileBasedSinkTest.java | 512 ---
.../dataflow/sdk/io/FileBasedSourceTest.java | 914 ------
.../dataflow/sdk/io/OffsetBasedSourceTest.java | 278 --
.../cloud/dataflow/sdk/io/PubsubIOTest.java | 233 --
.../google/cloud/dataflow/sdk/io/ReadTest.java | 144 -
.../cloud/dataflow/sdk/io/TextIOTest.java | 562 ----
.../google/cloud/dataflow/sdk/io/WriteTest.java | 341 --
.../cloud/dataflow/sdk/io/XmlSinkTest.java | 235 --
.../cloud/dataflow/sdk/io/XmlSourceTest.java | 822 -----
.../sdk/io/bigtable/BigtableIOTest.java | 688 ----
.../range/ByteKeyRangeEstimateFractionTest.java | 69 -
.../range/ByteKeyRangeInterpolateKeyTest.java | 73 -
.../dataflow/sdk/io/range/ByteKeyRangeTest.java | 396 ---
.../sdk/io/range/ByteKeyRangeTrackerTest.java | 118 -
.../dataflow/sdk/io/range/ByteKeyTest.java | 178 --
.../sdk/io/range/OffsetRangeTrackerTest.java | 186 --
.../com/google/cloud/dataflow/sdk/io/user.avsc | 10 -
.../DataflowPipelineDebugOptionsTest.java | 40 -
.../options/DataflowPipelineOptionsTest.java | 91 -
.../options/DataflowProfilingOptionsTest.java | 47 -
.../DataflowWorkerLoggingOptionsTest.java | 73 -
.../dataflow/sdk/options/GcpOptionsTest.java | 123 -
.../sdk/options/GoogleApiDebugOptionsTest.java | 147 -
.../sdk/options/PipelineOptionsFactoryTest.java | 1154 -------
.../sdk/options/PipelineOptionsTest.java | 126 -
.../options/PipelineOptionsValidatorTest.java | 310 --
.../sdk/options/ProxyInvocationHandlerTest.java | 691 ----
.../AggregatorPipelineExtractorTest.java | 228 --
.../BlockingDataflowPipelineRunnerTest.java | 301 --
.../sdk/runners/DataflowPipelineJobTest.java | 603 ----
.../runners/DataflowPipelineRegistrarTest.java | 72 -
.../sdk/runners/DataflowPipelineRunnerTest.java | 1370 --------
.../sdk/runners/DataflowPipelineTest.java | 44 -
.../runners/DataflowPipelineTranslatorTest.java | 889 ------
.../runners/DirectPipelineRegistrarTest.java | 69 -
.../sdk/runners/DirectPipelineRunnerTest.java | 210 --
.../sdk/runners/DirectPipelineTest.java | 34 -
.../sdk/runners/PipelineRunnerTest.java | 82 -
.../dataflow/sdk/runners/TransformTreeTest.java | 194 --
.../sdk/runners/dataflow/CustomSourcesTest.java | 273 --
.../runners/dataflow/TestCountingSource.java | 235 --
.../BoundedReadEvaluatorFactoryTest.java | 287 --
.../ConsumerTrackingPipelineVisitorTest.java | 233 --
.../inprocess/FlattenEvaluatorFactoryTest.java | 136 -
.../inprocess/ForwardingPTransformTest.java | 100 -
.../GroupByKeyEvaluatorFactoryTest.java | 178 --
.../inprocess/InMemoryWatermarkManagerTest.java | 1111 -------
.../runners/inprocess/InProcessBundleTest.java | 143 -
.../runners/inprocess/InProcessCreateTest.java | 199 --
.../InProcessEvaluationContextTest.java | 544 ----
.../inprocess/InProcessPipelineRunnerTest.java | 77 -
.../InProcessSideInputContainerTest.java | 370 ---
.../inprocess/InProcessTimerInternalsTest.java | 131 -
.../KeyedPValueTrackingVisitorTest.java | 189 --
.../sdk/runners/inprocess/MockClock.java | 60 -
.../ParDoMultiEvaluatorFactoryTest.java | 412 ---
.../ParDoSingleEvaluatorFactoryTest.java | 310 --
.../TransformExecutorServicesTest.java | 134 -
.../inprocess/TransformExecutorTest.java | 312 --
.../UnboundedReadEvaluatorFactoryTest.java | 327 --
.../inprocess/ViewEvaluatorFactoryTest.java | 96 -
.../WatermarkCallbackExecutorTest.java | 126 -
.../sdk/testing/CoderPropertiesTest.java | 214 --
.../sdk/testing/DataflowAssertTest.java | 326 --
.../sdk/testing/DataflowJUnitTestRunner.java | 129 -
.../dataflow/sdk/testing/ExpectedLogs.java | 306 --
.../dataflow/sdk/testing/ExpectedLogsTest.java | 153 -
.../sdk/testing/FastNanoClockAndSleeper.java | 47 -
.../testing/FastNanoClockAndSleeperTest.java | 47 -
.../sdk/testing/PCollectionViewTesting.java | 295 --
.../sdk/testing/ResetDateTimeProvider.java | 41 -
.../sdk/testing/ResetDateTimeProviderTest.java | 55 -
.../sdk/testing/RestoreSystemProperties.java | 51 -
.../testing/RestoreSystemPropertiesTest.java | 50 -
.../sdk/testing/SerializableMatchersTest.java | 165 -
.../sdk/testing/SystemNanoTimeSleeper.java | 68 -
.../sdk/testing/SystemNanoTimeSleeperTest.java | 53 -
.../testing/TestDataflowPipelineRunnerTest.java | 317 --
.../dataflow/sdk/testing/TestPipelineTest.java | 93 -
.../transforms/ApproximateQuantilesTest.java | 299 --
.../sdk/transforms/ApproximateUniqueTest.java | 291 --
.../dataflow/sdk/transforms/CombineFnsTest.java | 413 ---
.../dataflow/sdk/transforms/CombineTest.java | 1137 -------
.../dataflow/sdk/transforms/CountTest.java | 121 -
.../dataflow/sdk/transforms/CreateTest.java | 240 --
.../sdk/transforms/DoFnContextTest.java | 68 -
.../DoFnDelegatingAggregatorTest.java | 143 -
.../sdk/transforms/DoFnReflectorTest.java | 493 ---
.../cloud/dataflow/sdk/transforms/DoFnTest.java | 206 --
.../dataflow/sdk/transforms/DoFnTesterTest.java | 253 --
.../sdk/transforms/DoFnWithContextTest.java | 225 --
.../dataflow/sdk/transforms/FilterTest.java | 160 -
.../sdk/transforms/FlatMapElementsTest.java | 124 -
.../dataflow/sdk/transforms/FlattenTest.java | 369 ---
.../dataflow/sdk/transforms/GroupByKeyTest.java | 438 ---
.../IntraBundleParallelizationTest.java | 250 --
.../cloud/dataflow/sdk/transforms/KeysTest.java | 83 -
.../dataflow/sdk/transforms/KvSwapTest.java | 91 -
.../sdk/transforms/MapElementsTest.java | 134 -
.../cloud/dataflow/sdk/transforms/MaxTest.java | 66 -
.../cloud/dataflow/sdk/transforms/MeanTest.java | 72 -
.../cloud/dataflow/sdk/transforms/MinTest.java | 66 -
.../cloud/dataflow/sdk/transforms/NoOpDoFn.java | 143 -
.../dataflow/sdk/transforms/PTransformTest.java | 41 -
.../dataflow/sdk/transforms/ParDoTest.java | 1541 ---------
.../dataflow/sdk/transforms/PartitionTest.java | 140 -
.../sdk/transforms/RemoveDuplicatesTest.java | 131 -
.../dataflow/sdk/transforms/SampleTest.java | 260 --
.../sdk/transforms/SimpleStatsFnsTest.java | 129 -
.../cloud/dataflow/sdk/transforms/SumTest.java | 66 -
.../cloud/dataflow/sdk/transforms/TopTest.java | 259 --
.../dataflow/sdk/transforms/ValuesTest.java | 93 -
.../cloud/dataflow/sdk/transforms/ViewTest.java | 1548 ---------
.../dataflow/sdk/transforms/WithKeysTest.java | 127 -
.../sdk/transforms/WithTimestampsTest.java | 210 --
.../transforms/display/DisplayDataMatchers.java | 98 -
.../display/DisplayDataMatchersTest.java | 81 -
.../sdk/transforms/display/DisplayDataTest.java | 633 ----
.../transforms/join/CoGbkResultCoderTest.java | 85 -
.../sdk/transforms/join/CoGbkResultTest.java | 124 -
.../sdk/transforms/join/CoGroupByKeyTest.java | 507 ---
.../sdk/transforms/join/UnionCoderTest.java | 48 -
.../sdk/transforms/windowing/AfterAllTest.java | 151 -
.../sdk/transforms/windowing/AfterEachTest.java | 122 -
.../transforms/windowing/AfterFirstTest.java | 175 -
.../sdk/transforms/windowing/AfterPaneTest.java | 126 -
.../windowing/AfterProcessingTimeTest.java | 157 -
.../AfterSynchronizedProcessingTimeTest.java | 121 -
.../windowing/AfterWatermarkTest.java | 338 --
.../windowing/CalendarWindowsTest.java | 260 --
.../windowing/DefaultTriggerTest.java | 176 -
.../transforms/windowing/FixedWindowsTest.java | 124 -
.../windowing/IntervalWindowTest.java | 94 -
.../windowing/OrFinallyTriggerTest.java | 209 --
.../sdk/transforms/windowing/PaneInfoTest.java | 75 -
.../transforms/windowing/RepeatedlyTest.java | 128 -
.../sdk/transforms/windowing/SessionsTest.java | 156 -
.../windowing/SlidingWindowsTest.java | 193 --
.../sdk/transforms/windowing/TriggerTest.java | 117 -
.../sdk/transforms/windowing/WindowTest.java | 226 --
.../sdk/transforms/windowing/WindowingTest.java | 244 --
.../cloud/dataflow/sdk/util/ApiSurfaceTest.java | 187 --
...mptAndTimeBoundedExponentialBackOffTest.java | 212 --
.../AttemptBoundedExponentialBackOffTest.java | 85 -
.../cloud/dataflow/sdk/util/AvroUtilsTest.java | 225 --
.../sdk/util/BatchTimerInternalsTest.java | 116 -
.../sdk/util/BigQueryTableInserterTest.java | 239 --
.../sdk/util/BigQueryTableRowIteratorTest.java | 255 --
.../dataflow/sdk/util/BigQueryUtilTest.java | 479 ---
...BufferedElementCountingOutputStreamTest.java | 205 --
.../cloud/dataflow/sdk/util/CoderUtilsTest.java | 229 --
.../dataflow/sdk/util/CombineFnUtilTest.java | 62 -
.../sdk/util/CounterAggregatorTest.java | 253 --
.../sdk/util/DataflowPathValidatorTest.java | 92 -
.../sdk/util/ExecutableTriggerTest.java | 130 -
.../util/ExposedByteArrayInputStreamTest.java | 78 -
.../util/ExposedByteArrayOutputStreamTest.java | 245 --
.../sdk/util/FileIOChannelFactoryTest.java | 226 --
.../sdk/util/FinishedTriggersBitSetTest.java | 54 -
.../sdk/util/FinishedTriggersProperties.java | 109 -
.../sdk/util/FinishedTriggersSetTest.java | 60 -
.../sdk/util/GcsIOChannelFactoryTest.java | 43 -
.../cloud/dataflow/sdk/util/GcsUtilTest.java | 490 ---
.../sdk/util/GroupAlsoByWindowsProperties.java | 718 -----
...oupAlsoByWindowsViaOutputBufferDoFnTest.java | 111 -
.../dataflow/sdk/util/IOChannelUtilsTest.java | 94 -
.../dataflow/sdk/util/InstanceBuilderTest.java | 115 -
.../IntervalBoundedExponentialBackOffTest.java | 99 -
.../sdk/util/KeyedWorkItemCoderTest.java | 61 -
.../util/LateDataDroppingDoFnRunnerTest.java | 115 -
.../sdk/util/MergingActiveWindowSetTest.java | 175 -
.../dataflow/sdk/util/MonitoringUtilTest.java | 146 -
.../sdk/util/MutationDetectorsTest.java | 148 -
.../cloud/dataflow/sdk/util/PTupleTest.java | 40 -
.../dataflow/sdk/util/PackageUtilTest.java | 482 ---
.../dataflow/sdk/util/RandomAccessDataTest.java | 205 --
.../dataflow/sdk/util/ReduceFnRunnerTest.java | 1049 ------
.../cloud/dataflow/sdk/util/ReduceFnTester.java | 776 -----
.../cloud/dataflow/sdk/util/ReshuffleTest.java | 208 --
.../dataflow/sdk/util/ReshuffleTriggerTest.java | 58 -
.../util/RetryHttpRequestInitializerTest.java | 296 --
.../sdk/util/SerializableUtilsTest.java | 165 -
.../cloud/dataflow/sdk/util/SerializerTest.java | 162 -
.../dataflow/sdk/util/SimpleDoFnRunnerTest.java | 86 -
.../dataflow/sdk/util/StreamUtilsTest.java | 71 -
.../dataflow/sdk/util/StringUtilsTest.java | 145 -
.../cloud/dataflow/sdk/util/StructsTest.java | 206 --
.../cloud/dataflow/sdk/util/TimeUtilTest.java | 73 -
.../dataflow/sdk/util/TimerInternalsTest.java | 52 -
.../cloud/dataflow/sdk/util/TriggerTester.java | 585 ----
.../sdk/util/UnownedInputStreamTest.java | 76 -
.../sdk/util/UnownedOutputStreamTest.java | 57 -
.../util/UploadIdResponseInterceptorTest.java | 99 -
.../sdk/util/UserCodeExceptionTest.java | 176 -
.../cloud/dataflow/sdk/util/VarIntTest.java | 277 --
.../dataflow/sdk/util/WindowedValueTest.java | 57 -
.../cloud/dataflow/sdk/util/ZipFilesTest.java | 311 --
.../sdk/util/common/CounterSetTest.java | 225 --
.../dataflow/sdk/util/common/CounterTest.java | 589 ----
.../sdk/util/common/CounterTestUtils.java | 56 -
.../sdk/util/common/ReflectHelpersTest.java | 126 -
.../dataflow/sdk/util/gcsfs/GcsPathTest.java | 333 --
.../CopyOnAccessInMemoryStateInternalsTest.java | 553 ----
.../util/state/InMemoryStateInternalsTest.java | 348 --
.../sdk/util/state/StateNamespacesTest.java | 129 -
.../dataflow/sdk/util/state/StateTagTest.java | 173 -
.../cloud/dataflow/sdk/values/KVTest.java | 112 -
.../sdk/values/PCollectionListTest.java | 47 -
.../sdk/values/PCollectionTupleTest.java | 93 -
.../cloud/dataflow/sdk/values/PDoneTest.java | 102 -
.../cloud/dataflow/sdk/values/TupleTagTest.java | 87 -
.../dataflow/sdk/values/TypeDescriptorTest.java | 193 --
.../dataflow/sdk/values/TypedPValueTest.java | 164 -
.../PipelineOptionsFactoryJava8Test.java | 90 -
sdks/java/core/pom.xml | 771 +++++
.../com/google/cloud/dataflow/sdk/Pipeline.java | 502 +++
.../cloud/dataflow/sdk/PipelineResult.java | 95 +
.../dataflow/sdk/annotations/Experimental.java | 80 +
.../dataflow/sdk/annotations/package-info.java | 20 +
.../cloud/dataflow/sdk/coders/AtomicCoder.java | 51 +
.../cloud/dataflow/sdk/coders/AvroCoder.java | 714 +++++
.../sdk/coders/BigEndianIntegerCoder.java | 99 +
.../dataflow/sdk/coders/BigEndianLongCoder.java | 99 +
.../dataflow/sdk/coders/ByteArrayCoder.java | 138 +
.../cloud/dataflow/sdk/coders/ByteCoder.java | 111 +
.../dataflow/sdk/coders/ByteStringCoder.java | 106 +
.../sdk/coders/CannotProvideCoderException.java | 95 +
.../google/cloud/dataflow/sdk/coders/Coder.java | 298 ++
.../dataflow/sdk/coders/CoderException.java | 36 +
.../dataflow/sdk/coders/CoderFactories.java | 274 ++
.../cloud/dataflow/sdk/coders/CoderFactory.java | 43 +
.../dataflow/sdk/coders/CoderProvider.java | 33 +
.../dataflow/sdk/coders/CoderProviders.java | 164 +
.../dataflow/sdk/coders/CoderRegistry.java | 843 +++++
.../dataflow/sdk/coders/CollectionCoder.java | 73 +
.../cloud/dataflow/sdk/coders/CustomCoder.java | 137 +
.../cloud/dataflow/sdk/coders/DefaultCoder.java | 66 +
.../dataflow/sdk/coders/DelegateCoder.java | 164 +
.../sdk/coders/DeterministicStandardCoder.java | 38 +
.../cloud/dataflow/sdk/coders/DoubleCoder.java | 113 +
.../dataflow/sdk/coders/DurationCoder.java | 97 +
.../cloud/dataflow/sdk/coders/EntityCoder.java | 86 +
.../cloud/dataflow/sdk/coders/InstantCoder.java | 113 +
.../dataflow/sdk/coders/IterableCoder.java | 78 +
.../dataflow/sdk/coders/IterableLikeCoder.java | 278 ++
.../cloud/dataflow/sdk/coders/JAXBCoder.java | 135 +
.../cloud/dataflow/sdk/coders/KvCoder.java | 162 +
.../cloud/dataflow/sdk/coders/KvCoderBase.java | 61 +
.../cloud/dataflow/sdk/coders/ListCoder.java | 77 +
.../cloud/dataflow/sdk/coders/MapCoder.java | 160 +
.../cloud/dataflow/sdk/coders/MapCoderBase.java | 54 +
.../dataflow/sdk/coders/NullableCoder.java | 175 +
.../cloud/dataflow/sdk/coders/Proto2Coder.java | 361 +++
.../dataflow/sdk/coders/SerializableCoder.java | 183 ++
.../cloud/dataflow/sdk/coders/SetCoder.java | 94 +
.../dataflow/sdk/coders/StandardCoder.java | 229 ++
.../sdk/coders/StringDelegateCoder.java | 86 +
.../dataflow/sdk/coders/StringUtf8Coder.java | 139 +
.../sdk/coders/StructuralByteArray.java | 56 +
.../dataflow/sdk/coders/TableRowJsonCoder.java | 82 +
.../sdk/coders/TextualIntegerCoder.java | 69 +
.../cloud/dataflow/sdk/coders/VarIntCoder.java | 97 +
.../cloud/dataflow/sdk/coders/VarLongCoder.java | 96 +
.../cloud/dataflow/sdk/coders/VoidCoder.java | 76 +
.../cloud/dataflow/sdk/coders/package-info.java | 44 +
.../sdk/coders/protobuf/ProtoCoder.java | 404 +++
.../sdk/coders/protobuf/ProtobufUtil.java | 171 +
.../sdk/coders/protobuf/package-info.java | 23 +
.../google/cloud/dataflow/sdk/io/AvroIO.java | 810 +++++
.../cloud/dataflow/sdk/io/AvroSource.java | 647 ++++
.../cloud/dataflow/sdk/io/BigQueryIO.java | 1499 +++++++++
.../cloud/dataflow/sdk/io/BlockBasedSource.java | 237 ++
.../sdk/io/BoundedReadFromUnboundedSource.java | 271 ++
.../cloud/dataflow/sdk/io/BoundedSource.java | 277 ++
.../cloud/dataflow/sdk/io/CompressedSource.java | 413 +++
.../cloud/dataflow/sdk/io/CountingInput.java | 191 ++
.../cloud/dataflow/sdk/io/CountingSource.java | 397 +++
.../cloud/dataflow/sdk/io/DatastoreIO.java | 957 ++++++
.../cloud/dataflow/sdk/io/FileBasedSink.java | 864 +++++
.../cloud/dataflow/sdk/io/FileBasedSource.java | 648 ++++
.../dataflow/sdk/io/OffsetBasedSource.java | 326 ++
.../google/cloud/dataflow/sdk/io/PubsubIO.java | 1044 ++++++
.../com/google/cloud/dataflow/sdk/io/Read.java | 253 ++
.../dataflow/sdk/io/ShardNameTemplate.java | 75 +
.../com/google/cloud/dataflow/sdk/io/Sink.java | 252 ++
.../google/cloud/dataflow/sdk/io/Source.java | 193 ++
.../google/cloud/dataflow/sdk/io/TextIO.java | 992 ++++++
.../cloud/dataflow/sdk/io/UnboundedSource.java | 253 ++
.../com/google/cloud/dataflow/sdk/io/Write.java | 213 ++
.../google/cloud/dataflow/sdk/io/XmlSink.java | 310 ++
.../google/cloud/dataflow/sdk/io/XmlSource.java | 541 ++++
.../dataflow/sdk/io/bigtable/BigtableIO.java | 987 ++++++
.../sdk/io/bigtable/BigtableService.java | 108 +
.../sdk/io/bigtable/BigtableServiceImpl.java | 241 ++
.../dataflow/sdk/io/bigtable/package-info.java | 22 +
.../cloud/dataflow/sdk/io/package-info.java | 37 +
.../cloud/dataflow/sdk/io/range/ByteKey.java | 173 +
.../dataflow/sdk/io/range/ByteKeyRange.java | 376 +++
.../sdk/io/range/ByteKeyRangeTracker.java | 117 +
.../sdk/io/range/OffsetRangeTracker.java | 182 ++
.../dataflow/sdk/io/range/RangeTracker.java | 220 ++
.../dataflow/sdk/io/range/package-info.java | 23 +
.../sdk/options/ApplicationNameOptions.java | 33 +
.../dataflow/sdk/options/BigQueryOptions.java | 31 +
.../BlockingDataflowPipelineOptions.java | 49 +
.../sdk/options/CloudDebuggerOptions.java | 43 +
.../options/DataflowPipelineDebugOptions.java | 259 ++
.../sdk/options/DataflowPipelineOptions.java | 134 +
.../DataflowPipelineWorkerPoolOptions.java | 254 ++
.../sdk/options/DataflowProfilingOptions.java | 46 +
.../options/DataflowWorkerHarnessOptions.java | 50 +
.../options/DataflowWorkerLoggingOptions.java | 153 +
.../cloud/dataflow/sdk/options/Default.java | 153 +
.../sdk/options/DefaultValueFactory.java | 38 +
.../cloud/dataflow/sdk/options/Description.java | 35 +
.../sdk/options/DirectPipelineOptions.java | 75 +
.../cloud/dataflow/sdk/options/GcpOptions.java | 291 ++
.../cloud/dataflow/sdk/options/GcsOptions.java | 113 +
.../sdk/options/GoogleApiDebugOptions.java | 87 +
.../cloud/dataflow/sdk/options/Hidden.java | 33 +
.../dataflow/sdk/options/PipelineOptions.java | 249 ++
.../sdk/options/PipelineOptionsFactory.java | 1537 +++++++++
.../sdk/options/PipelineOptionsRegistrar.java | 36 +
.../sdk/options/PipelineOptionsValidator.java | 102 +
.../sdk/options/ProxyInvocationHandler.java | 441 +++
.../dataflow/sdk/options/StreamingOptions.java | 30 +
.../cloud/dataflow/sdk/options/Validation.java | 46 +
.../dataflow/sdk/options/package-info.java | 25 +
.../google/cloud/dataflow/sdk/package-info.java | 33 +
.../runners/AggregatorPipelineExtractor.java | 97 +
.../runners/AggregatorRetrievalException.java | 32 +
.../dataflow/sdk/runners/AggregatorValues.java | 52 +
.../runners/BlockingDataflowPipelineRunner.java | 181 ++
.../DataflowJobAlreadyExistsException.java | 34 +
.../DataflowJobAlreadyUpdatedException.java | 33 +
.../runners/DataflowJobCancelledException.java | 38 +
.../sdk/runners/DataflowJobException.java | 40 +
.../runners/DataflowJobExecutionException.java | 34 +
.../runners/DataflowJobUpdatedException.java | 51 +
.../dataflow/sdk/runners/DataflowPipeline.java | 59 +
.../sdk/runners/DataflowPipelineJob.java | 389 +++
.../sdk/runners/DataflowPipelineRegistrar.java | 58 +
.../sdk/runners/DataflowPipelineRunner.java | 3003 ++++++++++++++++++
.../runners/DataflowPipelineRunnerHooks.java | 37 +
.../sdk/runners/DataflowPipelineTranslator.java | 1104 +++++++
.../sdk/runners/DataflowServiceException.java | 32 +
.../dataflow/sdk/runners/DirectPipeline.java | 55 +
.../sdk/runners/DirectPipelineRegistrar.java | 53 +
.../sdk/runners/DirectPipelineRunner.java | 1156 +++++++
.../dataflow/sdk/runners/PipelineRunner.java | 76 +
.../sdk/runners/PipelineRunnerRegistrar.java | 40 +
.../sdk/runners/RecordingPipelineVisitor.java | 54 +
.../sdk/runners/TransformHierarchy.java | 104 +
.../dataflow/sdk/runners/TransformTreeNode.java | 252 ++
.../sdk/runners/dataflow/AssignWindows.java | 88 +
.../runners/dataflow/BigQueryIOTranslator.java | 125 +
.../sdk/runners/dataflow/CustomSources.java | 118 +
.../dataflow/DataflowAggregatorTransforms.java | 79 +
.../dataflow/DataflowMetricUpdateExtractor.java | 110 +
.../runners/dataflow/PubsubIOTranslator.java | 107 +
.../sdk/runners/dataflow/ReadTranslator.java | 103 +
.../sdk/runners/dataflow/package-info.java | 20 +
.../inprocess/BoundedReadEvaluatorFactory.java | 152 +
.../CachedThreadPoolExecutorServiceFactory.java | 42 +
.../dataflow/sdk/runners/inprocess/Clock.java | 29 +
.../runners/inprocess/CompletionCallback.java | 33 +
.../ConsumerTrackingPipelineVisitor.java | 173 +
.../inprocess/EmptyTransformEvaluator.java | 49 +
.../sdk/runners/inprocess/EvaluatorKey.java | 55 +
.../inprocess/ExecutorServiceFactory.java | 32 +
.../ExecutorServiceParallelExecutor.java | 432 +++
.../inprocess/FlattenEvaluatorFactory.java | 83 +
.../runners/inprocess/ForwardingPTransform.java | 54 +
.../inprocess/GroupByKeyEvaluatorFactory.java | 252 ++
.../inprocess/InMemoryWatermarkManager.java | 1310 ++++++++
.../sdk/runners/inprocess/InProcessBundle.java | 121 +
.../inprocess/InProcessBundleOutputManager.java | 50 +
.../sdk/runners/inprocess/InProcessCreate.java | 209 ++
.../inprocess/InProcessEvaluationContext.java | 405 +++
.../inprocess/InProcessExecutionContext.java | 106 +
.../runners/inprocess/InProcessExecutor.java | 46 +
.../inprocess/InProcessPipelineOptions.java | 90 +
.../inprocess/InProcessPipelineRunner.java | 343 ++
.../inprocess/InProcessSideInputContainer.java | 230 ++
.../inprocess/InProcessTimerInternals.java | 84 +
.../inprocess/InProcessTransformResult.java | 75 +
.../inprocess/KeyedPValueTrackingVisitor.java | 95 +
.../sdk/runners/inprocess/NanosOffsetClock.java | 58 +
.../inprocess/ParDoInProcessEvaluator.java | 109 +
.../inprocess/ParDoMultiEvaluatorFactory.java | 90 +
.../inprocess/ParDoSingleEvaluatorFactory.java | 87 +
.../sdk/runners/inprocess/StepAndKey.java | 68 +
.../runners/inprocess/StepTransformResult.java | 157 +
.../runners/inprocess/TransformEvaluator.java | 45 +
.../inprocess/TransformEvaluatorFactory.java | 42 +
.../inprocess/TransformEvaluatorRegistry.java | 72 +
.../runners/inprocess/TransformExecutor.java | 114 +
.../inprocess/TransformExecutorService.java | 34 +
.../inprocess/TransformExecutorServices.java | 153 +
.../UnboundedReadEvaluatorFactory.java | 168 +
.../runners/inprocess/ViewEvaluatorFactory.java | 121 +
.../inprocess/WatermarkCallbackExecutor.java | 143 +
.../dataflow/sdk/runners/package-info.java | 33 +
.../dataflow/sdk/runners/worker/IsmFormat.java | 946 ++++++
.../sdk/runners/worker/package-info.java | 24 +
.../dataflow/sdk/testing/CoderProperties.java | 349 ++
.../dataflow/sdk/testing/DataflowAssert.java | 825 +++++
.../dataflow/sdk/testing/RunnableOnService.java | 30 +
.../sdk/testing/SerializableMatcher.java | 36 +
.../sdk/testing/SerializableMatchers.java | 1180 +++++++
.../dataflow/sdk/testing/SourceTestUtils.java | 642 ++++
.../testing/TestDataflowPipelineOptions.java | 26 +
.../sdk/testing/TestDataflowPipelineRunner.java | 220 ++
.../dataflow/sdk/testing/TestPipeline.java | 193 ++
.../dataflow/sdk/testing/WindowFnTestUtils.java | 325 ++
.../dataflow/sdk/testing/package-info.java | 21 +
.../dataflow/sdk/transforms/Aggregator.java | 78 +
.../sdk/transforms/AggregatorRetriever.java | 36 +
.../sdk/transforms/AppliedPTransform.java | 100 +
.../sdk/transforms/ApproximateQuantiles.java | 766 +++++
.../sdk/transforms/ApproximateUnique.java | 419 +++
.../cloud/dataflow/sdk/transforms/Combine.java | 2240 +++++++++++++
.../dataflow/sdk/transforms/CombineFnBase.java | 283 ++
.../dataflow/sdk/transforms/CombineFns.java | 1100 +++++++
.../sdk/transforms/CombineWithContext.java | 277 ++
.../cloud/dataflow/sdk/transforms/Count.java | 135 +
.../cloud/dataflow/sdk/transforms/Create.java | 426 +++
.../cloud/dataflow/sdk/transforms/DoFn.java | 563 ++++
.../dataflow/sdk/transforms/DoFnReflector.java | 668 ++++
.../dataflow/sdk/transforms/DoFnTester.java | 495 +++
.../sdk/transforms/DoFnWithContext.java | 416 +++
.../cloud/dataflow/sdk/transforms/Filter.java | 234 ++
.../sdk/transforms/FlatMapElements.java | 145 +
.../cloud/dataflow/sdk/transforms/Flatten.java | 219 ++
.../dataflow/sdk/transforms/GroupByKey.java | 575 ++++
.../transforms/IntraBundleParallelization.java | 346 ++
.../cloud/dataflow/sdk/transforms/Keys.java | 68 +
.../cloud/dataflow/sdk/transforms/KvSwap.java | 73 +
.../dataflow/sdk/transforms/MapElements.java | 112 +
.../cloud/dataflow/sdk/transforms/Max.java | 255 ++
.../cloud/dataflow/sdk/transforms/Mean.java | 202 ++
.../cloud/dataflow/sdk/transforms/Min.java | 255 ++
.../dataflow/sdk/transforms/PTransform.java | 324 ++
.../cloud/dataflow/sdk/transforms/ParDo.java | 1321 ++++++++
.../dataflow/sdk/transforms/Partition.java | 173 +
.../sdk/transforms/RemoveDuplicates.java | 158 +
.../cloud/dataflow/sdk/transforms/Sample.java | 246 ++
.../sdk/transforms/SerializableComparator.java | 28 +
.../sdk/transforms/SerializableFunction.java | 31 +
.../dataflow/sdk/transforms/SimpleFunction.java | 54 +
.../cloud/dataflow/sdk/transforms/Sum.java | 188 ++
.../cloud/dataflow/sdk/transforms/Top.java | 559 ++++
.../cloud/dataflow/sdk/transforms/Values.java | 68 +
.../cloud/dataflow/sdk/transforms/View.java | 470 +++
.../cloud/dataflow/sdk/transforms/WithKeys.java | 140 +
.../dataflow/sdk/transforms/WithTimestamps.java | 129 +
.../cloud/dataflow/sdk/transforms/Write.java | 27 +
.../sdk/transforms/display/DisplayData.java | 530 ++++
.../sdk/transforms/display/HasDisplayData.java | 53 +
.../sdk/transforms/join/CoGbkResult.java | 463 +++
.../sdk/transforms/join/CoGbkResultSchema.java | 134 +
.../sdk/transforms/join/CoGroupByKey.java | 211 ++
.../transforms/join/KeyedPCollectionTuple.java | 247 ++
.../sdk/transforms/join/RawUnionValue.java | 51 +
.../sdk/transforms/join/UnionCoder.java | 147 +
.../sdk/transforms/join/package-info.java | 21 +
.../dataflow/sdk/transforms/package-info.java | 43 +
.../sdk/transforms/windowing/AfterAll.java | 117 +
.../windowing/AfterDelayFromFirstElement.java | 322 ++
.../sdk/transforms/windowing/AfterEach.java | 135 +
.../sdk/transforms/windowing/AfterFirst.java | 119 +
.../sdk/transforms/windowing/AfterPane.java | 145 +
.../windowing/AfterProcessingTime.java | 97 +
.../AfterSynchronizedProcessingTime.java | 75 +
.../transforms/windowing/AfterWatermark.java | 397 +++
.../sdk/transforms/windowing/BoundedWindow.java | 46 +
.../transforms/windowing/CalendarWindows.java | 348 ++
.../transforms/windowing/DefaultTrigger.java | 95 +
.../sdk/transforms/windowing/FixedWindows.java | 116 +
.../sdk/transforms/windowing/GlobalWindow.java | 68 +
.../sdk/transforms/windowing/GlobalWindows.java | 63 +
.../transforms/windowing/IntervalWindow.java | 201 ++
.../transforms/windowing/InvalidWindows.java | 87 +
.../MergeOverlappingIntervalWindows.java | 86 +
.../windowing/NonMergingWindowFn.java | 35 +
.../transforms/windowing/OrFinallyTrigger.java | 100 +
.../sdk/transforms/windowing/OutputTimeFn.java | 319 ++
.../sdk/transforms/windowing/OutputTimeFns.java | 168 +
.../sdk/transforms/windowing/PaneInfo.java | 384 +++
.../windowing/PartitioningWindowFn.java | 61 +
.../sdk/transforms/windowing/Repeatedly.java | 100 +
.../sdk/transforms/windowing/Sessions.java | 112 +
.../transforms/windowing/SlidingWindows.java | 214 ++
.../sdk/transforms/windowing/Trigger.java | 544 ++++
.../transforms/windowing/TriggerBuilder.java | 29 +
.../sdk/transforms/windowing/Window.java | 662 ++++
.../sdk/transforms/windowing/WindowFn.java | 221 ++
.../sdk/transforms/windowing/package-info.java | 49 +
.../dataflow/sdk/util/ActiveWindowSet.java | 171 +
.../cloud/dataflow/sdk/util/ApiSurface.java | 642 ++++
.../dataflow/sdk/util/AppEngineEnvironment.java | 61 +
.../dataflow/sdk/util/AppliedCombineFn.java | 130 +
.../dataflow/sdk/util/AssignWindowsDoFn.java | 67 +
...AttemptAndTimeBoundedExponentialBackOff.java | 168 +
.../util/AttemptBoundedExponentialBackOff.java | 83 +
.../cloud/dataflow/sdk/util/AvroUtils.java | 345 ++
.../dataflow/sdk/util/BaseExecutionContext.java | 155 +
.../dataflow/sdk/util/BatchTimerInternals.java | 138 +
.../sdk/util/BigQueryTableInserter.java | 434 +++
.../sdk/util/BigQueryTableRowIterator.java | 469 +++
.../cloud/dataflow/sdk/util/BitSetCoder.java | 59 +
.../BufferedElementCountingOutputStream.java | 184 ++
.../cloud/dataflow/sdk/util/CloudKnownType.java | 138 +
.../cloud/dataflow/sdk/util/CloudObject.java | 184 ++
.../cloud/dataflow/sdk/util/CoderUtils.java | 327 ++
.../sdk/util/CombineContextFactory.java | 107 +
.../cloud/dataflow/sdk/util/CombineFnUtil.java | 154 +
.../dataflow/sdk/util/CounterAggregator.java | 96 +
.../dataflow/sdk/util/CredentialFactory.java | 29 +
.../cloud/dataflow/sdk/util/Credentials.java | 192 ++
.../sdk/util/DataflowPathValidator.java | 97 +
.../dataflow/sdk/util/DataflowReleaseInfo.java | 87 +
.../sdk/util/DirectModeExecutionContext.java | 130 +
.../sdk/util/DirectSideInputReader.java | 73 +
.../cloud/dataflow/sdk/util/DoFnInfo.java | 67 +
.../cloud/dataflow/sdk/util/DoFnRunner.java | 60 +
.../cloud/dataflow/sdk/util/DoFnRunnerBase.java | 558 ++++
.../cloud/dataflow/sdk/util/DoFnRunners.java | 142 +
.../dataflow/sdk/util/ExecutableTrigger.java | 159 +
.../dataflow/sdk/util/ExecutionContext.java | 102 +
.../sdk/util/ExposedByteArrayInputStream.java | 51 +
.../sdk/util/ExposedByteArrayOutputStream.java | 115 +
.../dataflow/sdk/util/FileIOChannelFactory.java | 135 +
.../dataflow/sdk/util/FinishedTriggers.java | 42 +
.../sdk/util/FinishedTriggersBitSet.java | 68 +
.../dataflow/sdk/util/FinishedTriggersSet.java | 74 +
.../dataflow/sdk/util/GcpCredentialFactory.java | 45 +
.../dataflow/sdk/util/GcsIOChannelFactory.java | 86 +
.../cloud/dataflow/sdk/util/GcsStager.java | 53 +
.../google/cloud/dataflow/sdk/util/GcsUtil.java | 406 +++
.../util/GroupAlsoByWindowViaWindowSetDoFn.java | 104 +
.../sdk/util/GroupAlsoByWindowsDoFn.java | 58 +
.../GroupAlsoByWindowsViaOutputBufferDoFn.java | 98 +
.../dataflow/sdk/util/IOChannelFactory.java | 101 +
.../cloud/dataflow/sdk/util/IOChannelUtils.java | 204 ++
.../sdk/util/IllegalMutationException.java | 52 +
.../dataflow/sdk/util/InstanceBuilder.java | 269 ++
.../util/IntervalBoundedExponentialBackOff.java | 87 +
.../cloud/dataflow/sdk/util/KeyedWorkItem.java | 41 +
.../dataflow/sdk/util/KeyedWorkItemCoder.java | 120 +
.../cloud/dataflow/sdk/util/KeyedWorkItems.java | 120 +
.../sdk/util/LateDataDroppingDoFnRunner.java | 145 +
.../dataflow/sdk/util/MapAggregatorValues.java | 48 +
.../sdk/util/MergingActiveWindowSet.java | 544 ++++
.../cloud/dataflow/sdk/util/MimeTypes.java | 23 +
.../cloud/dataflow/sdk/util/MonitoringUtil.java | 233 ++
.../dataflow/sdk/util/MutationDetector.java | 31 +
.../dataflow/sdk/util/MutationDetectors.java | 182 ++
.../cloud/dataflow/sdk/util/NonEmptyPanes.java | 148 +
.../sdk/util/NonMergingActiveWindowSet.java | 85 +
.../sdk/util/NoopCredentialFactory.java | 38 +
.../dataflow/sdk/util/NoopPathValidator.java | 48 +
.../dataflow/sdk/util/NullSideInputReader.java | 61 +
.../dataflow/sdk/util/OutputReference.java | 42 +
.../sdk/util/PCollectionViewWindow.java | 67 +
.../dataflow/sdk/util/PCollectionViews.java | 426 +++
.../google/cloud/dataflow/sdk/util/PTuple.java | 160 +
.../cloud/dataflow/sdk/util/PackageUtil.java | 327 ++
.../dataflow/sdk/util/PaneInfoTracker.java | 151 +
.../cloud/dataflow/sdk/util/PathValidator.java | 47 +
.../sdk/util/PerKeyCombineFnRunner.java | 147 +
.../sdk/util/PerKeyCombineFnRunners.java | 257 ++
.../cloud/dataflow/sdk/util/PropertyNames.java | 107 +
.../dataflow/sdk/util/RandomAccessData.java | 352 ++
.../cloud/dataflow/sdk/util/ReduceFn.java | 128 +
.../sdk/util/ReduceFnContextFactory.java | 495 +++
.../cloud/dataflow/sdk/util/ReduceFnRunner.java | 843 +++++
.../sdk/util/ReifyTimestampAndWindowsDoFn.java | 46 +
.../cloud/dataflow/sdk/util/Reshuffle.java | 145 +
.../dataflow/sdk/util/ReshuffleTrigger.java | 61 +
.../sdk/util/RetryHttpRequestInitializer.java | 250 ++
.../dataflow/sdk/util/SerializableUtils.java | 159 +
.../cloud/dataflow/sdk/util/Serializer.java | 145 +
.../sdk/util/ShardingWritableByteChannel.java | 118 +
.../dataflow/sdk/util/SideInputReader.java | 48 +
.../dataflow/sdk/util/SimpleDoFnRunner.java | 55 +
.../google/cloud/dataflow/sdk/util/Stager.java | 29 +
.../cloud/dataflow/sdk/util/StreamUtils.java | 68 +
.../cloud/dataflow/sdk/util/StringUtils.java | 242 ++
.../google/cloud/dataflow/sdk/util/Structs.java | 384 +++
.../dataflow/sdk/util/SystemDoFnInternal.java | 37 +
.../cloud/dataflow/sdk/util/SystemReduceFn.java | 133 +
.../cloud/dataflow/sdk/util/TestCredential.java | 51 +
.../cloud/dataflow/sdk/util/TimeDomain.java | 41 +
.../cloud/dataflow/sdk/util/TimeUtil.java | 164 +
.../cloud/dataflow/sdk/util/TimerInternals.java | 269 ++
.../google/cloud/dataflow/sdk/util/Timers.java | 60 +
.../cloud/dataflow/sdk/util/Transport.java | 205 ++
.../sdk/util/TriggerContextFactory.java | 522 +++
.../cloud/dataflow/sdk/util/TriggerRunner.java | 223 ++
.../dataflow/sdk/util/UnownedInputStream.java | 76 +
.../dataflow/sdk/util/UnownedOutputStream.java | 56 +
.../sdk/util/UploadIdResponseInterceptor.java | 61 +
.../dataflow/sdk/util/UserCodeException.java | 94 +
.../dataflow/sdk/util/ValueWithRecordId.java | 154 +
.../google/cloud/dataflow/sdk/util/Values.java | 88 +
.../google/cloud/dataflow/sdk/util/VarInt.java | 115 +
.../cloud/dataflow/sdk/util/WatermarkHold.java | 450 +++
.../cloud/dataflow/sdk/util/Weighted.java | 27 +
.../cloud/dataflow/sdk/util/WeightedValue.java | 45 +
.../cloud/dataflow/sdk/util/WindowTracing.java | 36 +
.../cloud/dataflow/sdk/util/WindowedValue.java | 720 +++++
.../dataflow/sdk/util/WindowingInternals.java | 82 +
.../dataflow/sdk/util/WindowingStrategy.java | 268 ++
.../cloud/dataflow/sdk/util/ZipFiles.java | 294 ++
.../cloud/dataflow/sdk/util/common/Counter.java | 1103 +++++++
.../sdk/util/common/CounterProvider.java | 26 +
.../dataflow/sdk/util/common/CounterSet.java | 177 ++
.../util/common/ElementByteSizeObservable.java | 41 +
.../ElementByteSizeObservableIterable.java | 63 +
.../ElementByteSizeObservableIterator.java | 36 +
.../util/common/ElementByteSizeObserver.java | 92 +
.../sdk/util/common/PeekingReiterator.java | 98 +
.../sdk/util/common/ReflectHelpers.java | 209 ++
.../dataflow/sdk/util/common/Reiterable.java | 27 +
.../dataflow/sdk/util/common/Reiterator.java | 39 +
.../dataflow/sdk/util/common/package-info.java | 18 +
.../sdk/util/common/worker/StateSampler.java | 365 +++
.../sdk/util/common/worker/package-info.java | 18 +
.../cloud/dataflow/sdk/util/gcsfs/GcsPath.java | 619 ++++
.../dataflow/sdk/util/gcsfs/package-info.java | 18 +
.../cloud/dataflow/sdk/util/package-info.java | 18 +
.../util/state/AccumulatorCombiningState.java | 51 +
.../cloud/dataflow/sdk/util/state/BagState.java | 26 +
.../dataflow/sdk/util/state/CombiningState.java | 40 +
.../CopyOnAccessInMemoryStateInternals.java | 454 +++
.../sdk/util/state/InMemoryStateInternals.java | 414 +++
.../sdk/util/state/MergingStateAccessor.java | 40 +
.../dataflow/sdk/util/state/ReadableState.java | 53 +
.../cloud/dataflow/sdk/util/state/State.java | 30 +
.../dataflow/sdk/util/state/StateAccessor.java | 36 +
.../dataflow/sdk/util/state/StateContext.java | 41 +
.../dataflow/sdk/util/state/StateContexts.java | 107 +
.../dataflow/sdk/util/state/StateInternals.java | 55 +
.../dataflow/sdk/util/state/StateMerging.java | 254 ++
.../dataflow/sdk/util/state/StateNamespace.java | 54 +
.../sdk/util/state/StateNamespaceForTest.java | 63 +
.../sdk/util/state/StateNamespaces.java | 277 ++
.../dataflow/sdk/util/state/StateTable.java | 89 +
.../cloud/dataflow/sdk/util/state/StateTag.java | 96 +
.../dataflow/sdk/util/state/StateTags.java | 579 ++++
.../dataflow/sdk/util/state/ValueState.java | 35 +
.../sdk/util/state/WatermarkHoldState.java | 42 +
.../google/cloud/dataflow/sdk/values/KV.java | 130 +
.../cloud/dataflow/sdk/values/PBegin.java | 87 +
.../cloud/dataflow/sdk/values/PCollection.java | 250 ++
.../dataflow/sdk/values/PCollectionList.java | 238 ++
.../dataflow/sdk/values/PCollectionTuple.java | 264 ++
.../dataflow/sdk/values/PCollectionView.java | 64 +
.../google/cloud/dataflow/sdk/values/PDone.java | 47 +
.../cloud/dataflow/sdk/values/PInput.java | 56 +
.../cloud/dataflow/sdk/values/POutput.java | 76 +
.../dataflow/sdk/values/POutputValueBase.java | 102 +
.../cloud/dataflow/sdk/values/PValue.java | 38 +
.../cloud/dataflow/sdk/values/PValueBase.java | 155 +
.../dataflow/sdk/values/TimestampedValue.java | 155 +
.../cloud/dataflow/sdk/values/TupleTag.java | 196 ++
.../cloud/dataflow/sdk/values/TupleTagList.java | 148 +
.../dataflow/sdk/values/TypeDescriptor.java | 351 ++
.../cloud/dataflow/sdk/values/TypedPValue.java | 197 ++
.../cloud/dataflow/sdk/values/package-info.java | 52 +
sdks/java/core/src/main/proto/README.md | 27 +
.../main/proto/proto2_coder_test_messages.proto | 51 +
.../google/cloud/dataflow/sdk/sdk.properties | 5 +
.../cloud/dataflow/sdk/DataflowMatchers.java | 65 +
.../google/cloud/dataflow/sdk/PipelineTest.java | 296 ++
.../google/cloud/dataflow/sdk/TestUtils.java | 213 ++
.../cloud/dataflow/sdk/WindowMatchers.java | 137 +
.../dataflow/sdk/coders/AvroCoderTest.java | 754 +++++
.../sdk/coders/BigEndianIntegerCoderTest.java | 90 +
.../sdk/coders/BigEndianLongCoderTest.java | 94 +
.../dataflow/sdk/coders/ByteArrayCoderTest.java | 144 +
.../dataflow/sdk/coders/ByteCoderTest.java | 91 +
.../sdk/coders/ByteStringCoderTest.java | 121 +
.../dataflow/sdk/coders/CoderFactoriesTest.java | 100 +
.../dataflow/sdk/coders/CoderProvidersTest.java | 71 +
.../dataflow/sdk/coders/CoderRegistryTest.java | 521 +++
.../cloud/dataflow/sdk/coders/CoderTest.java | 78 +
.../sdk/coders/CollectionCoderTest.java | 93 +
.../dataflow/sdk/coders/CustomCoderTest.java | 135 +
.../dataflow/sdk/coders/DefaultCoderTest.java | 128 +
.../dataflow/sdk/coders/DelegateCoderTest.java | 141 +
.../dataflow/sdk/coders/DoubleCoderTest.java | 96 +
.../dataflow/sdk/coders/DurationCoderTest.java | 86 +
.../dataflow/sdk/coders/EntityCoderTest.java | 108 +
.../dataflow/sdk/coders/InstantCoderTest.java | 116 +
.../dataflow/sdk/coders/IterableCoderTest.java | 109 +
.../dataflow/sdk/coders/JAXBCoderTest.java | 99 +
.../cloud/dataflow/sdk/coders/KvCoderTest.java | 118 +
.../dataflow/sdk/coders/ListCoderTest.java | 134 +
.../cloud/dataflow/sdk/coders/MapCoderTest.java | 106 +
.../dataflow/sdk/coders/NullableCoderTest.java | 132 +
.../sdk/coders/PrintBase64Encodings.java | 81 +
.../dataflow/sdk/coders/Proto2CoderTest.java | 145 +
.../sdk/coders/SerializableCoderTest.java | 222 ++
.../cloud/dataflow/sdk/coders/SetCoderTest.java | 86 +
.../dataflow/sdk/coders/StandardCoderTest.java | 176 +
.../sdk/coders/StringDelegateCoderTest.java | 72 +
.../sdk/coders/StringUtf8CoderTest.java | 80 +
.../sdk/coders/StructuralByteArrayTest.java | 39 +
.../sdk/coders/TableRowJsonCoderTest.java | 86 +
.../sdk/coders/TextualIntegerCoderTest.java | 90 +
.../dataflow/sdk/coders/VarIntCoderTest.java | 91 +
.../dataflow/sdk/coders/VarLongCoderTest.java | 94 +
.../sdk/coders/protobuf/ProtoCoderTest.java | 182 ++
.../sdk/coders/protobuf/ProtobufUtilTest.java | 195 ++
.../sdk/io/AvroIOGeneratedClassTest.java | 374 +++
.../cloud/dataflow/sdk/io/AvroIOTest.java | 226 ++
.../cloud/dataflow/sdk/io/AvroSourceTest.java | 692 ++++
.../cloud/dataflow/sdk/io/BigQueryIOTest.java | 445 +++
.../io/BoundedReadFromUnboundedSourceTest.java | 132 +
.../dataflow/sdk/io/CompressedSourceTest.java | 430 +++
.../dataflow/sdk/io/CountingInputTest.java | 125 +
.../dataflow/sdk/io/CountingSourceTest.java | 216 ++
.../cloud/dataflow/sdk/io/DatastoreIOTest.java | 631 ++++
.../dataflow/sdk/io/FileBasedSinkTest.java | 512 +++
.../dataflow/sdk/io/FileBasedSourceTest.java | 914 ++++++
.../dataflow/sdk/io/OffsetBasedSourceTest.java | 278 ++
.../cloud/dataflow/sdk/io/PubsubIOTest.java | 233 ++
.../google/cloud/dataflow/sdk/io/ReadTest.java | 144 +
.../cloud/dataflow/sdk/io/TextIOTest.java | 562 ++++
.../google/cloud/dataflow/sdk/io/WriteTest.java | 341 ++
.../cloud/dataflow/sdk/io/XmlSinkTest.java | 235 ++
.../cloud/dataflow/sdk/io/XmlSourceTest.java | 822 +++++
.../sdk/io/bigtable/BigtableIOTest.java | 688 ++++
.../range/ByteKeyRangeEstimateFractionTest.java | 69 +
.../range/ByteKeyRangeInterpolateKeyTest.java | 73 +
.../dataflow/sdk/io/range/ByteKeyRangeTest.java | 396 +++
.../sdk/io/range/ByteKeyRangeTrackerTest.java | 118 +
.../dataflow/sdk/io/range/ByteKeyTest.java | 178 ++
.../sdk/io/range/OffsetRangeTrackerTest.java | 186 ++
.../com/google/cloud/dataflow/sdk/io/user.avsc | 10 +
.../DataflowPipelineDebugOptionsTest.java | 40 +
.../options/DataflowPipelineOptionsTest.java | 91 +
.../options/DataflowProfilingOptionsTest.java | 47 +
.../DataflowWorkerLoggingOptionsTest.java | 73 +
.../dataflow/sdk/options/GcpOptionsTest.java | 123 +
.../sdk/options/GoogleApiDebugOptionsTest.java | 147 +
.../sdk/options/PipelineOptionsFactoryTest.java | 1154 +++++++
.../sdk/options/PipelineOptionsTest.java | 126 +
.../options/PipelineOptionsValidatorTest.java | 310 ++
.../sdk/options/ProxyInvocationHandlerTest.java | 691 ++++
.../AggregatorPipelineExtractorTest.java | 228 ++
.../BlockingDataflowPipelineRunnerTest.java | 301 ++
.../sdk/runners/DataflowPipelineJobTest.java | 603 ++++
.../runners/DataflowPipelineRegistrarTest.java | 72 +
.../sdk/runners/DataflowPipelineRunnerTest.java | 1370 ++++++++
.../sdk/runners/DataflowPipelineTest.java | 44 +
.../runners/DataflowPipelineTranslatorTest.java | 889 ++++++
.../runners/DirectPipelineRegistrarTest.java | 69 +
.../sdk/runners/DirectPipelineRunnerTest.java | 210 ++
.../sdk/runners/DirectPipelineTest.java | 34 +
.../sdk/runners/PipelineRunnerTest.java | 82 +
.../dataflow/sdk/runners/TransformTreeTest.java | 194 ++
.../sdk/runners/dataflow/CustomSourcesTest.java | 273 ++
.../runners/dataflow/TestCountingSource.java | 235 ++
.../BoundedReadEvaluatorFactoryTest.java | 287 ++
.../ConsumerTrackingPipelineVisitorTest.java | 233 ++
.../inprocess/FlattenEvaluatorFactoryTest.java | 136 +
.../inprocess/ForwardingPTransformTest.java | 100 +
.../GroupByKeyEvaluatorFactoryTest.java | 178 ++
.../inprocess/InMemoryWatermarkManagerTest.java | 1111 +++++++
.../runners/inprocess/InProcessBundleTest.java | 143 +
.../runners/inprocess/InProcessCreateTest.java | 199 ++
.../InProcessEvaluationContextTest.java | 544 ++++
.../inprocess/InProcessPipelineRunnerTest.java | 77 +
.../InProcessSideInputContainerTest.java | 370 +++
.../inprocess/InProcessTimerInternalsTest.java | 131 +
.../KeyedPValueTrackingVisitorTest.java | 189 ++
.../sdk/runners/inprocess/MockClock.java | 60 +
.../ParDoMultiEvaluatorFactoryTest.java | 412 +++
.../ParDoSingleEvaluatorFactoryTest.java | 310 ++
.../TransformExecutorServicesTest.java | 134 +
.../inprocess/TransformExecutorTest.java | 312 ++
.../UnboundedReadEvaluatorFactoryTest.java | 327 ++
.../inprocess/ViewEvaluatorFactoryTest.java | 96 +
.../WatermarkCallbackExecutorTest.java | 126 +
.../sdk/testing/CoderPropertiesTest.java | 214 ++
.../sdk/testing/DataflowAssertTest.java | 326 ++
.../sdk/testing/DataflowJUnitTestRunner.java | 129 +
.../dataflow/sdk/testing/ExpectedLogs.java | 306 ++
.../dataflow/sdk/testing/ExpectedLogsTest.java | 153 +
.../sdk/testing/FastNanoClockAndSleeper.java | 47 +
.../testing/FastNanoClockAndSleeperTest.java | 47 +
.../sdk/testing/PCollectionViewTesting.java | 295 ++
.../sdk/testing/ResetDateTimeProvider.java | 41 +
.../sdk/testing/ResetDateTimeProviderTest.java | 55 +
.../sdk/testing/RestoreSystemProperties.java | 51 +
.../testing/RestoreSystemPropertiesTest.java | 50 +
.../sdk/testing/SerializableMatchersTest.java | 165 +
.../sdk/testing/SystemNanoTimeSleeper.java | 68 +
.../sdk/testing/SystemNanoTimeSleeperTest.java | 53 +
.../testing/TestDataflowPipelineRunnerTest.java | 317 ++
.../dataflow/sdk/testing/TestPipelineTest.java | 93 +
.../transforms/ApproximateQuantilesTest.java | 299 ++
.../sdk/transforms/ApproximateUniqueTest.java | 291 ++
.../dataflow/sdk/transforms/CombineFnsTest.java | 413 +++
.../dataflow/sdk/transforms/CombineTest.java | 1137 +++++++
.../dataflow/sdk/transforms/CountTest.java | 121 +
.../dataflow/sdk/transforms/CreateTest.java | 240 ++
.../sdk/transforms/DoFnContextTest.java | 68 +
.../DoFnDelegatingAggregatorTest.java | 143 +
.../sdk/transforms/DoFnReflectorTest.java | 493 +++
.../cloud/dataflow/sdk/transforms/DoFnTest.java | 206 ++
.../dataflow/sdk/transforms/DoFnTesterTest.java | 253 ++
.../sdk/transforms/DoFnWithContextTest.java | 225 ++
.../dataflow/sdk/transforms/FilterTest.java | 160 +
.../sdk/transforms/FlatMapElementsTest.java | 124 +
.../dataflow/sdk/transforms/FlattenTest.java | 369 +++
.../dataflow/sdk/transforms/GroupByKeyTest.java | 438 +++
.../IntraBundleParallelizationTest.java | 250 ++
.../cloud/dataflow/sdk/transforms/KeysTest.java | 83 +
.../dataflow/sdk/transforms/KvSwapTest.java | 91 +
.../sdk/transforms/MapElementsTest.java | 134 +
.../cloud/dataflow/sdk/transforms/MaxTest.java | 66 +
.../cloud/dataflow/sdk/transforms/MeanTest.java | 72 +
.../cloud/dataflow/sdk/transforms/MinTest.java | 66 +
.../cloud/dataflow/sdk/transforms/NoOpDoFn.java | 143 +
.../dataflow/sdk/transforms/PTransformTest.java | 41 +
.../dataflow/sdk/transforms/ParDoTest.java | 1541 +++++++++
.../dataflow/sdk/transforms/PartitionTest.java | 140 +
.../sdk/transforms/RemoveDuplicatesTest.java | 131 +
.../dataflow/sdk/transforms/SampleTest.java | 260 ++
.../sdk/transforms/SimpleStatsFnsTest.java | 129 +
.../cloud/dataflow/sdk/transforms/SumTest.java | 66 +
.../cloud/dataflow/sdk/transforms/TopTest.java | 259 ++
.../dataflow/sdk/transforms/ValuesTest.java | 93 +
.../cloud/dataflow/sdk/transforms/ViewTest.java | 1548 +++++++++
.../dataflow/sdk/transforms/WithKeysTest.java | 127 +
.../sdk/transforms/WithTimestampsTest.java | 210 ++
.../transforms/display/DisplayDataMatchers.java | 98 +
.../display/DisplayDataMatchersTest.java | 81 +
.../sdk/transforms/display/DisplayDataTest.java | 633 ++++
.../transforms/join/CoGbkResultCoderTest.java | 85 +
.../sdk/transforms/join/CoGbkResultTest.java | 124 +
.../sdk/transforms/join/CoGroupByKeyTest.java | 507 +++
.../sdk/transforms/join/UnionCoderTest.java | 48 +
.../sdk/transforms/windowing/AfterAllTest.java | 151 +
.../sdk/transforms/windowing/AfterEachTest.java | 122 +
.../transforms/windowing/AfterFirstTest.java | 175 +
.../sdk/transforms/windowing/AfterPaneTest.java | 126 +
.../windowing/AfterProcessingTimeTest.java | 157 +
.../AfterSynchronizedProcessingTimeTest.java | 121 +
.../windowing/AfterWatermarkTest.java | 338 ++
.../windowing/CalendarWindowsTest.java | 260 ++
.../windowing/DefaultTriggerTest.java | 176 +
.../transforms/windowing/FixedWindowsTest.java | 124 +
.../windowing/IntervalWindowTest.java | 94 +
.../windowing/OrFinallyTriggerTest.java | 209 ++
.../sdk/transforms/windowing/PaneInfoTest.java | 75 +
.../transforms/windowing/RepeatedlyTest.java | 128 +
.../sdk/transforms/windowing/SessionsTest.java | 156 +
.../windowing/SlidingWindowsTest.java | 193 ++
.../sdk/transforms/windowing/TriggerTest.java | 117 +
.../sdk/transforms/windowing/WindowTest.java | 226 ++
.../sdk/transforms/windowing/WindowingTest.java | 244 ++
.../cloud/dataflow/sdk/util/ApiSurfaceTest.java | 187 ++
...mptAndTimeBoundedExponentialBackOffTest.java | 212 ++
.../AttemptBoundedExponentialBackOffTest.java | 85 +
.../cloud/dataflow/sdk/util/AvroUtilsTest.java | 225 ++
.../sdk/util/BatchTimerInternalsTest.java | 116 +
.../sdk/util/BigQueryTableInserterTest.java | 239 ++
.../sdk/util/BigQueryTableRowIteratorTest.java | 255 ++
.../dataflow/sdk/util/BigQueryUtilTest.java | 479 +++
...BufferedElementCountingOutputStreamTest.java | 205 ++
.../cloud/dataflow/sdk/util/CoderUtilsTest.java | 229 ++
.../dataflow/sdk/util/CombineFnUtilTest.java | 62 +
.../sdk/util/CounterAggregatorTest.java | 253 ++
.../sdk/util/DataflowPathValidatorTest.java | 92 +
.../sdk/util/ExecutableTriggerTest.java | 130 +
.../util/ExposedByteArrayInputStreamTest.java | 78 +
.../util/ExposedByteArrayOutputStreamTest.java | 245 ++
.../sdk/util/FileIOChannelFactoryTest.java | 226 ++
.../sdk/util/FinishedTriggersBitSetTest.java | 54 +
.../sdk/util/FinishedTriggersProperties.java | 109 +
.../sdk/util/FinishedTriggersSetTest.java | 60 +
.../sdk/util/GcsIOChannelFactoryTest.java | 43 +
.../cloud/dataflow/sdk/util/GcsUtilTest.java | 490 +++
.../sdk/util/GroupAlsoByWindowsProperties.java | 718 +++++
...oupAlsoByWindowsViaOutputBufferDoFnTest.java | 111 +
.../dataflow/sdk/util/IOChannelUtilsTest.java | 94 +
.../dataflow/sdk/util/InstanceBuilderTest.java | 115 +
.../IntervalBoundedExponentialBackOffTest.java | 99 +
.../sdk/util/KeyedWorkItemCoderTest.java | 61 +
.../util/LateDataDroppingDoFnRunnerTest.java | 115 +
.../sdk/util/MergingActiveWindowSetTest.java | 175 +
.../dataflow/sdk/util/MonitoringUtilTest.java | 146 +
.../sdk/util/MutationDetectorsTest.java | 148 +
.../cloud/dataflow/sdk/util/PTupleTest.java | 40 +
.../dataflow/sdk/util/PackageUtilTest.java | 482 +++
.../dataflow/sdk/util/RandomAccessDataTest.java | 205 ++
.../dataflow/sdk/util/ReduceFnRunnerTest.java | 1049 ++++++
.../cloud/dataflow/sdk/util/ReduceFnTester.java | 776 +++++
.../cloud/dataflow/sdk/util/ReshuffleTest.java | 208 ++
.../dataflow/sdk/util/ReshuffleTriggerTest.java | 58 +
.../util/RetryHttpRequestInitializerTest.java | 296 ++
.../sdk/util/SerializableUtilsTest.java | 165 +
.../cloud/dataflow/sdk/util/SerializerTest.java | 162 +
.../dataflow/sdk/util/SimpleDoFnRunnerTest.java | 86 +
.../dataflow/sdk/util/StreamUtilsTest.java | 71 +
.../dataflow/sdk/util/StringUtilsTest.java | 145 +
.../cloud/dataflow/sdk/util/StructsTest.java | 206 ++
.../cloud/dataflow/sdk/util/TimeUtilTest.java | 73 +
.../dataflow/sdk/util/TimerInternalsTest.java | 52 +
.../cloud/dataflow/sdk/util/TriggerTester.java | 585 ++++
.../sdk/util/UnownedInputStreamTest.java | 76 +
.../sdk/util/UnownedOutputStreamTest.java | 57 +
.../util/UploadIdResponseInterceptorTest.java | 99 +
.../sdk/util/UserCodeExceptionTest.java | 176 +
.../cloud/dataflow/sdk/util/VarIntTest.java | 277 ++
.../dataflow/sdk/util/WindowedValueTest.java | 57 +
.../cloud/dataflow/sdk/util/ZipFilesTest.java | 311 ++
.../sdk/util/common/CounterSetTest.java | 225 ++
.../dataflow/sdk/util/common/CounterTest.java | 589 ++++
.../sdk/util/common/CounterTestUtils.java | 56 +
.../sdk/util/common/ReflectHelpersTest.java | 126 +
.../dataflow/sdk/util/gcsfs/GcsPathTest.java | 333 ++
.../CopyOnAccessInMemoryStateInternalsTest.java | 553 ++++
.../util/state/InMemoryStateInternalsTest.java | 348 ++
.../sdk/util/state/StateNamespacesTest.java | 129 +
.../dataflow/sdk/util/state/StateTagTest.java | 173 +
.../cloud/dataflow/sdk/values/KVTest.java | 112 +
.../sdk/values/PCollectionListTest.java | 47 +
.../sdk/values/PCollectionTupleTest.java | 93 +
.../cloud/dataflow/sdk/values/PDoneTest.java | 102 +
.../cloud/dataflow/sdk/values/TupleTagTest.java | 87 +
.../dataflow/sdk/values/TypeDescriptorTest.java | 193 ++
.../dataflow/sdk/values/TypedPValueTest.java | 164 +
.../PipelineOptionsFactoryJava8Test.java | 90 +
1451 files changed, 156174 insertions(+), 156174 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index b8944d6..3145c40 100644
--- a/pom.xml
+++ b/pom.xml
@@ -125,7 +125,7 @@
<packaging>pom</packaging>
<modules>
- <module>sdk</module>
+ <module>sdks/java/core</module>
<module>runners</module>
<module>examples</module>
<module>maven-archetypes</module>
[20/67] [partial] incubator-beam git commit: Directory reorganization
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/ParDo.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/ParDo.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/ParDo.java
deleted file mode 100644
index c77ac44..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/ParDo.java
+++ /dev/null
@@ -1,1321 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms;
-
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.annotations.Experimental;
-import com.google.cloud.dataflow.sdk.coders.CannotProvideCoderException;
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.coders.CoderException;
-import com.google.cloud.dataflow.sdk.runners.DirectPipelineRunner;
-import com.google.cloud.dataflow.sdk.transforms.display.DisplayData.Builder;
-import com.google.cloud.dataflow.sdk.transforms.windowing.WindowFn;
-import com.google.cloud.dataflow.sdk.util.DirectModeExecutionContext;
-import com.google.cloud.dataflow.sdk.util.DirectSideInputReader;
-import com.google.cloud.dataflow.sdk.util.DoFnRunner;
-import com.google.cloud.dataflow.sdk.util.DoFnRunnerBase;
-import com.google.cloud.dataflow.sdk.util.DoFnRunners;
-import com.google.cloud.dataflow.sdk.util.IllegalMutationException;
-import com.google.cloud.dataflow.sdk.util.MutationDetector;
-import com.google.cloud.dataflow.sdk.util.MutationDetectors;
-import com.google.cloud.dataflow.sdk.util.PTuple;
-import com.google.cloud.dataflow.sdk.util.SerializableUtils;
-import com.google.cloud.dataflow.sdk.util.SideInputReader;
-import com.google.cloud.dataflow.sdk.util.StringUtils;
-import com.google.cloud.dataflow.sdk.util.UserCodeException;
-import com.google.cloud.dataflow.sdk.util.WindowedValue;
-import com.google.cloud.dataflow.sdk.values.KV;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.cloud.dataflow.sdk.values.PCollectionTuple;
-import com.google.cloud.dataflow.sdk.values.PCollectionView;
-import com.google.cloud.dataflow.sdk.values.TupleTag;
-import com.google.cloud.dataflow.sdk.values.TupleTagList;
-import com.google.cloud.dataflow.sdk.values.TypedPValue;
-import com.google.common.collect.ImmutableList;
-import com.google.common.collect.Maps;
-
-import java.io.Serializable;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.List;
-import java.util.Map;
-import java.util.concurrent.ConcurrentMap;
-
-import javax.annotation.Nullable;
-
-/**
- * {@link ParDo} is the core element-wise transform in Google Cloud
- * Dataflow, invoking a user-specified function on each of the elements of the input
- * {@link PCollection} to produce zero or more output elements, all
- * of which are collected into the output {@link PCollection}.
- *
- * <p>Elements are processed independently, and possibly in parallel across
- * distributed cloud resources.
- *
- * <p>The {@link ParDo} processing style is similar to what happens inside
- * the "Mapper" or "Reducer" class of a MapReduce-style algorithm.
- *
- * <h2>{@link DoFn DoFns}</h2>
- *
- * <p>The function to use to process each element is specified by a
- * {@link DoFn DoFn<InputT, OutputT>}, primarily via its
- * {@link DoFn#processElement processElement} method. The {@link DoFn} may also
- * override the default implementations of {@link DoFn#startBundle startBundle}
- * and {@link DoFn#finishBundle finishBundle}.
- *
- * <p>Conceptually, when a {@link ParDo} transform is executed, the
- * elements of the input {@link PCollection} are first divided up
- * into some number of "bundles". These are farmed off to distributed
- * worker machines (or run locally, if using the {@link DirectPipelineRunner}).
- * For each bundle of input elements processing proceeds as follows:
- *
- * <ol>
- * <li>A fresh instance of the argument {@link DoFn} is created on a worker. This may
- * be through deserialization or other means. If the {@link DoFn} subclass
- * does not override {@link DoFn#startBundle startBundle} or
- * {@link DoFn#finishBundle finishBundle} then this may be optimized since
- * it cannot observe the start and end of a bundle.</li>
- * <li>The {@link DoFn DoFn's} {@link DoFn#startBundle} method is called to
- * initialize it. If this method is not overridden, the call may be optimized
- * away.</li>
- * <li>The {@link DoFn DoFn's} {@link DoFn#processElement} method
- * is called on each of the input elements in the bundle.</li>
- * <li>The {@link DoFn DoFn's} {@link DoFn#finishBundle} method is called
- * to complete its work. After {@link DoFn#finishBundle} is called, the
- * framework will never again invoke any of these three processing methods.
- * If this method is not overridden, this call may be optimized away.</li>
- * </ol>
- *
- * Each of the calls to any of the {@link DoFn DoFn's} processing
- * methods can produce zero or more output elements. All of the
- * of output elements from all of the {@link DoFn} instances
- * are included in the output {@link PCollection}.
- *
- * <p>For example:
- *
- * <pre> {@code
- * PCollection<String> lines = ...;
- * PCollection<String> words =
- * lines.apply(ParDo.of(new DoFn<String, String>() {
- * public void processElement(ProcessContext c) {
- * String line = c.element();
- * for (String word : line.split("[^a-zA-Z']+")) {
- * c.output(word);
- * }
- * }}));
- * PCollection<Integer> wordLengths =
- * words.apply(ParDo.of(new DoFn<String, Integer>() {
- * public void processElement(ProcessContext c) {
- * String word = c.element();
- * Integer length = word.length();
- * c.output(length);
- * }}));
- * } </pre>
- *
- * <p>Each output element has the same timestamp and is in the same windows
- * as its corresponding input element, and the output {@code PCollection}
- * has the same {@link WindowFn} associated with it as the input.
- *
- * <h2>Naming {@link ParDo ParDo} transforms</h2>
- *
- * <p>The name of a transform is used to provide a name for any node in the
- * {@link Pipeline} graph resulting from application of the transform.
- * It is best practice to provide a name at the time of application,
- * via {@link PCollection#apply(String, PTransform)}. Otherwise,
- * a unique name - which may not be stable across pipeline revision -
- * will be generated, based on the transform name.
- *
- * <p>If a {@link ParDo} is applied exactly once inlined, then
- * it can be given a name via {@link #named}. For example:
- *
- * <pre> {@code
- * PCollection<String> words =
- * lines.apply(ParDo.named("ExtractWords")
- * .of(new DoFn<String, String>() { ... }));
- * PCollection<Integer> wordLengths =
- * words.apply(ParDo.named("ComputeWordLengths")
- * .of(new DoFn<String, Integer>() { ... }));
- * } </pre>
- *
- * <h2>Side Inputs</h2>
- *
- * <p>While a {@link ParDo} processes elements from a single "main input"
- * {@link PCollection}, it can take additional "side input"
- * {@link PCollectionView PCollectionViews}. These side input
- * {@link PCollectionView PCollectionViews} express styles of accessing
- * {@link PCollection PCollections} computed by earlier pipeline operations,
- * passed in to the {@link ParDo} transform using
- * {@link #withSideInputs}, and their contents accessible to each of
- * the {@link DoFn} operations via {@link DoFn.ProcessContext#sideInput sideInput}.
- * For example:
- *
- * <pre> {@code
- * PCollection<String> words = ...;
- * PCollection<Integer> maxWordLengthCutOff = ...; // Singleton PCollection
- * final PCollectionView<Integer> maxWordLengthCutOffView =
- * maxWordLengthCutOff.apply(View.<Integer>asSingleton());
- * PCollection<String> wordsBelowCutOff =
- * words.apply(ParDo.withSideInputs(maxWordLengthCutOffView)
- * .of(new DoFn<String, String>() {
- * public void processElement(ProcessContext c) {
- * String word = c.element();
- * int lengthCutOff = c.sideInput(maxWordLengthCutOffView);
- * if (word.length() <= lengthCutOff) {
- * c.output(word);
- * }
- * }}));
- * } </pre>
- *
- * <h2>Side Outputs</h2>
- *
- * <p>Optionally, a {@link ParDo} transform can produce multiple
- * output {@link PCollection PCollections}, both a "main output"
- * {@code PCollection<OutputT>} plus any number of "side output"
- * {@link PCollection PCollections}, each keyed by a distinct {@link TupleTag},
- * and bundled in a {@link PCollectionTuple}. The {@link TupleTag TupleTags}
- * to be used for the output {@link PCollectionTuple} are specified by
- * invoking {@link #withOutputTags}. Unconsumed side outputs do not
- * necessarily need to be explicitly specified, even if the {@link DoFn}
- * generates them. Within the {@link DoFn}, an element is added to the
- * main output {@link PCollection} as normal, using
- * {@link DoFn.Context#output}, while an element is added to a side output
- * {@link PCollection} using {@link DoFn.Context#sideOutput}. For example:
- *
- * <pre> {@code
- * PCollection<String> words = ...;
- * // Select words whose length is below a cut off,
- * // plus the lengths of words that are above the cut off.
- * // Also select words starting with "MARKER".
- * final int wordLengthCutOff = 10;
- * // Create tags to use for the main and side outputs.
- * final TupleTag<String> wordsBelowCutOffTag =
- * new TupleTag<String>(){};
- * final TupleTag<Integer> wordLengthsAboveCutOffTag =
- * new TupleTag<Integer>(){};
- * final TupleTag<String> markedWordsTag =
- * new TupleTag<String>(){};
- * PCollectionTuple results =
- * words.apply(
- * ParDo
- * // Specify the main and consumed side output tags of the
- * // PCollectionTuple result:
- * .withOutputTags(wordsBelowCutOffTag,
- * TupleTagList.of(wordLengthsAboveCutOffTag)
- * .and(markedWordsTag))
- * .of(new DoFn<String, String>() {
- * // Create a tag for the unconsumed side output.
- * final TupleTag<String> specialWordsTag =
- * new TupleTag<String>(){};
- * public void processElement(ProcessContext c) {
- * String word = c.element();
- * if (word.length() <= wordLengthCutOff) {
- * // Emit this short word to the main output.
- * c.output(word);
- * } else {
- * // Emit this long word's length to a side output.
- * c.sideOutput(wordLengthsAboveCutOffTag, word.length());
- * }
- * if (word.startsWith("MARKER")) {
- * // Emit this word to a different side output.
- * c.sideOutput(markedWordsTag, word);
- * }
- * if (word.startsWith("SPECIAL")) {
- * // Emit this word to the unconsumed side output.
- * c.sideOutput(specialWordsTag, word);
- * }
- * }}));
- * // Extract the PCollection results, by tag.
- * PCollection<String> wordsBelowCutOff =
- * results.get(wordsBelowCutOffTag);
- * PCollection<Integer> wordLengthsAboveCutOff =
- * results.get(wordLengthsAboveCutOffTag);
- * PCollection<String> markedWords =
- * results.get(markedWordsTag);
- * } </pre>
- *
- * <h2>Properties May Be Specified In Any Order</h2>
- *
- * <p>Several properties can be specified for a {@link ParDo}
- * {@link PTransform}, including name, side inputs, side output tags,
- * and {@link DoFn} to invoke. Only the {@link DoFn} is required; the
- * name is encouraged but not required, and side inputs and side
- * output tags are only specified when they're needed. These
- * properties can be specified in any order, as long as they're
- * specified before the {@link ParDo} {@link PTransform} is applied.
- *
- * <p>The approach used to allow these properties to be specified in
- * any order, with some properties omitted, is to have each of the
- * property "setter" methods defined as static factory methods on
- * {@link ParDo} itself, which return an instance of either
- * {@link ParDo.Unbound} or
- * {@link ParDo.Bound} nested classes, each of which offer
- * property setter instance methods to enable setting additional
- * properties. {@link ParDo.Bound} is used for {@link ParDo}
- * transforms whose {@link DoFn} is specified and whose input and
- * output static types have been bound. {@link ParDo.Unbound ParDo.Unbound} is used
- * for {@link ParDo} transforms that have not yet had their
- * {@link DoFn} specified. Only {@link ParDo.Bound} instances can be
- * applied.
- *
- * <p>Another benefit of this approach is that it reduces the number
- * of type parameters that need to be specified manually. In
- * particular, the input and output types of the {@link ParDo}
- * {@link PTransform} are inferred automatically from the type
- * parameters of the {@link DoFn} argument passed to {@link ParDo#of}.
- *
- * <h2>Output Coders</h2>
- *
- * <p>By default, the {@link Coder Coder<OutputT>} for the
- * elements of the main output {@link PCollection PCollection<OutputT>} is
- * inferred from the concrete type of the {@link DoFn DoFn<InputT, OutputT>}.
- *
- * <p>By default, the {@link Coder Coder<SideOutputT>} for the elements of
- * a side output {@link PCollection PCollection<SideOutputT>} is inferred
- * from the concrete type of the corresponding {@link TupleTag TupleTag<SideOutputT>}.
- * To be successful, the {@link TupleTag} should be created as an instance
- * of a trivial anonymous subclass, with {@code {}} suffixed to the
- * constructor call. Such uses block Java's generic type parameter
- * inference, so the {@code <X>} argument must be provided explicitly.
- * For example:
- * <pre> {@code
- * // A TupleTag to use for a side input can be written concisely:
- * final TupleTag<Integer> sideInputag = new TupleTag<>();
- * // A TupleTag to use for a side output should be written with "{}",
- * // and explicit generic parameter type:
- * final TupleTag<String> sideOutputTag = new TupleTag<String>(){};
- * } </pre>
- * This style of {@code TupleTag} instantiation is used in the example of
- * multiple side outputs, above.
- *
- * <h2>Serializability of {@link DoFn DoFns}</h2>
- *
- * <p>A {@link DoFn} passed to a {@link ParDo} transform must be
- * {@link Serializable}. This allows the {@link DoFn} instance
- * created in this "main program" to be sent (in serialized form) to
- * remote worker machines and reconstituted for each bundles of elements
- * of the input {@link PCollection} being processed. A {@link DoFn}
- * can have instance variable state, and non-transient instance
- * variable state will be serialized in the main program and then
- * deserialized on remote worker machines for each bundle of elements
- * to process.
- *
- * <p>To aid in ensuring that {@link DoFn DoFns} are properly
- * {@link Serializable}, even local execution using the
- * {@link DirectPipelineRunner} will serialize and then deserialize
- * {@link DoFn DoFns} before executing them on a bundle.
- *
- * <p>{@link DoFn DoFns} expressed as anonymous inner classes can be
- * convenient, but due to a quirk in Java's rules for serializability,
- * non-static inner or nested classes (including anonymous inner
- * classes) automatically capture their enclosing class's instance in
- * their serialized state. This can lead to including much more than
- * intended in the serialized state of a {@link DoFn}, or even things
- * that aren't {@link Serializable}.
- *
- * <p>There are two ways to avoid unintended serialized state in a
- * {@link DoFn}:
- *
- * <ul>
- *
- * <li>Define the {@link DoFn} as a named, static class.
- *
- * <li>Define the {@link DoFn} as an anonymous inner class inside of
- * a static method.
- *
- * </ul>
- *
- * <p>Both of these approaches ensure that there is no implicit enclosing
- * instance serialized along with the {@link DoFn} instance.
- *
- * <p>Prior to Java 8, any local variables of the enclosing
- * method referenced from within an anonymous inner class need to be
- * marked as {@code final}. If defining the {@link DoFn} as a named
- * static class, such variables would be passed as explicit
- * constructor arguments and stored in explicit instance variables.
- *
- * <p>There are three main ways to initialize the state of a
- * {@link DoFn} instance processing a bundle:
- *
- * <ul>
- *
- * <li>Define instance variable state (including implicit instance
- * variables holding final variables captured by an anonymous inner
- * class), initialized by the {@link DoFn}'s constructor (which is
- * implicit for an anonymous inner class). This state will be
- * automatically serialized and then deserialized in the {@code DoFn}
- * instance created for each bundle. This method is good for state
- * known when the original {@code DoFn} is created in the main
- * program, if it's not overly large.
- *
- * <li>Compute the state as a singleton {@link PCollection} and pass it
- * in as a side input to the {@link DoFn}. This is good if the state
- * needs to be computed by the pipeline, or if the state is very large
- * and so is best read from file(s) rather than sent as part of the
- * {@code DoFn}'s serialized state.
- *
- * <li>Initialize the state in each {@link DoFn} instance, in
- * {@link DoFn#startBundle}. This is good if the initialization
- * doesn't depend on any information known only by the main program or
- * computed by earlier pipeline operations, but is the same for all
- * instances of this {@link DoFn} for all program executions, say
- * setting up empty caches or initializing constant data.
- *
- * </ul>
- *
- * <h2>No Global Shared State</h2>
- *
- * <p>{@link ParDo} operations are intended to be able to run in
- * parallel across multiple worker machines. This precludes easy
- * sharing and updating mutable state across those machines. There is
- * no support in the Google Cloud Dataflow system for communicating
- * and synchronizing updates to shared state across worker machines,
- * so programs should not access any mutable static variable state in
- * their {@link DoFn}, without understanding that the Java processes
- * for the main program and workers will each have its own independent
- * copy of such state, and there won't be any automatic copying of
- * that state across Java processes. All information should be
- * communicated to {@link DoFn} instances via main and side inputs and
- * serialized state, and all output should be communicated from a
- * {@link DoFn} instance via main and side outputs, in the absence of
- * external communication mechanisms written by user code.
- *
- * <h2>Fault Tolerance</h2>
- *
- * <p>In a distributed system, things can fail: machines can crash,
- * machines can be unable to communicate across the network, etc.
- * While individual failures are rare, the larger the job, the greater
- * the chance that something, somewhere, will fail. The Google Cloud
- * Dataflow service strives to mask such failures automatically,
- * principally by retrying failed {@link DoFn} bundle. This means
- * that a {@code DoFn} instance might process a bundle partially, then
- * crash for some reason, then be rerun (often on a different worker
- * machine) on that same bundle and on the same elements as before.
- * Sometimes two or more {@link DoFn} instances will be running on the
- * same bundle simultaneously, with the system taking the results of
- * the first instance to complete successfully. Consequently, the
- * code in a {@link DoFn} needs to be written such that these
- * duplicate (sequential or concurrent) executions do not cause
- * problems. If the outputs of a {@link DoFn} are a pure function of
- * its inputs, then this requirement is satisfied. However, if a
- * {@link DoFn DoFn's} execution has external side-effects, such as performing
- * updates to external HTTP services, then the {@link DoFn DoFn's} code
- * needs to take care to ensure that those updates are idempotent and
- * that concurrent updates are acceptable. This property can be
- * difficult to achieve, so it is advisable to strive to keep
- * {@link DoFn DoFns} as pure functions as much as possible.
- *
- * <h2>Optimization</h2>
- *
- * <p>The Google Cloud Dataflow service automatically optimizes a
- * pipeline before it is executed. A key optimization, <i>fusion</i>,
- * relates to {@link ParDo} operations. If one {@link ParDo} operation produces a
- * {@link PCollection} that is then consumed as the main input of another
- * {@link ParDo} operation, the two {@link ParDo} operations will be <i>fused</i>
- * together into a single ParDo operation and run in a single pass;
- * this is "producer-consumer fusion". Similarly, if
- * two or more ParDo operations have the same {@link PCollection} main input,
- * they will be fused into a single {@link ParDo} that makes just one pass
- * over the input {@link PCollection}; this is "sibling fusion".
- *
- * <p>If after fusion there are no more unfused references to a
- * {@link PCollection} (e.g., one between a producer ParDo and a consumer
- * {@link ParDo}), the {@link PCollection} itself is "fused away" and won't ever be
- * written to disk, saving all the I/O and space expense of
- * constructing it.
- *
- * <p>The Google Cloud Dataflow service applies fusion as much as
- * possible, greatly reducing the cost of executing pipelines. As a
- * result, it is essentially "free" to write {@link ParDo} operations in a
- * very modular, composable style, each {@link ParDo} operation doing one
- * clear task, and stringing together sequences of {@link ParDo} operations to
- * get the desired overall effect. Such programs can be easier to
- * understand, easier to unit-test, easier to extend and evolve, and
- * easier to reuse in new programs. The predefined library of
- * PTransforms that come with Google Cloud Dataflow makes heavy use of
- * this modular, composable style, trusting to the Google Cloud
- * Dataflow service's optimizer to "flatten out" all the compositions
- * into highly optimized stages.
- *
- * @see <a href="https://cloud.google.com/dataflow/model/par-do">the web
- * documentation for ParDo</a>
- */
-public class ParDo {
-
- /**
- * Creates a {@link ParDo} {@link PTransform} with the given name.
- *
- * <p>See the discussion of naming above for more explanation.
- *
- * <p>The resulting {@link PTransform} is incomplete, and its
- * input/output types are not yet bound. Use
- * {@link ParDo.Unbound#of} to specify the {@link DoFn} to
- * invoke, which will also bind the input/output types of this
- * {@link PTransform}.
- */
- public static Unbound named(String name) {
- return new Unbound().named(name);
- }
-
- /**
- * Creates a {@link ParDo} {@link PTransform} with the given
- * side inputs.
- *
- * <p>Side inputs are {@link PCollectionView PCollectionViews}, whose contents are
- * computed during pipeline execution and then made accessible to
- * {@link DoFn} code via {@link DoFn.ProcessContext#sideInput sideInput}. Each
- * invocation of the {@link DoFn} receives the same values for these
- * side inputs.
- *
- * <p>See the discussion of Side Inputs above for more explanation.
- *
- * <p>The resulting {@link PTransform} is incomplete, and its
- * input/output types are not yet bound. Use
- * {@link ParDo.Unbound#of} to specify the {@link DoFn} to
- * invoke, which will also bind the input/output types of this
- * {@link PTransform}.
- */
- public static Unbound withSideInputs(PCollectionView<?>... sideInputs) {
- return new Unbound().withSideInputs(sideInputs);
- }
-
- /**
- * Creates a {@link ParDo} with the given side inputs.
- *
- * <p>Side inputs are {@link PCollectionView}s, whose contents are
- * computed during pipeline execution and then made accessible to
- * {@code DoFn} code via {@link DoFn.ProcessContext#sideInput sideInput}.
- *
- * <p>See the discussion of Side Inputs above for more explanation.
- *
- * <p>The resulting {@link PTransform} is incomplete, and its
- * input/output types are not yet bound. Use
- * {@link ParDo.Unbound#of} to specify the {@link DoFn} to
- * invoke, which will also bind the input/output types of this
- * {@link PTransform}.
- */
- public static Unbound withSideInputs(
- Iterable<? extends PCollectionView<?>> sideInputs) {
- return new Unbound().withSideInputs(sideInputs);
- }
-
- /**
- * Creates a multi-output {@link ParDo} {@link PTransform} whose
- * output {@link PCollection}s will be referenced using the given main
- * output and side output tags.
- *
- * <p>{@link TupleTag TupleTags} are used to name (with its static element
- * type {@code T}) each main and side output {@code PCollection<T>}.
- * This {@link PTransform PTransform's} {@link DoFn} emits elements to the main
- * output {@link PCollection} as normal, using
- * {@link DoFn.Context#output}. The {@link DoFn} emits elements to
- * a side output {@code PCollection} using
- * {@link DoFn.Context#sideOutput}, passing that side output's tag
- * as an argument. The result of invoking this {@link PTransform}
- * will be a {@link PCollectionTuple}, and any of the the main and
- * side output {@code PCollection}s can be retrieved from it via
- * {@link PCollectionTuple#get}, passing the output's tag as an
- * argument.
- *
- * <p>See the discussion of Side Outputs above for more explanation.
- *
- * <p>The resulting {@link PTransform} is incomplete, and its input
- * type is not yet bound. Use {@link ParDo.UnboundMulti#of}
- * to specify the {@link DoFn} to invoke, which will also bind the
- * input type of this {@link PTransform}.
- */
- public static <OutputT> UnboundMulti<OutputT> withOutputTags(
- TupleTag<OutputT> mainOutputTag,
- TupleTagList sideOutputTags) {
- return new Unbound().withOutputTags(mainOutputTag, sideOutputTags);
- }
-
- /**
- * Creates a {@link ParDo} {@link PTransform} that will invoke the
- * given {@link DoFn} function.
- *
- * <p>The resulting {@link PTransform PTransform's} types have been bound, with the
- * input being a {@code PCollection<InputT>} and the output a
- * {@code PCollection<OutputT>}, inferred from the types of the argument
- * {@code DoFn<InputT, OutputT>}. It is ready to be applied, or further
- * properties can be set on it first.
- */
- public static <InputT, OutputT> Bound<InputT, OutputT> of(DoFn<InputT, OutputT> fn) {
- return new Unbound().of(fn);
- }
-
- private static <InputT, OutputT> DoFn<InputT, OutputT>
- adapt(DoFnWithContext<InputT, OutputT> fn) {
- return DoFnReflector.of(fn.getClass()).toDoFn(fn);
- }
-
- /**
- * Creates a {@link ParDo} {@link PTransform} that will invoke the
- * given {@link DoFnWithContext} function.
- *
- * <p>The resulting {@link PTransform PTransform's} types have been bound, with the
- * input being a {@code PCollection<InputT>} and the output a
- * {@code PCollection<OutputT>}, inferred from the types of the argument
- * {@code DoFn<InputT, OutputT>}. It is ready to be applied, or further
- * properties can be set on it first.
- *
- * <p>{@link DoFnWithContext} is an experimental alternative to
- * {@link DoFn} which simplifies accessing the window of the element.
- */
- @Experimental
- public static <InputT, OutputT> Bound<InputT, OutputT> of(DoFnWithContext<InputT, OutputT> fn) {
- return of(adapt(fn));
- }
-
- /**
- * An incomplete {@link ParDo} transform, with unbound input/output types.
- *
- * <p>Before being applied, {@link ParDo.Unbound#of} must be
- * invoked to specify the {@link DoFn} to invoke, which will also
- * bind the input/output types of this {@link PTransform}.
- */
- public static class Unbound {
- private final String name;
- private final List<PCollectionView<?>> sideInputs;
-
- Unbound() {
- this(null, ImmutableList.<PCollectionView<?>>of());
- }
-
- Unbound(String name, List<PCollectionView<?>> sideInputs) {
- this.name = name;
- this.sideInputs = sideInputs;
- }
-
- /**
- * Returns a new {@link ParDo} transform that's like this
- * transform but with the specified name. Does not modify this
- * transform. The resulting transform is still incomplete.
- *
- * <p>See the discussion of naming above for more explanation.
- */
- public Unbound named(String name) {
- return new Unbound(name, sideInputs);
- }
-
- /**
- * Returns a new {@link ParDo} transform that's like this
- * transform but with the specified additional side inputs.
- * Does not modify this transform. The resulting transform is
- * still incomplete.
- *
- * <p>See the discussion of Side Inputs above and on
- * {@link ParDo#withSideInputs} for more explanation.
- */
- public Unbound withSideInputs(PCollectionView<?>... sideInputs) {
- return withSideInputs(Arrays.asList(sideInputs));
- }
-
- /**
- * Returns a new {@link ParDo} transform that is like this
- * transform but with the specified additional side inputs. Does not modify
- * this transform. The resulting transform is still incomplete.
- *
- * <p>See the discussion of Side Inputs above and on
- * {@link ParDo#withSideInputs} for more explanation.
- */
- public Unbound withSideInputs(
- Iterable<? extends PCollectionView<?>> sideInputs) {
- ImmutableList.Builder<PCollectionView<?>> builder = ImmutableList.builder();
- builder.addAll(this.sideInputs);
- builder.addAll(sideInputs);
- return new Unbound(name, builder.build());
- }
-
- /**
- * Returns a new multi-output {@link ParDo} transform that's like
- * this transform but with the specified main and side output
- * tags. Does not modify this transform. The resulting transform
- * is still incomplete.
- *
- * <p>See the discussion of Side Outputs above and on
- * {@link ParDo#withOutputTags} for more explanation.
- */
- public <OutputT> UnboundMulti<OutputT> withOutputTags(TupleTag<OutputT> mainOutputTag,
- TupleTagList sideOutputTags) {
- return new UnboundMulti<>(
- name, sideInputs, mainOutputTag, sideOutputTags);
- }
-
- /**
- * Returns a new {@link ParDo} {@link PTransform} that's like this
- * transform but that will invoke the given {@link DoFn}
- * function, and that has its input and output types bound. Does
- * not modify this transform. The resulting {@link PTransform} is
- * sufficiently specified to be applied, but more properties can
- * still be specified.
- */
- public <InputT, OutputT> Bound<InputT, OutputT> of(DoFn<InputT, OutputT> fn) {
- return new Bound<>(name, sideInputs, fn);
- }
-
- /**
- * Returns a new {@link ParDo} {@link PTransform} that's like this
- * transform but which will invoke the given {@link DoFnWithContext}
- * function, and which has its input and output types bound. Does
- * not modify this transform. The resulting {@link PTransform} is
- * sufficiently specified to be applied, but more properties can
- * still be specified.
- */
- public <InputT, OutputT> Bound<InputT, OutputT> of(DoFnWithContext<InputT, OutputT> fn) {
- return of(adapt(fn));
- }
- }
-
- /**
- * A {@link PTransform} that, when applied to a {@code PCollection<InputT>},
- * invokes a user-specified {@code DoFn<InputT, OutputT>} on all its elements,
- * with all its outputs collected into an output
- * {@code PCollection<OutputT>}.
- *
- * <p>A multi-output form of this transform can be created with
- * {@link ParDo.Bound#withOutputTags}.
- *
- * @param <InputT> the type of the (main) input {@link PCollection} elements
- * @param <OutputT> the type of the (main) output {@link PCollection} elements
- */
- public static class Bound<InputT, OutputT>
- extends PTransform<PCollection<? extends InputT>, PCollection<OutputT>> {
- // Inherits name.
- private final List<PCollectionView<?>> sideInputs;
- private final DoFn<InputT, OutputT> fn;
-
- Bound(String name,
- List<PCollectionView<?>> sideInputs,
- DoFn<InputT, OutputT> fn) {
- super(name);
- this.sideInputs = sideInputs;
- this.fn = SerializableUtils.clone(fn);
- }
-
- /**
- * Returns a new {@link ParDo} {@link PTransform} that's like this
- * {@link PTransform} but with the specified name. Does not
- * modify this {@link PTransform}.
- *
- * <p>See the discussion of Naming above for more explanation.
- */
- public Bound<InputT, OutputT> named(String name) {
- return new Bound<>(name, sideInputs, fn);
- }
-
- /**
- * Returns a new {@link ParDo} {@link PTransform} that's like this
- * {@link PTransform} but with the specified additional side inputs. Does not
- * modify this {@link PTransform}.
- *
- * <p>See the discussion of Side Inputs above and on
- * {@link ParDo#withSideInputs} for more explanation.
- */
- public Bound<InputT, OutputT> withSideInputs(PCollectionView<?>... sideInputs) {
- return withSideInputs(Arrays.asList(sideInputs));
- }
-
- /**
- * Returns a new {@link ParDo} {@link PTransform} that's like this
- * {@link PTransform} but with the specified additional side inputs. Does not
- * modify this {@link PTransform}.
- *
- * <p>See the discussion of Side Inputs above and on
- * {@link ParDo#withSideInputs} for more explanation.
- */
- public Bound<InputT, OutputT> withSideInputs(
- Iterable<? extends PCollectionView<?>> sideInputs) {
- ImmutableList.Builder<PCollectionView<?>> builder = ImmutableList.builder();
- builder.addAll(this.sideInputs);
- builder.addAll(sideInputs);
- return new Bound<>(name, builder.build(), fn);
- }
-
- /**
- * Returns a new multi-output {@link ParDo} {@link PTransform}
- * that's like this {@link PTransform} but with the specified main
- * and side output tags. Does not modify this {@link PTransform}.
- *
- * <p>See the discussion of Side Outputs above and on
- * {@link ParDo#withOutputTags} for more explanation.
- */
- public BoundMulti<InputT, OutputT> withOutputTags(TupleTag<OutputT> mainOutputTag,
- TupleTagList sideOutputTags) {
- return new BoundMulti<>(
- name, sideInputs, mainOutputTag, sideOutputTags, fn);
- }
-
- @Override
- public PCollection<OutputT> apply(PCollection<? extends InputT> input) {
- return PCollection.<OutputT>createPrimitiveOutputInternal(
- input.getPipeline(),
- input.getWindowingStrategy(),
- input.isBounded())
- .setTypeDescriptorInternal(fn.getOutputTypeDescriptor());
- }
-
- @Override
- @SuppressWarnings("unchecked")
- protected Coder<OutputT> getDefaultOutputCoder(PCollection<? extends InputT> input)
- throws CannotProvideCoderException {
- return input.getPipeline().getCoderRegistry().getDefaultCoder(
- fn.getOutputTypeDescriptor(),
- fn.getInputTypeDescriptor(),
- ((PCollection<InputT>) input).getCoder());
- }
-
- @Override
- protected String getKindString() {
- Class<?> clazz = DoFnReflector.getDoFnClass(fn);
- if (clazz.isAnonymousClass()) {
- return "AnonymousParDo";
- } else {
- return String.format("ParDo(%s)", StringUtils.approximateSimpleName(clazz));
- }
- }
-
- /**
- * {@inheritDoc}
- *
- * <p>{@link ParDo} registers its internal {@link DoFn} as a subcomponent for display metadata.
- * {@link DoFn} implementations can register display data by overriding
- * {@link DoFn#populateDisplayData}.
- */
- @Override
- public void populateDisplayData(Builder builder) {
- builder.include(fn);
- }
-
- public DoFn<InputT, OutputT> getFn() {
- return fn;
- }
-
- public List<PCollectionView<?>> getSideInputs() {
- return sideInputs;
- }
- }
-
- /**
- * An incomplete multi-output {@link ParDo} transform, with unbound
- * input type.
- *
- * <p>Before being applied, {@link ParDo.UnboundMulti#of} must be
- * invoked to specify the {@link DoFn} to invoke, which will also
- * bind the input type of this {@link PTransform}.
- *
- * @param <OutputT> the type of the main output {@code PCollection} elements
- */
- public static class UnboundMulti<OutputT> {
- private final String name;
- private final List<PCollectionView<?>> sideInputs;
- private final TupleTag<OutputT> mainOutputTag;
- private final TupleTagList sideOutputTags;
-
- UnboundMulti(String name,
- List<PCollectionView<?>> sideInputs,
- TupleTag<OutputT> mainOutputTag,
- TupleTagList sideOutputTags) {
- this.name = name;
- this.sideInputs = sideInputs;
- this.mainOutputTag = mainOutputTag;
- this.sideOutputTags = sideOutputTags;
- }
-
- /**
- * Returns a new multi-output {@link ParDo} transform that's like
- * this transform but with the specified name. Does not modify
- * this transform. The resulting transform is still incomplete.
- *
- * <p>See the discussion of Naming above for more explanation.
- */
- public UnboundMulti<OutputT> named(String name) {
- return new UnboundMulti<>(
- name, sideInputs, mainOutputTag, sideOutputTags);
- }
-
- /**
- * Returns a new multi-output {@link ParDo} transform that's like
- * this transform but with the specified side inputs. Does not
- * modify this transform. The resulting transform is still
- * incomplete.
- *
- * <p>See the discussion of Side Inputs above and on
- * {@link ParDo#withSideInputs} for more explanation.
- */
- public UnboundMulti<OutputT> withSideInputs(
- PCollectionView<?>... sideInputs) {
- return withSideInputs(Arrays.asList(sideInputs));
- }
-
- /**
- * Returns a new multi-output {@link ParDo} transform that's like
- * this transform but with the specified additional side inputs. Does not
- * modify this transform. The resulting transform is still
- * incomplete.
- *
- * <p>See the discussion of Side Inputs above and on
- * {@link ParDo#withSideInputs} for more explanation.
- */
- public UnboundMulti<OutputT> withSideInputs(
- Iterable<? extends PCollectionView<?>> sideInputs) {
- ImmutableList.Builder<PCollectionView<?>> builder = ImmutableList.builder();
- builder.addAll(this.sideInputs);
- builder.addAll(sideInputs);
- return new UnboundMulti<>(
- name, builder.build(),
- mainOutputTag, sideOutputTags);
- }
-
- /**
- * Returns a new multi-output {@link ParDo} {@link PTransform}
- * that's like this transform but that will invoke the given
- * {@link DoFn} function, and that has its input type bound.
- * Does not modify this transform. The resulting
- * {@link PTransform} is sufficiently specified to be applied, but
- * more properties can still be specified.
- */
- public <InputT> BoundMulti<InputT, OutputT> of(DoFn<InputT, OutputT> fn) {
- return new BoundMulti<>(
- name, sideInputs, mainOutputTag, sideOutputTags, fn);
- }
-
- /**
- * Returns a new multi-output {@link ParDo} {@link PTransform}
- * that's like this transform but which will invoke the given
- * {@link DoFnWithContext} function, and which has its input type bound.
- * Does not modify this transform. The resulting
- * {@link PTransform} is sufficiently specified to be applied, but
- * more properties can still be specified.
- */
- public <InputT> BoundMulti<InputT, OutputT> of(DoFnWithContext<InputT, OutputT> fn) {
- return of(adapt(fn));
- }
- }
-
- /**
- * A {@link PTransform} that, when applied to a
- * {@code PCollection<InputT>}, invokes a user-specified
- * {@code DoFn<InputT, OutputT>} on all its elements, which can emit elements
- * to any of the {@link PTransform}'s main and side output
- * {@code PCollection}s, which are bundled into a result
- * {@code PCollectionTuple}.
- *
- * @param <InputT> the type of the (main) input {@code PCollection} elements
- * @param <OutputT> the type of the main output {@code PCollection} elements
- */
- public static class BoundMulti<InputT, OutputT>
- extends PTransform<PCollection<? extends InputT>, PCollectionTuple> {
- // Inherits name.
- private final List<PCollectionView<?>> sideInputs;
- private final TupleTag<OutputT> mainOutputTag;
- private final TupleTagList sideOutputTags;
- private final DoFn<InputT, OutputT> fn;
-
- BoundMulti(String name,
- List<PCollectionView<?>> sideInputs,
- TupleTag<OutputT> mainOutputTag,
- TupleTagList sideOutputTags,
- DoFn<InputT, OutputT> fn) {
- super(name);
- this.sideInputs = sideInputs;
- this.mainOutputTag = mainOutputTag;
- this.sideOutputTags = sideOutputTags;
- this.fn = SerializableUtils.clone(fn);
- }
-
- /**
- * Returns a new multi-output {@link ParDo} {@link PTransform}
- * that's like this {@link PTransform} but with the specified
- * name. Does not modify this {@link PTransform}.
- *
- * <p>See the discussion of Naming above for more explanation.
- */
- public BoundMulti<InputT, OutputT> named(String name) {
- return new BoundMulti<>(
- name, sideInputs, mainOutputTag, sideOutputTags, fn);
- }
-
- /**
- * Returns a new multi-output {@link ParDo} {@link PTransform}
- * that's like this {@link PTransform} but with the specified additional side
- * inputs. Does not modify this {@link PTransform}.
- *
- * <p>See the discussion of Side Inputs above and on
- * {@link ParDo#withSideInputs} for more explanation.
- */
- public BoundMulti<InputT, OutputT> withSideInputs(
- PCollectionView<?>... sideInputs) {
- return withSideInputs(Arrays.asList(sideInputs));
- }
-
- /**
- * Returns a new multi-output {@link ParDo} {@link PTransform}
- * that's like this {@link PTransform} but with the specified additional side
- * inputs. Does not modify this {@link PTransform}.
- *
- * <p>See the discussion of Side Inputs above and on
- * {@link ParDo#withSideInputs} for more explanation.
- */
- public BoundMulti<InputT, OutputT> withSideInputs(
- Iterable<? extends PCollectionView<?>> sideInputs) {
- ImmutableList.Builder<PCollectionView<?>> builder = ImmutableList.builder();
- builder.addAll(this.sideInputs);
- builder.addAll(sideInputs);
- return new BoundMulti<>(
- name, builder.build(),
- mainOutputTag, sideOutputTags, fn);
- }
-
-
- @Override
- public PCollectionTuple apply(PCollection<? extends InputT> input) {
- PCollectionTuple outputs = PCollectionTuple.ofPrimitiveOutputsInternal(
- input.getPipeline(),
- TupleTagList.of(mainOutputTag).and(sideOutputTags.getAll()),
- input.getWindowingStrategy(),
- input.isBounded());
-
- // The fn will likely be an instance of an anonymous subclass
- // such as DoFn<Integer, String> { }, thus will have a high-fidelity
- // TypeDescriptor for the output type.
- outputs.get(mainOutputTag).setTypeDescriptorInternal(fn.getOutputTypeDescriptor());
-
- return outputs;
- }
-
- @Override
- protected Coder<OutputT> getDefaultOutputCoder() {
- throw new RuntimeException(
- "internal error: shouldn't be calling this on a multi-output ParDo");
- }
-
- @Override
- public <T> Coder<T> getDefaultOutputCoder(
- PCollection<? extends InputT> input, TypedPValue<T> output)
- throws CannotProvideCoderException {
- @SuppressWarnings("unchecked")
- Coder<InputT> inputCoder = ((PCollection<InputT>) input).getCoder();
- return input.getPipeline().getCoderRegistry().getDefaultCoder(
- output.getTypeDescriptor(),
- fn.getInputTypeDescriptor(),
- inputCoder);
- }
-
- @Override
- protected String getKindString() {
- Class<?> clazz = DoFnReflector.getDoFnClass(fn);
- if (fn.getClass().isAnonymousClass()) {
- return "AnonymousParMultiDo";
- } else {
- return String.format("ParMultiDo(%s)", StringUtils.approximateSimpleName(clazz));
- }
- }
-
- public DoFn<InputT, OutputT> getFn() {
- return fn;
- }
-
- public TupleTag<OutputT> getMainOutputTag() {
- return mainOutputTag;
- }
-
- public TupleTagList getSideOutputTags() {
- return sideOutputTags;
- }
-
- public List<PCollectionView<?>> getSideInputs() {
- return sideInputs;
- }
- }
-
- /////////////////////////////////////////////////////////////////////////////
-
- static {
- DirectPipelineRunner.registerDefaultTransformEvaluator(
- Bound.class,
- new DirectPipelineRunner.TransformEvaluator<Bound>() {
- @Override
- public void evaluate(
- Bound transform,
- DirectPipelineRunner.EvaluationContext context) {
- evaluateSingleHelper(transform, context);
- }
- });
- }
-
- private static <InputT, OutputT> void evaluateSingleHelper(
- Bound<InputT, OutputT> transform,
- DirectPipelineRunner.EvaluationContext context) {
- TupleTag<OutputT> mainOutputTag = new TupleTag<>("out");
-
- DirectModeExecutionContext executionContext = DirectModeExecutionContext.create();
-
- PCollectionTuple outputs = PCollectionTuple.of(mainOutputTag, context.getOutput(transform));
-
- evaluateHelper(
- transform.fn,
- context.getStepName(transform),
- context.getInput(transform),
- transform.sideInputs,
- mainOutputTag,
- Collections.<TupleTag<?>>emptyList(),
- outputs,
- context,
- executionContext);
-
- context.setPCollectionValuesWithMetadata(
- context.getOutput(transform),
- executionContext.getOutput(mainOutputTag));
- }
-
- /////////////////////////////////////////////////////////////////////////////
-
- static {
- DirectPipelineRunner.registerDefaultTransformEvaluator(
- BoundMulti.class,
- new DirectPipelineRunner.TransformEvaluator<BoundMulti>() {
- @Override
- public void evaluate(
- BoundMulti transform,
- DirectPipelineRunner.EvaluationContext context) {
- evaluateMultiHelper(transform, context);
- }
- });
- }
-
- private static <InputT, OutputT> void evaluateMultiHelper(
- BoundMulti<InputT, OutputT> transform,
- DirectPipelineRunner.EvaluationContext context) {
-
- DirectModeExecutionContext executionContext = DirectModeExecutionContext.create();
-
- evaluateHelper(
- transform.fn,
- context.getStepName(transform),
- context.getInput(transform),
- transform.sideInputs,
- transform.mainOutputTag,
- transform.sideOutputTags.getAll(),
- context.getOutput(transform),
- context,
- executionContext);
-
- for (Map.Entry<TupleTag<?>, PCollection<?>> entry
- : context.getOutput(transform).getAll().entrySet()) {
- @SuppressWarnings("unchecked")
- TupleTag<Object> tag = (TupleTag<Object>) entry.getKey();
- @SuppressWarnings("unchecked")
- PCollection<Object> pc = (PCollection<Object>) entry.getValue();
-
- context.setPCollectionValuesWithMetadata(
- pc,
- (tag == transform.mainOutputTag
- ? executionContext.getOutput(tag)
- : executionContext.getSideOutput(tag)));
- }
- }
-
- /**
- * Evaluates a single-output or multi-output {@link ParDo} directly.
- *
- * <p>This evaluation method is intended for use in testing scenarios; it is designed for clarity
- * and correctness-checking, not speed.
- *
- * <p>Of particular note, this performs best-effort checking that inputs and outputs are not
- * mutated in violation of the requirements upon a {@link DoFn}.
- */
- private static <InputT, OutputT, ActualInputT extends InputT> void evaluateHelper(
- DoFn<InputT, OutputT> doFn,
- String stepName,
- PCollection<ActualInputT> input,
- List<PCollectionView<?>> sideInputs,
- TupleTag<OutputT> mainOutputTag,
- List<TupleTag<?>> sideOutputTags,
- PCollectionTuple outputs,
- DirectPipelineRunner.EvaluationContext context,
- DirectModeExecutionContext executionContext) {
- // TODO: Run multiple shards?
- DoFn<InputT, OutputT> fn = context.ensureSerializable(doFn);
-
- SideInputReader sideInputReader = makeSideInputReader(context, sideInputs);
-
- // When evaluating via the DirectPipelineRunner, this output manager checks each output for
- // illegal mutations when the next output comes along. We then verify again after finishBundle()
- // The common case we expect this to catch is a user mutating an input in order to repeatedly
- // emit "variations".
- ImmutabilityCheckingOutputManager<ActualInputT> outputManager =
- new ImmutabilityCheckingOutputManager<>(
- fn.getClass().getSimpleName(),
- new DoFnRunnerBase.ListOutputManager(),
- outputs);
-
- DoFnRunner<InputT, OutputT> fnRunner =
- DoFnRunners.createDefault(
- context.getPipelineOptions(),
- fn,
- sideInputReader,
- outputManager,
- mainOutputTag,
- sideOutputTags,
- executionContext.getOrCreateStepContext(stepName, stepName, null),
- context.getAddCounterMutator(),
- input.getWindowingStrategy());
-
- fnRunner.startBundle();
-
- for (DirectPipelineRunner.ValueWithMetadata<ActualInputT> elem
- : context.getPCollectionValuesWithMetadata(input)) {
- if (elem.getValue() instanceof KV) {
- // In case the DoFn needs keyed state, set the implicit keys to the keys
- // in the input elements.
- @SuppressWarnings("unchecked")
- KV<?, ?> kvElem = (KV<?, ?>) elem.getValue();
- executionContext.setKey(kvElem.getKey());
- } else {
- executionContext.setKey(elem.getKey());
- }
-
- // We check the input for mutations only through the call span of processElement.
- // This will miss some cases, but the check is ad hoc and best effort. The common case
- // is that the input is mutated to be used for output.
- try {
- MutationDetector inputMutationDetector = MutationDetectors.forValueWithCoder(
- elem.getWindowedValue().getValue(), input.getCoder());
- @SuppressWarnings("unchecked")
- WindowedValue<InputT> windowedElem = ((WindowedValue<InputT>) elem.getWindowedValue());
- fnRunner.processElement(windowedElem);
- inputMutationDetector.verifyUnmodified();
- } catch (CoderException e) {
- throw UserCodeException.wrap(e);
- } catch (IllegalMutationException exn) {
- throw new IllegalMutationException(
- String.format("DoFn %s mutated input value %s of class %s (new value was %s)."
- + " Input values must not be mutated in any way.",
- fn.getClass().getSimpleName(),
- exn.getSavedValue(), exn.getSavedValue().getClass(), exn.getNewValue()),
- exn.getSavedValue(),
- exn.getNewValue(),
- exn);
- }
- }
-
- // Note that the input could have been retained and mutated prior to this final output,
- // but for now it degrades readability too much to be worth trying to catch that particular
- // corner case.
- fnRunner.finishBundle();
- outputManager.verifyLatestOutputsUnmodified();
- }
-
- private static SideInputReader makeSideInputReader(
- DirectPipelineRunner.EvaluationContext context, List<PCollectionView<?>> sideInputs) {
- PTuple sideInputValues = PTuple.empty();
- for (PCollectionView<?> view : sideInputs) {
- sideInputValues = sideInputValues.and(
- view.getTagInternal(),
- context.getPCollectionView(view));
- }
- return DirectSideInputReader.of(sideInputValues);
- }
-
- /**
- * A {@code DoFnRunner.OutputManager} that provides facilities for checking output values for
- * illegal mutations.
- *
- * <p>When used via the try-with-resources pattern, it is guaranteed that every value passed
- * to {@link #output} will have been checked for illegal mutation.
- */
- private static class ImmutabilityCheckingOutputManager<InputT>
- implements DoFnRunners.OutputManager, AutoCloseable {
-
- private final DoFnRunners.OutputManager underlyingOutputManager;
- private final ConcurrentMap<TupleTag<?>, MutationDetector> mutationDetectorForTag;
- private final PCollectionTuple outputs;
- private String doFnName;
-
- public ImmutabilityCheckingOutputManager(
- String doFnName,
- DoFnRunners.OutputManager underlyingOutputManager,
- PCollectionTuple outputs) {
- this.doFnName = doFnName;
- this.underlyingOutputManager = underlyingOutputManager;
- this.outputs = outputs;
- this.mutationDetectorForTag = Maps.newConcurrentMap();
- }
-
- @Override
- public <T> void output(TupleTag<T> tag, WindowedValue<T> output) {
-
- // Skip verifying undeclared outputs, since we don't have coders for them.
- if (outputs.has(tag)) {
- try {
- MutationDetector newDetector =
- MutationDetectors.forValueWithCoder(
- output.getValue(), outputs.get(tag).getCoder());
- MutationDetector priorDetector = mutationDetectorForTag.put(tag, newDetector);
- verifyOutputUnmodified(priorDetector);
- } catch (CoderException e) {
- throw UserCodeException.wrap(e);
- }
- }
-
- // Actually perform the output.
- underlyingOutputManager.output(tag, output);
- }
-
- /**
- * Throws {@link IllegalMutationException} if the prior output for any tag has been mutated
- * since being output.
- */
- public void verifyLatestOutputsUnmodified() {
- for (MutationDetector detector : mutationDetectorForTag.values()) {
- verifyOutputUnmodified(detector);
- }
- }
-
- /**
- * Adapts the error message from the provided {@code detector}.
- *
- * <p>The {@code detector} may be null, in which case no check is performed. This is merely
- * to consolidate null checking to this method.
- */
- private <T> void verifyOutputUnmodified(@Nullable MutationDetector detector) {
- if (detector == null) {
- return;
- }
-
- try {
- detector.verifyUnmodified();
- } catch (IllegalMutationException exn) {
- throw new IllegalMutationException(String.format(
- "DoFn %s mutated value %s after it was output (new value was %s)."
- + " Values must not be mutated in any way after being output.",
- doFnName, exn.getSavedValue(), exn.getNewValue()),
- exn.getSavedValue(), exn.getNewValue(),
- exn);
- }
- }
-
- /**
- * When used in a {@code try}-with-resources block, verifies all of the latest outputs upon
- * {@link #close()}.
- */
- @Override
- public void close() {
- verifyLatestOutputsUnmodified();
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/Partition.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/Partition.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/Partition.java
deleted file mode 100644
index bbbccbc..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/Partition.java
+++ /dev/null
@@ -1,173 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms;
-
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.cloud.dataflow.sdk.values.PCollectionList;
-import com.google.cloud.dataflow.sdk.values.PCollectionTuple;
-import com.google.cloud.dataflow.sdk.values.TupleTag;
-import com.google.cloud.dataflow.sdk.values.TupleTagList;
-
-import java.io.Serializable;
-
-/**
- * {@code Partition} takes a {@code PCollection<T>} and a
- * {@code PartitionFn}, uses the {@code PartitionFn} to split the
- * elements of the input {@code PCollection} into {@code N} partitions, and
- * returns a {@code PCollectionList<T>} that bundles {@code N}
- * {@code PCollection<T>}s containing the split elements.
- *
- * <p>Example of use:
- * <pre> {@code
- * PCollection<Student> students = ...;
- * // Split students up into 10 partitions, by percentile:
- * PCollectionList<Student> studentsByPercentile =
- * students.apply(Partition.of(10, new PartitionFn<Student>() {
- * public int partitionFor(Student student, int numPartitions) {
- * return student.getPercentile() // 0..99
- * * numPartitions / 100;
- * }}))
- * for (int i = 0; i < 10; i++) {
- * PCollection<Student> partition = studentsByPercentile.get(i);
- * ...
- * }
- * } </pre>
- *
- * <p>By default, the {@code Coder} of each of the
- * {@code PCollection}s in the output {@code PCollectionList} is the
- * same as the {@code Coder} of the input {@code PCollection}.
- *
- * <p>Each output element has the same timestamp and is in the same windows
- * as its corresponding input element, and each output {@code PCollection}
- * has the same
- * {@link com.google.cloud.dataflow.sdk.transforms.windowing.WindowFn}
- * associated with it as the input.
- *
- * @param <T> the type of the elements of the input and output
- * {@code PCollection}s
- */
-public class Partition<T> extends PTransform<PCollection<T>, PCollectionList<T>> {
-
- /**
- * A function object that chooses an output partition for an element.
- *
- * @param <T> the type of the elements being partitioned
- */
- public interface PartitionFn<T> extends Serializable {
- /**
- * Chooses the partition into which to put the given element.
- *
- * @param elem the element to be partitioned
- * @param numPartitions the total number of partitions ({@code >= 1})
- * @return index of the selected partition (in the range
- * {@code [0..numPartitions-1]})
- */
- public int partitionFor(T elem, int numPartitions);
- }
-
- /**
- * Returns a new {@code Partition} {@code PTransform} that divides
- * its input {@code PCollection} into the given number of partitions,
- * using the given partitioning function.
- *
- * @param numPartitions the number of partitions to divide the input
- * {@code PCollection} into
- * @param partitionFn the function to invoke on each element to
- * choose its output partition
- * @throws IllegalArgumentException if {@code numPartitions <= 0}
- */
- public static <T> Partition<T> of(
- int numPartitions, PartitionFn<? super T> partitionFn) {
- return new Partition<>(new PartitionDoFn<T>(numPartitions, partitionFn));
- }
-
- /////////////////////////////////////////////////////////////////////////////
-
- @Override
- public PCollectionList<T> apply(PCollection<T> in) {
- final TupleTagList outputTags = partitionDoFn.getOutputTags();
-
- PCollectionTuple outputs = in.apply(
- ParDo
- .withOutputTags(new TupleTag<Void>(){}, outputTags)
- .of(partitionDoFn));
-
- PCollectionList<T> pcs = PCollectionList.empty(in.getPipeline());
- Coder<T> coder = in.getCoder();
-
- for (TupleTag<?> outputTag : outputTags.getAll()) {
- // All the tuple tags are actually TupleTag<T>
- // And all the collections are actually PCollection<T>
- @SuppressWarnings("unchecked")
- TupleTag<T> typedOutputTag = (TupleTag<T>) outputTag;
- pcs = pcs.and(outputs.get(typedOutputTag).setCoder(coder));
- }
- return pcs;
- }
-
- private final transient PartitionDoFn<T> partitionDoFn;
-
- private Partition(PartitionDoFn<T> partitionDoFn) {
- this.partitionDoFn = partitionDoFn;
- }
-
- private static class PartitionDoFn<X> extends DoFn<X, Void> {
- private final int numPartitions;
- private final PartitionFn<? super X> partitionFn;
- private final TupleTagList outputTags;
-
- /**
- * Constructs a PartitionDoFn.
- *
- * @throws IllegalArgumentException if {@code numPartitions <= 0}
- */
- public PartitionDoFn(int numPartitions, PartitionFn<? super X> partitionFn) {
- if (numPartitions <= 0) {
- throw new IllegalArgumentException("numPartitions must be > 0");
- }
-
- this.numPartitions = numPartitions;
- this.partitionFn = partitionFn;
-
- TupleTagList buildOutputTags = TupleTagList.empty();
- for (int partition = 0; partition < numPartitions; partition++) {
- buildOutputTags = buildOutputTags.and(new TupleTag<X>());
- }
- outputTags = buildOutputTags;
- }
-
- public TupleTagList getOutputTags() {
- return outputTags;
- }
-
- @Override
- public void processElement(ProcessContext c) {
- X input = c.element();
- int partition = partitionFn.partitionFor(input, numPartitions);
- if (0 <= partition && partition < numPartitions) {
- @SuppressWarnings("unchecked")
- TupleTag<X> typedTag = (TupleTag<X>) outputTags.get(partition);
- c.sideOutput(typedTag, input);
- } else {
- throw new IndexOutOfBoundsException(
- "Partition function returned out of bounds index: " +
- partition + " not in [0.." + numPartitions + ")");
- }
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/RemoveDuplicates.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/RemoveDuplicates.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/RemoveDuplicates.java
deleted file mode 100644
index 8913138..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/RemoveDuplicates.java
+++ /dev/null
@@ -1,158 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms;
-
-import com.google.cloud.dataflow.sdk.values.KV;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.cloud.dataflow.sdk.values.TypeDescriptor;
-
-/**
- * {@code RemoveDuplicates<T>} takes a {@code PCollection<T>} and
- * returns a {@code PCollection<T>} that has all the elements of the
- * input but with duplicate elements removed such that each element is
- * unique within each window.
- *
- * <p>Two values of type {@code T} are compared for equality <b>not</b> by
- * regular Java {@link Object#equals}, but instead by first encoding
- * each of the elements using the {@code PCollection}'s {@code Coder}, and then
- * comparing the encoded bytes. This admits efficient parallel
- * evaluation.
- *
- * <p>Optionally, a function may be provided that maps each element to a representative
- * value. In this case, two elements will be considered duplicates if they have equal
- * representative values, with equality being determined as above.
- *
- * <p>By default, the {@code Coder} of the output {@code PCollection}
- * is the same as the {@code Coder} of the input {@code PCollection}.
- *
- * <p>Each output element is in the same window as its corresponding input
- * element, and has the timestamp of the end of that window. The output
- * {@code PCollection} has the same
- * {@link com.google.cloud.dataflow.sdk.transforms.windowing.WindowFn}
- * as the input.
- *
- * <p>Does not preserve any order the input PCollection might have had.
- *
- * <p>Example of use:
- * <pre> {@code
- * PCollection<String> words = ...;
- * PCollection<String> uniqueWords =
- * words.apply(RemoveDuplicates.<String>create());
- * } </pre>
- *
- * @param <T> the type of the elements of the input and output
- * {@code PCollection}s
- */
-public class RemoveDuplicates<T> extends PTransform<PCollection<T>,
- PCollection<T>> {
- /**
- * Returns a {@code RemoveDuplicates<T>} {@code PTransform}.
- *
- * @param <T> the type of the elements of the input and output
- * {@code PCollection}s
- */
- public static <T> RemoveDuplicates<T> create() {
- return new RemoveDuplicates<T>();
- }
-
- /**
- * Returns a {@code RemoveDuplicates<T, IdT>} {@code PTransform}.
- *
- * @param <T> the type of the elements of the input and output
- * {@code PCollection}s
- * @param <IdT> the type of the representative value used to dedup
- */
- public static <T, IdT> WithRepresentativeValues<T, IdT> withRepresentativeValueFn(
- SerializableFunction<T, IdT> fn) {
- return new WithRepresentativeValues<T, IdT>(fn, null);
- }
-
- @Override
- public PCollection<T> apply(PCollection<T> in) {
- return in
- .apply(ParDo.named("CreateIndex")
- .of(new DoFn<T, KV<T, Void>>() {
- @Override
- public void processElement(ProcessContext c) {
- c.output(KV.of(c.element(), (Void) null));
- }
- }))
- .apply(Combine.<T, Void>perKey(
- new SerializableFunction<Iterable<Void>, Void>() {
- @Override
- public Void apply(Iterable<Void> iter) {
- return null; // ignore input
- }
- }))
- .apply(Keys.<T>create());
- }
-
- /**
- * A {@link RemoveDuplicates} {@link PTransform} that uses a {@link SerializableFunction} to
- * obtain a representative value for each input element.
- *
- * Construct via {@link RemoveDuplicates#withRepresentativeValueFn(SerializableFunction)}.
- *
- * @param <T> the type of input and output element
- * @param <IdT> the type of representative values used to dedup
- */
- public static class WithRepresentativeValues<T, IdT>
- extends PTransform<PCollection<T>, PCollection<T>> {
- private final SerializableFunction<T, IdT> fn;
- private final TypeDescriptor<IdT> representativeType;
-
- private WithRepresentativeValues(
- SerializableFunction<T, IdT> fn, TypeDescriptor<IdT> representativeType) {
- this.fn = fn;
- this.representativeType = representativeType;
- }
-
- @Override
- public PCollection<T> apply(PCollection<T> in) {
- WithKeys<IdT, T> withKeys = WithKeys.of(fn);
- if (representativeType != null) {
- withKeys = withKeys.withKeyType(representativeType);
- }
- return in
- .apply(withKeys)
- .apply(Combine.<IdT, T, T>perKey(
- new Combine.BinaryCombineFn<T>() {
- @Override
- public T apply(T left, T right) {
- return left;
- }
- }))
- .apply(Values.<T>create());
- }
-
- /**
- * Return a {@code WithRepresentativeValues} {@link PTransform} that is like this one, but with
- * the specified output type descriptor.
- *
- * Required for use of {@link RemoveDuplicates#withRepresentativeValueFn(SerializableFunction)}
- * in Java 8 with a lambda as the fn.
- *
- * @param type a {@link TypeDescriptor} describing the representative type of this
- * {@code WithRepresentativeValues}
- * @return A {@code WithRepresentativeValues} {@link PTransform} that is like this one, but with
- * the specified output type descriptor.
- */
- public WithRepresentativeValues<T, IdT> withRepresentativeType(TypeDescriptor<IdT> type) {
- return new WithRepresentativeValues<>(fn, type);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/Sample.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/Sample.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/Sample.java
deleted file mode 100644
index c5b6e7e..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/Sample.java
+++ /dev/null
@@ -1,246 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms;
-
-import com.google.cloud.dataflow.sdk.coders.BigEndianIntegerCoder;
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.coders.CoderRegistry;
-import com.google.cloud.dataflow.sdk.coders.IterableCoder;
-import com.google.cloud.dataflow.sdk.coders.KvCoder;
-import com.google.cloud.dataflow.sdk.coders.VoidCoder;
-import com.google.cloud.dataflow.sdk.transforms.Combine.CombineFn;
-import com.google.cloud.dataflow.sdk.values.KV;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.cloud.dataflow.sdk.values.PCollectionView;
-import com.google.common.base.Preconditions;
-
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Random;
-
-/**
- * {@code PTransform}s for taking samples of the elements in a
- * {@code PCollection}, or samples of the values associated with each
- * key in a {@code PCollection} of {@code KV}s.
- **/
-public class Sample {
-
- /**
- * {@code Sample#any(long)} takes a {@code PCollection<T>} and a limit, and
- * produces a new {@code PCollection<T>} containing up to limit
- * elements of the input {@code PCollection}.
- *
- * <p>If limit is less than or equal to the size of the input
- * {@code PCollection}, then all the input's elements will be selected.
- *
- * <p>All of the elements of the output {@code PCollection} should fit into
- * main memory of a single worker machine. This operation does not
- * run in parallel.
- *
- * <p>Example of use:
- * <pre> {@code
- * PCollection<String> input = ...;
- * PCollection<String> output = input.apply(Sample.<String>any(100));
- * } </pre>
- *
- * @param <T> the type of the elements of the input and output
- * {@code PCollection}s
- * @param limit the number of elements to take from the input
- */
- public static <T> PTransform<PCollection<T>, PCollection<T>> any(long limit) {
- return new SampleAny<>(limit);
- }
-
- /**
- * Returns a {@code PTransform} that takes a {@code PCollection<T>},
- * selects {@code sampleSize} elements, uniformly at random, and returns a
- * {@code PCollection<Iterable<T>>} containing the selected elements.
- * If the input {@code PCollection} has fewer than
- * {@code sampleSize} elements, then the output {@code Iterable<T>}
- * will be all the input's elements.
- *
- * <p>Example of use:
- * <pre> {@code
- * PCollection<String> pc = ...;
- * PCollection<Iterable<String>> sampleOfSize10 =
- * pc.apply(Sample.fixedSizeGlobally(10));
- * } </pre>
- *
- * @param sampleSize the number of elements to select; must be {@code >= 0}
- * @param <T> the type of the elements
- */
- public static <T> PTransform<PCollection<T>, PCollection<Iterable<T>>>
- fixedSizeGlobally(int sampleSize) {
- return Combine.globally(new FixedSizedSampleFn<T>(sampleSize));
- }
-
- /**
- * Returns a {@code PTransform} that takes an input
- * {@code PCollection<KV<K, V>>} and returns a
- * {@code PCollection<KV<K, Iterable<V>>>} that contains an output
- * element mapping each distinct key in the input
- * {@code PCollection} to a sample of {@code sampleSize} values
- * associated with that key in the input {@code PCollection}, taken
- * uniformly at random. If a key in the input {@code PCollection}
- * has fewer than {@code sampleSize} values associated with it, then
- * the output {@code Iterable<V>} associated with that key will be
- * all the values associated with that key in the input
- * {@code PCollection}.
- *
- * <p>Example of use:
- * <pre> {@code
- * PCollection<KV<String, Integer>> pc = ...;
- * PCollection<KV<String, Iterable<Integer>>> sampleOfSize10PerKey =
- * pc.apply(Sample.<String, Integer>fixedSizePerKey());
- * } </pre>
- *
- * @param sampleSize the number of values to select for each
- * distinct key; must be {@code >= 0}
- * @param <K> the type of the keys
- * @param <V> the type of the values
- */
- public static <K, V> PTransform<PCollection<KV<K, V>>,
- PCollection<KV<K, Iterable<V>>>>
- fixedSizePerKey(int sampleSize) {
- return Combine.perKey(new FixedSizedSampleFn<V>(sampleSize));
- }
-
-
- /////////////////////////////////////////////////////////////////////////////
-
- /**
- * A {@link PTransform} that takes a {@code PCollection<T>} and a limit, and
- * produces a new {@code PCollection<T>} containing up to limit
- * elements of the input {@code PCollection}.
- */
- public static class SampleAny<T> extends PTransform<PCollection<T>, PCollection<T>> {
- private final long limit;
-
- /**
- * Constructs a {@code SampleAny<T>} PTransform that, when applied,
- * produces a new PCollection containing up to {@code limit}
- * elements of its input {@code PCollection}.
- */
- private SampleAny(long limit) {
- Preconditions.checkArgument(limit >= 0, "Expected non-negative limit, received %s.", limit);
- this.limit = limit;
- }
-
- @Override
- public PCollection<T> apply(PCollection<T> in) {
- PCollectionView<Iterable<T>> iterableView = in.apply(View.<T>asIterable());
- return
- in.getPipeline()
- .apply(Create.of((Void) null).withCoder(VoidCoder.of()))
- .apply(ParDo
- .withSideInputs(iterableView)
- .of(new SampleAnyDoFn<>(limit, iterableView)))
- .setCoder(in.getCoder());
- }
- }
-
- /**
- * A {@link DoFn} that returns up to limit elements from the side input PCollection.
- */
- private static class SampleAnyDoFn<T> extends DoFn<Void, T> {
- long limit;
- final PCollectionView<Iterable<T>> iterableView;
-
- public SampleAnyDoFn(long limit, PCollectionView<Iterable<T>> iterableView) {
- this.limit = limit;
- this.iterableView = iterableView;
- }
-
- @Override
- public void processElement(ProcessContext c) {
- for (T i : c.sideInput(iterableView)) {
- if (limit-- <= 0) {
- break;
- }
- c.output(i);
- }
- }
- }
-
- /**
- * {@code CombineFn} that computes a fixed-size sample of a
- * collection of values.
- *
- * @param <T> the type of the elements
- */
- public static class FixedSizedSampleFn<T>
- extends CombineFn<T,
- Top.BoundedHeap<KV<Integer, T>, SerializableComparator<KV<Integer, T>>>,
- Iterable<T>> {
- private final Top.TopCombineFn<KV<Integer, T>, SerializableComparator<KV<Integer, T>>>
- topCombineFn;
- private final Random rand = new Random();
-
- private FixedSizedSampleFn(int sampleSize) {
- if (sampleSize < 0) {
- throw new IllegalArgumentException("sample size must be >= 0");
- }
- topCombineFn = new Top.TopCombineFn<KV<Integer, T>, SerializableComparator<KV<Integer, T>>>(
- sampleSize, new KV.OrderByKey<Integer, T>());
- }
-
- @Override
- public Top.BoundedHeap<KV<Integer, T>, SerializableComparator<KV<Integer, T>>>
- createAccumulator() {
- return topCombineFn.createAccumulator();
- }
-
- @Override
- public Top.BoundedHeap<KV<Integer, T>, SerializableComparator<KV<Integer, T>>> addInput(
- Top.BoundedHeap<KV<Integer, T>, SerializableComparator<KV<Integer, T>>> accumulator,
- T input) {
- accumulator.addInput(KV.of(rand.nextInt(), input));
- return accumulator;
- }
-
- @Override
- public Top.BoundedHeap<KV<Integer, T>, SerializableComparator<KV<Integer, T>>>
- mergeAccumulators(
- Iterable<Top.BoundedHeap<KV<Integer, T>, SerializableComparator<KV<Integer, T>>>>
- accumulators) {
- return topCombineFn.mergeAccumulators(accumulators);
- }
-
- @Override
- public Iterable<T> extractOutput(
- Top.BoundedHeap<KV<Integer, T>, SerializableComparator<KV<Integer, T>>> accumulator) {
- List<T> out = new ArrayList<>();
- for (KV<Integer, T> element : accumulator.extractOutput()) {
- out.add(element.getValue());
- }
- return out;
- }
-
- @Override
- public Coder<Top.BoundedHeap<KV<Integer, T>, SerializableComparator<KV<Integer, T>>>>
- getAccumulatorCoder(CoderRegistry registry, Coder<T> inputCoder) {
- return topCombineFn.getAccumulatorCoder(
- registry, KvCoder.of(BigEndianIntegerCoder.of(), inputCoder));
- }
-
- @Override
- public Coder<Iterable<T>> getDefaultOutputCoder(
- CoderRegistry registry, Coder<T> inputCoder) {
- return IterableCoder.of(inputCoder);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/SerializableComparator.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/SerializableComparator.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/SerializableComparator.java
deleted file mode 100644
index 7d41917..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/SerializableComparator.java
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms;
-
-import java.io.Serializable;
-import java.util.Comparator;
-
-/**
- * A {@code Comparator} that is also {@code Serializable}.
- *
- * @param <T> type of values being compared
- */
-public interface SerializableComparator<T> extends Comparator<T>, Serializable {
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/SerializableFunction.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/SerializableFunction.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/SerializableFunction.java
deleted file mode 100644
index 81bf3d4..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/SerializableFunction.java
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms;
-
-import java.io.Serializable;
-
-/**
- * A function that computes an output value of type {@code OutputT} from an input value of type
- * {@code InputT} and is {@link Serializable}.
- *
- * @param <InputT> input value type
- * @param <OutputT> output value type
- */
-public interface SerializableFunction<InputT, OutputT> extends Serializable {
- /** Returns the result of invoking this function on the given input. */
- public OutputT apply(InputT input);
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/SimpleFunction.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/SimpleFunction.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/SimpleFunction.java
deleted file mode 100644
index ef6fd81..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/SimpleFunction.java
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms;
-
-import com.google.cloud.dataflow.sdk.values.TypeDescriptor;
-
-/**
- * A {@link SerializableFunction} which is not a <i>functional interface</i>.
- * Concrete subclasses allow us to infer type information, which in turn aids
- * {@link Coder} inference.
- */
-public abstract class SimpleFunction<InputT, OutputT>
- implements SerializableFunction<InputT, OutputT> {
-
- /**
- * Returns a {@link TypeDescriptor} capturing what is known statically
- * about the input type of this {@code DoFn} instance's most-derived
- * class.
- *
- * <p>See {@link #getOutputTypeDescriptor} for more discussion.
- */
- public TypeDescriptor<InputT> getInputTypeDescriptor() {
- return new TypeDescriptor<InputT>(this) {};
- }
-
- /**
- * Returns a {@link TypeDescriptor} capturing what is known statically
- * about the output type of this {@code DoFn} instance's
- * most-derived class.
- *
- * <p>In the normal case of a concrete {@code DoFn} subclass with
- * no generic type parameters of its own (including anonymous inner
- * classes), this will be a complete non-generic type, which is good
- * for choosing a default output {@code Coder<OutputT>} for the output
- * {@code PCollection<OutputT>}.
- */
- public TypeDescriptor<OutputT> getOutputTypeDescriptor() {
- return new TypeDescriptor<OutputT>(this) {};
- }
-}
[58/67] incubator-beam git commit: Directory reorganization
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/java/src/main/java/com/google/cloud/dataflow/examples/complete/TopWikipediaSessions.java
----------------------------------------------------------------------
diff --git a/examples/java/src/main/java/com/google/cloud/dataflow/examples/complete/TopWikipediaSessions.java b/examples/java/src/main/java/com/google/cloud/dataflow/examples/complete/TopWikipediaSessions.java
new file mode 100644
index 0000000..c57a5f2
--- /dev/null
+++ b/examples/java/src/main/java/com/google/cloud/dataflow/examples/complete/TopWikipediaSessions.java
@@ -0,0 +1,223 @@
+/*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package com.google.cloud.dataflow.examples.complete;
+
+import com.google.api.services.bigquery.model.TableRow;
+import com.google.cloud.dataflow.sdk.Pipeline;
+import com.google.cloud.dataflow.sdk.coders.TableRowJsonCoder;
+import com.google.cloud.dataflow.sdk.io.TextIO;
+import com.google.cloud.dataflow.sdk.options.DataflowPipelineOptions;
+import com.google.cloud.dataflow.sdk.options.Default;
+import com.google.cloud.dataflow.sdk.options.Description;
+import com.google.cloud.dataflow.sdk.options.PipelineOptions;
+import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
+import com.google.cloud.dataflow.sdk.options.Validation;
+import com.google.cloud.dataflow.sdk.transforms.Count;
+import com.google.cloud.dataflow.sdk.transforms.DoFn;
+import com.google.cloud.dataflow.sdk.transforms.DoFn.RequiresWindowAccess;
+import com.google.cloud.dataflow.sdk.transforms.PTransform;
+import com.google.cloud.dataflow.sdk.transforms.ParDo;
+import com.google.cloud.dataflow.sdk.transforms.SerializableComparator;
+import com.google.cloud.dataflow.sdk.transforms.Top;
+import com.google.cloud.dataflow.sdk.transforms.windowing.CalendarWindows;
+import com.google.cloud.dataflow.sdk.transforms.windowing.IntervalWindow;
+import com.google.cloud.dataflow.sdk.transforms.windowing.Sessions;
+import com.google.cloud.dataflow.sdk.transforms.windowing.Window;
+import com.google.cloud.dataflow.sdk.values.KV;
+import com.google.cloud.dataflow.sdk.values.PCollection;
+
+import org.joda.time.Duration;
+import org.joda.time.Instant;
+
+import java.util.List;
+
+/**
+ * An example that reads Wikipedia edit data from Cloud Storage and computes the user with
+ * the longest string of edits separated by no more than an hour within each month.
+ *
+ * <p>Concepts: Using Windowing to perform time-based aggregations of data.
+ *
+ * <p>It is not recommended to execute this pipeline locally, given the size of the default input
+ * data.
+ *
+ * <p>To execute this pipeline using the Dataflow service, specify pipeline configuration:
+ * <pre>{@code
+ * --project=YOUR_PROJECT_ID
+ * --stagingLocation=gs://YOUR_STAGING_DIRECTORY
+ * --runner=BlockingDataflowPipelineRunner
+ * }
+ * </pre>
+ * and an output prefix on GCS:
+ * <pre>{@code
+ * --output=gs://YOUR_OUTPUT_PREFIX
+ * }</pre>
+ *
+ * <p>The default input is {@code gs://dataflow-samples/wikipedia_edits/*.json} and can be
+ * overridden with {@code --input}.
+ *
+ * <p>The input for this example is large enough that it's a good place to enable (experimental)
+ * autoscaling:
+ * <pre>{@code
+ * --autoscalingAlgorithm=BASIC
+ * --maxNumWorkers=20
+ * }
+ * </pre>
+ * This will automatically scale the number of workers up over time until the job completes.
+ */
+public class TopWikipediaSessions {
+ private static final String EXPORTED_WIKI_TABLE = "gs://dataflow-samples/wikipedia_edits/*.json";
+
+ /**
+ * Extracts user and timestamp from a TableRow representing a Wikipedia edit.
+ */
+ static class ExtractUserAndTimestamp extends DoFn<TableRow, String> {
+ @Override
+ public void processElement(ProcessContext c) {
+ TableRow row = c.element();
+ int timestamp = (Integer) row.get("timestamp");
+ String userName = (String) row.get("contributor_username");
+ if (userName != null) {
+ // Sets the implicit timestamp field to be used in windowing.
+ c.outputWithTimestamp(userName, new Instant(timestamp * 1000L));
+ }
+ }
+ }
+
+ /**
+ * Computes the number of edits in each user session. A session is defined as
+ * a string of edits where each is separated from the next by less than an hour.
+ */
+ static class ComputeSessions
+ extends PTransform<PCollection<String>, PCollection<KV<String, Long>>> {
+ @Override
+ public PCollection<KV<String, Long>> apply(PCollection<String> actions) {
+ return actions
+ .apply(Window.<String>into(Sessions.withGapDuration(Duration.standardHours(1))))
+
+ .apply(Count.<String>perElement());
+ }
+ }
+
+ /**
+ * Computes the longest session ending in each month.
+ */
+ private static class TopPerMonth
+ extends PTransform<PCollection<KV<String, Long>>, PCollection<List<KV<String, Long>>>> {
+ @Override
+ public PCollection<List<KV<String, Long>>> apply(PCollection<KV<String, Long>> sessions) {
+ return sessions
+ .apply(Window.<KV<String, Long>>into(CalendarWindows.months(1)))
+
+ .apply(Top.of(1, new SerializableComparator<KV<String, Long>>() {
+ @Override
+ public int compare(KV<String, Long> o1, KV<String, Long> o2) {
+ return Long.compare(o1.getValue(), o2.getValue());
+ }
+ }).withoutDefaults());
+ }
+ }
+
+ static class SessionsToStringsDoFn extends DoFn<KV<String, Long>, KV<String, Long>>
+ implements RequiresWindowAccess {
+
+ @Override
+ public void processElement(ProcessContext c) {
+ c.output(KV.of(
+ c.element().getKey() + " : " + c.window(), c.element().getValue()));
+ }
+ }
+
+ static class FormatOutputDoFn extends DoFn<List<KV<String, Long>>, String>
+ implements RequiresWindowAccess {
+ @Override
+ public void processElement(ProcessContext c) {
+ for (KV<String, Long> item : c.element()) {
+ String session = item.getKey();
+ long count = item.getValue();
+ c.output(session + " : " + count + " : " + ((IntervalWindow) c.window()).start());
+ }
+ }
+ }
+
+ static class ComputeTopSessions extends PTransform<PCollection<TableRow>, PCollection<String>> {
+
+ private final double samplingThreshold;
+
+ public ComputeTopSessions(double samplingThreshold) {
+ this.samplingThreshold = samplingThreshold;
+ }
+
+ @Override
+ public PCollection<String> apply(PCollection<TableRow> input) {
+ return input
+ .apply(ParDo.of(new ExtractUserAndTimestamp()))
+
+ .apply(ParDo.named("SampleUsers").of(
+ new DoFn<String, String>() {
+ @Override
+ public void processElement(ProcessContext c) {
+ if (Math.abs(c.element().hashCode()) <= Integer.MAX_VALUE * samplingThreshold) {
+ c.output(c.element());
+ }
+ }
+ }))
+
+ .apply(new ComputeSessions())
+
+ .apply(ParDo.named("SessionsToStrings").of(new SessionsToStringsDoFn()))
+ .apply(new TopPerMonth())
+ .apply(ParDo.named("FormatOutput").of(new FormatOutputDoFn()));
+ }
+ }
+
+ /**
+ * Options supported by this class.
+ *
+ * <p>Inherits standard Dataflow configuration options.
+ */
+ private static interface Options extends PipelineOptions {
+ @Description(
+ "Input specified as a GCS path containing a BigQuery table exported as json")
+ @Default.String(EXPORTED_WIKI_TABLE)
+ String getInput();
+ void setInput(String value);
+
+ @Description("File to output results to")
+ @Validation.Required
+ String getOutput();
+ void setOutput(String value);
+ }
+
+ public static void main(String[] args) {
+ Options options = PipelineOptionsFactory.fromArgs(args)
+ .withValidation()
+ .as(Options.class);
+ DataflowPipelineOptions dataflowOptions = options.as(DataflowPipelineOptions.class);
+
+ Pipeline p = Pipeline.create(dataflowOptions);
+
+ double samplingThreshold = 0.1;
+
+ p.apply(TextIO.Read
+ .from(options.getInput())
+ .withCoder(TableRowJsonCoder.of()))
+ .apply(new ComputeTopSessions(samplingThreshold))
+ .apply(TextIO.Write.named("Write").withoutSharding().to(options.getOutput()));
+
+ p.run();
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/java/src/main/java/com/google/cloud/dataflow/examples/complete/TrafficMaxLaneFlow.java
----------------------------------------------------------------------
diff --git a/examples/java/src/main/java/com/google/cloud/dataflow/examples/complete/TrafficMaxLaneFlow.java b/examples/java/src/main/java/com/google/cloud/dataflow/examples/complete/TrafficMaxLaneFlow.java
new file mode 100644
index 0000000..2d54252
--- /dev/null
+++ b/examples/java/src/main/java/com/google/cloud/dataflow/examples/complete/TrafficMaxLaneFlow.java
@@ -0,0 +1,425 @@
+/*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package com.google.cloud.dataflow.examples.complete;
+
+import com.google.api.services.bigquery.model.TableFieldSchema;
+import com.google.api.services.bigquery.model.TableReference;
+import com.google.api.services.bigquery.model.TableRow;
+import com.google.api.services.bigquery.model.TableSchema;
+import com.google.cloud.dataflow.examples.common.DataflowExampleOptions;
+import com.google.cloud.dataflow.examples.common.DataflowExampleUtils;
+import com.google.cloud.dataflow.examples.common.ExampleBigQueryTableOptions;
+import com.google.cloud.dataflow.examples.common.ExamplePubsubTopicAndSubscriptionOptions;
+import com.google.cloud.dataflow.sdk.Pipeline;
+import com.google.cloud.dataflow.sdk.PipelineResult;
+import com.google.cloud.dataflow.sdk.coders.AvroCoder;
+import com.google.cloud.dataflow.sdk.coders.DefaultCoder;
+import com.google.cloud.dataflow.sdk.io.BigQueryIO;
+import com.google.cloud.dataflow.sdk.io.PubsubIO;
+import com.google.cloud.dataflow.sdk.io.TextIO;
+import com.google.cloud.dataflow.sdk.options.Default;
+import com.google.cloud.dataflow.sdk.options.Description;
+import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
+import com.google.cloud.dataflow.sdk.transforms.Combine;
+import com.google.cloud.dataflow.sdk.transforms.DoFn;
+import com.google.cloud.dataflow.sdk.transforms.PTransform;
+import com.google.cloud.dataflow.sdk.transforms.ParDo;
+import com.google.cloud.dataflow.sdk.transforms.SerializableFunction;
+import com.google.cloud.dataflow.sdk.transforms.windowing.SlidingWindows;
+import com.google.cloud.dataflow.sdk.transforms.windowing.Window;
+import com.google.cloud.dataflow.sdk.values.KV;
+import com.google.cloud.dataflow.sdk.values.PBegin;
+import com.google.cloud.dataflow.sdk.values.PCollection;
+import com.google.common.base.Strings;
+
+import org.apache.avro.reflect.Nullable;
+import org.joda.time.Duration;
+import org.joda.time.Instant;
+import org.joda.time.format.DateTimeFormat;
+import org.joda.time.format.DateTimeFormatter;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * A Dataflow Example that runs in both batch and streaming modes with traffic sensor data.
+ * You can configure the running mode by setting {@literal --streaming} to true or false.
+ *
+ * <p>Concepts: The batch and streaming runners, sliding windows, Google Cloud Pub/Sub
+ * topic injection, use of the AvroCoder to encode a custom class, and custom Combine transforms.
+ *
+ * <p>This example analyzes traffic sensor data using SlidingWindows. For each window,
+ * it finds the lane that had the highest flow recorded, for each sensor station. It writes
+ * those max values along with auxiliary info to a BigQuery table.
+ *
+ * <p>In batch mode, the pipeline reads traffic sensor data from {@literal --inputFile}.
+ *
+ * <p>In streaming mode, the pipeline reads the data from a Pub/Sub topic.
+ * By default, the example will run a separate pipeline to inject the data from the default
+ * {@literal --inputFile} to the Pub/Sub {@literal --pubsubTopic}. It will make it available for
+ * the streaming pipeline to process. You may override the default {@literal --inputFile} with the
+ * file of your choosing. You may also set {@literal --inputFile} to an empty string, which will
+ * disable the automatic Pub/Sub injection, and allow you to use separate tool to control the input
+ * to this example. An example code, which publishes traffic sensor data to a Pub/Sub topic,
+ * is provided in
+ * <a href="https://github.com/GoogleCloudPlatform/cloud-pubsub-samples-python/tree/master/gce-cmdline-publisher"></a>.
+ *
+ * <p>The example is configured to use the default Pub/Sub topic and the default BigQuery table
+ * from the example common package (there are no defaults for a general Dataflow pipeline).
+ * You can override them by using the {@literal --pubsubTopic}, {@literal --bigQueryDataset}, and
+ * {@literal --bigQueryTable} options. If the Pub/Sub topic or the BigQuery table do not exist,
+ * the example will try to create them.
+ *
+ * <p>The example will try to cancel the pipelines on the signal to terminate the process (CTRL-C)
+ * and then exits.
+ */
+public class TrafficMaxLaneFlow {
+
+ private static final String PUBSUB_TIMESTAMP_LABEL_KEY = "timestamp_ms";
+ private static final Integer VALID_INPUTS = 4999;
+
+ static final int WINDOW_DURATION = 60; // Default sliding window duration in minutes
+ static final int WINDOW_SLIDE_EVERY = 5; // Default window 'slide every' setting in minutes
+
+ /**
+ * This class holds information about each lane in a station reading, along with some general
+ * information from the reading.
+ */
+ @DefaultCoder(AvroCoder.class)
+ static class LaneInfo {
+ @Nullable String stationId;
+ @Nullable String lane;
+ @Nullable String direction;
+ @Nullable String freeway;
+ @Nullable String recordedTimestamp;
+ @Nullable Integer laneFlow;
+ @Nullable Integer totalFlow;
+ @Nullable Double laneAO;
+ @Nullable Double laneAS;
+
+ public LaneInfo() {}
+
+ public LaneInfo(String stationId, String lane, String direction, String freeway,
+ String timestamp, Integer laneFlow, Double laneAO,
+ Double laneAS, Integer totalFlow) {
+ this.stationId = stationId;
+ this.lane = lane;
+ this.direction = direction;
+ this.freeway = freeway;
+ this.recordedTimestamp = timestamp;
+ this.laneFlow = laneFlow;
+ this.laneAO = laneAO;
+ this.laneAS = laneAS;
+ this.totalFlow = totalFlow;
+ }
+
+ public String getStationId() {
+ return this.stationId;
+ }
+ public String getLane() {
+ return this.lane;
+ }
+ public String getDirection() {
+ return this.direction;
+ }
+ public String getFreeway() {
+ return this.freeway;
+ }
+ public String getRecordedTimestamp() {
+ return this.recordedTimestamp;
+ }
+ public Integer getLaneFlow() {
+ return this.laneFlow;
+ }
+ public Double getLaneAO() {
+ return this.laneAO;
+ }
+ public Double getLaneAS() {
+ return this.laneAS;
+ }
+ public Integer getTotalFlow() {
+ return this.totalFlow;
+ }
+ }
+
+ /**
+ * Extract the timestamp field from the input string, and use it as the element timestamp.
+ */
+ static class ExtractTimestamps extends DoFn<String, String> {
+ private static final DateTimeFormatter dateTimeFormat =
+ DateTimeFormat.forPattern("MM/dd/yyyy HH:mm:ss");
+
+ @Override
+ public void processElement(DoFn<String, String>.ProcessContext c) throws Exception {
+ String[] items = c.element().split(",");
+ if (items.length > 0) {
+ try {
+ String timestamp = items[0];
+ c.outputWithTimestamp(c.element(), new Instant(dateTimeFormat.parseMillis(timestamp)));
+ } catch (IllegalArgumentException e) {
+ // Skip the invalid input.
+ }
+ }
+ }
+ }
+
+ /**
+ * Extract flow information for each of the 8 lanes in a reading, and output as separate tuples.
+ * This will let us determine which lane has the max flow for that station over the span of the
+ * window, and output not only the max flow from that calculation, but other associated
+ * information. The number of lanes for which data is present depends upon which freeway the data
+ * point comes from.
+ */
+ static class ExtractFlowInfoFn extends DoFn<String, KV<String, LaneInfo>> {
+
+ @Override
+ public void processElement(ProcessContext c) {
+ String[] items = c.element().split(",");
+ if (items.length < 48) {
+ // Skip the invalid input.
+ return;
+ }
+ // extract the sensor information for the lanes from the input string fields.
+ String timestamp = items[0];
+ String stationId = items[1];
+ String freeway = items[2];
+ String direction = items[3];
+ Integer totalFlow = tryIntParse(items[7]);
+ for (int i = 1; i <= 8; ++i) {
+ Integer laneFlow = tryIntParse(items[6 + 5 * i]);
+ Double laneAvgOccupancy = tryDoubleParse(items[7 + 5 * i]);
+ Double laneAvgSpeed = tryDoubleParse(items[8 + 5 * i]);
+ if (laneFlow == null || laneAvgOccupancy == null || laneAvgSpeed == null) {
+ return;
+ }
+ LaneInfo laneInfo = new LaneInfo(stationId, "lane" + i, direction, freeway, timestamp,
+ laneFlow, laneAvgOccupancy, laneAvgSpeed, totalFlow);
+ c.output(KV.of(stationId, laneInfo));
+ }
+ }
+ }
+
+ /**
+ * A custom 'combine function' used with the Combine.perKey transform. Used to find the max lane
+ * flow over all the data points in the Window. Extracts the lane flow from the input string and
+ * determines whether it's the max seen so far. We're using a custom combiner instead of the Max
+ * transform because we want to retain the additional information we've associated with the flow
+ * value.
+ */
+ public static class MaxFlow implements SerializableFunction<Iterable<LaneInfo>, LaneInfo> {
+ @Override
+ public LaneInfo apply(Iterable<LaneInfo> input) {
+ Integer max = 0;
+ LaneInfo maxInfo = new LaneInfo();
+ for (LaneInfo item : input) {
+ Integer flow = item.getLaneFlow();
+ if (flow != null && (flow >= max)) {
+ max = flow;
+ maxInfo = item;
+ }
+ }
+ return maxInfo;
+ }
+ }
+
+ /**
+ * Format the results of the Max Lane flow calculation to a TableRow, to save to BigQuery.
+ * Add the timestamp from the window context.
+ */
+ static class FormatMaxesFn extends DoFn<KV<String, LaneInfo>, TableRow> {
+ @Override
+ public void processElement(ProcessContext c) {
+
+ LaneInfo laneInfo = c.element().getValue();
+ TableRow row = new TableRow()
+ .set("station_id", c.element().getKey())
+ .set("direction", laneInfo.getDirection())
+ .set("freeway", laneInfo.getFreeway())
+ .set("lane_max_flow", laneInfo.getLaneFlow())
+ .set("lane", laneInfo.getLane())
+ .set("avg_occ", laneInfo.getLaneAO())
+ .set("avg_speed", laneInfo.getLaneAS())
+ .set("total_flow", laneInfo.getTotalFlow())
+ .set("recorded_timestamp", laneInfo.getRecordedTimestamp())
+ .set("window_timestamp", c.timestamp().toString());
+ c.output(row);
+ }
+
+ /** Defines the BigQuery schema used for the output. */
+ static TableSchema getSchema() {
+ List<TableFieldSchema> fields = new ArrayList<>();
+ fields.add(new TableFieldSchema().setName("station_id").setType("STRING"));
+ fields.add(new TableFieldSchema().setName("direction").setType("STRING"));
+ fields.add(new TableFieldSchema().setName("freeway").setType("STRING"));
+ fields.add(new TableFieldSchema().setName("lane_max_flow").setType("INTEGER"));
+ fields.add(new TableFieldSchema().setName("lane").setType("STRING"));
+ fields.add(new TableFieldSchema().setName("avg_occ").setType("FLOAT"));
+ fields.add(new TableFieldSchema().setName("avg_speed").setType("FLOAT"));
+ fields.add(new TableFieldSchema().setName("total_flow").setType("INTEGER"));
+ fields.add(new TableFieldSchema().setName("window_timestamp").setType("TIMESTAMP"));
+ fields.add(new TableFieldSchema().setName("recorded_timestamp").setType("STRING"));
+ TableSchema schema = new TableSchema().setFields(fields);
+ return schema;
+ }
+ }
+
+ /**
+ * This PTransform extracts lane info, calculates the max lane flow found for a given station (for
+ * the current Window) using a custom 'combiner', and formats the results for BigQuery.
+ */
+ static class MaxLaneFlow
+ extends PTransform<PCollection<KV<String, LaneInfo>>, PCollection<TableRow>> {
+ @Override
+ public PCollection<TableRow> apply(PCollection<KV<String, LaneInfo>> flowInfo) {
+ // stationId, LaneInfo => stationId + max lane flow info
+ PCollection<KV<String, LaneInfo>> flowMaxes =
+ flowInfo.apply(Combine.<String, LaneInfo>perKey(
+ new MaxFlow()));
+
+ // <stationId, max lane flow info>... => row...
+ PCollection<TableRow> results = flowMaxes.apply(
+ ParDo.of(new FormatMaxesFn()));
+
+ return results;
+ }
+ }
+
+ static class ReadFileAndExtractTimestamps extends PTransform<PBegin, PCollection<String>> {
+ private final String inputFile;
+
+ public ReadFileAndExtractTimestamps(String inputFile) {
+ this.inputFile = inputFile;
+ }
+
+ @Override
+ public PCollection<String> apply(PBegin begin) {
+ return begin
+ .apply(TextIO.Read.from(inputFile))
+ .apply(ParDo.of(new ExtractTimestamps()));
+ }
+ }
+
+ /**
+ * Options supported by {@link TrafficMaxLaneFlow}.
+ *
+ * <p>Inherits standard configuration options.
+ */
+ private interface TrafficMaxLaneFlowOptions extends DataflowExampleOptions,
+ ExamplePubsubTopicAndSubscriptionOptions, ExampleBigQueryTableOptions {
+ @Description("Input file to inject to Pub/Sub topic")
+ @Default.String("gs://dataflow-samples/traffic_sensor/"
+ + "Freeways-5Minaa2010-01-01_to_2010-02-15_test2.csv")
+ String getInputFile();
+ void setInputFile(String value);
+
+ @Description("Numeric value of sliding window duration, in minutes")
+ @Default.Integer(WINDOW_DURATION)
+ Integer getWindowDuration();
+ void setWindowDuration(Integer value);
+
+ @Description("Numeric value of window 'slide every' setting, in minutes")
+ @Default.Integer(WINDOW_SLIDE_EVERY)
+ Integer getWindowSlideEvery();
+ void setWindowSlideEvery(Integer value);
+
+ @Description("Whether to run the pipeline with unbounded input")
+ @Default.Boolean(false)
+ boolean isUnbounded();
+ void setUnbounded(boolean value);
+ }
+
+ /**
+ * Sets up and starts streaming pipeline.
+ *
+ * @throws IOException if there is a problem setting up resources
+ */
+ public static void main(String[] args) throws IOException {
+ TrafficMaxLaneFlowOptions options = PipelineOptionsFactory.fromArgs(args)
+ .withValidation()
+ .as(TrafficMaxLaneFlowOptions.class);
+ options.setBigQuerySchema(FormatMaxesFn.getSchema());
+ // Using DataflowExampleUtils to set up required resources.
+ DataflowExampleUtils dataflowUtils = new DataflowExampleUtils(options, options.isUnbounded());
+
+ Pipeline pipeline = Pipeline.create(options);
+ TableReference tableRef = new TableReference();
+ tableRef.setProjectId(options.getProject());
+ tableRef.setDatasetId(options.getBigQueryDataset());
+ tableRef.setTableId(options.getBigQueryTable());
+
+ PCollection<String> input;
+ if (options.isUnbounded()) {
+ // Read unbounded PubSubIO.
+ input = pipeline.apply(PubsubIO.Read
+ .timestampLabel(PUBSUB_TIMESTAMP_LABEL_KEY)
+ .subscription(options.getPubsubSubscription()));
+ } else {
+ // Read bounded PubSubIO.
+ input = pipeline.apply(PubsubIO.Read
+ .timestampLabel(PUBSUB_TIMESTAMP_LABEL_KEY)
+ .subscription(options.getPubsubSubscription()).maxNumRecords(VALID_INPUTS));
+
+ // To read bounded TextIO files, use:
+ // input = pipeline.apply(new ReadFileAndExtractTimestamps(options.getInputFile()));
+ }
+ input
+ // row... => <station route, station speed> ...
+ .apply(ParDo.of(new ExtractFlowInfoFn()))
+ // map the incoming data stream into sliding windows. The default window duration values
+ // work well if you're running the accompanying Pub/Sub generator script with the
+ // --replay flag, which simulates pauses in the sensor data publication. You may want to
+ // adjust them otherwise.
+ .apply(Window.<KV<String, LaneInfo>>into(SlidingWindows.of(
+ Duration.standardMinutes(options.getWindowDuration())).
+ every(Duration.standardMinutes(options.getWindowSlideEvery()))))
+ .apply(new MaxLaneFlow())
+ .apply(BigQueryIO.Write.to(tableRef)
+ .withSchema(FormatMaxesFn.getSchema()));
+
+ // Inject the data into the Pub/Sub topic with a Dataflow batch pipeline.
+ if (!Strings.isNullOrEmpty(options.getInputFile())
+ && !Strings.isNullOrEmpty(options.getPubsubTopic())) {
+ dataflowUtils.runInjectorPipeline(
+ new ReadFileAndExtractTimestamps(options.getInputFile()),
+ options.getPubsubTopic(),
+ PUBSUB_TIMESTAMP_LABEL_KEY);
+ }
+
+ // Run the pipeline.
+ PipelineResult result = pipeline.run();
+
+ // dataflowUtils will try to cancel the pipeline and the injector before the program exists.
+ dataflowUtils.waitToFinish(result);
+ }
+
+ private static Integer tryIntParse(String number) {
+ try {
+ return Integer.parseInt(number);
+ } catch (NumberFormatException e) {
+ return null;
+ }
+ }
+
+ private static Double tryDoubleParse(String number) {
+ try {
+ return Double.parseDouble(number);
+ } catch (NumberFormatException e) {
+ return null;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/java/src/main/java/com/google/cloud/dataflow/examples/complete/TrafficRoutes.java
----------------------------------------------------------------------
diff --git a/examples/java/src/main/java/com/google/cloud/dataflow/examples/complete/TrafficRoutes.java b/examples/java/src/main/java/com/google/cloud/dataflow/examples/complete/TrafficRoutes.java
new file mode 100644
index 0000000..e3e88c2
--- /dev/null
+++ b/examples/java/src/main/java/com/google/cloud/dataflow/examples/complete/TrafficRoutes.java
@@ -0,0 +1,459 @@
+/*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package com.google.cloud.dataflow.examples.complete;
+
+import com.google.api.services.bigquery.model.TableFieldSchema;
+import com.google.api.services.bigquery.model.TableReference;
+import com.google.api.services.bigquery.model.TableRow;
+import com.google.api.services.bigquery.model.TableSchema;
+import com.google.cloud.dataflow.examples.common.DataflowExampleOptions;
+import com.google.cloud.dataflow.examples.common.DataflowExampleUtils;
+import com.google.cloud.dataflow.examples.common.ExampleBigQueryTableOptions;
+import com.google.cloud.dataflow.examples.common.ExamplePubsubTopicAndSubscriptionOptions;
+import com.google.cloud.dataflow.sdk.Pipeline;
+import com.google.cloud.dataflow.sdk.PipelineResult;
+import com.google.cloud.dataflow.sdk.coders.AvroCoder;
+import com.google.cloud.dataflow.sdk.coders.DefaultCoder;
+import com.google.cloud.dataflow.sdk.io.BigQueryIO;
+import com.google.cloud.dataflow.sdk.io.PubsubIO;
+import com.google.cloud.dataflow.sdk.io.TextIO;
+import com.google.cloud.dataflow.sdk.options.Default;
+import com.google.cloud.dataflow.sdk.options.Description;
+import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
+import com.google.cloud.dataflow.sdk.transforms.DoFn;
+import com.google.cloud.dataflow.sdk.transforms.GroupByKey;
+import com.google.cloud.dataflow.sdk.transforms.PTransform;
+import com.google.cloud.dataflow.sdk.transforms.ParDo;
+import com.google.cloud.dataflow.sdk.transforms.windowing.SlidingWindows;
+import com.google.cloud.dataflow.sdk.transforms.windowing.Window;
+import com.google.cloud.dataflow.sdk.values.KV;
+import com.google.cloud.dataflow.sdk.values.PBegin;
+import com.google.cloud.dataflow.sdk.values.PCollection;
+import com.google.common.base.Strings;
+import com.google.common.collect.Lists;
+
+import org.apache.avro.reflect.Nullable;
+import org.joda.time.Duration;
+import org.joda.time.Instant;
+import org.joda.time.format.DateTimeFormat;
+import org.joda.time.format.DateTimeFormatter;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Hashtable;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * A Dataflow Example that runs in both batch and streaming modes with traffic sensor data.
+ * You can configure the running mode by setting {@literal --streaming} to true or false.
+ *
+ * <p>Concepts: The batch and streaming runners, GroupByKey, sliding windows, and
+ * Google Cloud Pub/Sub topic injection.
+ *
+ * <p>This example analyzes traffic sensor data using SlidingWindows. For each window,
+ * it calculates the average speed over the window for some small set of predefined 'routes',
+ * and looks for 'slowdowns' in those routes. It writes its results to a BigQuery table.
+ *
+ * <p>In batch mode, the pipeline reads traffic sensor data from {@literal --inputFile}.
+ *
+ * <p>In streaming mode, the pipeline reads the data from a Pub/Sub topic.
+ * By default, the example will run a separate pipeline to inject the data from the default
+ * {@literal --inputFile} to the Pub/Sub {@literal --pubsubTopic}. It will make it available for
+ * the streaming pipeline to process. You may override the default {@literal --inputFile} with the
+ * file of your choosing. You may also set {@literal --inputFile} to an empty string, which will
+ * disable the automatic Pub/Sub injection, and allow you to use separate tool to control the input
+ * to this example. An example code, which publishes traffic sensor data to a Pub/Sub topic,
+ * is provided in
+ * <a href="https://github.com/GoogleCloudPlatform/cloud-pubsub-samples-python/tree/master/gce-cmdline-publisher"></a>.
+ *
+ * <p>The example is configured to use the default Pub/Sub topic and the default BigQuery table
+ * from the example common package (there are no defaults for a general Dataflow pipeline).
+ * You can override them by using the {@literal --pubsubTopic}, {@literal --bigQueryDataset}, and
+ * {@literal --bigQueryTable} options. If the Pub/Sub topic or the BigQuery table do not exist,
+ * the example will try to create them.
+ *
+ * <p>The example will try to cancel the pipelines on the signal to terminate the process (CTRL-C)
+ * and then exits.
+ */
+
+public class TrafficRoutes {
+
+ private static final String PUBSUB_TIMESTAMP_LABEL_KEY = "timestamp_ms";
+ private static final Integer VALID_INPUTS = 4999;
+
+ // Instantiate some small predefined San Diego routes to analyze
+ static Map<String, String> sdStations = buildStationInfo();
+ static final int WINDOW_DURATION = 3; // Default sliding window duration in minutes
+ static final int WINDOW_SLIDE_EVERY = 1; // Default window 'slide every' setting in minutes
+
+ /**
+ * This class holds information about a station reading's average speed.
+ */
+ @DefaultCoder(AvroCoder.class)
+ static class StationSpeed implements Comparable<StationSpeed> {
+ @Nullable String stationId;
+ @Nullable Double avgSpeed;
+ @Nullable Long timestamp;
+
+ public StationSpeed() {}
+
+ public StationSpeed(String stationId, Double avgSpeed, Long timestamp) {
+ this.stationId = stationId;
+ this.avgSpeed = avgSpeed;
+ this.timestamp = timestamp;
+ }
+
+ public String getStationId() {
+ return this.stationId;
+ }
+ public Double getAvgSpeed() {
+ return this.avgSpeed;
+ }
+
+ @Override
+ public int compareTo(StationSpeed other) {
+ return Long.compare(this.timestamp, other.timestamp);
+ }
+ }
+
+ /**
+ * This class holds information about a route's speed/slowdown.
+ */
+ @DefaultCoder(AvroCoder.class)
+ static class RouteInfo {
+ @Nullable String route;
+ @Nullable Double avgSpeed;
+ @Nullable Boolean slowdownEvent;
+
+
+ public RouteInfo() {}
+
+ public RouteInfo(String route, Double avgSpeed, Boolean slowdownEvent) {
+ this.route = route;
+ this.avgSpeed = avgSpeed;
+ this.slowdownEvent = slowdownEvent;
+ }
+
+ public String getRoute() {
+ return this.route;
+ }
+ public Double getAvgSpeed() {
+ return this.avgSpeed;
+ }
+ public Boolean getSlowdownEvent() {
+ return this.slowdownEvent;
+ }
+ }
+
+ /**
+ * Extract the timestamp field from the input string, and use it as the element timestamp.
+ */
+ static class ExtractTimestamps extends DoFn<String, String> {
+ private static final DateTimeFormatter dateTimeFormat =
+ DateTimeFormat.forPattern("MM/dd/yyyy HH:mm:ss");
+
+ @Override
+ public void processElement(DoFn<String, String>.ProcessContext c) throws Exception {
+ String[] items = c.element().split(",");
+ String timestamp = tryParseTimestamp(items);
+ if (timestamp != null) {
+ try {
+ c.outputWithTimestamp(c.element(), new Instant(dateTimeFormat.parseMillis(timestamp)));
+ } catch (IllegalArgumentException e) {
+ // Skip the invalid input.
+ }
+ }
+ }
+ }
+
+ /**
+ * Filter out readings for the stations along predefined 'routes', and output
+ * (station, speed info) keyed on route.
+ */
+ static class ExtractStationSpeedFn extends DoFn<String, KV<String, StationSpeed>> {
+
+ @Override
+ public void processElement(ProcessContext c) {
+ String[] items = c.element().split(",");
+ String stationType = tryParseStationType(items);
+ // For this analysis, use only 'main line' station types
+ if (stationType != null && stationType.equals("ML")) {
+ Double avgSpeed = tryParseAvgSpeed(items);
+ String stationId = tryParseStationId(items);
+ // For this simple example, filter out everything but some hardwired routes.
+ if (avgSpeed != null && stationId != null && sdStations.containsKey(stationId)) {
+ StationSpeed stationSpeed =
+ new StationSpeed(stationId, avgSpeed, c.timestamp().getMillis());
+ // The tuple key is the 'route' name stored in the 'sdStations' hash.
+ KV<String, StationSpeed> outputValue = KV.of(sdStations.get(stationId), stationSpeed);
+ c.output(outputValue);
+ }
+ }
+ }
+ }
+
+ /**
+ * For a given route, track average speed for the window. Calculate whether
+ * traffic is currently slowing down, via a predefined threshold. If a supermajority of
+ * speeds in this sliding window are less than the previous reading we call this a 'slowdown'.
+ * Note: these calculations are for example purposes only, and are unrealistic and oversimplified.
+ */
+ static class GatherStats
+ extends DoFn<KV<String, Iterable<StationSpeed>>, KV<String, RouteInfo>> {
+ @Override
+ public void processElement(ProcessContext c) throws IOException {
+ String route = c.element().getKey();
+ double speedSum = 0.0;
+ int speedCount = 0;
+ int speedups = 0;
+ int slowdowns = 0;
+ List<StationSpeed> infoList = Lists.newArrayList(c.element().getValue());
+ // StationSpeeds sort by embedded timestamp.
+ Collections.sort(infoList);
+ Map<String, Double> prevSpeeds = new HashMap<>();
+ // For all stations in the route, sum (non-null) speeds. Keep a count of the non-null speeds.
+ for (StationSpeed item : infoList) {
+ Double speed = item.getAvgSpeed();
+ if (speed != null) {
+ speedSum += speed;
+ speedCount++;
+ Double lastSpeed = prevSpeeds.get(item.getStationId());
+ if (lastSpeed != null) {
+ if (lastSpeed < speed) {
+ speedups += 1;
+ } else {
+ slowdowns += 1;
+ }
+ }
+ prevSpeeds.put(item.getStationId(), speed);
+ }
+ }
+ if (speedCount == 0) {
+ // No average to compute.
+ return;
+ }
+ double speedAvg = speedSum / speedCount;
+ boolean slowdownEvent = slowdowns >= 2 * speedups;
+ RouteInfo routeInfo = new RouteInfo(route, speedAvg, slowdownEvent);
+ c.output(KV.of(route, routeInfo));
+ }
+ }
+
+ /**
+ * Format the results of the slowdown calculations to a TableRow, to save to BigQuery.
+ */
+ static class FormatStatsFn extends DoFn<KV<String, RouteInfo>, TableRow> {
+ @Override
+ public void processElement(ProcessContext c) {
+ RouteInfo routeInfo = c.element().getValue();
+ TableRow row = new TableRow()
+ .set("avg_speed", routeInfo.getAvgSpeed())
+ .set("slowdown_event", routeInfo.getSlowdownEvent())
+ .set("route", c.element().getKey())
+ .set("window_timestamp", c.timestamp().toString());
+ c.output(row);
+ }
+
+ /**
+ * Defines the BigQuery schema used for the output.
+ */
+ static TableSchema getSchema() {
+ List<TableFieldSchema> fields = new ArrayList<>();
+ fields.add(new TableFieldSchema().setName("route").setType("STRING"));
+ fields.add(new TableFieldSchema().setName("avg_speed").setType("FLOAT"));
+ fields.add(new TableFieldSchema().setName("slowdown_event").setType("BOOLEAN"));
+ fields.add(new TableFieldSchema().setName("window_timestamp").setType("TIMESTAMP"));
+ TableSchema schema = new TableSchema().setFields(fields);
+ return schema;
+ }
+ }
+
+ /**
+ * This PTransform extracts speed info from traffic station readings.
+ * It groups the readings by 'route' and analyzes traffic slowdown for that route.
+ * Lastly, it formats the results for BigQuery.
+ */
+ static class TrackSpeed extends
+ PTransform<PCollection<KV<String, StationSpeed>>, PCollection<TableRow>> {
+ @Override
+ public PCollection<TableRow> apply(PCollection<KV<String, StationSpeed>> stationSpeed) {
+ // Apply a GroupByKey transform to collect a list of all station
+ // readings for a given route.
+ PCollection<KV<String, Iterable<StationSpeed>>> timeGroup = stationSpeed.apply(
+ GroupByKey.<String, StationSpeed>create());
+
+ // Analyze 'slowdown' over the route readings.
+ PCollection<KV<String, RouteInfo>> stats = timeGroup.apply(ParDo.of(new GatherStats()));
+
+ // Format the results for writing to BigQuery
+ PCollection<TableRow> results = stats.apply(
+ ParDo.of(new FormatStatsFn()));
+
+ return results;
+ }
+ }
+
+ static class ReadFileAndExtractTimestamps extends PTransform<PBegin, PCollection<String>> {
+ private final String inputFile;
+
+ public ReadFileAndExtractTimestamps(String inputFile) {
+ this.inputFile = inputFile;
+ }
+
+ @Override
+ public PCollection<String> apply(PBegin begin) {
+ return begin
+ .apply(TextIO.Read.from(inputFile))
+ .apply(ParDo.of(new ExtractTimestamps()));
+ }
+ }
+
+ /**
+ * Options supported by {@link TrafficRoutes}.
+ *
+ * <p>Inherits standard configuration options.
+ */
+ private interface TrafficRoutesOptions extends DataflowExampleOptions,
+ ExamplePubsubTopicAndSubscriptionOptions, ExampleBigQueryTableOptions {
+ @Description("Input file to inject to Pub/Sub topic")
+ @Default.String("gs://dataflow-samples/traffic_sensor/"
+ + "Freeways-5Minaa2010-01-01_to_2010-02-15_test2.csv")
+ String getInputFile();
+ void setInputFile(String value);
+
+ @Description("Numeric value of sliding window duration, in minutes")
+ @Default.Integer(WINDOW_DURATION)
+ Integer getWindowDuration();
+ void setWindowDuration(Integer value);
+
+ @Description("Numeric value of window 'slide every' setting, in minutes")
+ @Default.Integer(WINDOW_SLIDE_EVERY)
+ Integer getWindowSlideEvery();
+ void setWindowSlideEvery(Integer value);
+
+ @Description("Whether to run the pipeline with unbounded input")
+ @Default.Boolean(false)
+ boolean isUnbounded();
+ void setUnbounded(boolean value);
+ }
+
+ /**
+ * Sets up and starts streaming pipeline.
+ *
+ * @throws IOException if there is a problem setting up resources
+ */
+ public static void main(String[] args) throws IOException {
+ TrafficRoutesOptions options = PipelineOptionsFactory.fromArgs(args)
+ .withValidation()
+ .as(TrafficRoutesOptions.class);
+
+ options.setBigQuerySchema(FormatStatsFn.getSchema());
+ // Using DataflowExampleUtils to set up required resources.
+ DataflowExampleUtils dataflowUtils = new DataflowExampleUtils(options, options.isUnbounded());
+
+ Pipeline pipeline = Pipeline.create(options);
+ TableReference tableRef = new TableReference();
+ tableRef.setProjectId(options.getProject());
+ tableRef.setDatasetId(options.getBigQueryDataset());
+ tableRef.setTableId(options.getBigQueryTable());
+
+ PCollection<String> input;
+ if (options.isUnbounded()) {
+ // Read unbounded PubSubIO.
+ input = pipeline.apply(PubsubIO.Read
+ .timestampLabel(PUBSUB_TIMESTAMP_LABEL_KEY)
+ .subscription(options.getPubsubSubscription()));
+ } else {
+ // Read bounded PubSubIO.
+ input = pipeline.apply(PubsubIO.Read
+ .timestampLabel(PUBSUB_TIMESTAMP_LABEL_KEY)
+ .subscription(options.getPubsubSubscription()).maxNumRecords(VALID_INPUTS));
+
+ // To read bounded TextIO files, use:
+ // input = pipeline.apply(TextIO.Read.from(options.getInputFile()))
+ // .apply(ParDo.of(new ExtractTimestamps()));
+ }
+ input
+ // row... => <station route, station speed> ...
+ .apply(ParDo.of(new ExtractStationSpeedFn()))
+ // map the incoming data stream into sliding windows.
+ // The default window duration values work well if you're running the accompanying Pub/Sub
+ // generator script without the --replay flag, so that there are no simulated pauses in
+ // the sensor data publication. You may want to adjust the values otherwise.
+ .apply(Window.<KV<String, StationSpeed>>into(SlidingWindows.of(
+ Duration.standardMinutes(options.getWindowDuration())).
+ every(Duration.standardMinutes(options.getWindowSlideEvery()))))
+ .apply(new TrackSpeed())
+ .apply(BigQueryIO.Write.to(tableRef)
+ .withSchema(FormatStatsFn.getSchema()));
+
+ // Inject the data into the Pub/Sub topic with a Dataflow batch pipeline.
+ if (!Strings.isNullOrEmpty(options.getInputFile())
+ && !Strings.isNullOrEmpty(options.getPubsubTopic())) {
+ dataflowUtils.runInjectorPipeline(
+ new ReadFileAndExtractTimestamps(options.getInputFile()),
+ options.getPubsubTopic(),
+ PUBSUB_TIMESTAMP_LABEL_KEY);
+ }
+
+ // Run the pipeline.
+ PipelineResult result = pipeline.run();
+
+ // dataflowUtils will try to cancel the pipeline and the injector before the program exists.
+ dataflowUtils.waitToFinish(result);
+ }
+
+ private static Double tryParseAvgSpeed(String[] inputItems) {
+ try {
+ return Double.parseDouble(tryParseString(inputItems, 9));
+ } catch (NumberFormatException e) {
+ return null;
+ } catch (NullPointerException e) {
+ return null;
+ }
+ }
+
+ private static String tryParseStationType(String[] inputItems) {
+ return tryParseString(inputItems, 4);
+ }
+
+ private static String tryParseStationId(String[] inputItems) {
+ return tryParseString(inputItems, 1);
+ }
+
+ private static String tryParseTimestamp(String[] inputItems) {
+ return tryParseString(inputItems, 0);
+ }
+
+ private static String tryParseString(String[] inputItems, int index) {
+ return inputItems.length >= index ? inputItems[index] : null;
+ }
+
+ /**
+ * Define some small hard-wired San Diego 'routes' to track based on sensor station ID.
+ */
+ private static Map<String, String> buildStationInfo() {
+ Map<String, String> stations = new Hashtable<String, String>();
+ stations.put("1108413", "SDRoute1"); // from freeway 805 S
+ stations.put("1108699", "SDRoute2"); // from freeway 78 E
+ stations.put("1108702", "SDRoute2");
+ return stations;
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/java/src/main/java/com/google/cloud/dataflow/examples/cookbook/BigQueryTornadoes.java
----------------------------------------------------------------------
diff --git a/examples/java/src/main/java/com/google/cloud/dataflow/examples/cookbook/BigQueryTornadoes.java b/examples/java/src/main/java/com/google/cloud/dataflow/examples/cookbook/BigQueryTornadoes.java
new file mode 100644
index 0000000..503bcad
--- /dev/null
+++ b/examples/java/src/main/java/com/google/cloud/dataflow/examples/cookbook/BigQueryTornadoes.java
@@ -0,0 +1,179 @@
+/*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package com.google.cloud.dataflow.examples.cookbook;
+
+import com.google.api.services.bigquery.model.TableFieldSchema;
+import com.google.api.services.bigquery.model.TableRow;
+import com.google.api.services.bigquery.model.TableSchema;
+import com.google.cloud.dataflow.sdk.Pipeline;
+import com.google.cloud.dataflow.sdk.io.BigQueryIO;
+import com.google.cloud.dataflow.sdk.options.Default;
+import com.google.cloud.dataflow.sdk.options.Description;
+import com.google.cloud.dataflow.sdk.options.PipelineOptions;
+import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
+import com.google.cloud.dataflow.sdk.options.Validation;
+import com.google.cloud.dataflow.sdk.transforms.Count;
+import com.google.cloud.dataflow.sdk.transforms.DoFn;
+import com.google.cloud.dataflow.sdk.transforms.PTransform;
+import com.google.cloud.dataflow.sdk.transforms.ParDo;
+import com.google.cloud.dataflow.sdk.values.KV;
+import com.google.cloud.dataflow.sdk.values.PCollection;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * An example that reads the public samples of weather data from BigQuery, counts the number of
+ * tornadoes that occur in each month, and writes the results to BigQuery.
+ *
+ * <p>Concepts: Reading/writing BigQuery; counting a PCollection; user-defined PTransforms
+ *
+ * <p>Note: Before running this example, you must create a BigQuery dataset to contain your output
+ * table.
+ *
+ * <p>To execute this pipeline locally, specify general pipeline configuration:
+ * <pre>{@code
+ * --project=YOUR_PROJECT_ID
+ * }
+ * </pre>
+ * and the BigQuery table for the output, with the form
+ * <pre>{@code
+ * --output=YOUR_PROJECT_ID:DATASET_ID.TABLE_ID
+ * }</pre>
+ *
+ * <p>To execute this pipeline using the Dataflow service, specify pipeline configuration:
+ * <pre>{@code
+ * --project=YOUR_PROJECT_ID
+ * --stagingLocation=gs://YOUR_STAGING_DIRECTORY
+ * --runner=BlockingDataflowPipelineRunner
+ * }
+ * </pre>
+ * and the BigQuery table for the output:
+ * <pre>{@code
+ * --output=YOUR_PROJECT_ID:DATASET_ID.TABLE_ID
+ * }</pre>
+ *
+ * <p>The BigQuery input table defaults to {@code clouddataflow-readonly:samples.weather_stations}
+ * and can be overridden with {@code --input}.
+ */
+public class BigQueryTornadoes {
+ // Default to using a 1000 row subset of the public weather station table publicdata:samples.gsod.
+ private static final String WEATHER_SAMPLES_TABLE =
+ "clouddataflow-readonly:samples.weather_stations";
+
+ /**
+ * Examines each row in the input table. If a tornado was recorded
+ * in that sample, the month in which it occurred is output.
+ */
+ static class ExtractTornadoesFn extends DoFn<TableRow, Integer> {
+ @Override
+ public void processElement(ProcessContext c){
+ TableRow row = c.element();
+ if ((Boolean) row.get("tornado")) {
+ c.output(Integer.parseInt((String) row.get("month")));
+ }
+ }
+ }
+
+ /**
+ * Prepares the data for writing to BigQuery by building a TableRow object containing an
+ * integer representation of month and the number of tornadoes that occurred in each month.
+ */
+ static class FormatCountsFn extends DoFn<KV<Integer, Long>, TableRow> {
+ @Override
+ public void processElement(ProcessContext c) {
+ TableRow row = new TableRow()
+ .set("month", c.element().getKey())
+ .set("tornado_count", c.element().getValue());
+ c.output(row);
+ }
+ }
+
+ /**
+ * Takes rows from a table and generates a table of counts.
+ *
+ * <p>The input schema is described by
+ * https://developers.google.com/bigquery/docs/dataset-gsod .
+ * The output contains the total number of tornadoes found in each month in
+ * the following schema:
+ * <ul>
+ * <li>month: integer</li>
+ * <li>tornado_count: integer</li>
+ * </ul>
+ */
+ static class CountTornadoes
+ extends PTransform<PCollection<TableRow>, PCollection<TableRow>> {
+ @Override
+ public PCollection<TableRow> apply(PCollection<TableRow> rows) {
+
+ // row... => month...
+ PCollection<Integer> tornadoes = rows.apply(
+ ParDo.of(new ExtractTornadoesFn()));
+
+ // month... => <month,count>...
+ PCollection<KV<Integer, Long>> tornadoCounts =
+ tornadoes.apply(Count.<Integer>perElement());
+
+ // <month,count>... => row...
+ PCollection<TableRow> results = tornadoCounts.apply(
+ ParDo.of(new FormatCountsFn()));
+
+ return results;
+ }
+ }
+
+ /**
+ * Options supported by {@link BigQueryTornadoes}.
+ *
+ * <p>Inherits standard configuration options.
+ */
+ private static interface Options extends PipelineOptions {
+ @Description("Table to read from, specified as "
+ + "<project_id>:<dataset_id>.<table_id>")
+ @Default.String(WEATHER_SAMPLES_TABLE)
+ String getInput();
+ void setInput(String value);
+
+ @Description("BigQuery table to write to, specified as "
+ + "<project_id>:<dataset_id>.<table_id>. The dataset must already exist.")
+ @Validation.Required
+ String getOutput();
+ void setOutput(String value);
+ }
+
+ public static void main(String[] args) {
+ Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
+
+ Pipeline p = Pipeline.create(options);
+
+ // Build the table schema for the output table.
+ List<TableFieldSchema> fields = new ArrayList<>();
+ fields.add(new TableFieldSchema().setName("month").setType("INTEGER"));
+ fields.add(new TableFieldSchema().setName("tornado_count").setType("INTEGER"));
+ TableSchema schema = new TableSchema().setFields(fields);
+
+ p.apply(BigQueryIO.Read.from(options.getInput()))
+ .apply(new CountTornadoes())
+ .apply(BigQueryIO.Write
+ .to(options.getOutput())
+ .withSchema(schema)
+ .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED)
+ .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_TRUNCATE));
+
+ p.run();
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/java/src/main/java/com/google/cloud/dataflow/examples/cookbook/CombinePerKeyExamples.java
----------------------------------------------------------------------
diff --git a/examples/java/src/main/java/com/google/cloud/dataflow/examples/cookbook/CombinePerKeyExamples.java b/examples/java/src/main/java/com/google/cloud/dataflow/examples/cookbook/CombinePerKeyExamples.java
new file mode 100644
index 0000000..9540dd4
--- /dev/null
+++ b/examples/java/src/main/java/com/google/cloud/dataflow/examples/cookbook/CombinePerKeyExamples.java
@@ -0,0 +1,223 @@
+/*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package com.google.cloud.dataflow.examples.cookbook;
+
+import com.google.api.services.bigquery.model.TableFieldSchema;
+import com.google.api.services.bigquery.model.TableRow;
+import com.google.api.services.bigquery.model.TableSchema;
+import com.google.cloud.dataflow.sdk.Pipeline;
+import com.google.cloud.dataflow.sdk.io.BigQueryIO;
+import com.google.cloud.dataflow.sdk.options.Default;
+import com.google.cloud.dataflow.sdk.options.Description;
+import com.google.cloud.dataflow.sdk.options.PipelineOptions;
+import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
+import com.google.cloud.dataflow.sdk.options.Validation;
+import com.google.cloud.dataflow.sdk.transforms.Aggregator;
+import com.google.cloud.dataflow.sdk.transforms.Combine;
+import com.google.cloud.dataflow.sdk.transforms.DoFn;
+import com.google.cloud.dataflow.sdk.transforms.PTransform;
+import com.google.cloud.dataflow.sdk.transforms.ParDo;
+import com.google.cloud.dataflow.sdk.transforms.SerializableFunction;
+import com.google.cloud.dataflow.sdk.transforms.Sum;
+import com.google.cloud.dataflow.sdk.values.KV;
+import com.google.cloud.dataflow.sdk.values.PCollection;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * An example that reads the public 'Shakespeare' data, and for each word in
+ * the dataset that is over a given length, generates a string containing the
+ * list of play names in which that word appears, and saves this information
+ * to a bigquery table.
+ *
+ * <p>Concepts: the Combine.perKey transform, which lets you combine the values in a
+ * key-grouped Collection, and how to use an Aggregator to track information in the
+ * Monitoring UI.
+ *
+ * <p>Note: Before running this example, you must create a BigQuery dataset to contain your output
+ * table.
+ *
+ * <p>To execute this pipeline locally, specify general pipeline configuration:
+ * <pre>{@code
+ * --project=YOUR_PROJECT_ID
+ * }
+ * </pre>
+ * and the BigQuery table for the output:
+ * <pre>{@code
+ * --output=YOUR_PROJECT_ID:DATASET_ID.TABLE_ID
+ * }</pre>
+ *
+ * <p>To execute this pipeline using the Dataflow service, specify pipeline configuration:
+ * <pre>{@code
+ * --project=YOUR_PROJECT_ID
+ * --stagingLocation=gs://<STAGING DIRECTORY>
+ * --runner=BlockingDataflowPipelineRunner
+ * }
+ * </pre>
+ * and the BigQuery table for the output:
+ * <pre>{@code
+ * --output=YOUR_PROJECT_ID:DATASET_ID.TABLE_ID
+ * }</pre>
+ *
+ * <p>The BigQuery input table defaults to {@code publicdata:samples.shakespeare} and can
+ * be overridden with {@code --input}.
+ */
+public class CombinePerKeyExamples {
+ // Use the shakespeare public BigQuery sample
+ private static final String SHAKESPEARE_TABLE =
+ "publicdata:samples.shakespeare";
+ // We'll track words >= this word length across all plays in the table.
+ private static final int MIN_WORD_LENGTH = 9;
+
+ /**
+ * Examines each row in the input table. If the word is greater than or equal to MIN_WORD_LENGTH,
+ * outputs word, play_name.
+ */
+ static class ExtractLargeWordsFn extends DoFn<TableRow, KV<String, String>> {
+ private final Aggregator<Long, Long> smallerWords =
+ createAggregator("smallerWords", new Sum.SumLongFn());
+
+ @Override
+ public void processElement(ProcessContext c){
+ TableRow row = c.element();
+ String playName = (String) row.get("corpus");
+ String word = (String) row.get("word");
+ if (word.length() >= MIN_WORD_LENGTH) {
+ c.output(KV.of(word, playName));
+ } else {
+ // Track how many smaller words we're not including. This information will be
+ // visible in the Monitoring UI.
+ smallerWords.addValue(1L);
+ }
+ }
+ }
+
+
+ /**
+ * Prepares the data for writing to BigQuery by building a TableRow object
+ * containing a word with a string listing the plays in which it appeared.
+ */
+ static class FormatShakespeareOutputFn extends DoFn<KV<String, String>, TableRow> {
+ @Override
+ public void processElement(ProcessContext c) {
+ TableRow row = new TableRow()
+ .set("word", c.element().getKey())
+ .set("all_plays", c.element().getValue());
+ c.output(row);
+ }
+ }
+
+ /**
+ * Reads the public 'Shakespeare' data, and for each word in the dataset
+ * over a given length, generates a string containing the list of play names
+ * in which that word appears. It does this via the Combine.perKey
+ * transform, with the ConcatWords combine function.
+ *
+ * <p>Combine.perKey is similar to a GroupByKey followed by a ParDo, but
+ * has more restricted semantics that allow it to be executed more
+ * efficiently. These records are then formatted as BQ table rows.
+ */
+ static class PlaysForWord
+ extends PTransform<PCollection<TableRow>, PCollection<TableRow>> {
+ @Override
+ public PCollection<TableRow> apply(PCollection<TableRow> rows) {
+
+ // row... => <word, play_name> ...
+ PCollection<KV<String, String>> words = rows.apply(
+ ParDo.of(new ExtractLargeWordsFn()));
+
+ // word, play_name => word, all_plays ...
+ PCollection<KV<String, String>> wordAllPlays =
+ words.apply(Combine.<String, String>perKey(
+ new ConcatWords()));
+
+ // <word, all_plays>... => row...
+ PCollection<TableRow> results = wordAllPlays.apply(
+ ParDo.of(new FormatShakespeareOutputFn()));
+
+ return results;
+ }
+ }
+
+ /**
+ * A 'combine function' used with the Combine.perKey transform. Builds a
+ * comma-separated string of all input items. So, it will build a string
+ * containing all the different Shakespeare plays in which the given input
+ * word has appeared.
+ */
+ public static class ConcatWords implements SerializableFunction<Iterable<String>, String> {
+ @Override
+ public String apply(Iterable<String> input) {
+ StringBuilder all = new StringBuilder();
+ for (String item : input) {
+ if (!item.isEmpty()) {
+ if (all.length() == 0) {
+ all.append(item);
+ } else {
+ all.append(",");
+ all.append(item);
+ }
+ }
+ }
+ return all.toString();
+ }
+ }
+
+ /**
+ * Options supported by {@link CombinePerKeyExamples}.
+ *
+ * <p>Inherits standard configuration options.
+ */
+ private static interface Options extends PipelineOptions {
+ @Description("Table to read from, specified as "
+ + "<project_id>:<dataset_id>.<table_id>")
+ @Default.String(SHAKESPEARE_TABLE)
+ String getInput();
+ void setInput(String value);
+
+ @Description("Table to write to, specified as "
+ + "<project_id>:<dataset_id>.<table_id>. "
+ + "The dataset_id must already exist")
+ @Validation.Required
+ String getOutput();
+ void setOutput(String value);
+ }
+
+ public static void main(String[] args)
+ throws Exception {
+
+ Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
+ Pipeline p = Pipeline.create(options);
+
+ // Build the table schema for the output table.
+ List<TableFieldSchema> fields = new ArrayList<>();
+ fields.add(new TableFieldSchema().setName("word").setType("STRING"));
+ fields.add(new TableFieldSchema().setName("all_plays").setType("STRING"));
+ TableSchema schema = new TableSchema().setFields(fields);
+
+ p.apply(BigQueryIO.Read.from(options.getInput()))
+ .apply(new PlaysForWord())
+ .apply(BigQueryIO.Write
+ .to(options.getOutput())
+ .withSchema(schema)
+ .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED)
+ .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_TRUNCATE));
+
+ p.run();
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/java/src/main/java/com/google/cloud/dataflow/examples/cookbook/DatastoreWordCount.java
----------------------------------------------------------------------
diff --git a/examples/java/src/main/java/com/google/cloud/dataflow/examples/cookbook/DatastoreWordCount.java b/examples/java/src/main/java/com/google/cloud/dataflow/examples/cookbook/DatastoreWordCount.java
new file mode 100644
index 0000000..eaf1e20
--- /dev/null
+++ b/examples/java/src/main/java/com/google/cloud/dataflow/examples/cookbook/DatastoreWordCount.java
@@ -0,0 +1,269 @@
+/*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package com.google.cloud.dataflow.examples.cookbook;
+
+import static com.google.api.services.datastore.client.DatastoreHelper.getPropertyMap;
+import static com.google.api.services.datastore.client.DatastoreHelper.getString;
+import static com.google.api.services.datastore.client.DatastoreHelper.makeFilter;
+import static com.google.api.services.datastore.client.DatastoreHelper.makeKey;
+import static com.google.api.services.datastore.client.DatastoreHelper.makeValue;
+
+import com.google.api.services.datastore.DatastoreV1.Entity;
+import com.google.api.services.datastore.DatastoreV1.Key;
+import com.google.api.services.datastore.DatastoreV1.Property;
+import com.google.api.services.datastore.DatastoreV1.PropertyFilter;
+import com.google.api.services.datastore.DatastoreV1.Query;
+import com.google.api.services.datastore.DatastoreV1.Value;
+import com.google.cloud.dataflow.examples.WordCount;
+import com.google.cloud.dataflow.sdk.Pipeline;
+import com.google.cloud.dataflow.sdk.io.DatastoreIO;
+import com.google.cloud.dataflow.sdk.io.Read;
+import com.google.cloud.dataflow.sdk.io.TextIO;
+import com.google.cloud.dataflow.sdk.options.Default;
+import com.google.cloud.dataflow.sdk.options.Description;
+import com.google.cloud.dataflow.sdk.options.PipelineOptions;
+import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
+import com.google.cloud.dataflow.sdk.options.Validation;
+import com.google.cloud.dataflow.sdk.transforms.DoFn;
+import com.google.cloud.dataflow.sdk.transforms.MapElements;
+import com.google.cloud.dataflow.sdk.transforms.ParDo;
+
+import java.util.Map;
+import java.util.UUID;
+
+import javax.annotation.Nullable;
+
+/**
+ * A WordCount example using DatastoreIO.
+ *
+ * <p>This example shows how to use DatastoreIO to read from Datastore and
+ * write the results to Cloud Storage. Note that this example will write
+ * data to Datastore, which may incur charge for Datastore operations.
+ *
+ * <p>To run this example, users need to use gcloud to get credential for Datastore:
+ * <pre>{@code
+ * $ gcloud auth login
+ * }</pre>
+ *
+ * <p>To run this pipeline locally, the following options must be provided:
+ * <pre>{@code
+ * --project=YOUR_PROJECT_ID
+ * --dataset=YOUR_DATASET_ID
+ * --output=[YOUR_LOCAL_FILE | gs://YOUR_OUTPUT_PATH]
+ * }</pre>
+ *
+ * <p>To run this example using Dataflow service, you must additionally
+ * provide either {@literal --stagingLocation} or {@literal --tempLocation}, and
+ * select one of the Dataflow pipeline runners, eg
+ * {@literal --runner=BlockingDataflowPipelineRunner}.
+ *
+ * <p><b>Note:</b> this example creates entities with <i>Ancestor keys</i> to ensure that all
+ * entities created are in the same entity group. Similarly, the query used to read from the Cloud
+ * Datastore uses an <i>Ancestor filter</i>. Ancestors are used to ensure strongly consistent
+ * results in Cloud Datastore. For more information, see the Cloud Datastore documentation on
+ * <a href="https://cloud.google.com/datastore/docs/concepts/structuring_for_strong_consistency">
+ * Structing Data for Strong Consistency</a>.
+ */
+public class DatastoreWordCount {
+
+ /**
+ * A DoFn that gets the content of an entity (one line in a
+ * Shakespeare play) and converts it to a string.
+ */
+ static class GetContentFn extends DoFn<Entity, String> {
+ @Override
+ public void processElement(ProcessContext c) {
+ Map<String, Value> props = getPropertyMap(c.element());
+ Value value = props.get("content");
+ if (value != null) {
+ c.output(getString(value));
+ }
+ }
+ }
+
+ /**
+ * A helper function to create the ancestor key for all created and queried entities.
+ *
+ * <p>We use ancestor keys and ancestor queries for strong consistency. See
+ * {@link DatastoreWordCount} javadoc for more information.
+ */
+ static Key makeAncestorKey(@Nullable String namespace, String kind) {
+ Key.Builder keyBuilder = makeKey(kind, "root");
+ if (namespace != null) {
+ keyBuilder.getPartitionIdBuilder().setNamespace(namespace);
+ }
+ return keyBuilder.build();
+ }
+
+ /**
+ * A DoFn that creates entity for every line in Shakespeare.
+ */
+ static class CreateEntityFn extends DoFn<String, Entity> {
+ private final String namespace;
+ private final String kind;
+ private final Key ancestorKey;
+
+ CreateEntityFn(String namespace, String kind) {
+ this.namespace = namespace;
+ this.kind = kind;
+
+ // Build the ancestor key for all created entities once, including the namespace.
+ ancestorKey = makeAncestorKey(namespace, kind);
+ }
+
+ public Entity makeEntity(String content) {
+ Entity.Builder entityBuilder = Entity.newBuilder();
+
+ // All created entities have the same ancestor Key.
+ Key.Builder keyBuilder = makeKey(ancestorKey, kind, UUID.randomUUID().toString());
+ // NOTE: Namespace is not inherited between keys created with DatastoreHelper.makeKey, so
+ // we must set the namespace on keyBuilder. TODO: Once partitionId inheritance is added,
+ // we can simplify this code.
+ if (namespace != null) {
+ keyBuilder.getPartitionIdBuilder().setNamespace(namespace);
+ }
+
+ entityBuilder.setKey(keyBuilder.build());
+ entityBuilder.addProperty(Property.newBuilder().setName("content")
+ .setValue(Value.newBuilder().setStringValue(content)));
+ return entityBuilder.build();
+ }
+
+ @Override
+ public void processElement(ProcessContext c) {
+ c.output(makeEntity(c.element()));
+ }
+ }
+
+ /**
+ * Options supported by {@link DatastoreWordCount}.
+ *
+ * <p>Inherits standard configuration options.
+ */
+ public static interface Options extends PipelineOptions {
+ @Description("Path of the file to read from and store to Datastore")
+ @Default.String("gs://dataflow-samples/shakespeare/kinglear.txt")
+ String getInput();
+ void setInput(String value);
+
+ @Description("Path of the file to write to")
+ @Validation.Required
+ String getOutput();
+ void setOutput(String value);
+
+ @Description("Dataset ID to read from datastore")
+ @Validation.Required
+ String getDataset();
+ void setDataset(String value);
+
+ @Description("Dataset entity kind")
+ @Default.String("shakespeare-demo")
+ String getKind();
+ void setKind(String value);
+
+ @Description("Dataset namespace")
+ String getNamespace();
+ void setNamespace(@Nullable String value);
+
+ @Description("Read an existing dataset, do not write first")
+ boolean isReadOnly();
+ void setReadOnly(boolean value);
+
+ @Description("Number of output shards")
+ @Default.Integer(0) // If the system should choose automatically.
+ int getNumShards();
+ void setNumShards(int value);
+ }
+
+ /**
+ * An example that creates a pipeline to populate DatastoreIO from a
+ * text input. Forces use of DirectPipelineRunner for local execution mode.
+ */
+ public static void writeDataToDatastore(Options options) {
+ Pipeline p = Pipeline.create(options);
+ p.apply(TextIO.Read.named("ReadLines").from(options.getInput()))
+ .apply(ParDo.of(new CreateEntityFn(options.getNamespace(), options.getKind())))
+ .apply(DatastoreIO.writeTo(options.getDataset()));
+
+ p.run();
+ }
+
+ /**
+ * Build a Cloud Datastore ancestor query for the specified {@link Options#getNamespace} and
+ * {@link Options#getKind}.
+ *
+ * <p>We use ancestor keys and ancestor queries for strong consistency. See
+ * {@link DatastoreWordCount} javadoc for more information.
+ *
+ * @see <a href="https://cloud.google.com/datastore/docs/concepts/queries#Datastore_Ancestor_filters">Ancestor filters</a>
+ */
+ static Query makeAncestorKindQuery(Options options) {
+ Query.Builder q = Query.newBuilder();
+ q.addKindBuilder().setName(options.getKind());
+ q.setFilter(makeFilter(
+ "__key__",
+ PropertyFilter.Operator.HAS_ANCESTOR,
+ makeValue(makeAncestorKey(options.getNamespace(), options.getKind()))));
+ return q.build();
+ }
+
+ /**
+ * An example that creates a pipeline to do DatastoreIO.Read from Datastore.
+ */
+ public static void readDataFromDatastore(Options options) {
+ Query query = makeAncestorKindQuery(options);
+
+ // For Datastore sources, the read namespace can be set on the entire query.
+ DatastoreIO.Source source = DatastoreIO.source()
+ .withDataset(options.getDataset())
+ .withQuery(query)
+ .withNamespace(options.getNamespace());
+
+ Pipeline p = Pipeline.create(options);
+ p.apply("ReadShakespeareFromDatastore", Read.from(source))
+ .apply("StringifyEntity", ParDo.of(new GetContentFn()))
+ .apply("CountWords", new WordCount.CountWords())
+ .apply("PrintWordCount", MapElements.via(new WordCount.FormatAsTextFn()))
+ .apply("WriteLines", TextIO.Write.to(options.getOutput())
+ .withNumShards(options.getNumShards()));
+ p.run();
+ }
+
+ /**
+ * An example to demo how to use {@link DatastoreIO}. The runner here is
+ * customizable, which means users could pass either {@code DirectPipelineRunner}
+ * or {@code DataflowPipelineRunner} in the pipeline options.
+ */
+ public static void main(String args[]) {
+ // The options are used in two places, for Dataflow service, and
+ // building DatastoreIO.Read object
+ Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
+
+ if (!options.isReadOnly()) {
+ // First example: write data to Datastore for reading later.
+ //
+ // NOTE: this write does not delete any existing Entities in the Datastore, so if run
+ // multiple times with the same output dataset, there may be duplicate entries. The
+ // Datastore Query tool in the Google Developers Console can be used to inspect or erase all
+ // entries with a particular namespace and/or kind.
+ DatastoreWordCount.writeDataToDatastore(options);
+ }
+
+ // Second example: do parallel read from Datastore.
+ DatastoreWordCount.readDataFromDatastore(options);
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/java/src/main/java/com/google/cloud/dataflow/examples/cookbook/DeDupExample.java
----------------------------------------------------------------------
diff --git a/examples/java/src/main/java/com/google/cloud/dataflow/examples/cookbook/DeDupExample.java b/examples/java/src/main/java/com/google/cloud/dataflow/examples/cookbook/DeDupExample.java
new file mode 100644
index 0000000..9873561
--- /dev/null
+++ b/examples/java/src/main/java/com/google/cloud/dataflow/examples/cookbook/DeDupExample.java
@@ -0,0 +1,100 @@
+/*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package com.google.cloud.dataflow.examples.cookbook;
+
+import com.google.cloud.dataflow.sdk.Pipeline;
+import com.google.cloud.dataflow.sdk.io.TextIO;
+import com.google.cloud.dataflow.sdk.options.DataflowPipelineOptions;
+import com.google.cloud.dataflow.sdk.options.Default;
+import com.google.cloud.dataflow.sdk.options.DefaultValueFactory;
+import com.google.cloud.dataflow.sdk.options.Description;
+import com.google.cloud.dataflow.sdk.options.PipelineOptions;
+import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
+import com.google.cloud.dataflow.sdk.transforms.RemoveDuplicates;
+import com.google.cloud.dataflow.sdk.util.gcsfs.GcsPath;
+
+/**
+ * This example uses as input Shakespeare's plays as plaintext files, and will remove any
+ * duplicate lines across all the files. (The output does not preserve any input order).
+ *
+ * <p>Concepts: the RemoveDuplicates transform, and how to wire transforms together.
+ * Demonstrates {@link com.google.cloud.dataflow.sdk.io.TextIO.Read}/
+ * {@link RemoveDuplicates}/{@link com.google.cloud.dataflow.sdk.io.TextIO.Write}.
+ *
+ * <p>To execute this pipeline locally, specify general pipeline configuration:
+ * --project=YOUR_PROJECT_ID
+ * and a local output file or output prefix on GCS:
+ * --output=[YOUR_LOCAL_FILE | gs://YOUR_OUTPUT_PREFIX]
+ *
+ * <p>To execute this pipeline using the Dataflow service, specify pipeline configuration:
+ * --project=YOUR_PROJECT_ID
+ * --stagingLocation=gs://YOUR_STAGING_DIRECTORY
+ * --runner=BlockingDataflowPipelineRunner
+ * and an output prefix on GCS:
+ * --output=gs://YOUR_OUTPUT_PREFIX
+ *
+ * <p>The input defaults to {@code gs://dataflow-samples/shakespeare/*} and can be
+ * overridden with {@code --input}.
+ */
+public class DeDupExample {
+
+ /**
+ * Options supported by {@link DeDupExample}.
+ *
+ * <p>Inherits standard configuration options.
+ */
+ private static interface Options extends PipelineOptions {
+ @Description("Path to the directory or GCS prefix containing files to read from")
+ @Default.String("gs://dataflow-samples/shakespeare/*")
+ String getInput();
+ void setInput(String value);
+
+ @Description("Path of the file to write to")
+ @Default.InstanceFactory(OutputFactory.class)
+ String getOutput();
+ void setOutput(String value);
+
+ /** Returns gs://${STAGING_LOCATION}/"deduped.txt". */
+ public static class OutputFactory implements DefaultValueFactory<String> {
+ @Override
+ public String create(PipelineOptions options) {
+ DataflowPipelineOptions dataflowOptions = options.as(DataflowPipelineOptions.class);
+ if (dataflowOptions.getStagingLocation() != null) {
+ return GcsPath.fromUri(dataflowOptions.getStagingLocation())
+ .resolve("deduped.txt").toString();
+ } else {
+ throw new IllegalArgumentException("Must specify --output or --stagingLocation");
+ }
+ }
+ }
+ }
+
+
+ public static void main(String[] args)
+ throws Exception {
+
+ Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
+ Pipeline p = Pipeline.create(options);
+
+ p.apply(TextIO.Read.named("ReadLines").from(options.getInput()))
+ .apply(RemoveDuplicates.<String>create())
+ .apply(TextIO.Write.named("DedupedShakespeare")
+ .to(options.getOutput()));
+
+ p.run();
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/java/src/main/java/com/google/cloud/dataflow/examples/cookbook/FilterExamples.java
----------------------------------------------------------------------
diff --git a/examples/java/src/main/java/com/google/cloud/dataflow/examples/cookbook/FilterExamples.java b/examples/java/src/main/java/com/google/cloud/dataflow/examples/cookbook/FilterExamples.java
new file mode 100644
index 0000000..781873a
--- /dev/null
+++ b/examples/java/src/main/java/com/google/cloud/dataflow/examples/cookbook/FilterExamples.java
@@ -0,0 +1,266 @@
+/*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package com.google.cloud.dataflow.examples.cookbook;
+
+import com.google.api.services.bigquery.model.TableFieldSchema;
+import com.google.api.services.bigquery.model.TableRow;
+import com.google.api.services.bigquery.model.TableSchema;
+import com.google.cloud.dataflow.sdk.Pipeline;
+import com.google.cloud.dataflow.sdk.io.BigQueryIO;
+import com.google.cloud.dataflow.sdk.options.Default;
+import com.google.cloud.dataflow.sdk.options.Description;
+import com.google.cloud.dataflow.sdk.options.PipelineOptions;
+import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
+import com.google.cloud.dataflow.sdk.options.Validation;
+import com.google.cloud.dataflow.sdk.transforms.DoFn;
+import com.google.cloud.dataflow.sdk.transforms.Mean;
+import com.google.cloud.dataflow.sdk.transforms.PTransform;
+import com.google.cloud.dataflow.sdk.transforms.ParDo;
+import com.google.cloud.dataflow.sdk.transforms.View;
+import com.google.cloud.dataflow.sdk.values.PCollection;
+import com.google.cloud.dataflow.sdk.values.PCollectionView;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.logging.Logger;
+
+/**
+ * This is an example that demonstrates several approaches to filtering, and use of the Mean
+ * transform. It shows how to dynamically set parameters by defining and using new pipeline options,
+ * and how to use a value derived by the pipeline.
+ *
+ * <p>Concepts: The Mean transform; Options configuration; using pipeline-derived data as a side
+ * input; approaches to filtering, selection, and projection.
+ *
+ * <p>The example reads public samples of weather data from BigQuery. It performs a
+ * projection on the data, finds the global mean of the temperature readings, filters on readings
+ * for a single given month, and then outputs only data (for that month) that has a mean temp
+ * smaller than the derived global mean.
+*
+ * <p>Note: Before running this example, you must create a BigQuery dataset to contain your output
+ * table.
+ *
+ * <p>To execute this pipeline locally, specify general pipeline configuration:
+ * <pre>{@code
+ * --project=YOUR_PROJECT_ID
+ * }
+ * </pre>
+ * and the BigQuery table for the output:
+ * <pre>{@code
+ * --output=YOUR_PROJECT_ID:DATASET_ID.TABLE_ID
+ * [--monthFilter=<month_number>]
+ * }
+ * </pre>
+ * where optional parameter {@code --monthFilter} is set to a number 1-12.
+ *
+ * <p>To execute this pipeline using the Dataflow service, specify pipeline configuration:
+ * <pre>{@code
+ * --project=YOUR_PROJECT_ID
+ * --stagingLocation=gs://YOUR_STAGING_DIRECTORY
+ * --runner=BlockingDataflowPipelineRunner
+ * }
+ * </pre>
+ * and the BigQuery table for the output:
+ * <pre>{@code
+ * --output=YOUR_PROJECT_ID:DATASET_ID.TABLE_ID
+ * [--monthFilter=<month_number>]
+ * }
+ * </pre>
+ * where optional parameter {@code --monthFilter} is set to a number 1-12.
+ *
+ * <p>The BigQuery input table defaults to {@code clouddataflow-readonly:samples.weather_stations}
+ * and can be overridden with {@code --input}.
+ */
+public class FilterExamples {
+ // Default to using a 1000 row subset of the public weather station table publicdata:samples.gsod.
+ private static final String WEATHER_SAMPLES_TABLE =
+ "clouddataflow-readonly:samples.weather_stations";
+ static final Logger LOG = Logger.getLogger(FilterExamples.class.getName());
+ static final int MONTH_TO_FILTER = 7;
+
+ /**
+ * Examines each row in the input table. Outputs only the subset of the cells this example
+ * is interested in-- the mean_temp and year, month, and day-- as a bigquery table row.
+ */
+ static class ProjectionFn extends DoFn<TableRow, TableRow> {
+ @Override
+ public void processElement(ProcessContext c){
+ TableRow row = c.element();
+ // Grab year, month, day, mean_temp from the row
+ Integer year = Integer.parseInt((String) row.get("year"));
+ Integer month = Integer.parseInt((String) row.get("month"));
+ Integer day = Integer.parseInt((String) row.get("day"));
+ Double meanTemp = Double.parseDouble(row.get("mean_temp").toString());
+ // Prepares the data for writing to BigQuery by building a TableRow object
+ TableRow outRow = new TableRow()
+ .set("year", year).set("month", month)
+ .set("day", day).set("mean_temp", meanTemp);
+ c.output(outRow);
+ }
+ }
+
+ /**
+ * Implements 'filter' functionality.
+ *
+ * <p>Examines each row in the input table. Outputs only rows from the month
+ * monthFilter, which is passed in as a parameter during construction of this DoFn.
+ */
+ static class FilterSingleMonthDataFn extends DoFn<TableRow, TableRow> {
+ Integer monthFilter;
+
+ public FilterSingleMonthDataFn(Integer monthFilter) {
+ this.monthFilter = monthFilter;
+ }
+
+ @Override
+ public void processElement(ProcessContext c){
+ TableRow row = c.element();
+ Integer month;
+ month = (Integer) row.get("month");
+ if (month.equals(this.monthFilter)) {
+ c.output(row);
+ }
+ }
+ }
+
+ /**
+ * Examines each row (weather reading) in the input table. Output the temperature
+ * reading for that row ('mean_temp').
+ */
+ static class ExtractTempFn extends DoFn<TableRow, Double> {
+ @Override
+ public void processElement(ProcessContext c){
+ TableRow row = c.element();
+ Double meanTemp = Double.parseDouble(row.get("mean_temp").toString());
+ c.output(meanTemp);
+ }
+ }
+
+
+
+ /*
+ * Finds the global mean of the mean_temp for each day/record, and outputs
+ * only data that has a mean temp larger than this global mean.
+ **/
+ static class BelowGlobalMean
+ extends PTransform<PCollection<TableRow>, PCollection<TableRow>> {
+ Integer monthFilter;
+
+ public BelowGlobalMean(Integer monthFilter) {
+ this.monthFilter = monthFilter;
+ }
+
+
+ @Override
+ public PCollection<TableRow> apply(PCollection<TableRow> rows) {
+
+ // Extract the mean_temp from each row.
+ PCollection<Double> meanTemps = rows.apply(
+ ParDo.of(new ExtractTempFn()));
+
+ // Find the global mean, of all the mean_temp readings in the weather data,
+ // and prepare this singleton PCollectionView for use as a side input.
+ final PCollectionView<Double> globalMeanTemp =
+ meanTemps.apply(Mean.<Double>globally())
+ .apply(View.<Double>asSingleton());
+
+ // Rows filtered to remove all but a single month
+ PCollection<TableRow> monthFilteredRows = rows
+ .apply(ParDo.of(new FilterSingleMonthDataFn(monthFilter)));
+
+ // Then, use the global mean as a side input, to further filter the weather data.
+ // By using a side input to pass in the filtering criteria, we can use a value
+ // that is computed earlier in pipeline execution.
+ // We'll only output readings with temperatures below this mean.
+ PCollection<TableRow> filteredRows = monthFilteredRows
+ .apply(ParDo
+ .named("ParseAndFilter")
+ .withSideInputs(globalMeanTemp)
+ .of(new DoFn<TableRow, TableRow>() {
+ @Override
+ public void processElement(ProcessContext c) {
+ Double meanTemp = Double.parseDouble(c.element().get("mean_temp").toString());
+ Double gTemp = c.sideInput(globalMeanTemp);
+ if (meanTemp < gTemp) {
+ c.output(c.element());
+ }
+ }
+ }));
+
+ return filteredRows;
+ }
+ }
+
+
+ /**
+ * Options supported by {@link FilterExamples}.
+ *
+ * <p>Inherits standard configuration options.
+ */
+ private static interface Options extends PipelineOptions {
+ @Description("Table to read from, specified as "
+ + "<project_id>:<dataset_id>.<table_id>")
+ @Default.String(WEATHER_SAMPLES_TABLE)
+ String getInput();
+ void setInput(String value);
+
+ @Description("Table to write to, specified as "
+ + "<project_id>:<dataset_id>.<table_id>. "
+ + "The dataset_id must already exist")
+ @Validation.Required
+ String getOutput();
+ void setOutput(String value);
+
+ @Description("Numeric value of month to filter on")
+ @Default.Integer(MONTH_TO_FILTER)
+ Integer getMonthFilter();
+ void setMonthFilter(Integer value);
+ }
+
+ /**
+ * Helper method to build the table schema for the output table.
+ */
+ private static TableSchema buildWeatherSchemaProjection() {
+ List<TableFieldSchema> fields = new ArrayList<>();
+ fields.add(new TableFieldSchema().setName("year").setType("INTEGER"));
+ fields.add(new TableFieldSchema().setName("month").setType("INTEGER"));
+ fields.add(new TableFieldSchema().setName("day").setType("INTEGER"));
+ fields.add(new TableFieldSchema().setName("mean_temp").setType("FLOAT"));
+ TableSchema schema = new TableSchema().setFields(fields);
+ return schema;
+ }
+
+ public static void main(String[] args)
+ throws Exception {
+
+ Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
+ Pipeline p = Pipeline.create(options);
+
+ TableSchema schema = buildWeatherSchemaProjection();
+
+ p.apply(BigQueryIO.Read.from(options.getInput()))
+ .apply(ParDo.of(new ProjectionFn()))
+ .apply(new BelowGlobalMean(options.getMonthFilter()))
+ .apply(BigQueryIO.Write
+ .to(options.getOutput())
+ .withSchema(schema)
+ .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED)
+ .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_TRUNCATE));
+
+ p.run();
+ }
+}
[54/67] incubator-beam git commit: Directory reorganization
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/src/main/java/com/google/cloud/dataflow/examples/complete/TrafficRoutes.java
----------------------------------------------------------------------
diff --git a/examples/src/main/java/com/google/cloud/dataflow/examples/complete/TrafficRoutes.java b/examples/src/main/java/com/google/cloud/dataflow/examples/complete/TrafficRoutes.java
deleted file mode 100644
index e3e88c2..0000000
--- a/examples/src/main/java/com/google/cloud/dataflow/examples/complete/TrafficRoutes.java
+++ /dev/null
@@ -1,459 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.examples.complete;
-
-import com.google.api.services.bigquery.model.TableFieldSchema;
-import com.google.api.services.bigquery.model.TableReference;
-import com.google.api.services.bigquery.model.TableRow;
-import com.google.api.services.bigquery.model.TableSchema;
-import com.google.cloud.dataflow.examples.common.DataflowExampleOptions;
-import com.google.cloud.dataflow.examples.common.DataflowExampleUtils;
-import com.google.cloud.dataflow.examples.common.ExampleBigQueryTableOptions;
-import com.google.cloud.dataflow.examples.common.ExamplePubsubTopicAndSubscriptionOptions;
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.PipelineResult;
-import com.google.cloud.dataflow.sdk.coders.AvroCoder;
-import com.google.cloud.dataflow.sdk.coders.DefaultCoder;
-import com.google.cloud.dataflow.sdk.io.BigQueryIO;
-import com.google.cloud.dataflow.sdk.io.PubsubIO;
-import com.google.cloud.dataflow.sdk.io.TextIO;
-import com.google.cloud.dataflow.sdk.options.Default;
-import com.google.cloud.dataflow.sdk.options.Description;
-import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
-import com.google.cloud.dataflow.sdk.transforms.DoFn;
-import com.google.cloud.dataflow.sdk.transforms.GroupByKey;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-import com.google.cloud.dataflow.sdk.transforms.ParDo;
-import com.google.cloud.dataflow.sdk.transforms.windowing.SlidingWindows;
-import com.google.cloud.dataflow.sdk.transforms.windowing.Window;
-import com.google.cloud.dataflow.sdk.values.KV;
-import com.google.cloud.dataflow.sdk.values.PBegin;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.common.base.Strings;
-import com.google.common.collect.Lists;
-
-import org.apache.avro.reflect.Nullable;
-import org.joda.time.Duration;
-import org.joda.time.Instant;
-import org.joda.time.format.DateTimeFormat;
-import org.joda.time.format.DateTimeFormatter;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.Hashtable;
-import java.util.List;
-import java.util.Map;
-
-/**
- * A Dataflow Example that runs in both batch and streaming modes with traffic sensor data.
- * You can configure the running mode by setting {@literal --streaming} to true or false.
- *
- * <p>Concepts: The batch and streaming runners, GroupByKey, sliding windows, and
- * Google Cloud Pub/Sub topic injection.
- *
- * <p>This example analyzes traffic sensor data using SlidingWindows. For each window,
- * it calculates the average speed over the window for some small set of predefined 'routes',
- * and looks for 'slowdowns' in those routes. It writes its results to a BigQuery table.
- *
- * <p>In batch mode, the pipeline reads traffic sensor data from {@literal --inputFile}.
- *
- * <p>In streaming mode, the pipeline reads the data from a Pub/Sub topic.
- * By default, the example will run a separate pipeline to inject the data from the default
- * {@literal --inputFile} to the Pub/Sub {@literal --pubsubTopic}. It will make it available for
- * the streaming pipeline to process. You may override the default {@literal --inputFile} with the
- * file of your choosing. You may also set {@literal --inputFile} to an empty string, which will
- * disable the automatic Pub/Sub injection, and allow you to use separate tool to control the input
- * to this example. An example code, which publishes traffic sensor data to a Pub/Sub topic,
- * is provided in
- * <a href="https://github.com/GoogleCloudPlatform/cloud-pubsub-samples-python/tree/master/gce-cmdline-publisher"></a>.
- *
- * <p>The example is configured to use the default Pub/Sub topic and the default BigQuery table
- * from the example common package (there are no defaults for a general Dataflow pipeline).
- * You can override them by using the {@literal --pubsubTopic}, {@literal --bigQueryDataset}, and
- * {@literal --bigQueryTable} options. If the Pub/Sub topic or the BigQuery table do not exist,
- * the example will try to create them.
- *
- * <p>The example will try to cancel the pipelines on the signal to terminate the process (CTRL-C)
- * and then exits.
- */
-
-public class TrafficRoutes {
-
- private static final String PUBSUB_TIMESTAMP_LABEL_KEY = "timestamp_ms";
- private static final Integer VALID_INPUTS = 4999;
-
- // Instantiate some small predefined San Diego routes to analyze
- static Map<String, String> sdStations = buildStationInfo();
- static final int WINDOW_DURATION = 3; // Default sliding window duration in minutes
- static final int WINDOW_SLIDE_EVERY = 1; // Default window 'slide every' setting in minutes
-
- /**
- * This class holds information about a station reading's average speed.
- */
- @DefaultCoder(AvroCoder.class)
- static class StationSpeed implements Comparable<StationSpeed> {
- @Nullable String stationId;
- @Nullable Double avgSpeed;
- @Nullable Long timestamp;
-
- public StationSpeed() {}
-
- public StationSpeed(String stationId, Double avgSpeed, Long timestamp) {
- this.stationId = stationId;
- this.avgSpeed = avgSpeed;
- this.timestamp = timestamp;
- }
-
- public String getStationId() {
- return this.stationId;
- }
- public Double getAvgSpeed() {
- return this.avgSpeed;
- }
-
- @Override
- public int compareTo(StationSpeed other) {
- return Long.compare(this.timestamp, other.timestamp);
- }
- }
-
- /**
- * This class holds information about a route's speed/slowdown.
- */
- @DefaultCoder(AvroCoder.class)
- static class RouteInfo {
- @Nullable String route;
- @Nullable Double avgSpeed;
- @Nullable Boolean slowdownEvent;
-
-
- public RouteInfo() {}
-
- public RouteInfo(String route, Double avgSpeed, Boolean slowdownEvent) {
- this.route = route;
- this.avgSpeed = avgSpeed;
- this.slowdownEvent = slowdownEvent;
- }
-
- public String getRoute() {
- return this.route;
- }
- public Double getAvgSpeed() {
- return this.avgSpeed;
- }
- public Boolean getSlowdownEvent() {
- return this.slowdownEvent;
- }
- }
-
- /**
- * Extract the timestamp field from the input string, and use it as the element timestamp.
- */
- static class ExtractTimestamps extends DoFn<String, String> {
- private static final DateTimeFormatter dateTimeFormat =
- DateTimeFormat.forPattern("MM/dd/yyyy HH:mm:ss");
-
- @Override
- public void processElement(DoFn<String, String>.ProcessContext c) throws Exception {
- String[] items = c.element().split(",");
- String timestamp = tryParseTimestamp(items);
- if (timestamp != null) {
- try {
- c.outputWithTimestamp(c.element(), new Instant(dateTimeFormat.parseMillis(timestamp)));
- } catch (IllegalArgumentException e) {
- // Skip the invalid input.
- }
- }
- }
- }
-
- /**
- * Filter out readings for the stations along predefined 'routes', and output
- * (station, speed info) keyed on route.
- */
- static class ExtractStationSpeedFn extends DoFn<String, KV<String, StationSpeed>> {
-
- @Override
- public void processElement(ProcessContext c) {
- String[] items = c.element().split(",");
- String stationType = tryParseStationType(items);
- // For this analysis, use only 'main line' station types
- if (stationType != null && stationType.equals("ML")) {
- Double avgSpeed = tryParseAvgSpeed(items);
- String stationId = tryParseStationId(items);
- // For this simple example, filter out everything but some hardwired routes.
- if (avgSpeed != null && stationId != null && sdStations.containsKey(stationId)) {
- StationSpeed stationSpeed =
- new StationSpeed(stationId, avgSpeed, c.timestamp().getMillis());
- // The tuple key is the 'route' name stored in the 'sdStations' hash.
- KV<String, StationSpeed> outputValue = KV.of(sdStations.get(stationId), stationSpeed);
- c.output(outputValue);
- }
- }
- }
- }
-
- /**
- * For a given route, track average speed for the window. Calculate whether
- * traffic is currently slowing down, via a predefined threshold. If a supermajority of
- * speeds in this sliding window are less than the previous reading we call this a 'slowdown'.
- * Note: these calculations are for example purposes only, and are unrealistic and oversimplified.
- */
- static class GatherStats
- extends DoFn<KV<String, Iterable<StationSpeed>>, KV<String, RouteInfo>> {
- @Override
- public void processElement(ProcessContext c) throws IOException {
- String route = c.element().getKey();
- double speedSum = 0.0;
- int speedCount = 0;
- int speedups = 0;
- int slowdowns = 0;
- List<StationSpeed> infoList = Lists.newArrayList(c.element().getValue());
- // StationSpeeds sort by embedded timestamp.
- Collections.sort(infoList);
- Map<String, Double> prevSpeeds = new HashMap<>();
- // For all stations in the route, sum (non-null) speeds. Keep a count of the non-null speeds.
- for (StationSpeed item : infoList) {
- Double speed = item.getAvgSpeed();
- if (speed != null) {
- speedSum += speed;
- speedCount++;
- Double lastSpeed = prevSpeeds.get(item.getStationId());
- if (lastSpeed != null) {
- if (lastSpeed < speed) {
- speedups += 1;
- } else {
- slowdowns += 1;
- }
- }
- prevSpeeds.put(item.getStationId(), speed);
- }
- }
- if (speedCount == 0) {
- // No average to compute.
- return;
- }
- double speedAvg = speedSum / speedCount;
- boolean slowdownEvent = slowdowns >= 2 * speedups;
- RouteInfo routeInfo = new RouteInfo(route, speedAvg, slowdownEvent);
- c.output(KV.of(route, routeInfo));
- }
- }
-
- /**
- * Format the results of the slowdown calculations to a TableRow, to save to BigQuery.
- */
- static class FormatStatsFn extends DoFn<KV<String, RouteInfo>, TableRow> {
- @Override
- public void processElement(ProcessContext c) {
- RouteInfo routeInfo = c.element().getValue();
- TableRow row = new TableRow()
- .set("avg_speed", routeInfo.getAvgSpeed())
- .set("slowdown_event", routeInfo.getSlowdownEvent())
- .set("route", c.element().getKey())
- .set("window_timestamp", c.timestamp().toString());
- c.output(row);
- }
-
- /**
- * Defines the BigQuery schema used for the output.
- */
- static TableSchema getSchema() {
- List<TableFieldSchema> fields = new ArrayList<>();
- fields.add(new TableFieldSchema().setName("route").setType("STRING"));
- fields.add(new TableFieldSchema().setName("avg_speed").setType("FLOAT"));
- fields.add(new TableFieldSchema().setName("slowdown_event").setType("BOOLEAN"));
- fields.add(new TableFieldSchema().setName("window_timestamp").setType("TIMESTAMP"));
- TableSchema schema = new TableSchema().setFields(fields);
- return schema;
- }
- }
-
- /**
- * This PTransform extracts speed info from traffic station readings.
- * It groups the readings by 'route' and analyzes traffic slowdown for that route.
- * Lastly, it formats the results for BigQuery.
- */
- static class TrackSpeed extends
- PTransform<PCollection<KV<String, StationSpeed>>, PCollection<TableRow>> {
- @Override
- public PCollection<TableRow> apply(PCollection<KV<String, StationSpeed>> stationSpeed) {
- // Apply a GroupByKey transform to collect a list of all station
- // readings for a given route.
- PCollection<KV<String, Iterable<StationSpeed>>> timeGroup = stationSpeed.apply(
- GroupByKey.<String, StationSpeed>create());
-
- // Analyze 'slowdown' over the route readings.
- PCollection<KV<String, RouteInfo>> stats = timeGroup.apply(ParDo.of(new GatherStats()));
-
- // Format the results for writing to BigQuery
- PCollection<TableRow> results = stats.apply(
- ParDo.of(new FormatStatsFn()));
-
- return results;
- }
- }
-
- static class ReadFileAndExtractTimestamps extends PTransform<PBegin, PCollection<String>> {
- private final String inputFile;
-
- public ReadFileAndExtractTimestamps(String inputFile) {
- this.inputFile = inputFile;
- }
-
- @Override
- public PCollection<String> apply(PBegin begin) {
- return begin
- .apply(TextIO.Read.from(inputFile))
- .apply(ParDo.of(new ExtractTimestamps()));
- }
- }
-
- /**
- * Options supported by {@link TrafficRoutes}.
- *
- * <p>Inherits standard configuration options.
- */
- private interface TrafficRoutesOptions extends DataflowExampleOptions,
- ExamplePubsubTopicAndSubscriptionOptions, ExampleBigQueryTableOptions {
- @Description("Input file to inject to Pub/Sub topic")
- @Default.String("gs://dataflow-samples/traffic_sensor/"
- + "Freeways-5Minaa2010-01-01_to_2010-02-15_test2.csv")
- String getInputFile();
- void setInputFile(String value);
-
- @Description("Numeric value of sliding window duration, in minutes")
- @Default.Integer(WINDOW_DURATION)
- Integer getWindowDuration();
- void setWindowDuration(Integer value);
-
- @Description("Numeric value of window 'slide every' setting, in minutes")
- @Default.Integer(WINDOW_SLIDE_EVERY)
- Integer getWindowSlideEvery();
- void setWindowSlideEvery(Integer value);
-
- @Description("Whether to run the pipeline with unbounded input")
- @Default.Boolean(false)
- boolean isUnbounded();
- void setUnbounded(boolean value);
- }
-
- /**
- * Sets up and starts streaming pipeline.
- *
- * @throws IOException if there is a problem setting up resources
- */
- public static void main(String[] args) throws IOException {
- TrafficRoutesOptions options = PipelineOptionsFactory.fromArgs(args)
- .withValidation()
- .as(TrafficRoutesOptions.class);
-
- options.setBigQuerySchema(FormatStatsFn.getSchema());
- // Using DataflowExampleUtils to set up required resources.
- DataflowExampleUtils dataflowUtils = new DataflowExampleUtils(options, options.isUnbounded());
-
- Pipeline pipeline = Pipeline.create(options);
- TableReference tableRef = new TableReference();
- tableRef.setProjectId(options.getProject());
- tableRef.setDatasetId(options.getBigQueryDataset());
- tableRef.setTableId(options.getBigQueryTable());
-
- PCollection<String> input;
- if (options.isUnbounded()) {
- // Read unbounded PubSubIO.
- input = pipeline.apply(PubsubIO.Read
- .timestampLabel(PUBSUB_TIMESTAMP_LABEL_KEY)
- .subscription(options.getPubsubSubscription()));
- } else {
- // Read bounded PubSubIO.
- input = pipeline.apply(PubsubIO.Read
- .timestampLabel(PUBSUB_TIMESTAMP_LABEL_KEY)
- .subscription(options.getPubsubSubscription()).maxNumRecords(VALID_INPUTS));
-
- // To read bounded TextIO files, use:
- // input = pipeline.apply(TextIO.Read.from(options.getInputFile()))
- // .apply(ParDo.of(new ExtractTimestamps()));
- }
- input
- // row... => <station route, station speed> ...
- .apply(ParDo.of(new ExtractStationSpeedFn()))
- // map the incoming data stream into sliding windows.
- // The default window duration values work well if you're running the accompanying Pub/Sub
- // generator script without the --replay flag, so that there are no simulated pauses in
- // the sensor data publication. You may want to adjust the values otherwise.
- .apply(Window.<KV<String, StationSpeed>>into(SlidingWindows.of(
- Duration.standardMinutes(options.getWindowDuration())).
- every(Duration.standardMinutes(options.getWindowSlideEvery()))))
- .apply(new TrackSpeed())
- .apply(BigQueryIO.Write.to(tableRef)
- .withSchema(FormatStatsFn.getSchema()));
-
- // Inject the data into the Pub/Sub topic with a Dataflow batch pipeline.
- if (!Strings.isNullOrEmpty(options.getInputFile())
- && !Strings.isNullOrEmpty(options.getPubsubTopic())) {
- dataflowUtils.runInjectorPipeline(
- new ReadFileAndExtractTimestamps(options.getInputFile()),
- options.getPubsubTopic(),
- PUBSUB_TIMESTAMP_LABEL_KEY);
- }
-
- // Run the pipeline.
- PipelineResult result = pipeline.run();
-
- // dataflowUtils will try to cancel the pipeline and the injector before the program exists.
- dataflowUtils.waitToFinish(result);
- }
-
- private static Double tryParseAvgSpeed(String[] inputItems) {
- try {
- return Double.parseDouble(tryParseString(inputItems, 9));
- } catch (NumberFormatException e) {
- return null;
- } catch (NullPointerException e) {
- return null;
- }
- }
-
- private static String tryParseStationType(String[] inputItems) {
- return tryParseString(inputItems, 4);
- }
-
- private static String tryParseStationId(String[] inputItems) {
- return tryParseString(inputItems, 1);
- }
-
- private static String tryParseTimestamp(String[] inputItems) {
- return tryParseString(inputItems, 0);
- }
-
- private static String tryParseString(String[] inputItems, int index) {
- return inputItems.length >= index ? inputItems[index] : null;
- }
-
- /**
- * Define some small hard-wired San Diego 'routes' to track based on sensor station ID.
- */
- private static Map<String, String> buildStationInfo() {
- Map<String, String> stations = new Hashtable<String, String>();
- stations.put("1108413", "SDRoute1"); // from freeway 805 S
- stations.put("1108699", "SDRoute2"); // from freeway 78 E
- stations.put("1108702", "SDRoute2");
- return stations;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/src/main/java/com/google/cloud/dataflow/examples/cookbook/BigQueryTornadoes.java
----------------------------------------------------------------------
diff --git a/examples/src/main/java/com/google/cloud/dataflow/examples/cookbook/BigQueryTornadoes.java b/examples/src/main/java/com/google/cloud/dataflow/examples/cookbook/BigQueryTornadoes.java
deleted file mode 100644
index 503bcad..0000000
--- a/examples/src/main/java/com/google/cloud/dataflow/examples/cookbook/BigQueryTornadoes.java
+++ /dev/null
@@ -1,179 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.examples.cookbook;
-
-import com.google.api.services.bigquery.model.TableFieldSchema;
-import com.google.api.services.bigquery.model.TableRow;
-import com.google.api.services.bigquery.model.TableSchema;
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.io.BigQueryIO;
-import com.google.cloud.dataflow.sdk.options.Default;
-import com.google.cloud.dataflow.sdk.options.Description;
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
-import com.google.cloud.dataflow.sdk.options.Validation;
-import com.google.cloud.dataflow.sdk.transforms.Count;
-import com.google.cloud.dataflow.sdk.transforms.DoFn;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-import com.google.cloud.dataflow.sdk.transforms.ParDo;
-import com.google.cloud.dataflow.sdk.values.KV;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-
-import java.util.ArrayList;
-import java.util.List;
-
-/**
- * An example that reads the public samples of weather data from BigQuery, counts the number of
- * tornadoes that occur in each month, and writes the results to BigQuery.
- *
- * <p>Concepts: Reading/writing BigQuery; counting a PCollection; user-defined PTransforms
- *
- * <p>Note: Before running this example, you must create a BigQuery dataset to contain your output
- * table.
- *
- * <p>To execute this pipeline locally, specify general pipeline configuration:
- * <pre>{@code
- * --project=YOUR_PROJECT_ID
- * }
- * </pre>
- * and the BigQuery table for the output, with the form
- * <pre>{@code
- * --output=YOUR_PROJECT_ID:DATASET_ID.TABLE_ID
- * }</pre>
- *
- * <p>To execute this pipeline using the Dataflow service, specify pipeline configuration:
- * <pre>{@code
- * --project=YOUR_PROJECT_ID
- * --stagingLocation=gs://YOUR_STAGING_DIRECTORY
- * --runner=BlockingDataflowPipelineRunner
- * }
- * </pre>
- * and the BigQuery table for the output:
- * <pre>{@code
- * --output=YOUR_PROJECT_ID:DATASET_ID.TABLE_ID
- * }</pre>
- *
- * <p>The BigQuery input table defaults to {@code clouddataflow-readonly:samples.weather_stations}
- * and can be overridden with {@code --input}.
- */
-public class BigQueryTornadoes {
- // Default to using a 1000 row subset of the public weather station table publicdata:samples.gsod.
- private static final String WEATHER_SAMPLES_TABLE =
- "clouddataflow-readonly:samples.weather_stations";
-
- /**
- * Examines each row in the input table. If a tornado was recorded
- * in that sample, the month in which it occurred is output.
- */
- static class ExtractTornadoesFn extends DoFn<TableRow, Integer> {
- @Override
- public void processElement(ProcessContext c){
- TableRow row = c.element();
- if ((Boolean) row.get("tornado")) {
- c.output(Integer.parseInt((String) row.get("month")));
- }
- }
- }
-
- /**
- * Prepares the data for writing to BigQuery by building a TableRow object containing an
- * integer representation of month and the number of tornadoes that occurred in each month.
- */
- static class FormatCountsFn extends DoFn<KV<Integer, Long>, TableRow> {
- @Override
- public void processElement(ProcessContext c) {
- TableRow row = new TableRow()
- .set("month", c.element().getKey())
- .set("tornado_count", c.element().getValue());
- c.output(row);
- }
- }
-
- /**
- * Takes rows from a table and generates a table of counts.
- *
- * <p>The input schema is described by
- * https://developers.google.com/bigquery/docs/dataset-gsod .
- * The output contains the total number of tornadoes found in each month in
- * the following schema:
- * <ul>
- * <li>month: integer</li>
- * <li>tornado_count: integer</li>
- * </ul>
- */
- static class CountTornadoes
- extends PTransform<PCollection<TableRow>, PCollection<TableRow>> {
- @Override
- public PCollection<TableRow> apply(PCollection<TableRow> rows) {
-
- // row... => month...
- PCollection<Integer> tornadoes = rows.apply(
- ParDo.of(new ExtractTornadoesFn()));
-
- // month... => <month,count>...
- PCollection<KV<Integer, Long>> tornadoCounts =
- tornadoes.apply(Count.<Integer>perElement());
-
- // <month,count>... => row...
- PCollection<TableRow> results = tornadoCounts.apply(
- ParDo.of(new FormatCountsFn()));
-
- return results;
- }
- }
-
- /**
- * Options supported by {@link BigQueryTornadoes}.
- *
- * <p>Inherits standard configuration options.
- */
- private static interface Options extends PipelineOptions {
- @Description("Table to read from, specified as "
- + "<project_id>:<dataset_id>.<table_id>")
- @Default.String(WEATHER_SAMPLES_TABLE)
- String getInput();
- void setInput(String value);
-
- @Description("BigQuery table to write to, specified as "
- + "<project_id>:<dataset_id>.<table_id>. The dataset must already exist.")
- @Validation.Required
- String getOutput();
- void setOutput(String value);
- }
-
- public static void main(String[] args) {
- Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
-
- Pipeline p = Pipeline.create(options);
-
- // Build the table schema for the output table.
- List<TableFieldSchema> fields = new ArrayList<>();
- fields.add(new TableFieldSchema().setName("month").setType("INTEGER"));
- fields.add(new TableFieldSchema().setName("tornado_count").setType("INTEGER"));
- TableSchema schema = new TableSchema().setFields(fields);
-
- p.apply(BigQueryIO.Read.from(options.getInput()))
- .apply(new CountTornadoes())
- .apply(BigQueryIO.Write
- .to(options.getOutput())
- .withSchema(schema)
- .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED)
- .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_TRUNCATE));
-
- p.run();
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/src/main/java/com/google/cloud/dataflow/examples/cookbook/CombinePerKeyExamples.java
----------------------------------------------------------------------
diff --git a/examples/src/main/java/com/google/cloud/dataflow/examples/cookbook/CombinePerKeyExamples.java b/examples/src/main/java/com/google/cloud/dataflow/examples/cookbook/CombinePerKeyExamples.java
deleted file mode 100644
index 9540dd4..0000000
--- a/examples/src/main/java/com/google/cloud/dataflow/examples/cookbook/CombinePerKeyExamples.java
+++ /dev/null
@@ -1,223 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.examples.cookbook;
-
-import com.google.api.services.bigquery.model.TableFieldSchema;
-import com.google.api.services.bigquery.model.TableRow;
-import com.google.api.services.bigquery.model.TableSchema;
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.io.BigQueryIO;
-import com.google.cloud.dataflow.sdk.options.Default;
-import com.google.cloud.dataflow.sdk.options.Description;
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
-import com.google.cloud.dataflow.sdk.options.Validation;
-import com.google.cloud.dataflow.sdk.transforms.Aggregator;
-import com.google.cloud.dataflow.sdk.transforms.Combine;
-import com.google.cloud.dataflow.sdk.transforms.DoFn;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-import com.google.cloud.dataflow.sdk.transforms.ParDo;
-import com.google.cloud.dataflow.sdk.transforms.SerializableFunction;
-import com.google.cloud.dataflow.sdk.transforms.Sum;
-import com.google.cloud.dataflow.sdk.values.KV;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-
-import java.util.ArrayList;
-import java.util.List;
-
-/**
- * An example that reads the public 'Shakespeare' data, and for each word in
- * the dataset that is over a given length, generates a string containing the
- * list of play names in which that word appears, and saves this information
- * to a bigquery table.
- *
- * <p>Concepts: the Combine.perKey transform, which lets you combine the values in a
- * key-grouped Collection, and how to use an Aggregator to track information in the
- * Monitoring UI.
- *
- * <p>Note: Before running this example, you must create a BigQuery dataset to contain your output
- * table.
- *
- * <p>To execute this pipeline locally, specify general pipeline configuration:
- * <pre>{@code
- * --project=YOUR_PROJECT_ID
- * }
- * </pre>
- * and the BigQuery table for the output:
- * <pre>{@code
- * --output=YOUR_PROJECT_ID:DATASET_ID.TABLE_ID
- * }</pre>
- *
- * <p>To execute this pipeline using the Dataflow service, specify pipeline configuration:
- * <pre>{@code
- * --project=YOUR_PROJECT_ID
- * --stagingLocation=gs://<STAGING DIRECTORY>
- * --runner=BlockingDataflowPipelineRunner
- * }
- * </pre>
- * and the BigQuery table for the output:
- * <pre>{@code
- * --output=YOUR_PROJECT_ID:DATASET_ID.TABLE_ID
- * }</pre>
- *
- * <p>The BigQuery input table defaults to {@code publicdata:samples.shakespeare} and can
- * be overridden with {@code --input}.
- */
-public class CombinePerKeyExamples {
- // Use the shakespeare public BigQuery sample
- private static final String SHAKESPEARE_TABLE =
- "publicdata:samples.shakespeare";
- // We'll track words >= this word length across all plays in the table.
- private static final int MIN_WORD_LENGTH = 9;
-
- /**
- * Examines each row in the input table. If the word is greater than or equal to MIN_WORD_LENGTH,
- * outputs word, play_name.
- */
- static class ExtractLargeWordsFn extends DoFn<TableRow, KV<String, String>> {
- private final Aggregator<Long, Long> smallerWords =
- createAggregator("smallerWords", new Sum.SumLongFn());
-
- @Override
- public void processElement(ProcessContext c){
- TableRow row = c.element();
- String playName = (String) row.get("corpus");
- String word = (String) row.get("word");
- if (word.length() >= MIN_WORD_LENGTH) {
- c.output(KV.of(word, playName));
- } else {
- // Track how many smaller words we're not including. This information will be
- // visible in the Monitoring UI.
- smallerWords.addValue(1L);
- }
- }
- }
-
-
- /**
- * Prepares the data for writing to BigQuery by building a TableRow object
- * containing a word with a string listing the plays in which it appeared.
- */
- static class FormatShakespeareOutputFn extends DoFn<KV<String, String>, TableRow> {
- @Override
- public void processElement(ProcessContext c) {
- TableRow row = new TableRow()
- .set("word", c.element().getKey())
- .set("all_plays", c.element().getValue());
- c.output(row);
- }
- }
-
- /**
- * Reads the public 'Shakespeare' data, and for each word in the dataset
- * over a given length, generates a string containing the list of play names
- * in which that word appears. It does this via the Combine.perKey
- * transform, with the ConcatWords combine function.
- *
- * <p>Combine.perKey is similar to a GroupByKey followed by a ParDo, but
- * has more restricted semantics that allow it to be executed more
- * efficiently. These records are then formatted as BQ table rows.
- */
- static class PlaysForWord
- extends PTransform<PCollection<TableRow>, PCollection<TableRow>> {
- @Override
- public PCollection<TableRow> apply(PCollection<TableRow> rows) {
-
- // row... => <word, play_name> ...
- PCollection<KV<String, String>> words = rows.apply(
- ParDo.of(new ExtractLargeWordsFn()));
-
- // word, play_name => word, all_plays ...
- PCollection<KV<String, String>> wordAllPlays =
- words.apply(Combine.<String, String>perKey(
- new ConcatWords()));
-
- // <word, all_plays>... => row...
- PCollection<TableRow> results = wordAllPlays.apply(
- ParDo.of(new FormatShakespeareOutputFn()));
-
- return results;
- }
- }
-
- /**
- * A 'combine function' used with the Combine.perKey transform. Builds a
- * comma-separated string of all input items. So, it will build a string
- * containing all the different Shakespeare plays in which the given input
- * word has appeared.
- */
- public static class ConcatWords implements SerializableFunction<Iterable<String>, String> {
- @Override
- public String apply(Iterable<String> input) {
- StringBuilder all = new StringBuilder();
- for (String item : input) {
- if (!item.isEmpty()) {
- if (all.length() == 0) {
- all.append(item);
- } else {
- all.append(",");
- all.append(item);
- }
- }
- }
- return all.toString();
- }
- }
-
- /**
- * Options supported by {@link CombinePerKeyExamples}.
- *
- * <p>Inherits standard configuration options.
- */
- private static interface Options extends PipelineOptions {
- @Description("Table to read from, specified as "
- + "<project_id>:<dataset_id>.<table_id>")
- @Default.String(SHAKESPEARE_TABLE)
- String getInput();
- void setInput(String value);
-
- @Description("Table to write to, specified as "
- + "<project_id>:<dataset_id>.<table_id>. "
- + "The dataset_id must already exist")
- @Validation.Required
- String getOutput();
- void setOutput(String value);
- }
-
- public static void main(String[] args)
- throws Exception {
-
- Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
- Pipeline p = Pipeline.create(options);
-
- // Build the table schema for the output table.
- List<TableFieldSchema> fields = new ArrayList<>();
- fields.add(new TableFieldSchema().setName("word").setType("STRING"));
- fields.add(new TableFieldSchema().setName("all_plays").setType("STRING"));
- TableSchema schema = new TableSchema().setFields(fields);
-
- p.apply(BigQueryIO.Read.from(options.getInput()))
- .apply(new PlaysForWord())
- .apply(BigQueryIO.Write
- .to(options.getOutput())
- .withSchema(schema)
- .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED)
- .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_TRUNCATE));
-
- p.run();
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/src/main/java/com/google/cloud/dataflow/examples/cookbook/DatastoreWordCount.java
----------------------------------------------------------------------
diff --git a/examples/src/main/java/com/google/cloud/dataflow/examples/cookbook/DatastoreWordCount.java b/examples/src/main/java/com/google/cloud/dataflow/examples/cookbook/DatastoreWordCount.java
deleted file mode 100644
index eaf1e20..0000000
--- a/examples/src/main/java/com/google/cloud/dataflow/examples/cookbook/DatastoreWordCount.java
+++ /dev/null
@@ -1,269 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.examples.cookbook;
-
-import static com.google.api.services.datastore.client.DatastoreHelper.getPropertyMap;
-import static com.google.api.services.datastore.client.DatastoreHelper.getString;
-import static com.google.api.services.datastore.client.DatastoreHelper.makeFilter;
-import static com.google.api.services.datastore.client.DatastoreHelper.makeKey;
-import static com.google.api.services.datastore.client.DatastoreHelper.makeValue;
-
-import com.google.api.services.datastore.DatastoreV1.Entity;
-import com.google.api.services.datastore.DatastoreV1.Key;
-import com.google.api.services.datastore.DatastoreV1.Property;
-import com.google.api.services.datastore.DatastoreV1.PropertyFilter;
-import com.google.api.services.datastore.DatastoreV1.Query;
-import com.google.api.services.datastore.DatastoreV1.Value;
-import com.google.cloud.dataflow.examples.WordCount;
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.io.DatastoreIO;
-import com.google.cloud.dataflow.sdk.io.Read;
-import com.google.cloud.dataflow.sdk.io.TextIO;
-import com.google.cloud.dataflow.sdk.options.Default;
-import com.google.cloud.dataflow.sdk.options.Description;
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
-import com.google.cloud.dataflow.sdk.options.Validation;
-import com.google.cloud.dataflow.sdk.transforms.DoFn;
-import com.google.cloud.dataflow.sdk.transforms.MapElements;
-import com.google.cloud.dataflow.sdk.transforms.ParDo;
-
-import java.util.Map;
-import java.util.UUID;
-
-import javax.annotation.Nullable;
-
-/**
- * A WordCount example using DatastoreIO.
- *
- * <p>This example shows how to use DatastoreIO to read from Datastore and
- * write the results to Cloud Storage. Note that this example will write
- * data to Datastore, which may incur charge for Datastore operations.
- *
- * <p>To run this example, users need to use gcloud to get credential for Datastore:
- * <pre>{@code
- * $ gcloud auth login
- * }</pre>
- *
- * <p>To run this pipeline locally, the following options must be provided:
- * <pre>{@code
- * --project=YOUR_PROJECT_ID
- * --dataset=YOUR_DATASET_ID
- * --output=[YOUR_LOCAL_FILE | gs://YOUR_OUTPUT_PATH]
- * }</pre>
- *
- * <p>To run this example using Dataflow service, you must additionally
- * provide either {@literal --stagingLocation} or {@literal --tempLocation}, and
- * select one of the Dataflow pipeline runners, eg
- * {@literal --runner=BlockingDataflowPipelineRunner}.
- *
- * <p><b>Note:</b> this example creates entities with <i>Ancestor keys</i> to ensure that all
- * entities created are in the same entity group. Similarly, the query used to read from the Cloud
- * Datastore uses an <i>Ancestor filter</i>. Ancestors are used to ensure strongly consistent
- * results in Cloud Datastore. For more information, see the Cloud Datastore documentation on
- * <a href="https://cloud.google.com/datastore/docs/concepts/structuring_for_strong_consistency">
- * Structing Data for Strong Consistency</a>.
- */
-public class DatastoreWordCount {
-
- /**
- * A DoFn that gets the content of an entity (one line in a
- * Shakespeare play) and converts it to a string.
- */
- static class GetContentFn extends DoFn<Entity, String> {
- @Override
- public void processElement(ProcessContext c) {
- Map<String, Value> props = getPropertyMap(c.element());
- Value value = props.get("content");
- if (value != null) {
- c.output(getString(value));
- }
- }
- }
-
- /**
- * A helper function to create the ancestor key for all created and queried entities.
- *
- * <p>We use ancestor keys and ancestor queries for strong consistency. See
- * {@link DatastoreWordCount} javadoc for more information.
- */
- static Key makeAncestorKey(@Nullable String namespace, String kind) {
- Key.Builder keyBuilder = makeKey(kind, "root");
- if (namespace != null) {
- keyBuilder.getPartitionIdBuilder().setNamespace(namespace);
- }
- return keyBuilder.build();
- }
-
- /**
- * A DoFn that creates entity for every line in Shakespeare.
- */
- static class CreateEntityFn extends DoFn<String, Entity> {
- private final String namespace;
- private final String kind;
- private final Key ancestorKey;
-
- CreateEntityFn(String namespace, String kind) {
- this.namespace = namespace;
- this.kind = kind;
-
- // Build the ancestor key for all created entities once, including the namespace.
- ancestorKey = makeAncestorKey(namespace, kind);
- }
-
- public Entity makeEntity(String content) {
- Entity.Builder entityBuilder = Entity.newBuilder();
-
- // All created entities have the same ancestor Key.
- Key.Builder keyBuilder = makeKey(ancestorKey, kind, UUID.randomUUID().toString());
- // NOTE: Namespace is not inherited between keys created with DatastoreHelper.makeKey, so
- // we must set the namespace on keyBuilder. TODO: Once partitionId inheritance is added,
- // we can simplify this code.
- if (namespace != null) {
- keyBuilder.getPartitionIdBuilder().setNamespace(namespace);
- }
-
- entityBuilder.setKey(keyBuilder.build());
- entityBuilder.addProperty(Property.newBuilder().setName("content")
- .setValue(Value.newBuilder().setStringValue(content)));
- return entityBuilder.build();
- }
-
- @Override
- public void processElement(ProcessContext c) {
- c.output(makeEntity(c.element()));
- }
- }
-
- /**
- * Options supported by {@link DatastoreWordCount}.
- *
- * <p>Inherits standard configuration options.
- */
- public static interface Options extends PipelineOptions {
- @Description("Path of the file to read from and store to Datastore")
- @Default.String("gs://dataflow-samples/shakespeare/kinglear.txt")
- String getInput();
- void setInput(String value);
-
- @Description("Path of the file to write to")
- @Validation.Required
- String getOutput();
- void setOutput(String value);
-
- @Description("Dataset ID to read from datastore")
- @Validation.Required
- String getDataset();
- void setDataset(String value);
-
- @Description("Dataset entity kind")
- @Default.String("shakespeare-demo")
- String getKind();
- void setKind(String value);
-
- @Description("Dataset namespace")
- String getNamespace();
- void setNamespace(@Nullable String value);
-
- @Description("Read an existing dataset, do not write first")
- boolean isReadOnly();
- void setReadOnly(boolean value);
-
- @Description("Number of output shards")
- @Default.Integer(0) // If the system should choose automatically.
- int getNumShards();
- void setNumShards(int value);
- }
-
- /**
- * An example that creates a pipeline to populate DatastoreIO from a
- * text input. Forces use of DirectPipelineRunner for local execution mode.
- */
- public static void writeDataToDatastore(Options options) {
- Pipeline p = Pipeline.create(options);
- p.apply(TextIO.Read.named("ReadLines").from(options.getInput()))
- .apply(ParDo.of(new CreateEntityFn(options.getNamespace(), options.getKind())))
- .apply(DatastoreIO.writeTo(options.getDataset()));
-
- p.run();
- }
-
- /**
- * Build a Cloud Datastore ancestor query for the specified {@link Options#getNamespace} and
- * {@link Options#getKind}.
- *
- * <p>We use ancestor keys and ancestor queries for strong consistency. See
- * {@link DatastoreWordCount} javadoc for more information.
- *
- * @see <a href="https://cloud.google.com/datastore/docs/concepts/queries#Datastore_Ancestor_filters">Ancestor filters</a>
- */
- static Query makeAncestorKindQuery(Options options) {
- Query.Builder q = Query.newBuilder();
- q.addKindBuilder().setName(options.getKind());
- q.setFilter(makeFilter(
- "__key__",
- PropertyFilter.Operator.HAS_ANCESTOR,
- makeValue(makeAncestorKey(options.getNamespace(), options.getKind()))));
- return q.build();
- }
-
- /**
- * An example that creates a pipeline to do DatastoreIO.Read from Datastore.
- */
- public static void readDataFromDatastore(Options options) {
- Query query = makeAncestorKindQuery(options);
-
- // For Datastore sources, the read namespace can be set on the entire query.
- DatastoreIO.Source source = DatastoreIO.source()
- .withDataset(options.getDataset())
- .withQuery(query)
- .withNamespace(options.getNamespace());
-
- Pipeline p = Pipeline.create(options);
- p.apply("ReadShakespeareFromDatastore", Read.from(source))
- .apply("StringifyEntity", ParDo.of(new GetContentFn()))
- .apply("CountWords", new WordCount.CountWords())
- .apply("PrintWordCount", MapElements.via(new WordCount.FormatAsTextFn()))
- .apply("WriteLines", TextIO.Write.to(options.getOutput())
- .withNumShards(options.getNumShards()));
- p.run();
- }
-
- /**
- * An example to demo how to use {@link DatastoreIO}. The runner here is
- * customizable, which means users could pass either {@code DirectPipelineRunner}
- * or {@code DataflowPipelineRunner} in the pipeline options.
- */
- public static void main(String args[]) {
- // The options are used in two places, for Dataflow service, and
- // building DatastoreIO.Read object
- Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
-
- if (!options.isReadOnly()) {
- // First example: write data to Datastore for reading later.
- //
- // NOTE: this write does not delete any existing Entities in the Datastore, so if run
- // multiple times with the same output dataset, there may be duplicate entries. The
- // Datastore Query tool in the Google Developers Console can be used to inspect or erase all
- // entries with a particular namespace and/or kind.
- DatastoreWordCount.writeDataToDatastore(options);
- }
-
- // Second example: do parallel read from Datastore.
- DatastoreWordCount.readDataFromDatastore(options);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/src/main/java/com/google/cloud/dataflow/examples/cookbook/DeDupExample.java
----------------------------------------------------------------------
diff --git a/examples/src/main/java/com/google/cloud/dataflow/examples/cookbook/DeDupExample.java b/examples/src/main/java/com/google/cloud/dataflow/examples/cookbook/DeDupExample.java
deleted file mode 100644
index 9873561..0000000
--- a/examples/src/main/java/com/google/cloud/dataflow/examples/cookbook/DeDupExample.java
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.examples.cookbook;
-
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.io.TextIO;
-import com.google.cloud.dataflow.sdk.options.DataflowPipelineOptions;
-import com.google.cloud.dataflow.sdk.options.Default;
-import com.google.cloud.dataflow.sdk.options.DefaultValueFactory;
-import com.google.cloud.dataflow.sdk.options.Description;
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
-import com.google.cloud.dataflow.sdk.transforms.RemoveDuplicates;
-import com.google.cloud.dataflow.sdk.util.gcsfs.GcsPath;
-
-/**
- * This example uses as input Shakespeare's plays as plaintext files, and will remove any
- * duplicate lines across all the files. (The output does not preserve any input order).
- *
- * <p>Concepts: the RemoveDuplicates transform, and how to wire transforms together.
- * Demonstrates {@link com.google.cloud.dataflow.sdk.io.TextIO.Read}/
- * {@link RemoveDuplicates}/{@link com.google.cloud.dataflow.sdk.io.TextIO.Write}.
- *
- * <p>To execute this pipeline locally, specify general pipeline configuration:
- * --project=YOUR_PROJECT_ID
- * and a local output file or output prefix on GCS:
- * --output=[YOUR_LOCAL_FILE | gs://YOUR_OUTPUT_PREFIX]
- *
- * <p>To execute this pipeline using the Dataflow service, specify pipeline configuration:
- * --project=YOUR_PROJECT_ID
- * --stagingLocation=gs://YOUR_STAGING_DIRECTORY
- * --runner=BlockingDataflowPipelineRunner
- * and an output prefix on GCS:
- * --output=gs://YOUR_OUTPUT_PREFIX
- *
- * <p>The input defaults to {@code gs://dataflow-samples/shakespeare/*} and can be
- * overridden with {@code --input}.
- */
-public class DeDupExample {
-
- /**
- * Options supported by {@link DeDupExample}.
- *
- * <p>Inherits standard configuration options.
- */
- private static interface Options extends PipelineOptions {
- @Description("Path to the directory or GCS prefix containing files to read from")
- @Default.String("gs://dataflow-samples/shakespeare/*")
- String getInput();
- void setInput(String value);
-
- @Description("Path of the file to write to")
- @Default.InstanceFactory(OutputFactory.class)
- String getOutput();
- void setOutput(String value);
-
- /** Returns gs://${STAGING_LOCATION}/"deduped.txt". */
- public static class OutputFactory implements DefaultValueFactory<String> {
- @Override
- public String create(PipelineOptions options) {
- DataflowPipelineOptions dataflowOptions = options.as(DataflowPipelineOptions.class);
- if (dataflowOptions.getStagingLocation() != null) {
- return GcsPath.fromUri(dataflowOptions.getStagingLocation())
- .resolve("deduped.txt").toString();
- } else {
- throw new IllegalArgumentException("Must specify --output or --stagingLocation");
- }
- }
- }
- }
-
-
- public static void main(String[] args)
- throws Exception {
-
- Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
- Pipeline p = Pipeline.create(options);
-
- p.apply(TextIO.Read.named("ReadLines").from(options.getInput()))
- .apply(RemoveDuplicates.<String>create())
- .apply(TextIO.Write.named("DedupedShakespeare")
- .to(options.getOutput()));
-
- p.run();
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/src/main/java/com/google/cloud/dataflow/examples/cookbook/FilterExamples.java
----------------------------------------------------------------------
diff --git a/examples/src/main/java/com/google/cloud/dataflow/examples/cookbook/FilterExamples.java b/examples/src/main/java/com/google/cloud/dataflow/examples/cookbook/FilterExamples.java
deleted file mode 100644
index 781873a..0000000
--- a/examples/src/main/java/com/google/cloud/dataflow/examples/cookbook/FilterExamples.java
+++ /dev/null
@@ -1,266 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.examples.cookbook;
-
-import com.google.api.services.bigquery.model.TableFieldSchema;
-import com.google.api.services.bigquery.model.TableRow;
-import com.google.api.services.bigquery.model.TableSchema;
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.io.BigQueryIO;
-import com.google.cloud.dataflow.sdk.options.Default;
-import com.google.cloud.dataflow.sdk.options.Description;
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
-import com.google.cloud.dataflow.sdk.options.Validation;
-import com.google.cloud.dataflow.sdk.transforms.DoFn;
-import com.google.cloud.dataflow.sdk.transforms.Mean;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-import com.google.cloud.dataflow.sdk.transforms.ParDo;
-import com.google.cloud.dataflow.sdk.transforms.View;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.cloud.dataflow.sdk.values.PCollectionView;
-
-import java.util.ArrayList;
-import java.util.List;
-import java.util.logging.Logger;
-
-/**
- * This is an example that demonstrates several approaches to filtering, and use of the Mean
- * transform. It shows how to dynamically set parameters by defining and using new pipeline options,
- * and how to use a value derived by the pipeline.
- *
- * <p>Concepts: The Mean transform; Options configuration; using pipeline-derived data as a side
- * input; approaches to filtering, selection, and projection.
- *
- * <p>The example reads public samples of weather data from BigQuery. It performs a
- * projection on the data, finds the global mean of the temperature readings, filters on readings
- * for a single given month, and then outputs only data (for that month) that has a mean temp
- * smaller than the derived global mean.
-*
- * <p>Note: Before running this example, you must create a BigQuery dataset to contain your output
- * table.
- *
- * <p>To execute this pipeline locally, specify general pipeline configuration:
- * <pre>{@code
- * --project=YOUR_PROJECT_ID
- * }
- * </pre>
- * and the BigQuery table for the output:
- * <pre>{@code
- * --output=YOUR_PROJECT_ID:DATASET_ID.TABLE_ID
- * [--monthFilter=<month_number>]
- * }
- * </pre>
- * where optional parameter {@code --monthFilter} is set to a number 1-12.
- *
- * <p>To execute this pipeline using the Dataflow service, specify pipeline configuration:
- * <pre>{@code
- * --project=YOUR_PROJECT_ID
- * --stagingLocation=gs://YOUR_STAGING_DIRECTORY
- * --runner=BlockingDataflowPipelineRunner
- * }
- * </pre>
- * and the BigQuery table for the output:
- * <pre>{@code
- * --output=YOUR_PROJECT_ID:DATASET_ID.TABLE_ID
- * [--monthFilter=<month_number>]
- * }
- * </pre>
- * where optional parameter {@code --monthFilter} is set to a number 1-12.
- *
- * <p>The BigQuery input table defaults to {@code clouddataflow-readonly:samples.weather_stations}
- * and can be overridden with {@code --input}.
- */
-public class FilterExamples {
- // Default to using a 1000 row subset of the public weather station table publicdata:samples.gsod.
- private static final String WEATHER_SAMPLES_TABLE =
- "clouddataflow-readonly:samples.weather_stations";
- static final Logger LOG = Logger.getLogger(FilterExamples.class.getName());
- static final int MONTH_TO_FILTER = 7;
-
- /**
- * Examines each row in the input table. Outputs only the subset of the cells this example
- * is interested in-- the mean_temp and year, month, and day-- as a bigquery table row.
- */
- static class ProjectionFn extends DoFn<TableRow, TableRow> {
- @Override
- public void processElement(ProcessContext c){
- TableRow row = c.element();
- // Grab year, month, day, mean_temp from the row
- Integer year = Integer.parseInt((String) row.get("year"));
- Integer month = Integer.parseInt((String) row.get("month"));
- Integer day = Integer.parseInt((String) row.get("day"));
- Double meanTemp = Double.parseDouble(row.get("mean_temp").toString());
- // Prepares the data for writing to BigQuery by building a TableRow object
- TableRow outRow = new TableRow()
- .set("year", year).set("month", month)
- .set("day", day).set("mean_temp", meanTemp);
- c.output(outRow);
- }
- }
-
- /**
- * Implements 'filter' functionality.
- *
- * <p>Examines each row in the input table. Outputs only rows from the month
- * monthFilter, which is passed in as a parameter during construction of this DoFn.
- */
- static class FilterSingleMonthDataFn extends DoFn<TableRow, TableRow> {
- Integer monthFilter;
-
- public FilterSingleMonthDataFn(Integer monthFilter) {
- this.monthFilter = monthFilter;
- }
-
- @Override
- public void processElement(ProcessContext c){
- TableRow row = c.element();
- Integer month;
- month = (Integer) row.get("month");
- if (month.equals(this.monthFilter)) {
- c.output(row);
- }
- }
- }
-
- /**
- * Examines each row (weather reading) in the input table. Output the temperature
- * reading for that row ('mean_temp').
- */
- static class ExtractTempFn extends DoFn<TableRow, Double> {
- @Override
- public void processElement(ProcessContext c){
- TableRow row = c.element();
- Double meanTemp = Double.parseDouble(row.get("mean_temp").toString());
- c.output(meanTemp);
- }
- }
-
-
-
- /*
- * Finds the global mean of the mean_temp for each day/record, and outputs
- * only data that has a mean temp larger than this global mean.
- **/
- static class BelowGlobalMean
- extends PTransform<PCollection<TableRow>, PCollection<TableRow>> {
- Integer monthFilter;
-
- public BelowGlobalMean(Integer monthFilter) {
- this.monthFilter = monthFilter;
- }
-
-
- @Override
- public PCollection<TableRow> apply(PCollection<TableRow> rows) {
-
- // Extract the mean_temp from each row.
- PCollection<Double> meanTemps = rows.apply(
- ParDo.of(new ExtractTempFn()));
-
- // Find the global mean, of all the mean_temp readings in the weather data,
- // and prepare this singleton PCollectionView for use as a side input.
- final PCollectionView<Double> globalMeanTemp =
- meanTemps.apply(Mean.<Double>globally())
- .apply(View.<Double>asSingleton());
-
- // Rows filtered to remove all but a single month
- PCollection<TableRow> monthFilteredRows = rows
- .apply(ParDo.of(new FilterSingleMonthDataFn(monthFilter)));
-
- // Then, use the global mean as a side input, to further filter the weather data.
- // By using a side input to pass in the filtering criteria, we can use a value
- // that is computed earlier in pipeline execution.
- // We'll only output readings with temperatures below this mean.
- PCollection<TableRow> filteredRows = monthFilteredRows
- .apply(ParDo
- .named("ParseAndFilter")
- .withSideInputs(globalMeanTemp)
- .of(new DoFn<TableRow, TableRow>() {
- @Override
- public void processElement(ProcessContext c) {
- Double meanTemp = Double.parseDouble(c.element().get("mean_temp").toString());
- Double gTemp = c.sideInput(globalMeanTemp);
- if (meanTemp < gTemp) {
- c.output(c.element());
- }
- }
- }));
-
- return filteredRows;
- }
- }
-
-
- /**
- * Options supported by {@link FilterExamples}.
- *
- * <p>Inherits standard configuration options.
- */
- private static interface Options extends PipelineOptions {
- @Description("Table to read from, specified as "
- + "<project_id>:<dataset_id>.<table_id>")
- @Default.String(WEATHER_SAMPLES_TABLE)
- String getInput();
- void setInput(String value);
-
- @Description("Table to write to, specified as "
- + "<project_id>:<dataset_id>.<table_id>. "
- + "The dataset_id must already exist")
- @Validation.Required
- String getOutput();
- void setOutput(String value);
-
- @Description("Numeric value of month to filter on")
- @Default.Integer(MONTH_TO_FILTER)
- Integer getMonthFilter();
- void setMonthFilter(Integer value);
- }
-
- /**
- * Helper method to build the table schema for the output table.
- */
- private static TableSchema buildWeatherSchemaProjection() {
- List<TableFieldSchema> fields = new ArrayList<>();
- fields.add(new TableFieldSchema().setName("year").setType("INTEGER"));
- fields.add(new TableFieldSchema().setName("month").setType("INTEGER"));
- fields.add(new TableFieldSchema().setName("day").setType("INTEGER"));
- fields.add(new TableFieldSchema().setName("mean_temp").setType("FLOAT"));
- TableSchema schema = new TableSchema().setFields(fields);
- return schema;
- }
-
- public static void main(String[] args)
- throws Exception {
-
- Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
- Pipeline p = Pipeline.create(options);
-
- TableSchema schema = buildWeatherSchemaProjection();
-
- p.apply(BigQueryIO.Read.from(options.getInput()))
- .apply(ParDo.of(new ProjectionFn()))
- .apply(new BelowGlobalMean(options.getMonthFilter()))
- .apply(BigQueryIO.Write
- .to(options.getOutput())
- .withSchema(schema)
- .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED)
- .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_TRUNCATE));
-
- p.run();
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/src/main/java/com/google/cloud/dataflow/examples/cookbook/JoinExamples.java
----------------------------------------------------------------------
diff --git a/examples/src/main/java/com/google/cloud/dataflow/examples/cookbook/JoinExamples.java b/examples/src/main/java/com/google/cloud/dataflow/examples/cookbook/JoinExamples.java
deleted file mode 100644
index 745c5d6..0000000
--- a/examples/src/main/java/com/google/cloud/dataflow/examples/cookbook/JoinExamples.java
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.examples.cookbook;
-
-import com.google.api.services.bigquery.model.TableRow;
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.io.BigQueryIO;
-import com.google.cloud.dataflow.sdk.io.TextIO;
-import com.google.cloud.dataflow.sdk.options.Description;
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
-import com.google.cloud.dataflow.sdk.options.Validation;
-import com.google.cloud.dataflow.sdk.transforms.DoFn;
-import com.google.cloud.dataflow.sdk.transforms.ParDo;
-import com.google.cloud.dataflow.sdk.transforms.join.CoGbkResult;
-import com.google.cloud.dataflow.sdk.transforms.join.CoGroupByKey;
-import com.google.cloud.dataflow.sdk.transforms.join.KeyedPCollectionTuple;
-import com.google.cloud.dataflow.sdk.values.KV;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.cloud.dataflow.sdk.values.TupleTag;
-
-/**
- * This example shows how to do a join on two collections.
- * It uses a sample of the GDELT 'world event' data (http://goo.gl/OB6oin), joining the event
- * 'action' country code against a table that maps country codes to country names.
- *
- * <p>Concepts: Join operation; multiple input sources.
- *
- * <p>To execute this pipeline locally, specify general pipeline configuration:
- * <pre>{@code
- * --project=YOUR_PROJECT_ID
- * }
- * </pre>
- * and a local output file or output prefix on GCS:
- * <pre>{@code
- * --output=[YOUR_LOCAL_FILE | gs://YOUR_OUTPUT_PREFIX]
- * }</pre>
- *
- * <p>To execute this pipeline using the Dataflow service, specify pipeline configuration:
- * <pre>{@code
- * --project=YOUR_PROJECT_ID
- * --stagingLocation=gs://YOUR_STAGING_DIRECTORY
- * --runner=BlockingDataflowPipelineRunner
- * }
- * </pre>
- * and an output prefix on GCS:
- * <pre>{@code
- * --output=gs://YOUR_OUTPUT_PREFIX
- * }</pre>
- */
-public class JoinExamples {
-
- // A 1000-row sample of the GDELT data here: gdelt-bq:full.events.
- private static final String GDELT_EVENTS_TABLE =
- "clouddataflow-readonly:samples.gdelt_sample";
- // A table that maps country codes to country names.
- private static final String COUNTRY_CODES =
- "gdelt-bq:full.crosswalk_geocountrycodetohuman";
-
- /**
- * Join two collections, using country code as the key.
- */
- static PCollection<String> joinEvents(PCollection<TableRow> eventsTable,
- PCollection<TableRow> countryCodes) throws Exception {
-
- final TupleTag<String> eventInfoTag = new TupleTag<String>();
- final TupleTag<String> countryInfoTag = new TupleTag<String>();
-
- // transform both input collections to tuple collections, where the keys are country
- // codes in both cases.
- PCollection<KV<String, String>> eventInfo = eventsTable.apply(
- ParDo.of(new ExtractEventDataFn()));
- PCollection<KV<String, String>> countryInfo = countryCodes.apply(
- ParDo.of(new ExtractCountryInfoFn()));
-
- // country code 'key' -> CGBKR (<event info>, <country name>)
- PCollection<KV<String, CoGbkResult>> kvpCollection = KeyedPCollectionTuple
- .of(eventInfoTag, eventInfo)
- .and(countryInfoTag, countryInfo)
- .apply(CoGroupByKey.<String>create());
-
- // Process the CoGbkResult elements generated by the CoGroupByKey transform.
- // country code 'key' -> string of <event info>, <country name>
- PCollection<KV<String, String>> finalResultCollection =
- kvpCollection.apply(ParDo.named("Process").of(
- new DoFn<KV<String, CoGbkResult>, KV<String, String>>() {
- @Override
- public void processElement(ProcessContext c) {
- KV<String, CoGbkResult> e = c.element();
- String countryCode = e.getKey();
- String countryName = "none";
- countryName = e.getValue().getOnly(countryInfoTag);
- for (String eventInfo : c.element().getValue().getAll(eventInfoTag)) {
- // Generate a string that combines information from both collection values
- c.output(KV.of(countryCode, "Country name: " + countryName
- + ", Event info: " + eventInfo));
- }
- }
- }));
-
- // write to GCS
- PCollection<String> formattedResults = finalResultCollection
- .apply(ParDo.named("Format").of(new DoFn<KV<String, String>, String>() {
- @Override
- public void processElement(ProcessContext c) {
- String outputstring = "Country code: " + c.element().getKey()
- + ", " + c.element().getValue();
- c.output(outputstring);
- }
- }));
- return formattedResults;
- }
-
- /**
- * Examines each row (event) in the input table. Output a KV with the key the country
- * code of the event, and the value a string encoding event information.
- */
- static class ExtractEventDataFn extends DoFn<TableRow, KV<String, String>> {
- @Override
- public void processElement(ProcessContext c) {
- TableRow row = c.element();
- String countryCode = (String) row.get("ActionGeo_CountryCode");
- String sqlDate = (String) row.get("SQLDATE");
- String actor1Name = (String) row.get("Actor1Name");
- String sourceUrl = (String) row.get("SOURCEURL");
- String eventInfo = "Date: " + sqlDate + ", Actor1: " + actor1Name + ", url: " + sourceUrl;
- c.output(KV.of(countryCode, eventInfo));
- }
- }
-
-
- /**
- * Examines each row (country info) in the input table. Output a KV with the key the country
- * code, and the value the country name.
- */
- static class ExtractCountryInfoFn extends DoFn<TableRow, KV<String, String>> {
- @Override
- public void processElement(ProcessContext c) {
- TableRow row = c.element();
- String countryCode = (String) row.get("FIPSCC");
- String countryName = (String) row.get("HumanName");
- c.output(KV.of(countryCode, countryName));
- }
- }
-
-
- /**
- * Options supported by {@link JoinExamples}.
- *
- * <p>Inherits standard configuration options.
- */
- private static interface Options extends PipelineOptions {
- @Description("Path of the file to write to")
- @Validation.Required
- String getOutput();
- void setOutput(String value);
- }
-
- public static void main(String[] args) throws Exception {
- Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
- Pipeline p = Pipeline.create(options);
- // the following two 'applys' create multiple inputs to our pipeline, one for each
- // of our two input sources.
- PCollection<TableRow> eventsTable = p.apply(BigQueryIO.Read.from(GDELT_EVENTS_TABLE));
- PCollection<TableRow> countryCodes = p.apply(BigQueryIO.Read.from(COUNTRY_CODES));
- PCollection<String> formattedResults = joinEvents(eventsTable, countryCodes);
- formattedResults.apply(TextIO.Write.to(options.getOutput()));
- p.run();
- }
-
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/src/main/java/com/google/cloud/dataflow/examples/cookbook/MaxPerKeyExamples.java
----------------------------------------------------------------------
diff --git a/examples/src/main/java/com/google/cloud/dataflow/examples/cookbook/MaxPerKeyExamples.java b/examples/src/main/java/com/google/cloud/dataflow/examples/cookbook/MaxPerKeyExamples.java
deleted file mode 100644
index 1c26d0f..0000000
--- a/examples/src/main/java/com/google/cloud/dataflow/examples/cookbook/MaxPerKeyExamples.java
+++ /dev/null
@@ -1,173 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.examples.cookbook;
-
-import com.google.api.services.bigquery.model.TableFieldSchema;
-import com.google.api.services.bigquery.model.TableRow;
-import com.google.api.services.bigquery.model.TableSchema;
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.io.BigQueryIO;
-import com.google.cloud.dataflow.sdk.options.Default;
-import com.google.cloud.dataflow.sdk.options.Description;
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
-import com.google.cloud.dataflow.sdk.options.Validation;
-import com.google.cloud.dataflow.sdk.transforms.DoFn;
-import com.google.cloud.dataflow.sdk.transforms.Max;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-import com.google.cloud.dataflow.sdk.transforms.ParDo;
-import com.google.cloud.dataflow.sdk.values.KV;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-
-import java.util.ArrayList;
-import java.util.List;
-
-/**
- * An example that reads the public samples of weather data from BigQuery, and finds
- * the maximum temperature ('mean_temp') for each month.
- *
- * <p>Concepts: The 'Max' statistical combination function, and how to find the max per
- * key group.
- *
- * <p>Note: Before running this example, you must create a BigQuery dataset to contain your output
- * table.
- *
- * <p>To execute this pipeline locally, specify general pipeline configuration:
- * <pre>{@code
- * --project=YOUR_PROJECT_ID
- * }
- * </pre>
- * and the BigQuery table for the output, with the form
- * <pre>{@code
- * --output=YOUR_PROJECT_ID:DATASET_ID.TABLE_ID
- * }</pre>
- *
- * <p>To execute this pipeline using the Dataflow service, specify pipeline configuration:
- * <pre>{@code
- * --project=YOUR_PROJECT_ID
- * --stagingLocation=gs://YOUR_STAGING_DIRECTORY
- * --runner=BlockingDataflowPipelineRunner
- * }
- * </pre>
- * and the BigQuery table for the output:
- * <pre>{@code
- * --output=YOUR_PROJECT_ID:DATASET_ID.TABLE_ID
- * }</pre>
- *
- * <p>The BigQuery input table defaults to {@code clouddataflow-readonly:samples.weather_stations }
- * and can be overridden with {@code --input}.
- */
-public class MaxPerKeyExamples {
- // Default to using a 1000 row subset of the public weather station table publicdata:samples.gsod.
- private static final String WEATHER_SAMPLES_TABLE =
- "clouddataflow-readonly:samples.weather_stations";
-
- /**
- * Examines each row (weather reading) in the input table. Output the month of the reading,
- * and the mean_temp.
- */
- static class ExtractTempFn extends DoFn<TableRow, KV<Integer, Double>> {
- @Override
- public void processElement(ProcessContext c) {
- TableRow row = c.element();
- Integer month = Integer.parseInt((String) row.get("month"));
- Double meanTemp = Double.parseDouble(row.get("mean_temp").toString());
- c.output(KV.of(month, meanTemp));
- }
- }
-
- /**
- * Format the results to a TableRow, to save to BigQuery.
- *
- */
- static class FormatMaxesFn extends DoFn<KV<Integer, Double>, TableRow> {
- @Override
- public void processElement(ProcessContext c) {
- TableRow row = new TableRow()
- .set("month", c.element().getKey())
- .set("max_mean_temp", c.element().getValue());
- c.output(row);
- }
- }
-
- /**
- * Reads rows from a weather data table, and finds the max mean_temp for each
- * month via the 'Max' statistical combination function.
- */
- static class MaxMeanTemp
- extends PTransform<PCollection<TableRow>, PCollection<TableRow>> {
- @Override
- public PCollection<TableRow> apply(PCollection<TableRow> rows) {
-
- // row... => <month, mean_temp> ...
- PCollection<KV<Integer, Double>> temps = rows.apply(
- ParDo.of(new ExtractTempFn()));
-
- // month, mean_temp... => <month, max mean temp>...
- PCollection<KV<Integer, Double>> tempMaxes =
- temps.apply(Max.<Integer>doublesPerKey());
-
- // <month, max>... => row...
- PCollection<TableRow> results = tempMaxes.apply(
- ParDo.of(new FormatMaxesFn()));
-
- return results;
- }
- }
-
- /**
- * Options supported by {@link MaxPerKeyExamples}.
- *
- * <p>Inherits standard configuration options.
- */
- private static interface Options extends PipelineOptions {
- @Description("Table to read from, specified as "
- + "<project_id>:<dataset_id>.<table_id>")
- @Default.String(WEATHER_SAMPLES_TABLE)
- String getInput();
- void setInput(String value);
-
- @Description("Table to write to, specified as "
- + "<project_id>:<dataset_id>.<table_id>")
- @Validation.Required
- String getOutput();
- void setOutput(String value);
- }
-
- public static void main(String[] args)
- throws Exception {
-
- Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
- Pipeline p = Pipeline.create(options);
-
- // Build the table schema for the output table.
- List<TableFieldSchema> fields = new ArrayList<>();
- fields.add(new TableFieldSchema().setName("month").setType("INTEGER"));
- fields.add(new TableFieldSchema().setName("max_mean_temp").setType("FLOAT"));
- TableSchema schema = new TableSchema().setFields(fields);
-
- p.apply(BigQueryIO.Read.from(options.getInput()))
- .apply(new MaxMeanTemp())
- .apply(BigQueryIO.Write
- .to(options.getOutput())
- .withSchema(schema)
- .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED)
- .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_TRUNCATE));
-
- p.run();
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/src/main/java/com/google/cloud/dataflow/examples/cookbook/README.md
----------------------------------------------------------------------
diff --git a/examples/src/main/java/com/google/cloud/dataflow/examples/cookbook/README.md b/examples/src/main/java/com/google/cloud/dataflow/examples/cookbook/README.md
deleted file mode 100644
index 99f3080..0000000
--- a/examples/src/main/java/com/google/cloud/dataflow/examples/cookbook/README.md
+++ /dev/null
@@ -1,55 +0,0 @@
-
-# "Cookbook" Examples
-
-This directory holds simple "cookbook" examples, which show how to define
-commonly-used data analysis patterns that you would likely incorporate into a
-larger Dataflow pipeline. They include:
-
- <ul>
- <li><a href="https://github.com/GoogleCloudPlatform/DataflowJavaSDK/blob/master/examples/src/main/java/com/google/cloud/dataflow/examples/cookbook/BigQueryTornadoes.java">BigQueryTornadoes</a>
- — An example that reads the public samples of weather data from Google
- BigQuery, counts the number of tornadoes that occur in each month, and
- writes the results to BigQuery. Demonstrates reading/writing BigQuery,
- counting a <code>PCollection</code>, and user-defined <code>PTransforms</code>.</li>
- <li><a href="https://github.com/GoogleCloudPlatform/DataflowJavaSDK/blob/master/examples/src/main/java/com/google/cloud/dataflow/examples/cookbook/CombinePerKeyExamples.java">CombinePerKeyExamples</a>
- — An example that reads the public "Shakespeare" data, and for
- each word in the dataset that exceeds a given length, generates a string
- containing the list of play names in which that word appears.
- Demonstrates the <code>Combine.perKey</code>
- transform, which lets you combine the values in a key-grouped
- <code>PCollection</code>.
- </li>
- <li><a href="https://github.com/GoogleCloudPlatform/DataflowJavaSDK/blob/master/examples/src/main/java/com/google/cloud/dataflow/examples/cookbook/DatastoreWordCount.java">DatastoreWordCount</a>
- — An example that shows you how to read from Google Cloud Datastore.</li>
- <li><a href="https://github.com/GoogleCloudPlatform/DataflowJavaSDK/blob/master/examples/src/main/java/com/google/cloud/dataflow/examples/cookbook/DeDupExample.java">DeDupExample</a>
- — An example that uses Shakespeare's plays as plain text files, and
- removes duplicate lines across all the files. Demonstrates the
- <code>RemoveDuplicates</code>, <code>TextIO.Read</code>,
- and <code>TextIO.Write</code> transforms, and how to wire transforms together.
- </li>
- <li><a href="https://github.com/GoogleCloudPlatform/DataflowJavaSDK/blob/master/examples/src/main/java/com/google/cloud/dataflow/examples/cookbook/FilterExamples.java">FilterExamples</a>
- — An example that shows different approaches to filtering, including
- selection and projection. It also shows how to dynamically set parameters
- by defining and using new pipeline options, and use how to use a value derived
- by a pipeline. Demonstrates the <code>Mean</code> transform,
- <code>Options</code> configuration, and using pipeline-derived data as a side
- input.
- </li>
- <li><a href="https://github.com/GoogleCloudPlatform/DataflowJavaSDK/blob/master/examples/src/main/java/com/google/cloud/dataflow/examples/cookbook/JoinExamples.java">JoinExamples</a>
- — An example that shows how to join two collections. It uses a
- sample of the <a href="http://goo.gl/OB6oin">GDELT "world event"
- data</a>, joining the event <code>action</code> country code against a table
- that maps country codes to country names. Demonstrates the <code>Join</code>
- operation, and using multiple input sources.
- </li>
- <li><a href="https://github.com/GoogleCloudPlatform/DataflowJavaSDK/blob/master/examples/src/main/java/com/google/cloud/dataflow/examples/cookbook/MaxPerKeyExamples.java">MaxPerKeyExamples</a>
- — An example that reads the public samples of weather data from BigQuery,
- and finds the maximum temperature (<code>mean_temp</code>) for each month.
- Demonstrates the <code>Max</code> statistical combination transform, and how to
- find the max-per-key group.
- </li>
- </ul>
-
-See the [documentation](https://cloud.google.com/dataflow/getting-started) and the [Examples
-README](../../../../../../../../../README.md) for
-information about how to run these examples.
[27/67] [partial] incubator-beam git commit: Directory reorganization
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/testing/SourceTestUtils.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/testing/SourceTestUtils.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/testing/SourceTestUtils.java
deleted file mode 100644
index b8f9b0b..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/testing/SourceTestUtils.java
+++ /dev/null
@@ -1,642 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.testing;
-
-import static org.hamcrest.Matchers.containsInAnyOrder;
-import static org.hamcrest.Matchers.equalTo;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.assertThat;
-import static org.junit.Assert.assertTrue;
-
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.io.BoundedSource;
-import com.google.cloud.dataflow.sdk.io.Source;
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-import com.google.cloud.dataflow.sdk.values.KV;
-
-import org.junit.Assert;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Objects;
-import java.util.concurrent.Callable;
-import java.util.concurrent.CountDownLatch;
-import java.util.concurrent.ExecutorService;
-import java.util.concurrent.Executors;
-import java.util.concurrent.Future;
-
-/**
- * Helper functions and test harnesses for checking correctness of {@link Source}
- * implementations.
- *
- * <p>Contains a few lightweight utilities (e.g. reading items from a source or a reader,
- * such as {@link #readFromSource} and {@link #readFromUnstartedReader}), as well as
- * heavyweight property testing and stress testing harnesses that help getting a large
- * amount of test coverage with few code. Most notable ones are:
- * <ul>
- * <li>{@link #assertSourcesEqualReferenceSource} helps testing that the data read
- * by the union of sources produced by {@link BoundedSource#splitIntoBundles}
- * is the same as data read by the original source.
- * <li>If your source implements dynamic work rebalancing, use the
- * {@code assertSplitAtFraction} family of functions - they test behavior of
- * {@link BoundedSource.BoundedReader#splitAtFraction}, in particular, that
- * various consistency properties are respected and the total set of data read
- * by the source is preserved when splits happen.
- * Use {@link #assertSplitAtFractionBehavior} to test individual cases
- * of {@code splitAtFraction} and use {@link #assertSplitAtFractionExhaustive}
- * as a heavy-weight stress test including concurrency. We strongly recommend to
- * use both.
- * </ul>
- * For example usages, see the unit tests of classes such as
- * {@link com.google.cloud.dataflow.sdk.io.AvroSource} or
- * {@link com.google.cloud.dataflow.sdk.io.XmlSource}.
- *
- * <p>Like {@link DataflowAssert}, requires JUnit and Hamcrest to be present in the classpath.
- */
-public class SourceTestUtils {
- // A wrapper around a value of type T that compares according to the structural
- // value provided by a Coder<T>, but prints both the original and structural value,
- // to help get good error messages from JUnit equality assertion failures and such.
- private static class ReadableStructuralValue<T> {
- private T originalValue;
- private Object structuralValue;
-
- public ReadableStructuralValue(T originalValue, Object structuralValue) {
- this.originalValue = originalValue;
- this.structuralValue = structuralValue;
- }
-
- @Override
- public int hashCode() {
- return Objects.hashCode(structuralValue);
- }
-
- @Override
- public boolean equals(Object obj) {
- if (obj == null || !(obj instanceof ReadableStructuralValue)) {
- return false;
- }
- return Objects.equals(structuralValue, ((ReadableStructuralValue) obj).structuralValue);
- }
-
- @Override
- public String toString() {
- return String.format("[%s (structural %s)]", originalValue, structuralValue);
- }
- }
-
- /**
- * Testing utilities below depend on standard assertions and matchers to compare elements read by
- * sources. In general the elements may not implement {@code equals}/{@code hashCode} properly,
- * however every source has a {@link Coder} and every {@code Coder} can
- * produce a {@link Coder#structuralValue} whose {@code equals}/{@code hashCode} is
- * consistent with equality of encoded format.
- * So we use this {@link Coder#structuralValue} to compare elements read by sources.
- */
- public static <T> List<ReadableStructuralValue<T>> createStructuralValues(
- Coder<T> coder, List<T> list)
- throws Exception {
- List<ReadableStructuralValue<T>> result = new ArrayList<>();
- for (T elem : list) {
- result.add(new ReadableStructuralValue<>(elem, coder.structuralValue(elem)));
- }
- return result;
- }
-
- /**
- * Reads all elements from the given {@link BoundedSource}.
- */
- public static <T> List<T> readFromSource(BoundedSource<T> source, PipelineOptions options)
- throws IOException {
- try (BoundedSource.BoundedReader<T> reader = source.createReader(options)) {
- return readFromUnstartedReader(reader);
- }
- }
-
- /**
- * Reads all elements from the given unstarted {@link Source.Reader}.
- */
- public static <T> List<T> readFromUnstartedReader(Source.Reader<T> reader) throws IOException {
- return readRemainingFromReader(reader, false);
- }
-
- /**
- * Reads all elements from the given started {@link Source.Reader}.
- */
- public static <T> List<T> readFromStartedReader(Source.Reader<T> reader) throws IOException {
- return readRemainingFromReader(reader, true);
- }
-
- /**
- * Read elements from a {@link Source.Reader} until n elements are read.
- */
- public static <T> List<T> readNItemsFromUnstartedReader(Source.Reader<T> reader, int n)
- throws IOException {
- return readNItemsFromReader(reader, n, false);
- }
-
- /**
- * Read elements from a {@link Source.Reader} that has already had {@link Source.Reader#start}
- * called on it, until n elements are read.
- */
- public static <T> List<T> readNItemsFromStartedReader(Source.Reader<T> reader, int n)
- throws IOException {
- return readNItemsFromReader(reader, n, true);
- }
-
- /**
- * Read elements from a {@link Source.Reader} until n elements are read.
- *
- * <p>There must be at least n elements remaining in the reader, except for
- * the case when n is {@code Integer.MAX_VALUE}, which means "read all
- * remaining elements".
- */
- private static <T> List<T> readNItemsFromReader(Source.Reader<T> reader, int n, boolean started)
- throws IOException {
- List<T> res = new ArrayList<>();
- for (int i = 0; i < n; i++) {
- boolean shouldStart = (i == 0 && !started);
- boolean more = shouldStart ? reader.start() : reader.advance();
- if (n != Integer.MAX_VALUE) {
- assertTrue(more);
- }
- if (!more) {
- break;
- }
- res.add(reader.getCurrent());
- }
- return res;
- }
-
- /**
- * Read all remaining elements from a {@link Source.Reader}.
- */
- public static <T> List<T> readRemainingFromReader(Source.Reader<T> reader, boolean started)
- throws IOException {
- return readNItemsFromReader(reader, Integer.MAX_VALUE, started);
- }
-
- /**
- * Given a reference {@code Source} and a list of {@code Source}s, assert that the union of
- * the records read from the list of sources is equal to the records read from the reference
- * source.
- */
- public static <T> void assertSourcesEqualReferenceSource(
- BoundedSource<T> referenceSource,
- List<? extends BoundedSource<T>> sources,
- PipelineOptions options)
- throws Exception {
- Coder<T> coder = referenceSource.getDefaultOutputCoder();
- List<T> referenceRecords = readFromSource(referenceSource, options);
- List<T> bundleRecords = new ArrayList<>();
- for (BoundedSource<T> source : sources) {
- assertThat(
- "Coder type for source "
- + source
- + " is not compatible with Coder type for referenceSource "
- + referenceSource,
- source.getDefaultOutputCoder(),
- equalTo(coder));
- List<T> elems = readFromSource(source, options);
- bundleRecords.addAll(elems);
- }
- List<ReadableStructuralValue<T>> bundleValues =
- createStructuralValues(coder, bundleRecords);
- List<ReadableStructuralValue<T>> referenceValues =
- createStructuralValues(coder, referenceRecords);
- assertThat(bundleValues, containsInAnyOrder(referenceValues.toArray()));
- }
-
- /**
- * Assert that a {@code Reader} returns a {@code Source} that, when read from, produces the same
- * records as the reader.
- */
- public static <T> void assertUnstartedReaderReadsSameAsItsSource(
- BoundedSource.BoundedReader<T> reader, PipelineOptions options) throws Exception {
- Coder<T> coder = reader.getCurrentSource().getDefaultOutputCoder();
- List<T> expected = readFromUnstartedReader(reader);
- List<T> actual = readFromSource(reader.getCurrentSource(), options);
- List<ReadableStructuralValue<T>> expectedStructural = createStructuralValues(coder, expected);
- List<ReadableStructuralValue<T>> actualStructural = createStructuralValues(coder, actual);
- assertThat(actualStructural, containsInAnyOrder(expectedStructural.toArray()));
- }
-
- /**
- * Expected outcome of
- * {@link com.google.cloud.dataflow.sdk.io.BoundedSource.BoundedReader#splitAtFraction}.
- */
- public enum ExpectedSplitOutcome {
- /**
- * The operation must succeed and the results must be consistent.
- */
- MUST_SUCCEED_AND_BE_CONSISTENT,
- /**
- * The operation must fail (return {@code null}).
- */
- MUST_FAIL,
- /**
- * The operation must either fail, or succeed and the results be consistent.
- */
- MUST_BE_CONSISTENT_IF_SUCCEEDS
- }
-
- /**
- * Contains two values: the number of items in the primary source, and the number of items in
- * the residual source, -1 if split failed.
- */
- private static class SplitAtFractionResult {
- public int numPrimaryItems;
- public int numResidualItems;
-
- public SplitAtFractionResult(int numPrimaryItems, int numResidualItems) {
- this.numPrimaryItems = numPrimaryItems;
- this.numResidualItems = numResidualItems;
- }
- }
-
- /**
- * Asserts that the {@code source}'s reader either fails to {@code splitAtFraction(fraction)}
- * after reading {@code numItemsToReadBeforeSplit} items, or succeeds in a way that is
- * consistent according to {@link #assertSplitAtFractionSucceedsAndConsistent}.
- * <p> Returns SplitAtFractionResult.
- */
-
- public static <T> SplitAtFractionResult assertSplitAtFractionBehavior(
- BoundedSource<T> source,
- int numItemsToReadBeforeSplit,
- double splitFraction,
- ExpectedSplitOutcome expectedOutcome,
- PipelineOptions options)
- throws Exception {
- return assertSplitAtFractionBehaviorImpl(
- source, readFromSource(source, options), numItemsToReadBeforeSplit, splitFraction,
- expectedOutcome, options);
- }
-
- /**
- * Compares two lists elementwise and throws a detailed assertion failure optimized for
- * human reading in case they are unequal.
- */
- private static <T> void assertListsEqualInOrder(
- String message, String expectedLabel, List<T> expected, String actualLabel, List<T> actual) {
- int i = 0;
- for (; i < expected.size() && i < actual.size(); ++i) {
- if (!Objects.equals(expected.get(i), actual.get(i))) {
- Assert.fail(String.format(
- "%s: %s and %s have %d items in common and then differ. "
- + "Item in %s (%d more): %s, item in %s (%d more): %s",
- message, expectedLabel, actualLabel, i,
- expectedLabel, expected.size() - i - 1, expected.get(i),
- actualLabel, actual.size() - i - 1, actual.get(i)));
- }
- }
- if (i < expected.size() /* but i == actual.size() */) {
- Assert.fail(String.format(
- "%s: %s has %d more items after matching all %d from %s. First 5: %s",
- message, expectedLabel, expected.size() - actual.size(), actual.size(), actualLabel,
- expected.subList(actual.size(), Math.min(expected.size(), actual.size() + 5))));
- } else if (i < actual.size() /* but i == expected.size() */) {
- Assert.fail(String.format(
- "%s: %s has %d more items after matching all %d from %s. First 5: %s",
- message, actualLabel, actual.size() - expected.size(), expected.size(), expectedLabel,
- actual.subList(expected.size(), Math.min(actual.size(), expected.size() + 5))));
- } else {
- // All is well.
- }
- }
-
- private static <T> SourceTestUtils.SplitAtFractionResult assertSplitAtFractionBehaviorImpl(
- BoundedSource<T> source, List<T> expectedItems, int numItemsToReadBeforeSplit,
- double splitFraction, ExpectedSplitOutcome expectedOutcome, PipelineOptions options)
- throws Exception {
- try (BoundedSource.BoundedReader<T> reader = source.createReader(options)) {
- BoundedSource<T> originalSource = reader.getCurrentSource();
- List<T> currentItems = readNItemsFromUnstartedReader(reader, numItemsToReadBeforeSplit);
- BoundedSource<T> residual = reader.splitAtFraction(splitFraction);
- if (residual != null) {
- assertFalse(
- String.format(
- "Primary source didn't change after a successful split of %s at %f "
- + "after reading %d items. "
- + "Was the source object mutated instead of creating a new one? "
- + "Source objects MUST be immutable.",
- source, splitFraction, numItemsToReadBeforeSplit),
- reader.getCurrentSource() == originalSource);
- assertFalse(
- String.format(
- "Residual source equal to original source after a successful split of %s at %f "
- + "after reading %d items. "
- + "Was the source object mutated instead of creating a new one? "
- + "Source objects MUST be immutable.",
- source, splitFraction, numItemsToReadBeforeSplit),
- reader.getCurrentSource() == residual);
- }
- // Failure cases are: must succeed but fails; must fail but succeeds.
- switch (expectedOutcome) {
- case MUST_SUCCEED_AND_BE_CONSISTENT:
- assertNotNull(
- "Failed to split reader of source: "
- + source
- + " at "
- + splitFraction
- + " after reading "
- + numItemsToReadBeforeSplit
- + " items",
- residual);
- break;
- case MUST_FAIL:
- assertEquals(null, residual);
- break;
- case MUST_BE_CONSISTENT_IF_SUCCEEDS:
- // Nothing.
- break;
- }
- currentItems.addAll(readRemainingFromReader(reader, numItemsToReadBeforeSplit > 0));
- BoundedSource<T> primary = reader.getCurrentSource();
- return verifySingleSplitAtFractionResult(
- source, expectedItems, currentItems, primary, residual,
- numItemsToReadBeforeSplit, splitFraction, options);
- }
- }
-
- private static <T> SourceTestUtils.SplitAtFractionResult verifySingleSplitAtFractionResult(
- BoundedSource<T> source, List<T> expectedItems, List<T> currentItems,
- BoundedSource<T> primary, BoundedSource<T> residual,
- int numItemsToReadBeforeSplit, double splitFraction, PipelineOptions options)
- throws Exception {
- List<T> primaryItems = readFromSource(primary, options);
- if (residual != null) {
- List<T> residualItems = readFromSource(residual, options);
- List<T> totalItems = new ArrayList<>();
- totalItems.addAll(primaryItems);
- totalItems.addAll(residualItems);
- String errorMsgForPrimarySourceComp =
- String.format(
- "Continued reading after split yielded different items than primary source: "
- + "split at %s after reading %s items, original source: %s, primary source: %s",
- splitFraction,
- numItemsToReadBeforeSplit,
- source,
- primary);
- String errorMsgForTotalSourceComp =
- String.format(
- "Items in primary and residual sources after split do not add up to items "
- + "in the original source. Split at %s after reading %s items; "
- + "original source: %s, primary: %s, residual: %s",
- splitFraction,
- numItemsToReadBeforeSplit,
- source,
- primary,
- residual);
- Coder<T> coder = primary.getDefaultOutputCoder();
- List<ReadableStructuralValue<T>> primaryValues =
- createStructuralValues(coder, primaryItems);
- List<ReadableStructuralValue<T>> currentValues =
- createStructuralValues(coder, currentItems);
- List<ReadableStructuralValue<T>> expectedValues =
- createStructuralValues(coder, expectedItems);
- List<ReadableStructuralValue<T>> totalValues =
- createStructuralValues(coder, totalItems);
- assertListsEqualInOrder(
- errorMsgForPrimarySourceComp, "current", currentValues, "primary", primaryValues);
- assertListsEqualInOrder(
- errorMsgForTotalSourceComp, "total", expectedValues, "primary+residual", totalValues);
- return new SplitAtFractionResult(primaryItems.size(), residualItems.size());
- }
- return new SplitAtFractionResult(primaryItems.size(), -1);
- }
-
- /**
- * Verifies some consistency properties of
- * {@link BoundedSource.BoundedReader#splitAtFraction} on the given source. Equivalent to
- * the following pseudocode:
- * <pre>
- * Reader reader = source.createReader();
- * read N items from reader;
- * Source residual = reader.splitAtFraction(splitFraction);
- * Source primary = reader.getCurrentSource();
- * assert: items in primary == items we read so far
- * + items we'll get by continuing to read from reader;
- * assert: items in original source == items in primary + items in residual
- * </pre>
- */
- public static <T> void assertSplitAtFractionSucceedsAndConsistent(
- BoundedSource<T> source,
- int numItemsToReadBeforeSplit,
- double splitFraction,
- PipelineOptions options)
- throws Exception {
- assertSplitAtFractionBehavior(
- source,
- numItemsToReadBeforeSplit,
- splitFraction,
- ExpectedSplitOutcome.MUST_SUCCEED_AND_BE_CONSISTENT,
- options);
- }
-
- /**
- * Asserts that the {@code source}'s reader fails to {@code splitAtFraction(fraction)}
- * after reading {@code numItemsToReadBeforeSplit} items.
- */
- public static <T> void assertSplitAtFractionFails(
- BoundedSource<T> source,
- int numItemsToReadBeforeSplit,
- double splitFraction,
- PipelineOptions options)
- throws Exception {
- assertSplitAtFractionBehavior(
- source, numItemsToReadBeforeSplit, splitFraction, ExpectedSplitOutcome.MUST_FAIL, options);
- }
-
- private static class SplitFractionStatistics {
- List<Double> successfulFractions = new ArrayList<>();
- List<Double> nonTrivialFractions = new ArrayList<>();
- }
-
- /**
- * Asserts that given a start position,
- * {@link BoundedSource.BoundedReader#splitAtFraction} at every interesting fraction (halfway
- * between two fractions that differ by at least one item) can be called successfully and the
- * results are consistent if a split succeeds.
- */
- private static <T> void assertSplitAtFractionBinary(
- BoundedSource<T> source,
- List<T> expectedItems,
- int numItemsToBeReadBeforeSplit,
- double leftFraction,
- SplitAtFractionResult leftResult,
- double rightFraction,
- SplitAtFractionResult rightResult,
- PipelineOptions options,
- SplitFractionStatistics stats)
- throws Exception {
- if (rightFraction - leftFraction < 0.001) {
- // Do not recurse too deeply. Otherwise we will end up in infinite
- // recursion, e.g., while trying to find the exact minimal fraction s.t.
- // split succeeds. A precision of 0.001 when looking for such a fraction
- // ought to be enough for everybody.
- return;
- }
- double middleFraction = (rightFraction + leftFraction) / 2;
- if (leftResult == null) {
- leftResult = assertSplitAtFractionBehaviorImpl(
- source, expectedItems, numItemsToBeReadBeforeSplit, leftFraction,
- ExpectedSplitOutcome.MUST_BE_CONSISTENT_IF_SUCCEEDS, options);
- }
- if (rightResult == null) {
- rightResult = assertSplitAtFractionBehaviorImpl(
- source, expectedItems, numItemsToBeReadBeforeSplit, rightFraction,
- ExpectedSplitOutcome.MUST_BE_CONSISTENT_IF_SUCCEEDS, options);
- }
- SplitAtFractionResult middleResult = assertSplitAtFractionBehaviorImpl(
- source, expectedItems, numItemsToBeReadBeforeSplit, middleFraction,
- ExpectedSplitOutcome.MUST_BE_CONSISTENT_IF_SUCCEEDS, options);
- if (middleResult.numResidualItems != -1) {
- stats.successfulFractions.add(middleFraction);
- }
- if (middleResult.numResidualItems > 0) {
- stats.nonTrivialFractions.add(middleFraction);
- }
- // Two split fractions are equivalent if they yield the same number of
- // items in primary vs. residual source. Left and right are already not
- // equivalent. Recurse into [left, middle) and [right, middle) respectively
- // if middle is not equivalent to left or right.
- if (leftResult.numPrimaryItems != middleResult.numPrimaryItems) {
- assertSplitAtFractionBinary(
- source, expectedItems, numItemsToBeReadBeforeSplit,
- leftFraction, leftResult, middleFraction, middleResult, options, stats);
- }
- if (rightResult.numPrimaryItems != middleResult.numPrimaryItems) {
- assertSplitAtFractionBinary(
- source, expectedItems, numItemsToBeReadBeforeSplit,
- middleFraction, middleResult, rightFraction, rightResult, options, stats);
- }
- }
-
- /**
- * Asserts that for each possible start position,
- * {@link BoundedSource.BoundedReader#splitAtFraction} at every interesting fraction (halfway
- * between two fractions that differ by at least one item) can be called successfully and the
- * results are consistent if a split succeeds. Verifies multithreaded splitting as well.
- */
- public static <T> void assertSplitAtFractionExhaustive(
- BoundedSource<T> source, PipelineOptions options) throws Exception {
- List<T> expectedItems = readFromSource(source, options);
- assertFalse("Empty source", expectedItems.isEmpty());
- assertFalse("Source reads a single item", expectedItems.size() == 1);
- List<List<Double>> allNonTrivialFractions = new ArrayList<>();
- {
- boolean anySuccessfulFractions = false;
- boolean anyNonTrivialFractions = false;
- for (int i = 0; i < expectedItems.size(); i++) {
- SplitFractionStatistics stats = new SplitFractionStatistics();
- assertSplitAtFractionBinary(source, expectedItems, i,
- 0.0, null, 1.0, null, options, stats);
- if (!stats.successfulFractions.isEmpty()) {
- anySuccessfulFractions = true;
- }
- if (!stats.nonTrivialFractions.isEmpty()) {
- anyNonTrivialFractions = true;
- }
- allNonTrivialFractions.add(stats.nonTrivialFractions);
- }
- assertTrue(
- "splitAtFraction test completed vacuously: no successful split fractions found",
- anySuccessfulFractions);
- assertTrue(
- "splitAtFraction test completed vacuously: no non-trivial split fractions found",
- anyNonTrivialFractions);
- }
- {
- // Perform a stress test of "racy" concurrent splitting:
- // for every position (number of items read), try to split at the minimum nontrivial
- // split fraction for that position concurrently with reading the record at that position.
- // To ensure that the test is non-vacuous, make sure that the splitting succeeds
- // at least once and fails at least once.
- ExecutorService executor = Executors.newFixedThreadPool(2);
- for (int i = 0; i < expectedItems.size(); i++) {
- double minNonTrivialFraction = 2.0; // Greater than any possible fraction.
- for (double fraction : allNonTrivialFractions.get(i)) {
- minNonTrivialFraction = Math.min(minNonTrivialFraction, fraction);
- }
- if (minNonTrivialFraction == 2.0) {
- // This will not happen all the time because otherwise the test above would
- // detect vacuousness.
- continue;
- }
- boolean haveSuccess = false, haveFailure = false;
- while (!haveSuccess || !haveFailure) {
- if (assertSplitAtFractionConcurrent(
- executor, source, expectedItems, i, minNonTrivialFraction, options)) {
- haveSuccess = true;
- } else {
- haveFailure = true;
- }
- }
- }
- }
- }
-
- private static <T> boolean assertSplitAtFractionConcurrent(
- ExecutorService executor, BoundedSource<T> source, List<T> expectedItems,
- final int numItemsToReadBeforeSplitting, final double fraction, PipelineOptions options)
- throws Exception {
- @SuppressWarnings("resource") // Closed in readerThread
- final BoundedSource.BoundedReader<T> reader = source.createReader(options);
- final CountDownLatch unblockSplitter = new CountDownLatch(1);
- Future<List<T>> readerThread =
- executor.submit(
- new Callable<List<T>>() {
- @Override
- public List<T> call() throws Exception {
- try {
- List<T> items =
- readNItemsFromUnstartedReader(reader, numItemsToReadBeforeSplitting);
- unblockSplitter.countDown();
- items.addAll(readRemainingFromReader(reader, numItemsToReadBeforeSplitting > 0));
- return items;
- } finally {
- reader.close();
- }
- }
- });
- Future<KV<BoundedSource<T>, BoundedSource<T>>> splitterThread = executor.submit(
- new Callable<KV<BoundedSource<T>, BoundedSource<T>>>() {
- @Override
- public KV<BoundedSource<T>, BoundedSource<T>> call() throws Exception {
- unblockSplitter.await();
- BoundedSource<T> residual = reader.splitAtFraction(fraction);
- if (residual == null) {
- return null;
- }
- return KV.of(reader.getCurrentSource(), residual);
- }
- });
- List<T> currentItems = readerThread.get();
- KV<BoundedSource<T>, BoundedSource<T>> splitSources = splitterThread.get();
- if (splitSources == null) {
- return false;
- }
- SplitAtFractionResult res = verifySingleSplitAtFractionResult(
- source, expectedItems, currentItems, splitSources.getKey(), splitSources.getValue(),
- numItemsToReadBeforeSplitting, fraction, options);
- return (res.numResidualItems > 0);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/testing/TestDataflowPipelineOptions.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/testing/TestDataflowPipelineOptions.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/testing/TestDataflowPipelineOptions.java
deleted file mode 100644
index 1afb691..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/testing/TestDataflowPipelineOptions.java
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.testing;
-
-import com.google.cloud.dataflow.sdk.options.BlockingDataflowPipelineOptions;
-
-/**
- * A set of options used to configure the {@link TestPipeline}.
- */
-public interface TestDataflowPipelineOptions extends BlockingDataflowPipelineOptions {
-
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/testing/TestDataflowPipelineRunner.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/testing/TestDataflowPipelineRunner.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/testing/TestDataflowPipelineRunner.java
deleted file mode 100644
index 9fff070..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/testing/TestDataflowPipelineRunner.java
+++ /dev/null
@@ -1,220 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.testing;
-
-import com.google.api.services.dataflow.model.JobMessage;
-import com.google.api.services.dataflow.model.JobMetrics;
-import com.google.api.services.dataflow.model.MetricUpdate;
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.PipelineResult.State;
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-import com.google.cloud.dataflow.sdk.runners.DataflowJobExecutionException;
-import com.google.cloud.dataflow.sdk.runners.DataflowPipelineJob;
-import com.google.cloud.dataflow.sdk.runners.DataflowPipelineRunner;
-import com.google.cloud.dataflow.sdk.runners.PipelineRunner;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-import com.google.cloud.dataflow.sdk.util.MonitoringUtil;
-import com.google.cloud.dataflow.sdk.util.MonitoringUtil.JobMessagesHandler;
-import com.google.cloud.dataflow.sdk.values.PInput;
-import com.google.cloud.dataflow.sdk.values.POutput;
-import com.google.common.base.Optional;
-import com.google.common.base.Throwables;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.IOException;
-import java.math.BigDecimal;
-import java.util.List;
-import java.util.concurrent.Callable;
-import java.util.concurrent.Future;
-import java.util.concurrent.TimeUnit;
-
-/**
- * {@link TestDataflowPipelineRunner} is a pipeline runner that wraps a
- * {@link DataflowPipelineRunner} when running tests against the {@link TestPipeline}.
- *
- * @see TestPipeline
- */
-public class TestDataflowPipelineRunner extends PipelineRunner<DataflowPipelineJob> {
- private static final String TENTATIVE_COUNTER = "tentative";
- private static final Logger LOG = LoggerFactory.getLogger(TestDataflowPipelineRunner.class);
-
- private final TestDataflowPipelineOptions options;
- private final DataflowPipelineRunner runner;
- private int expectedNumberOfAssertions = 0;
-
- TestDataflowPipelineRunner(TestDataflowPipelineOptions options) {
- this.options = options;
- this.runner = DataflowPipelineRunner.fromOptions(options);
- }
-
- /**
- * Constructs a runner from the provided options.
- */
- public static TestDataflowPipelineRunner fromOptions(
- PipelineOptions options) {
- TestDataflowPipelineOptions dataflowOptions = options.as(TestDataflowPipelineOptions.class);
-
- return new TestDataflowPipelineRunner(dataflowOptions);
- }
-
- @Override
- public DataflowPipelineJob run(Pipeline pipeline) {
- return run(pipeline, runner);
- }
-
- DataflowPipelineJob run(Pipeline pipeline, DataflowPipelineRunner runner) {
-
- final JobMessagesHandler messageHandler =
- new MonitoringUtil.PrintHandler(options.getJobMessageOutput());
- final DataflowPipelineJob job;
- try {
- job = runner.run(pipeline);
- } catch (DataflowJobExecutionException ex) {
- throw new IllegalStateException("The dataflow failed.");
- }
-
- LOG.info("Running Dataflow job {} with {} expected assertions.",
- job.getJobId(), expectedNumberOfAssertions);
-
- try {
- final Optional<Boolean> result;
- if (options.isStreaming()) {
- Future<Optional<Boolean>> resultFuture = options.getExecutorService().submit(
- new Callable<Optional<Boolean>>() {
- @Override
- public Optional<Boolean> call() throws Exception {
- try {
- for (;;) {
- Optional<Boolean> result = checkForSuccess(job);
- if (result.isPresent()) {
- return result;
- }
- Thread.sleep(10000L);
- }
- } finally {
- LOG.info("Cancelling Dataflow job {}", job.getJobId());
- job.cancel();
- }
- }
- });
- State finalState = job.waitToFinish(10L, TimeUnit.MINUTES, new JobMessagesHandler() {
- @Override
- public void process(List<JobMessage> messages) {
- messageHandler.process(messages);
- for (JobMessage message : messages) {
- if (message.getMessageImportance() != null
- && message.getMessageImportance().equals("JOB_MESSAGE_ERROR")) {
- LOG.info("Dataflow job {} threw exception, cancelling. Exception was: {}",
- job.getJobId(), message.getMessageText());
- try {
- job.cancel();
- } catch (Exception e) {
- throw Throwables.propagate(e);
- }
- }
- }
- }
- });
- if (finalState == null || finalState == State.RUNNING) {
- LOG.info("Dataflow job {} took longer than 10 minutes to complete, cancelling.",
- job.getJobId());
- job.cancel();
- }
- result = resultFuture.get();
- } else {
- job.waitToFinish(-1, TimeUnit.SECONDS, messageHandler);
- result = checkForSuccess(job);
- }
- if (!result.isPresent()) {
- throw new IllegalStateException(
- "The dataflow did not output a success or failure metric.");
- } else if (!result.get()) {
- throw new IllegalStateException("The dataflow failed.");
- }
- } catch (Exception e) {
- Throwables.propagateIfPossible(e);
- throw Throwables.propagate(e);
- }
- return job;
- }
-
- @Override
- public <OutputT extends POutput, InputT extends PInput> OutputT apply(
- PTransform<InputT, OutputT> transform, InputT input) {
- if (transform instanceof DataflowAssert.OneSideInputAssert
- || transform instanceof DataflowAssert.TwoSideInputAssert) {
- expectedNumberOfAssertions += 1;
- }
-
- return runner.apply(transform, input);
- }
-
- Optional<Boolean> checkForSuccess(DataflowPipelineJob job)
- throws IOException {
- State state = job.getState();
- if (state == State.FAILED || state == State.CANCELLED) {
- LOG.info("The pipeline failed");
- return Optional.of(false);
- }
-
- JobMetrics metrics = job.getDataflowClient().projects().jobs()
- .getMetrics(job.getProjectId(), job.getJobId()).execute();
-
- if (metrics == null || metrics.getMetrics() == null) {
- LOG.warn("Metrics not present for Dataflow job {}.", job.getJobId());
- } else {
- int successes = 0;
- int failures = 0;
- for (MetricUpdate metric : metrics.getMetrics()) {
- if (metric.getName() == null || metric.getName().getContext() == null
- || !metric.getName().getContext().containsKey(TENTATIVE_COUNTER)) {
- // Don't double count using the non-tentative version of the metric.
- continue;
- }
- if (DataflowAssert.SUCCESS_COUNTER.equals(metric.getName().getName())) {
- successes += ((BigDecimal) metric.getScalar()).intValue();
- } else if (DataflowAssert.FAILURE_COUNTER.equals(metric.getName().getName())) {
- failures += ((BigDecimal) metric.getScalar()).intValue();
- }
- }
-
- if (failures > 0) {
- LOG.info("Found result while running Dataflow job {}. Found {} success, {} failures out of "
- + "{} expected assertions.", job.getJobId(), successes, failures,
- expectedNumberOfAssertions);
- return Optional.of(false);
- } else if (successes >= expectedNumberOfAssertions) {
- LOG.info("Found result while running Dataflow job {}. Found {} success, {} failures out of "
- + "{} expected assertions.", job.getJobId(), successes, failures,
- expectedNumberOfAssertions);
- return Optional.of(true);
- }
-
- LOG.info("Running Dataflow job {}. Found {} success, {} failures out of {} expected "
- + "assertions.", job.getJobId(), successes, failures, expectedNumberOfAssertions);
- }
-
- return Optional.<Boolean>absent();
- }
-
- @Override
- public String toString() {
- return "TestDataflowPipelineRunner#" + options.getAppName();
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/testing/TestPipeline.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/testing/TestPipeline.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/testing/TestPipeline.java
deleted file mode 100644
index a05a778..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/testing/TestPipeline.java
+++ /dev/null
@@ -1,193 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.testing;
-
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.PipelineResult;
-import com.google.cloud.dataflow.sdk.options.ApplicationNameOptions;
-import com.google.cloud.dataflow.sdk.options.GcpOptions;
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-import com.google.cloud.dataflow.sdk.options.PipelineOptions.CheckEnabled;
-import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
-import com.google.cloud.dataflow.sdk.runners.DataflowPipelineRunner;
-import com.google.cloud.dataflow.sdk.runners.PipelineRunner;
-import com.google.cloud.dataflow.sdk.util.TestCredential;
-import com.google.common.base.Optional;
-import com.google.common.collect.Iterators;
-
-import com.fasterxml.jackson.databind.ObjectMapper;
-
-import java.io.IOException;
-import java.util.Iterator;
-
-import javax.annotation.Nullable;
-
-/**
- * A creator of test pipelines that can be used inside of tests that can be
- * configured to run locally or against the live service.
- *
- * <p>It is recommended to tag hand-selected tests for this purpose using the
- * RunnableOnService Category annotation, as each test run against the service
- * will spin up and tear down a single VM.
- *
- * <p>In order to run tests on the dataflow pipeline service, the following
- * conditions must be met:
- * <ul>
- * <li> runIntegrationTestOnService System property must be set to true.
- * <li> System property "projectName" must be set to your Cloud project.
- * <li> System property "temp_gcs_directory" must be set to a valid GCS bucket.
- * <li> Jars containing the SDK and test classes must be added to the test classpath.
- * </ul>
- *
- * <p>Use {@link DataflowAssert} for tests, as it integrates with this test
- * harness in both direct and remote execution modes. For example:
- *
- * <pre>{@code
- * Pipeline p = TestPipeline.create();
- * PCollection<Integer> output = ...
- *
- * DataflowAssert.that(output)
- * .containsInAnyOrder(1, 2, 3, 4);
- * p.run();
- * }</pre>
- *
- */
-public class TestPipeline extends Pipeline {
- private static final String PROPERTY_DATAFLOW_OPTIONS = "dataflowOptions";
- private static final ObjectMapper MAPPER = new ObjectMapper();
-
- /**
- * Creates and returns a new test pipeline.
- *
- * <p>Use {@link DataflowAssert} to add tests, then call
- * {@link Pipeline#run} to execute the pipeline and check the tests.
- */
- public static TestPipeline create() {
- return fromOptions(testingPipelineOptions());
- }
-
- public static TestPipeline fromOptions(PipelineOptions options) {
- return new TestPipeline(PipelineRunner.fromOptions(options), options);
- }
-
- /**
- * Returns whether a {@link TestPipeline} supports dynamic work rebalancing, and thus tests
- * of dynamic work rebalancing are expected to pass.
- */
- public boolean supportsDynamicWorkRebalancing() {
- return getRunner() instanceof DataflowPipelineRunner;
- }
-
- private TestPipeline(PipelineRunner<? extends PipelineResult> runner, PipelineOptions options) {
- super(runner, options);
- }
-
- /**
- * Runs this {@link TestPipeline}, unwrapping any {@code AssertionError}
- * that is raised during testing.
- */
- @Override
- public PipelineResult run() {
- try {
- return super.run();
- } catch (RuntimeException exc) {
- Throwable cause = exc.getCause();
- if (cause instanceof AssertionError) {
- throw (AssertionError) cause;
- } else {
- throw exc;
- }
- }
- }
-
- @Override
- public String toString() {
- return "TestPipeline#" + getOptions().as(ApplicationNameOptions.class).getAppName();
- }
-
- /**
- * Creates {@link PipelineOptions} for testing.
- */
- public static PipelineOptions testingPipelineOptions() {
- try {
- @Nullable String systemDataflowOptions = System.getProperty(PROPERTY_DATAFLOW_OPTIONS);
- PipelineOptions options =
- systemDataflowOptions == null
- ? PipelineOptionsFactory.create()
- : PipelineOptionsFactory.fromArgs(
- MAPPER.readValue(
- System.getProperty(PROPERTY_DATAFLOW_OPTIONS), String[].class))
- .as(PipelineOptions.class);
-
- options.as(ApplicationNameOptions.class).setAppName(getAppName());
- if (isIntegrationTest()) {
- // TODO: adjust everyone's integration test frameworks to set the runner class via the
- // pipeline options via PROPERTY_DATAFLOW_OPTIONS
- options.setRunner(TestDataflowPipelineRunner.class);
- } else {
- options.as(GcpOptions.class).setGcpCredential(new TestCredential());
- }
- options.setStableUniqueNames(CheckEnabled.ERROR);
- return options;
- } catch (IOException e) {
- throw new RuntimeException("Unable to instantiate test options from system property "
- + PROPERTY_DATAFLOW_OPTIONS + ":" + System.getProperty(PROPERTY_DATAFLOW_OPTIONS), e);
- }
- }
-
- /**
- * Returns whether a {@link TestPipeline} should be treated as an integration test.
- */
- private static boolean isIntegrationTest() {
- return Boolean.parseBoolean(System.getProperty("runIntegrationTestOnService"));
- }
-
- /** Returns the class + method name of the test, or a default name. */
- private static String getAppName() {
- Optional<StackTraceElement> stackTraceElement = findCallersStackTrace();
- if (stackTraceElement.isPresent()) {
- String methodName = stackTraceElement.get().getMethodName();
- String className = stackTraceElement.get().getClassName();
- if (className.contains(".")) {
- className = className.substring(className.lastIndexOf(".") + 1);
- }
- return className + "-" + methodName;
- }
- return "UnitTest";
- }
-
- /** Returns the {@link StackTraceElement} of the calling class. */
- private static Optional<StackTraceElement> findCallersStackTrace() {
- Iterator<StackTraceElement> elements =
- Iterators.forArray(Thread.currentThread().getStackTrace());
- // First find the TestPipeline class in the stack trace.
- while (elements.hasNext()) {
- StackTraceElement next = elements.next();
- if (TestPipeline.class.getName().equals(next.getClassName())) {
- break;
- }
- }
- // Then find the first instance after that is not the TestPipeline
- while (elements.hasNext()) {
- StackTraceElement next = elements.next();
- if (!TestPipeline.class.getName().equals(next.getClassName())) {
- return Optional.of(next);
- }
- }
- return Optional.absent();
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/testing/WindowFnTestUtils.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/testing/WindowFnTestUtils.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/testing/WindowFnTestUtils.java
deleted file mode 100644
index dc0baf5..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/testing/WindowFnTestUtils.java
+++ /dev/null
@@ -1,325 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.testing;
-
-import static org.hamcrest.Matchers.greaterThan;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertThat;
-import static org.junit.Assert.assertTrue;
-
-import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
-import com.google.cloud.dataflow.sdk.transforms.windowing.IntervalWindow;
-import com.google.cloud.dataflow.sdk.transforms.windowing.OutputTimeFn;
-import com.google.cloud.dataflow.sdk.transforms.windowing.OutputTimeFns;
-import com.google.cloud.dataflow.sdk.transforms.windowing.WindowFn;
-
-import org.joda.time.Instant;
-import org.joda.time.ReadableInstant;
-
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-
-import javax.annotation.Nullable;
-
-/**
- * A utility class for testing {@link WindowFn}s.
- */
-public class WindowFnTestUtils {
-
- /**
- * Creates a Set of elements to be used as expected output in
- * {@link #runWindowFn}.
- */
- public static Set<String> set(long... timestamps) {
- Set<String> result = new HashSet<>();
- for (long timestamp : timestamps) {
- result.add(timestampValue(timestamp));
- }
- return result;
- }
-
- /**
- * Runs the {@link WindowFn} over the provided input, returning a map
- * of windows to the timestamps in those windows.
- */
- public static <T, W extends BoundedWindow> Map<W, Set<String>> runWindowFn(
- WindowFn<T, W> windowFn,
- List<Long> timestamps) throws Exception {
-
- final TestWindowSet<W, String> windowSet = new TestWindowSet<W, String>();
- for (final Long timestamp : timestamps) {
- for (W window : windowFn.assignWindows(
- new TestAssignContext<T, W>(new Instant(timestamp), windowFn))) {
- windowSet.put(window, timestampValue(timestamp));
- }
- windowFn.mergeWindows(new TestMergeContext<T, W>(windowSet, windowFn));
- }
- Map<W, Set<String>> actual = new HashMap<>();
- for (W window : windowSet.windows()) {
- actual.put(window, windowSet.get(window));
- }
- return actual;
- }
-
- public static <T, W extends BoundedWindow> Collection<W> assignedWindows(
- WindowFn<T, W> windowFn, long timestamp) throws Exception {
- return windowFn.assignWindows(new TestAssignContext<T, W>(new Instant(timestamp), windowFn));
- }
-
- private static String timestampValue(long timestamp) {
- return "T" + new Instant(timestamp);
- }
-
- /**
- * Test implementation of AssignContext.
- */
- private static class TestAssignContext<T, W extends BoundedWindow>
- extends WindowFn<T, W>.AssignContext {
- private Instant timestamp;
-
- public TestAssignContext(Instant timestamp, WindowFn<T, W> windowFn) {
- windowFn.super();
- this.timestamp = timestamp;
- }
-
- @Override
- public T element() {
- return null;
- }
-
- @Override
- public Instant timestamp() {
- return timestamp;
- }
-
- @Override
- public Collection<? extends BoundedWindow> windows() {
- return null;
- }
- }
-
- /**
- * Test implementation of MergeContext.
- */
- private static class TestMergeContext<T, W extends BoundedWindow>
- extends WindowFn<T, W>.MergeContext {
- private TestWindowSet<W, ?> windowSet;
-
- public TestMergeContext(
- TestWindowSet<W, ?> windowSet, WindowFn<T, W> windowFn) {
- windowFn.super();
- this.windowSet = windowSet;
- }
-
- @Override
- public Collection<W> windows() {
- return windowSet.windows();
- }
-
- @Override
- public void merge(Collection<W> toBeMerged, W mergeResult) {
- windowSet.merge(toBeMerged, mergeResult);
- }
- }
-
- /**
- * A WindowSet useful for testing WindowFns that simply
- * collects the placed elements into multisets.
- */
- private static class TestWindowSet<W extends BoundedWindow, V> {
-
- private Map<W, Set<V>> elements = new HashMap<>();
-
- public void put(W window, V value) {
- Set<V> all = elements.get(window);
- if (all == null) {
- all = new HashSet<>();
- elements.put(window, all);
- }
- all.add(value);
- }
-
- public void merge(Collection<W> otherWindows, W window) {
- if (otherWindows.isEmpty()) {
- return;
- }
- Set<V> merged = new HashSet<>();
- if (elements.containsKey(window) && !otherWindows.contains(window)) {
- merged.addAll(elements.get(window));
- }
- for (W w : otherWindows) {
- if (!elements.containsKey(w)) {
- throw new IllegalArgumentException("Tried to merge a non-existent window:" + w);
- }
- merged.addAll(elements.get(w));
- elements.remove(w);
- }
- elements.put(window, merged);
- }
-
- public Collection<W> windows() {
- return elements.keySet();
- }
-
- // For testing.
-
- public Set<V> get(W window) {
- return elements.get(window);
- }
- }
-
- /**
- * Assigns the given {@code timestamp} to windows using the specified {@code windowFn}, and
- * verifies that result of {@code windowFn.getOutputTimestamp} for each window is within the
- * proper bound.
- */
- public static <T, W extends BoundedWindow> void validateNonInterferingOutputTimes(
- WindowFn<T, W> windowFn, long timestamp) throws Exception {
- Collection<W> windows = WindowFnTestUtils.<T, W>assignedWindows(windowFn, timestamp);
-
- Instant instant = new Instant(timestamp);
- for (W window : windows) {
- Instant outputTimestamp = windowFn.getOutputTimeFn().assignOutputTime(instant, window);
- assertFalse("getOutputTime must be greater than or equal to input timestamp",
- outputTimestamp.isBefore(instant));
- assertFalse("getOutputTime must be less than or equal to the max timestamp",
- outputTimestamp.isAfter(window.maxTimestamp()));
- }
- }
-
- /**
- * Assigns the given {@code timestamp} to windows using the specified {@code windowFn}, and
- * verifies that result of {@link WindowFn#getOutputTime windowFn.getOutputTime} for later windows
- * (as defined by {@code maxTimestamp} won't prevent the watermark from passing the end of earlier
- * windows.
- *
- * <p>This verifies that overlapping windows don't interfere at all. Depending on the
- * {@code windowFn} this may be stricter than desired.
- */
- public static <T, W extends BoundedWindow> void validateGetOutputTimestamp(
- WindowFn<T, W> windowFn, long timestamp) throws Exception {
- Collection<W> windows = WindowFnTestUtils.<T, W>assignedWindows(windowFn, timestamp);
- List<W> sortedWindows = new ArrayList<>(windows);
- Collections.sort(sortedWindows, new Comparator<BoundedWindow>() {
- @Override
- public int compare(BoundedWindow o1, BoundedWindow o2) {
- return o1.maxTimestamp().compareTo(o2.maxTimestamp());
- }
- });
-
- Instant instant = new Instant(timestamp);
- Instant endOfPrevious = null;
- for (W window : sortedWindows) {
- Instant outputTimestamp = windowFn.getOutputTimeFn().assignOutputTime(instant, window);
- if (endOfPrevious == null) {
- // If this is the first window, the output timestamp can be anything, as long as it is in
- // the valid range.
- assertFalse("getOutputTime must be greater than or equal to input timestamp",
- outputTimestamp.isBefore(instant));
- assertFalse("getOutputTime must be less than or equal to the max timestamp",
- outputTimestamp.isAfter(window.maxTimestamp()));
- } else {
- // If this is a later window, the output timestamp must be after the end of the previous
- // window
- assertTrue("getOutputTime must be greater than the end of the previous window",
- outputTimestamp.isAfter(endOfPrevious));
- assertFalse("getOutputTime must be less than or equal to the max timestamp",
- outputTimestamp.isAfter(window.maxTimestamp()));
- }
- endOfPrevious = window.maxTimestamp();
- }
- }
-
- /**
- * Verifies that later-ending merged windows from any of the timestamps hold up output of
- * earlier-ending windows, using the provided {@link WindowFn} and {@link OutputTimeFn}.
- *
- * <p>Given a list of lists of timestamps, where each list is expected to merge into a single
- * window with end times in ascending order, assigns and merges windows for each list (as though
- * each were a separate key/user session). Then maps each timestamp in the list according to
- * {@link OutputTimeFn#assignOutputTime outputTimeFn.assignOutputTime()} and
- * {@link OutputTimeFn#combine outputTimeFn.combine()}.
- *
- * <p>Verifies that a overlapping windows do not hold each other up via the watermark.
- */
- public static <T, W extends IntervalWindow>
- void validateGetOutputTimestamps(
- WindowFn<T, W> windowFn,
- OutputTimeFn<? super W> outputTimeFn,
- List<List<Long>> timestampsPerWindow) throws Exception {
-
- // Assign windows to each timestamp, then merge them, storing the merged windows in
- // a list in corresponding order to timestampsPerWindow
- final List<W> windows = new ArrayList<>();
- for (List<Long> timestampsForWindow : timestampsPerWindow) {
- final Set<W> windowsToMerge = new HashSet<>();
-
- for (long timestamp : timestampsForWindow) {
- windowsToMerge.addAll(
- WindowFnTestUtils.<T, W>assignedWindows(windowFn, timestamp));
- }
-
- windowFn.mergeWindows(windowFn.new MergeContext() {
- @Override
- public Collection<W> windows() {
- return windowsToMerge;
- }
-
- @Override
- public void merge(Collection<W> toBeMerged, W mergeResult) throws Exception {
- windows.add(mergeResult);
- }
- });
- }
-
- // Map every list of input timestamps to an output timestamp
- final List<Instant> combinedOutputTimestamps = new ArrayList<>();
- for (int i = 0; i < timestampsPerWindow.size(); ++i) {
- List<Long> timestampsForWindow = timestampsPerWindow.get(i);
- W window = windows.get(i);
-
- List<Instant> outputInstants = new ArrayList<>();
- for (long inputTimestamp : timestampsForWindow) {
- outputInstants.add(outputTimeFn.assignOutputTime(new Instant(inputTimestamp), window));
- }
-
- combinedOutputTimestamps.add(OutputTimeFns.combineOutputTimes(outputTimeFn, outputInstants));
- }
-
- // Consider windows in increasing order of max timestamp; ensure the output timestamp is after
- // the max timestamp of the previous
- @Nullable W earlierEndingWindow = null;
- for (int i = 0; i < windows.size(); ++i) {
- W window = windows.get(i);
- ReadableInstant outputTimestamp = combinedOutputTimestamps.get(i);
-
- if (earlierEndingWindow != null) {
- assertThat(outputTimestamp,
- greaterThan((ReadableInstant) earlierEndingWindow.maxTimestamp()));
- }
-
- earlierEndingWindow = window;
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/testing/package-info.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/testing/package-info.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/testing/package-info.java
deleted file mode 100644
index d6f075d..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/testing/package-info.java
+++ /dev/null
@@ -1,21 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-/**
- * Defines utilities for unit testing Dataflow pipelines. The tests for the {@code PTransform}s and
- * examples included the Dataflow SDK provide examples of using these utilities.
- */
-package com.google.cloud.dataflow.sdk.testing;
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/Aggregator.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/Aggregator.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/Aggregator.java
deleted file mode 100644
index 7e56dda..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/Aggregator.java
+++ /dev/null
@@ -1,78 +0,0 @@
-/*******************************************************************************
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- ******************************************************************************/
-
-package com.google.cloud.dataflow.sdk.transforms;
-
-import com.google.cloud.dataflow.sdk.transforms.Combine.CombineFn;
-
-/**
- * An {@code Aggregator<InputT>} enables monitoring of values of type {@code InputT},
- * to be combined across all bundles.
- *
- * <p>Aggregators are created by calling {@link DoFn#createAggregator},
- * typically from the {@link DoFn} constructor. Elements can be added to the
- * {@code Aggregator} by calling {@link Aggregator#addValue}.
- *
- * <p>Aggregators are visible in the monitoring UI, when the pipeline is run
- * using DataflowPipelineRunner or BlockingDataflowPipelineRunner, along with
- * their current value. Aggregators may not become visible until the system
- * begins executing the ParDo transform that created them and/or their initial
- * value is changed.
- *
- * <p>Example:
- * <pre> {@code
- * class MyDoFn extends DoFn<String, String> {
- * private Aggregator<Integer, Integer> myAggregator;
- *
- * public MyDoFn() {
- * myAggregator = createAggregator("myAggregator", new Sum.SumIntegerFn());
- * }
- *
- * @Override
- * public void processElement(ProcessContext c) {
- * myAggregator.addValue(1);
- * }
- * }
- * } </pre>
- *
- * @param <InputT> the type of input values
- * @param <OutputT> the type of output values
- */
-public interface Aggregator<InputT, OutputT> {
-
- /**
- * Adds a new value into the Aggregator.
- */
- void addValue(InputT value);
-
- /**
- * Returns the name of the Aggregator.
- */
- String getName();
-
- /**
- * Returns the {@link CombineFn}, which combines input elements in the
- * aggregator.
- */
- CombineFn<InputT, ?, OutputT> getCombineFn();
-
- // TODO: Consider the following additional API conveniences:
- // - In addition to createAggregator(), consider adding getAggregator() to
- // avoid the need to store the aggregator locally in a DoFn, i.e., create
- // if not already present.
- // - Add a shortcut for the most common aggregator:
- // c.createAggregator("name", new Sum.SumIntegerFn()).
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/AggregatorRetriever.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/AggregatorRetriever.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/AggregatorRetriever.java
deleted file mode 100644
index 4bbea85..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/AggregatorRetriever.java
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms;
-
-import java.util.Collection;
-
-/**
- * An internal class for extracting {@link Aggregator Aggregators} from {@link DoFn DoFns}.
- */
-public final class AggregatorRetriever {
- private AggregatorRetriever() {
- // do not instantiate
- }
-
- /**
- * Returns the {@link Aggregator Aggregators} created by the provided {@link DoFn}.
- */
- public static Collection<Aggregator<?, ?>> getAggregators(DoFn<?, ?> fn) {
- return fn.getAggregators();
- }
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/AppliedPTransform.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/AppliedPTransform.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/AppliedPTransform.java
deleted file mode 100644
index 7b3d87d..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/AppliedPTransform.java
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms;
-
-import com.google.cloud.dataflow.sdk.values.PInput;
-import com.google.cloud.dataflow.sdk.values.POutput;
-import com.google.common.base.MoreObjects;
-import com.google.common.base.Objects;
-
-/**
- * Represents the application of a {@link PTransform} to a specific input to produce
- * a specific output.
- *
- * <p>For internal use.
- *
- * @param <InputT> transform input type
- * @param <OutputT> transform output type
- * @param <TransformT> transform type
- */
-public class AppliedPTransform
- <InputT extends PInput, OutputT extends POutput,
- TransformT extends PTransform<? super InputT, OutputT>> {
-
- private final String fullName;
- private final InputT input;
- private final OutputT output;
- private final TransformT transform;
-
- private AppliedPTransform(String fullName, InputT input, OutputT output, TransformT transform) {
- this.input = input;
- this.output = output;
- this.transform = transform;
- this.fullName = fullName;
- }
-
- public static <InputT extends PInput, OutputT extends POutput,
- TransformT extends PTransform<? super InputT, OutputT>>
- AppliedPTransform<InputT, OutputT, TransformT> of(
- String fullName, InputT input, OutputT output, TransformT transform) {
- return new AppliedPTransform<InputT, OutputT, TransformT>(fullName, input, output, transform);
- }
-
- public String getFullName() {
- return fullName;
- }
-
- public InputT getInput() {
- return input;
- }
-
- public OutputT getOutput() {
- return output;
- }
-
- public TransformT getTransform() {
- return transform;
- }
-
- @Override
- public int hashCode() {
- return Objects.hashCode(getFullName(), getInput(), getOutput(), getTransform());
- }
-
- @Override
- public boolean equals(Object other) {
- if (other instanceof AppliedPTransform) {
- AppliedPTransform<?, ?, ?> that = (AppliedPTransform<?, ?, ?>) other;
- return Objects.equal(this.getFullName(), that.getFullName())
- && Objects.equal(this.getInput(), that.getInput())
- && Objects.equal(this.getOutput(), that.getOutput())
- && Objects.equal(this.getTransform(), that.getTransform());
- } else {
- return false;
- }
- }
-
- @Override
- public String toString() {
- return MoreObjects.toStringHelper(getClass())
- .add("fullName", getFullName())
- .add("input", getInput())
- .add("output", getOutput())
- .add("transform", getTransform())
- .toString();
- }
-}
[34/67] [partial] incubator-beam git commit: Directory reorganization
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/DataflowPipelineRunnerHooks.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/DataflowPipelineRunnerHooks.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/DataflowPipelineRunnerHooks.java
deleted file mode 100644
index b9a0293..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/DataflowPipelineRunnerHooks.java
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.runners;
-
-import com.google.api.services.dataflow.model.Environment;
-import com.google.cloud.dataflow.sdk.annotations.Experimental;
-
-/**
- * An instance of this class can be passed to the
- * {@link DataflowPipelineRunner} to add user defined hooks to be
- * invoked at various times during pipeline execution.
- */
-@Experimental
-public class DataflowPipelineRunnerHooks {
- /**
- * Allows the user to modify the environment of their job before their job is submitted
- * to the service for execution.
- *
- * @param environment The environment of the job. Users can make change to this instance in order
- * to change the environment with which their job executes on the service.
- */
- public void modifyEnvironmentBeforeSubmission(Environment environment) {}
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/DataflowPipelineTranslator.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/DataflowPipelineTranslator.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/DataflowPipelineTranslator.java
deleted file mode 100644
index 155c454..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/DataflowPipelineTranslator.java
+++ /dev/null
@@ -1,1104 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.runners;
-
-import static com.google.cloud.dataflow.sdk.util.CoderUtils.encodeToByteArray;
-import static com.google.cloud.dataflow.sdk.util.SerializableUtils.serializeToByteArray;
-import static com.google.cloud.dataflow.sdk.util.StringUtils.byteArrayToJsonString;
-import static com.google.cloud.dataflow.sdk.util.StringUtils.jsonStringToByteArray;
-import static com.google.cloud.dataflow.sdk.util.Structs.addBoolean;
-import static com.google.cloud.dataflow.sdk.util.Structs.addDictionary;
-import static com.google.cloud.dataflow.sdk.util.Structs.addList;
-import static com.google.cloud.dataflow.sdk.util.Structs.addLong;
-import static com.google.cloud.dataflow.sdk.util.Structs.addObject;
-import static com.google.cloud.dataflow.sdk.util.Structs.addString;
-import static com.google.cloud.dataflow.sdk.util.Structs.getString;
-import static com.google.common.base.Preconditions.checkArgument;
-
-import com.google.api.services.dataflow.model.AutoscalingSettings;
-import com.google.api.services.dataflow.model.DataflowPackage;
-import com.google.api.services.dataflow.model.Disk;
-import com.google.api.services.dataflow.model.Environment;
-import com.google.api.services.dataflow.model.Job;
-import com.google.api.services.dataflow.model.Step;
-import com.google.api.services.dataflow.model.WorkerPool;
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.Pipeline.PipelineVisitor;
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.coders.CoderException;
-import com.google.cloud.dataflow.sdk.coders.IterableCoder;
-import com.google.cloud.dataflow.sdk.io.BigQueryIO;
-import com.google.cloud.dataflow.sdk.io.PubsubIO;
-import com.google.cloud.dataflow.sdk.io.Read;
-import com.google.cloud.dataflow.sdk.options.DataflowPipelineOptions;
-import com.google.cloud.dataflow.sdk.options.StreamingOptions;
-import com.google.cloud.dataflow.sdk.runners.DataflowPipelineRunner.GroupByKeyAndSortValuesOnly;
-import com.google.cloud.dataflow.sdk.runners.dataflow.BigQueryIOTranslator;
-import com.google.cloud.dataflow.sdk.runners.dataflow.PubsubIOTranslator;
-import com.google.cloud.dataflow.sdk.runners.dataflow.ReadTranslator;
-import com.google.cloud.dataflow.sdk.transforms.AppliedPTransform;
-import com.google.cloud.dataflow.sdk.transforms.Combine;
-import com.google.cloud.dataflow.sdk.transforms.Create;
-import com.google.cloud.dataflow.sdk.transforms.DoFn;
-import com.google.cloud.dataflow.sdk.transforms.Flatten;
-import com.google.cloud.dataflow.sdk.transforms.GroupByKey;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-import com.google.cloud.dataflow.sdk.transforms.ParDo;
-import com.google.cloud.dataflow.sdk.transforms.View;
-import com.google.cloud.dataflow.sdk.transforms.display.DisplayData;
-import com.google.cloud.dataflow.sdk.transforms.windowing.DefaultTrigger;
-import com.google.cloud.dataflow.sdk.transforms.windowing.Window;
-import com.google.cloud.dataflow.sdk.util.AppliedCombineFn;
-import com.google.cloud.dataflow.sdk.util.CloudObject;
-import com.google.cloud.dataflow.sdk.util.DoFnInfo;
-import com.google.cloud.dataflow.sdk.util.OutputReference;
-import com.google.cloud.dataflow.sdk.util.PropertyNames;
-import com.google.cloud.dataflow.sdk.util.SerializableUtils;
-import com.google.cloud.dataflow.sdk.util.WindowedValue;
-import com.google.cloud.dataflow.sdk.util.WindowingStrategy;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.cloud.dataflow.sdk.values.PCollectionTuple;
-import com.google.cloud.dataflow.sdk.values.PCollectionView;
-import com.google.cloud.dataflow.sdk.values.PInput;
-import com.google.cloud.dataflow.sdk.values.POutput;
-import com.google.cloud.dataflow.sdk.values.PValue;
-import com.google.cloud.dataflow.sdk.values.TupleTag;
-import com.google.cloud.dataflow.sdk.values.TypedPValue;
-import com.google.common.base.Preconditions;
-import com.google.common.base.Strings;
-import com.google.common.collect.Lists;
-
-import com.fasterxml.jackson.core.JsonProcessingException;
-import com.fasterxml.jackson.databind.ObjectMapper;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.LinkedList;
-import java.util.List;
-import java.util.Map;
-
-import javax.annotation.Nullable;
-
-/**
- * {@link DataflowPipelineTranslator} knows how to translate {@link Pipeline} objects
- * into Cloud Dataflow Service API {@link Job}s.
- */
-@SuppressWarnings({"rawtypes", "unchecked"})
-public class DataflowPipelineTranslator {
- // Must be kept in sync with their internal counterparts.
- private static final Logger LOG = LoggerFactory.getLogger(DataflowPipelineTranslator.class);
- private static final ObjectMapper MAPPER = new ObjectMapper();
-
- /**
- * A map from {@link PTransform} subclass to the corresponding
- * {@link TransformTranslator} to use to translate that transform.
- *
- * <p>A static map that contains system-wide defaults.
- */
- private static Map<Class, TransformTranslator> transformTranslators =
- new HashMap<>();
-
- /** Provided configuration options. */
- private final DataflowPipelineOptions options;
-
- /**
- * Constructs a translator from the provided options.
- *
- * @param options Properties that configure the translator.
- *
- * @return The newly created translator.
- */
- public static DataflowPipelineTranslator fromOptions(
- DataflowPipelineOptions options) {
- return new DataflowPipelineTranslator(options);
- }
-
- private DataflowPipelineTranslator(DataflowPipelineOptions options) {
- this.options = options;
- }
-
- /**
- * Translates a {@link Pipeline} into a {@code JobSpecification}.
- */
- public JobSpecification translate(
- Pipeline pipeline,
- DataflowPipelineRunner runner,
- List<DataflowPackage> packages) {
-
- Translator translator = new Translator(pipeline, runner);
- Job result = translator.translate(packages);
- return new JobSpecification(result, Collections.unmodifiableMap(translator.stepNames));
- }
-
- /**
- * The result of a job translation.
- *
- * <p>Used to pass the result {@link Job} and any state that was used to construct the job that
- * may be of use to other classes (eg the {@link PTransform} to StepName mapping).
- */
- public static class JobSpecification {
- private final Job job;
- private final Map<AppliedPTransform<?, ?, ?>, String> stepNames;
-
- public JobSpecification(Job job, Map<AppliedPTransform<?, ?, ?>, String> stepNames) {
- this.job = job;
- this.stepNames = stepNames;
- }
-
- public Job getJob() {
- return job;
- }
-
- /**
- * Returns the mapping of {@link AppliedPTransform AppliedPTransforms} to the internal step
- * name for that {@code AppliedPTransform}.
- */
- public Map<AppliedPTransform<?, ?, ?>, String> getStepNames() {
- return stepNames;
- }
- }
-
- /**
- * Renders a {@link Job} as a string.
- */
- public static String jobToString(Job job) {
- try {
- return MAPPER.writerWithDefaultPrettyPrinter().writeValueAsString(job);
- } catch (JsonProcessingException exc) {
- throw new IllegalStateException("Failed to render Job as String.", exc);
- }
- }
-
- /////////////////////////////////////////////////////////////////////////////
-
- /**
- * Records that instances of the specified PTransform class
- * should be translated by default by the corresponding
- * {@link TransformTranslator}.
- */
- public static <TransformT extends PTransform> void registerTransformTranslator(
- Class<TransformT> transformClass,
- TransformTranslator<? extends TransformT> transformTranslator) {
- if (transformTranslators.put(transformClass, transformTranslator) != null) {
- throw new IllegalArgumentException(
- "defining multiple translators for " + transformClass);
- }
- }
-
- /**
- * Returns the {@link TransformTranslator} to use for instances of the
- * specified PTransform class, or null if none registered.
- */
- public <TransformT extends PTransform>
- TransformTranslator<TransformT> getTransformTranslator(Class<TransformT> transformClass) {
- return transformTranslators.get(transformClass);
- }
-
- /**
- * A {@link TransformTranslator} knows how to translate
- * a particular subclass of {@link PTransform} for the
- * Cloud Dataflow service. It does so by
- * mutating the {@link TranslationContext}.
- */
- public interface TransformTranslator<TransformT extends PTransform> {
- public void translate(TransformT transform,
- TranslationContext context);
- }
-
- /**
- * The interface provided to registered callbacks for interacting
- * with the {@link DataflowPipelineRunner}, including reading and writing the
- * values of {@link PCollection}s and side inputs ({@link PCollectionView}s).
- */
- public interface TranslationContext {
- /**
- * Returns the configured pipeline options.
- */
- DataflowPipelineOptions getPipelineOptions();
-
- /**
- * Returns the input of the currently being translated transform.
- */
- <InputT extends PInput> InputT getInput(PTransform<InputT, ?> transform);
-
- /**
- * Returns the output of the currently being translated transform.
- */
- <OutputT extends POutput> OutputT getOutput(PTransform<?, OutputT> transform);
-
- /**
- * Returns the full name of the currently being translated transform.
- */
- String getFullName(PTransform<?, ?> transform);
-
- /**
- * Adds a step to the Dataflow workflow for the given transform, with
- * the given Dataflow step type.
- * This step becomes "current" for the purpose of {@link #addInput} and
- * {@link #addOutput}.
- */
- public void addStep(PTransform<?, ?> transform, String type);
-
- /**
- * Adds a pre-defined step to the Dataflow workflow. The given PTransform should be
- * consistent with the Step, in terms of input, output and coder types.
- *
- * <p>This is a low-level operation, when using this method it is up to
- * the caller to ensure that names do not collide.
- */
- public void addStep(PTransform<?, ? extends PValue> transform, Step step);
-
- /**
- * Sets the encoding for the current Dataflow step.
- */
- public void addEncodingInput(Coder<?> value);
-
- /**
- * Adds an input with the given name and value to the current
- * Dataflow step.
- */
- public void addInput(String name, Boolean value);
-
- /**
- * Adds an input with the given name and value to the current
- * Dataflow step.
- */
- public void addInput(String name, String value);
-
- /**
- * Adds an input with the given name and value to the current
- * Dataflow step.
- */
- public void addInput(String name, Long value);
-
- /**
- * Adds an input with the given name to the previously added Dataflow
- * step, coming from the specified input PValue.
- */
- public void addInput(String name, PInput value);
-
- /**
- * Adds an input that is a dictionary of strings to objects.
- */
- public void addInput(String name, Map<String, Object> elements);
-
- /**
- * Adds an input that is a list of objects.
- */
- public void addInput(String name, List<? extends Map<String, Object>> elements);
-
- /**
- * Adds an output with the given name to the previously added
- * Dataflow step, producing the specified output {@code PValue},
- * including its {@code Coder} if a {@code TypedPValue}. If the
- * {@code PValue} is a {@code PCollection}, wraps its coder inside
- * a {@code WindowedValueCoder}.
- */
- public void addOutput(String name, PValue value);
-
- /**
- * Adds an output with the given name to the previously added
- * Dataflow step, producing the specified output {@code PValue},
- * including its {@code Coder} if a {@code TypedPValue}. If the
- * {@code PValue} is a {@code PCollection}, wraps its coder inside
- * a {@code ValueOnlyCoder}.
- */
- public void addValueOnlyOutput(String name, PValue value);
-
- /**
- * Adds an output with the given name to the previously added
- * CollectionToSingleton Dataflow step, consuming the specified
- * input {@code PValue} and producing the specified output
- * {@code PValue}. This step requires special treatment for its
- * output encoding.
- */
- public void addCollectionToSingletonOutput(String name,
- PValue inputValue,
- PValue outputValue);
-
- /**
- * Encode a PValue reference as an output reference.
- */
- public OutputReference asOutputReference(PValue value);
- }
-
-
- /////////////////////////////////////////////////////////////////////////////
-
- /**
- * Translates a Pipeline into the Dataflow representation.
- */
- class Translator implements PipelineVisitor, TranslationContext {
- /** The Pipeline to translate. */
- private final Pipeline pipeline;
-
- /** The runner which will execute the pipeline. */
- private final DataflowPipelineRunner runner;
-
- /** The Cloud Dataflow Job representation. */
- private final Job job = new Job();
-
- /**
- * Translator is stateful, as addProperty calls refer to the current step.
- */
- private Step currentStep;
-
- /**
- * A Map from AppliedPTransform to their unique Dataflow step names.
- */
- private final Map<AppliedPTransform<?, ?, ?>, String> stepNames = new HashMap<>();
-
- /**
- * A Map from PValues to their output names used by their producer
- * Dataflow steps.
- */
- private final Map<POutput, String> outputNames = new HashMap<>();
-
- /**
- * A Map from PValues to the Coders used for them.
- */
- private final Map<POutput, Coder<?>> outputCoders = new HashMap<>();
-
- /**
- * The transform currently being applied.
- */
- private AppliedPTransform<?, ?, ?> currentTransform;
-
- /**
- * Constructs a Translator that will translate the specified
- * Pipeline into Dataflow objects.
- */
- public Translator(Pipeline pipeline, DataflowPipelineRunner runner) {
- this.pipeline = pipeline;
- this.runner = runner;
- }
-
- /**
- * Translates this Translator's pipeline onto its writer.
- * @return a Job definition filled in with the type of job, the environment,
- * and the job steps.
- */
- public Job translate(List<DataflowPackage> packages) {
- job.setName(options.getJobName().toLowerCase());
-
- Environment environment = new Environment();
- job.setEnvironment(environment);
-
- try {
- environment.setSdkPipelineOptions(
- MAPPER.readValue(MAPPER.writeValueAsBytes(options), Map.class));
- } catch (IOException e) {
- throw new IllegalArgumentException(
- "PipelineOptions specified failed to serialize to JSON.", e);
- }
-
- WorkerPool workerPool = new WorkerPool();
-
- if (options.getTeardownPolicy() != null) {
- workerPool.setTeardownPolicy(options.getTeardownPolicy().getTeardownPolicyName());
- }
-
- if (options.isStreaming()) {
- job.setType("JOB_TYPE_STREAMING");
- } else {
- job.setType("JOB_TYPE_BATCH");
- workerPool.setDiskType(options.getWorkerDiskType());
- }
-
- if (options.getWorkerMachineType() != null) {
- workerPool.setMachineType(options.getWorkerMachineType());
- }
-
- workerPool.setPackages(packages);
- workerPool.setNumWorkers(options.getNumWorkers());
-
- if (options.isStreaming()) {
- // Use separate data disk for streaming.
- Disk disk = new Disk();
- disk.setDiskType(options.getWorkerDiskType());
- workerPool.setDataDisks(Collections.singletonList(disk));
- }
- if (!Strings.isNullOrEmpty(options.getZone())) {
- workerPool.setZone(options.getZone());
- }
- if (!Strings.isNullOrEmpty(options.getNetwork())) {
- workerPool.setNetwork(options.getNetwork());
- }
- if (!Strings.isNullOrEmpty(options.getSubnetwork())) {
- workerPool.setSubnetwork(options.getSubnetwork());
- }
- if (options.getDiskSizeGb() > 0) {
- workerPool.setDiskSizeGb(options.getDiskSizeGb());
- }
- AutoscalingSettings settings = new AutoscalingSettings();
- if (options.getAutoscalingAlgorithm() != null) {
- settings.setAlgorithm(options.getAutoscalingAlgorithm().getAlgorithm());
- }
- settings.setMaxNumWorkers(options.getMaxNumWorkers());
- workerPool.setAutoscalingSettings(settings);
-
- List<WorkerPool> workerPools = new LinkedList<>();
-
- workerPools.add(workerPool);
- environment.setWorkerPools(workerPools);
-
- pipeline.traverseTopologically(this);
- return job;
- }
-
- @Override
- public DataflowPipelineOptions getPipelineOptions() {
- return options;
- }
-
- @Override
- public <InputT extends PInput> InputT getInput(PTransform<InputT, ?> transform) {
- return (InputT) getCurrentTransform(transform).getInput();
- }
-
- @Override
- public <OutputT extends POutput> OutputT getOutput(PTransform<?, OutputT> transform) {
- return (OutputT) getCurrentTransform(transform).getOutput();
- }
-
- @Override
- public String getFullName(PTransform<?, ?> transform) {
- return getCurrentTransform(transform).getFullName();
- }
-
- private AppliedPTransform<?, ?, ?> getCurrentTransform(PTransform<?, ?> transform) {
- checkArgument(
- currentTransform != null && currentTransform.getTransform() == transform,
- "can only be called with current transform");
- return currentTransform;
- }
-
- @Override
- public void enterCompositeTransform(TransformTreeNode node) {
- }
-
- @Override
- public void leaveCompositeTransform(TransformTreeNode node) {
- }
-
- @Override
- public void visitTransform(TransformTreeNode node) {
- PTransform<?, ?> transform = node.getTransform();
- TransformTranslator translator =
- getTransformTranslator(transform.getClass());
- if (translator == null) {
- throw new IllegalStateException(
- "no translator registered for " + transform);
- }
- LOG.debug("Translating {}", transform);
- currentTransform = AppliedPTransform.of(
- node.getFullName(), node.getInput(), node.getOutput(), (PTransform) transform);
- translator.translate(transform, this);
- currentTransform = null;
- }
-
- @Override
- public void visitValue(PValue value, TransformTreeNode producer) {
- LOG.debug("Checking translation of {}", value);
- if (value.getProducingTransformInternal() == null) {
- throw new RuntimeException(
- "internal error: expecting a PValue "
- + "to have a producingTransform");
- }
- if (!producer.isCompositeNode()) {
- // Primitive transforms are the only ones assigned step names.
- asOutputReference(value);
- }
- }
-
- @Override
- public void addStep(PTransform<?, ?> transform, String type) {
- String stepName = genStepName();
- if (stepNames.put(getCurrentTransform(transform), stepName) != null) {
- throw new IllegalArgumentException(
- transform + " already has a name specified");
- }
- // Start the next "steps" list item.
- List<Step> steps = job.getSteps();
- if (steps == null) {
- steps = new LinkedList<>();
- job.setSteps(steps);
- }
-
- currentStep = new Step();
- currentStep.setName(stepName);
- currentStep.setKind(type);
- steps.add(currentStep);
- addInput(PropertyNames.USER_NAME, getFullName(transform));
- addDisplayData(PropertyNames.DISPLAY_DATA, DisplayData.from(transform));
- }
-
- @Override
- public void addStep(PTransform<?, ? extends PValue> transform, Step original) {
- Step step = original.clone();
- String stepName = step.getName();
- if (stepNames.put(getCurrentTransform(transform), stepName) != null) {
- throw new IllegalArgumentException(transform + " already has a name specified");
- }
-
- Map<String, Object> properties = step.getProperties();
- if (properties != null) {
- @Nullable List<Map<String, Object>> outputInfoList = null;
- try {
- // TODO: This should be done via a Structs accessor.
- @Nullable List<Map<String, Object>> list =
- (List<Map<String, Object>>) properties.get(PropertyNames.OUTPUT_INFO);
- outputInfoList = list;
- } catch (Exception e) {
- throw new RuntimeException("Inconsistent dataflow pipeline translation", e);
- }
- if (outputInfoList != null && outputInfoList.size() > 0) {
- Map<String, Object> firstOutputPort = outputInfoList.get(0);
- @Nullable String name;
- try {
- name = getString(firstOutputPort, PropertyNames.OUTPUT_NAME);
- } catch (Exception e) {
- name = null;
- }
- if (name != null) {
- registerOutputName(getOutput(transform), name);
- }
- }
- }
-
- List<Step> steps = job.getSteps();
- if (steps == null) {
- steps = new LinkedList<>();
- job.setSteps(steps);
- }
- currentStep = step;
- steps.add(step);
- }
-
- @Override
- public void addEncodingInput(Coder<?> coder) {
- CloudObject encoding = SerializableUtils.ensureSerializable(coder);
- addObject(getProperties(), PropertyNames.ENCODING, encoding);
- }
-
- @Override
- public void addInput(String name, Boolean value) {
- addBoolean(getProperties(), name, value);
- }
-
- @Override
- public void addInput(String name, String value) {
- addString(getProperties(), name, value);
- }
-
- @Override
- public void addInput(String name, Long value) {
- addLong(getProperties(), name, value);
- }
-
- @Override
- public void addInput(String name, Map<String, Object> elements) {
- addDictionary(getProperties(), name, elements);
- }
-
- @Override
- public void addInput(String name, List<? extends Map<String, Object>> elements) {
- addList(getProperties(), name, elements);
- }
-
- @Override
- public void addInput(String name, PInput value) {
- if (value instanceof PValue) {
- addInput(name, asOutputReference((PValue) value));
- } else {
- throw new IllegalStateException("Input must be a PValue");
- }
- }
-
- @Override
- public void addOutput(String name, PValue value) {
- Coder<?> coder;
- if (value instanceof TypedPValue) {
- coder = ((TypedPValue<?>) value).getCoder();
- if (value instanceof PCollection) {
- // Wrap the PCollection element Coder inside a WindowedValueCoder.
- coder = WindowedValue.getFullCoder(
- coder,
- ((PCollection<?>) value).getWindowingStrategy().getWindowFn().windowCoder());
- }
- } else {
- // No output coder to encode.
- coder = null;
- }
- addOutput(name, value, coder);
- }
-
- @Override
- public void addValueOnlyOutput(String name, PValue value) {
- Coder<?> coder;
- if (value instanceof TypedPValue) {
- coder = ((TypedPValue<?>) value).getCoder();
- if (value instanceof PCollection) {
- // Wrap the PCollection element Coder inside a ValueOnly
- // WindowedValueCoder.
- coder = WindowedValue.getValueOnlyCoder(coder);
- }
- } else {
- // No output coder to encode.
- coder = null;
- }
- addOutput(name, value, coder);
- }
-
- @Override
- public void addCollectionToSingletonOutput(String name,
- PValue inputValue,
- PValue outputValue) {
- Coder<?> inputValueCoder =
- Preconditions.checkNotNull(outputCoders.get(inputValue));
- // The inputValueCoder for the input PCollection should be some
- // WindowedValueCoder of the input PCollection's element
- // coder.
- Preconditions.checkState(
- inputValueCoder instanceof WindowedValue.WindowedValueCoder);
- // The outputValueCoder for the output should be an
- // IterableCoder of the inputValueCoder. This is a property
- // of the backend "CollectionToSingleton" step.
- Coder<?> outputValueCoder = IterableCoder.of(inputValueCoder);
- addOutput(name, outputValue, outputValueCoder);
- }
-
- /**
- * Adds an output with the given name to the previously added
- * Dataflow step, producing the specified output {@code PValue}
- * with the given {@code Coder} (if not {@code null}).
- */
- private void addOutput(String name, PValue value, Coder<?> valueCoder) {
- registerOutputName(value, name);
-
- Map<String, Object> properties = getProperties();
- @Nullable List<Map<String, Object>> outputInfoList = null;
- try {
- // TODO: This should be done via a Structs accessor.
- outputInfoList = (List<Map<String, Object>>) properties.get(PropertyNames.OUTPUT_INFO);
- } catch (Exception e) {
- throw new RuntimeException("Inconsistent dataflow pipeline translation", e);
- }
- if (outputInfoList == null) {
- outputInfoList = new ArrayList<>();
- // TODO: This should be done via a Structs accessor.
- properties.put(PropertyNames.OUTPUT_INFO, outputInfoList);
- }
-
- Map<String, Object> outputInfo = new HashMap<>();
- addString(outputInfo, PropertyNames.OUTPUT_NAME, name);
- addString(outputInfo, PropertyNames.USER_NAME, value.getName());
- if (value instanceof PCollection
- && runner.doesPCollectionRequireIndexedFormat((PCollection<?>) value)) {
- addBoolean(outputInfo, PropertyNames.USE_INDEXED_FORMAT, true);
- }
- if (valueCoder != null) {
- // Verify that encoding can be decoded, in order to catch serialization
- // failures as early as possible.
- CloudObject encoding = SerializableUtils.ensureSerializable(valueCoder);
- addObject(outputInfo, PropertyNames.ENCODING, encoding);
- outputCoders.put(value, valueCoder);
- }
-
- outputInfoList.add(outputInfo);
- }
-
- private void addDisplayData(String name, DisplayData displayData) {
- List<Map<String, Object>> serializedItems = Lists.newArrayList();
- for (DisplayData.Item item : displayData.items()) {
- serializedItems.add(MAPPER.convertValue(item, Map.class));
- }
-
- addList(getProperties(), name, serializedItems);
- }
-
- @Override
- public OutputReference asOutputReference(PValue value) {
- AppliedPTransform<?, ?, ?> transform =
- value.getProducingTransformInternal();
- String stepName = stepNames.get(transform);
- if (stepName == null) {
- throw new IllegalArgumentException(transform + " doesn't have a name specified");
- }
-
- String outputName = outputNames.get(value);
- if (outputName == null) {
- throw new IllegalArgumentException(
- "output " + value + " doesn't have a name specified");
- }
-
- return new OutputReference(stepName, outputName);
- }
-
- private Map<String, Object> getProperties() {
- Map<String, Object> properties = currentStep.getProperties();
- if (properties == null) {
- properties = new HashMap<>();
- currentStep.setProperties(properties);
- }
- return properties;
- }
-
- /**
- * Returns a fresh Dataflow step name.
- */
- private String genStepName() {
- return "s" + (stepNames.size() + 1);
- }
-
- /**
- * Records the name of the given output PValue,
- * within its producing transform.
- */
- private void registerOutputName(POutput value, String name) {
- if (outputNames.put(value, name) != null) {
- throw new IllegalArgumentException(
- "output " + value + " already has a name specified");
- }
- }
- }
-
- /////////////////////////////////////////////////////////////////////////////
-
- @Override
- public String toString() {
- return "DataflowPipelineTranslator#" + hashCode();
- }
-
-
- ///////////////////////////////////////////////////////////////////////////
-
- static {
- registerTransformTranslator(
- View.CreatePCollectionView.class,
- new TransformTranslator<View.CreatePCollectionView>() {
- @Override
- public void translate(
- View.CreatePCollectionView transform,
- TranslationContext context) {
- translateTyped(transform, context);
- }
-
- private <ElemT, ViewT> void translateTyped(
- View.CreatePCollectionView<ElemT, ViewT> transform,
- TranslationContext context) {
- context.addStep(transform, "CollectionToSingleton");
- context.addInput(PropertyNames.PARALLEL_INPUT, context.getInput(transform));
- context.addCollectionToSingletonOutput(
- PropertyNames.OUTPUT,
- context.getInput(transform),
- context.getOutput(transform));
- }
- });
-
- DataflowPipelineTranslator.registerTransformTranslator(
- Combine.GroupedValues.class,
- new DataflowPipelineTranslator.TransformTranslator<Combine.GroupedValues>() {
- @Override
- public void translate(
- Combine.GroupedValues transform,
- DataflowPipelineTranslator.TranslationContext context) {
- translateHelper(transform, context);
- }
-
- private <K, InputT, OutputT> void translateHelper(
- final Combine.GroupedValues<K, InputT, OutputT> transform,
- DataflowPipelineTranslator.TranslationContext context) {
- context.addStep(transform, "CombineValues");
- translateInputs(context.getInput(transform), transform.getSideInputs(), context);
-
- AppliedCombineFn<? super K, ? super InputT, ?, OutputT> fn =
- transform.getAppliedFn(
- context.getInput(transform).getPipeline().getCoderRegistry(),
- context.getInput(transform).getCoder(),
- context.getInput(transform).getWindowingStrategy());
-
- context.addEncodingInput(fn.getAccumulatorCoder());
- context.addInput(
- PropertyNames.SERIALIZED_FN,
- byteArrayToJsonString(serializeToByteArray(fn)));
- context.addOutput(PropertyNames.OUTPUT, context.getOutput(transform));
- }
- });
-
- registerTransformTranslator(
- Create.Values.class,
- new TransformTranslator<Create.Values>() {
- @Override
- public void translate(
- Create.Values transform,
- TranslationContext context) {
- createHelper(transform, context);
- }
-
- private <T> void createHelper(
- Create.Values<T> transform,
- TranslationContext context) {
- context.addStep(transform, "CreateCollection");
-
- Coder<T> coder = context.getOutput(transform).getCoder();
- List<CloudObject> elements = new LinkedList<>();
- for (T elem : transform.getElements()) {
- byte[] encodedBytes;
- try {
- encodedBytes = encodeToByteArray(coder, elem);
- } catch (CoderException exn) {
- // TODO: Put in better element printing:
- // truncate if too long.
- throw new IllegalArgumentException(
- "Unable to encode element '" + elem + "' of transform '" + transform
- + "' using coder '" + coder + "'.",
- exn);
- }
- String encodedJson = byteArrayToJsonString(encodedBytes);
- assert Arrays.equals(encodedBytes,
- jsonStringToByteArray(encodedJson));
- elements.add(CloudObject.forString(encodedJson));
- }
- context.addInput(PropertyNames.ELEMENT, elements);
- context.addValueOnlyOutput(PropertyNames.OUTPUT, context.getOutput(transform));
- }
- });
-
- registerTransformTranslator(
- Flatten.FlattenPCollectionList.class,
- new TransformTranslator<Flatten.FlattenPCollectionList>() {
- @Override
- public void translate(
- Flatten.FlattenPCollectionList transform,
- TranslationContext context) {
- flattenHelper(transform, context);
- }
-
- private <T> void flattenHelper(
- Flatten.FlattenPCollectionList<T> transform,
- TranslationContext context) {
- context.addStep(transform, "Flatten");
-
- List<OutputReference> inputs = new LinkedList<>();
- for (PCollection<T> input : context.getInput(transform).getAll()) {
- inputs.add(context.asOutputReference(input));
- }
- context.addInput(PropertyNames.INPUTS, inputs);
- context.addOutput(PropertyNames.OUTPUT, context.getOutput(transform));
- }
- });
-
- registerTransformTranslator(
- GroupByKeyAndSortValuesOnly.class,
- new TransformTranslator<GroupByKeyAndSortValuesOnly>() {
- @Override
- public void translate(
- GroupByKeyAndSortValuesOnly transform,
- TranslationContext context) {
- groupByKeyAndSortValuesHelper(transform, context);
- }
-
- private <K1, K2, V> void groupByKeyAndSortValuesHelper(
- GroupByKeyAndSortValuesOnly<K1, K2, V> transform,
- TranslationContext context) {
- context.addStep(transform, "GroupByKey");
- context.addInput(PropertyNames.PARALLEL_INPUT, context.getInput(transform));
- context.addOutput(PropertyNames.OUTPUT, context.getOutput(transform));
- context.addInput(PropertyNames.SORT_VALUES, true);
-
- // TODO: Add support for combiner lifting once the need arises.
- context.addInput(
- PropertyNames.DISALLOW_COMBINER_LIFTING, true);
- }
- });
-
- registerTransformTranslator(
- GroupByKey.class,
- new TransformTranslator<GroupByKey>() {
- @Override
- public void translate(
- GroupByKey transform,
- TranslationContext context) {
- groupByKeyHelper(transform, context);
- }
-
- private <K, V> void groupByKeyHelper(
- GroupByKey<K, V> transform,
- TranslationContext context) {
- context.addStep(transform, "GroupByKey");
- context.addInput(PropertyNames.PARALLEL_INPUT, context.getInput(transform));
- context.addOutput(PropertyNames.OUTPUT, context.getOutput(transform));
-
- WindowingStrategy<?, ?> windowingStrategy =
- context.getInput(transform).getWindowingStrategy();
- boolean isStreaming =
- context.getPipelineOptions().as(StreamingOptions.class).isStreaming();
- boolean disallowCombinerLifting =
- !windowingStrategy.getWindowFn().isNonMerging()
- || (isStreaming && !transform.fewKeys())
- // TODO: Allow combiner lifting on the non-default trigger, as appropriate.
- || !(windowingStrategy.getTrigger().getSpec() instanceof DefaultTrigger);
- context.addInput(
- PropertyNames.DISALLOW_COMBINER_LIFTING, disallowCombinerLifting);
- context.addInput(
- PropertyNames.SERIALIZED_FN,
- byteArrayToJsonString(serializeToByteArray(windowingStrategy)));
- context.addInput(
- PropertyNames.IS_MERGING_WINDOW_FN,
- !windowingStrategy.getWindowFn().isNonMerging());
- }
- });
-
- registerTransformTranslator(
- ParDo.BoundMulti.class,
- new TransformTranslator<ParDo.BoundMulti>() {
- @Override
- public void translate(
- ParDo.BoundMulti transform,
- TranslationContext context) {
- translateMultiHelper(transform, context);
- }
-
- private <InputT, OutputT> void translateMultiHelper(
- ParDo.BoundMulti<InputT, OutputT> transform,
- TranslationContext context) {
- context.addStep(transform, "ParallelDo");
- translateInputs(context.getInput(transform), transform.getSideInputs(), context);
- translateFn(transform.getFn(), context.getInput(transform).getWindowingStrategy(),
- transform.getSideInputs(), context.getInput(transform).getCoder(), context);
- translateOutputs(context.getOutput(transform), context);
- }
- });
-
- registerTransformTranslator(
- ParDo.Bound.class,
- new TransformTranslator<ParDo.Bound>() {
- @Override
- public void translate(
- ParDo.Bound transform,
- TranslationContext context) {
- translateSingleHelper(transform, context);
- }
-
- private <InputT, OutputT> void translateSingleHelper(
- ParDo.Bound<InputT, OutputT> transform,
- TranslationContext context) {
- context.addStep(transform, "ParallelDo");
- translateInputs(context.getInput(transform), transform.getSideInputs(), context);
- translateFn(
- transform.getFn(),
- context.getInput(transform).getWindowingStrategy(),
- transform.getSideInputs(), context.getInput(transform).getCoder(), context);
- context.addOutput(PropertyNames.OUTPUT, context.getOutput(transform));
- }
- });
-
- registerTransformTranslator(
- Window.Bound.class,
- new DataflowPipelineTranslator.TransformTranslator<Window.Bound>() {
- @Override
- public void translate(
- Window.Bound transform, TranslationContext context) {
- translateHelper(transform, context);
- }
-
- private <T> void translateHelper(
- Window.Bound<T> transform, TranslationContext context) {
- context.addStep(transform, "Bucket");
- context.addInput(PropertyNames.PARALLEL_INPUT, context.getInput(transform));
- context.addOutput(PropertyNames.OUTPUT, context.getOutput(transform));
-
- WindowingStrategy<?, ?> strategy = context.getOutput(transform).getWindowingStrategy();
- byte[] serializedBytes = serializeToByteArray(strategy);
- String serializedJson = byteArrayToJsonString(serializedBytes);
- assert Arrays.equals(serializedBytes,
- jsonStringToByteArray(serializedJson));
- context.addInput(PropertyNames.SERIALIZED_FN, serializedJson);
- }
- });
-
- ///////////////////////////////////////////////////////////////////////////
- // IO Translation.
-
- registerTransformTranslator(
- BigQueryIO.Read.Bound.class, new BigQueryIOTranslator.ReadTranslator());
- registerTransformTranslator(
- BigQueryIO.Write.Bound.class, new BigQueryIOTranslator.WriteTranslator());
-
- registerTransformTranslator(
- PubsubIO.Read.Bound.class, new PubsubIOTranslator.ReadTranslator());
- registerTransformTranslator(
- DataflowPipelineRunner.StreamingPubsubIOWrite.class,
- new PubsubIOTranslator.WriteTranslator());
-
- registerTransformTranslator(Read.Bounded.class, new ReadTranslator());
- }
-
- private static void translateInputs(
- PCollection<?> input,
- List<PCollectionView<?>> sideInputs,
- TranslationContext context) {
- context.addInput(PropertyNames.PARALLEL_INPUT, input);
- translateSideInputs(sideInputs, context);
- }
-
- // Used for ParDo
- private static void translateSideInputs(
- List<PCollectionView<?>> sideInputs,
- TranslationContext context) {
- Map<String, Object> nonParInputs = new HashMap<>();
-
- for (PCollectionView<?> view : sideInputs) {
- nonParInputs.put(
- view.getTagInternal().getId(),
- context.asOutputReference(view));
- }
-
- context.addInput(PropertyNames.NON_PARALLEL_INPUTS, nonParInputs);
- }
-
- private static void translateFn(
- DoFn fn,
- WindowingStrategy windowingStrategy,
- Iterable<PCollectionView<?>> sideInputs,
- Coder inputCoder,
- TranslationContext context) {
- context.addInput(PropertyNames.USER_FN, fn.getClass().getName());
- context.addInput(
- PropertyNames.SERIALIZED_FN,
- byteArrayToJsonString(serializeToByteArray(
- new DoFnInfo(fn, windowingStrategy, sideInputs, inputCoder))));
- }
-
- private static void translateOutputs(
- PCollectionTuple outputs,
- TranslationContext context) {
- for (Map.Entry<TupleTag<?>, PCollection<?>> entry
- : outputs.getAll().entrySet()) {
- TupleTag<?> tag = entry.getKey();
- PCollection<?> output = entry.getValue();
- context.addOutput(tag.getId(), output);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/DataflowServiceException.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/DataflowServiceException.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/DataflowServiceException.java
deleted file mode 100644
index 6e8301b..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/DataflowServiceException.java
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.runners;
-
-import javax.annotation.Nullable;
-
-/**
- * Signals there was an error retrieving information about a job from the Cloud Dataflow Service.
- */
-public class DataflowServiceException extends DataflowJobException {
- DataflowServiceException(DataflowPipelineJob job, String message) {
- this(job, message, null);
- }
-
- DataflowServiceException(DataflowPipelineJob job, String message, @Nullable Throwable cause) {
- super(job, message, cause);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/DirectPipeline.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/DirectPipeline.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/DirectPipeline.java
deleted file mode 100644
index 5217a90..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/DirectPipeline.java
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.runners;
-
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.options.DirectPipelineOptions;
-
-/**
- * A {@link DirectPipeline} is a {@link Pipeline} that returns
- * {@link DirectPipelineRunner.EvaluationResults} when it is
- * {@link com.google.cloud.dataflow.sdk.Pipeline#run()}.
- */
-public class DirectPipeline extends Pipeline {
-
- /**
- * Creates and returns a new DirectPipeline instance for tests.
- */
- public static DirectPipeline createForTest() {
- DirectPipelineRunner runner = DirectPipelineRunner.createForTest();
- return new DirectPipeline(runner, runner.getPipelineOptions());
- }
-
- private DirectPipeline(DirectPipelineRunner runner, DirectPipelineOptions options) {
- super(runner, options);
- }
-
- @Override
- public DirectPipelineRunner.EvaluationResults run() {
- return (DirectPipelineRunner.EvaluationResults) super.run();
- }
-
- @Override
- public DirectPipelineRunner getRunner() {
- return (DirectPipelineRunner) super.getRunner();
- }
-
- @Override
- public String toString() {
- return "DirectPipeline#" + hashCode();
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/DirectPipelineRegistrar.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/DirectPipelineRegistrar.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/DirectPipelineRegistrar.java
deleted file mode 100644
index f2dd40c..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/DirectPipelineRegistrar.java
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.runners;
-
-import com.google.auto.service.AutoService;
-import com.google.cloud.dataflow.sdk.options.DirectPipelineOptions;
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-import com.google.cloud.dataflow.sdk.options.PipelineOptionsRegistrar;
-import com.google.common.collect.ImmutableList;
-
-/**
- * Contains the {@link PipelineOptionsRegistrar} and {@link PipelineRunnerRegistrar} for
- * the {@link DirectPipeline}.
- */
-public class DirectPipelineRegistrar {
- private DirectPipelineRegistrar() { }
-
- /**
- * Register the {@link DirectPipelineRunner}.
- */
- @AutoService(PipelineRunnerRegistrar.class)
- public static class Runner implements PipelineRunnerRegistrar {
- @Override
- public Iterable<Class<? extends PipelineRunner<?>>> getPipelineRunners() {
- return ImmutableList.<Class<? extends PipelineRunner<?>>>of(DirectPipelineRunner.class);
- }
- }
-
- /**
- * Register the {@link DirectPipelineOptions}.
- */
- @AutoService(PipelineOptionsRegistrar.class)
- public static class Options implements PipelineOptionsRegistrar {
- @Override
- public Iterable<Class<? extends PipelineOptions>> getPipelineOptions() {
- return ImmutableList.<Class<? extends PipelineOptions>>of(DirectPipelineOptions.class);
- }
- }
-}
[60/67] incubator-beam git commit: Directory reorganization
Posted by dh...@apache.org.
Directory reorganization
Move Java examples from "examples/" into "examples/java".
Project: http://git-wip-us.apache.org/repos/asf/incubator-beam/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-beam/commit/2eaa709c
Tree: http://git-wip-us.apache.org/repos/asf/incubator-beam/tree/2eaa709c
Diff: http://git-wip-us.apache.org/repos/asf/incubator-beam/diff/2eaa709c
Branch: refs/heads/master
Commit: 2eaa709c7c2bb7e101a684025c357b25141fcf1f
Parents: 75cfa4a
Author: Davor Bonaci <da...@google.com>
Authored: Wed Mar 23 17:05:40 2016 -0700
Committer: Davor Bonaci <da...@google.com>
Committed: Wed Mar 23 18:33:27 2016 -0700
----------------------------------------------------------------------
examples/README.md | 95 ----
examples/java/README.md | 95 ++++
examples/java/pom.xml | 394 +++++++++++++
.../dataflow/examples/DebuggingWordCount.java | 182 ++++++
.../dataflow/examples/MinimalWordCount.java | 117 ++++
.../dataflow/examples/WindowedWordCount.java | 269 +++++++++
.../cloud/dataflow/examples/WordCount.java | 206 +++++++
.../examples/common/DataflowExampleOptions.java | 34 ++
.../examples/common/DataflowExampleUtils.java | 485 ++++++++++++++++
.../common/ExampleBigQueryTableOptions.java | 53 ++
...xamplePubsubTopicAndSubscriptionOptions.java | 44 ++
.../common/ExamplePubsubTopicOptions.java | 44 ++
.../examples/common/PubsubFileInjector.java | 153 +++++
.../examples/complete/AutoComplete.java | 516 +++++++++++++++++
.../cloud/dataflow/examples/complete/README.md | 44 ++
.../examples/complete/StreamingWordExtract.java | 163 ++++++
.../cloud/dataflow/examples/complete/TfIdf.java | 431 ++++++++++++++
.../examples/complete/TopWikipediaSessions.java | 223 ++++++++
.../examples/complete/TrafficMaxLaneFlow.java | 425 ++++++++++++++
.../examples/complete/TrafficRoutes.java | 459 +++++++++++++++
.../examples/cookbook/BigQueryTornadoes.java | 179 ++++++
.../cookbook/CombinePerKeyExamples.java | 223 ++++++++
.../examples/cookbook/DatastoreWordCount.java | 269 +++++++++
.../examples/cookbook/DeDupExample.java | 100 ++++
.../examples/cookbook/FilterExamples.java | 266 +++++++++
.../examples/cookbook/JoinExamples.java | 185 ++++++
.../examples/cookbook/MaxPerKeyExamples.java | 173 ++++++
.../cloud/dataflow/examples/cookbook/README.md | 55 ++
.../examples/cookbook/TriggerExample.java | 564 +++++++++++++++++++
.../examples/DebuggingWordCountTest.java | 45 ++
.../cloud/dataflow/examples/WordCountTest.java | 85 +++
.../examples/complete/AutoCompleteTest.java | 181 ++++++
.../dataflow/examples/complete/TfIdfTest.java | 67 +++
.../complete/TopWikipediaSessionsTest.java | 62 ++
.../cookbook/BigQueryTornadoesTest.java | 80 +++
.../cookbook/CombinePerKeyExamplesTest.java | 90 +++
.../examples/cookbook/DeDupExampleTest.java | 83 +++
.../examples/cookbook/FilterExamplesTest.java | 85 +++
.../examples/cookbook/JoinExamplesTest.java | 114 ++++
.../cookbook/MaxPerKeyExamplesTest.java | 85 +++
.../examples/cookbook/TriggerExampleTest.java | 139 +++++
examples/pom.xml | 394 -------------
.../dataflow/examples/DebuggingWordCount.java | 182 ------
.../dataflow/examples/MinimalWordCount.java | 117 ----
.../dataflow/examples/WindowedWordCount.java | 269 ---------
.../cloud/dataflow/examples/WordCount.java | 206 -------
.../examples/common/DataflowExampleOptions.java | 34 --
.../examples/common/DataflowExampleUtils.java | 485 ----------------
.../common/ExampleBigQueryTableOptions.java | 53 --
...xamplePubsubTopicAndSubscriptionOptions.java | 44 --
.../common/ExamplePubsubTopicOptions.java | 44 --
.../examples/common/PubsubFileInjector.java | 153 -----
.../examples/complete/AutoComplete.java | 516 -----------------
.../cloud/dataflow/examples/complete/README.md | 44 --
.../examples/complete/StreamingWordExtract.java | 163 ------
.../cloud/dataflow/examples/complete/TfIdf.java | 431 --------------
.../examples/complete/TopWikipediaSessions.java | 223 --------
.../examples/complete/TrafficMaxLaneFlow.java | 425 --------------
.../examples/complete/TrafficRoutes.java | 459 ---------------
.../examples/cookbook/BigQueryTornadoes.java | 179 ------
.../cookbook/CombinePerKeyExamples.java | 223 --------
.../examples/cookbook/DatastoreWordCount.java | 269 ---------
.../examples/cookbook/DeDupExample.java | 100 ----
.../examples/cookbook/FilterExamples.java | 266 ---------
.../examples/cookbook/JoinExamples.java | 185 ------
.../examples/cookbook/MaxPerKeyExamples.java | 173 ------
.../cloud/dataflow/examples/cookbook/README.md | 55 --
.../examples/cookbook/TriggerExample.java | 564 -------------------
.../examples/DebuggingWordCountTest.java | 45 --
.../cloud/dataflow/examples/WordCountTest.java | 85 ---
.../examples/complete/AutoCompleteTest.java | 181 ------
.../dataflow/examples/complete/TfIdfTest.java | 67 ---
.../complete/TopWikipediaSessionsTest.java | 62 --
.../cookbook/BigQueryTornadoesTest.java | 80 ---
.../cookbook/CombinePerKeyExamplesTest.java | 90 ---
.../examples/cookbook/DeDupExampleTest.java | 83 ---
.../examples/cookbook/FilterExamplesTest.java | 85 ---
.../examples/cookbook/JoinExamplesTest.java | 114 ----
.../cookbook/MaxPerKeyExamplesTest.java | 85 ---
.../examples/cookbook/TriggerExampleTest.java | 139 -----
pom.xml | 2 +-
travis/test_wordcount.sh | 4 +-
82 files changed, 7470 insertions(+), 7470 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/README.md
----------------------------------------------------------------------
diff --git a/examples/README.md b/examples/README.md
deleted file mode 100644
index cbcd01f..0000000
--- a/examples/README.md
+++ /dev/null
@@ -1,95 +0,0 @@
-# Example Pipelines
-
-The examples included in this module serve to demonstrate the basic
-functionality of Google Cloud Dataflow, and act as starting points for
-the development of more complex pipelines.
-
-## Word Count
-
-A good starting point for new users is our set of
-[word count](https://github.com/GoogleCloudPlatform/DataflowJavaSDK/blob/master/examples/src/main/java/com/google/cloud/dataflow/examples) examples, which computes word frequencies. This series of four successively more detailed pipelines is described in detail in the accompanying [walkthrough](https://cloud.google.com/dataflow/examples/wordcount-example).
-
-1. [`MinimalWordCount`](https://github.com/GoogleCloudPlatform/DataflowJavaSDK/blob/master/examples/src/main/java/com/google/cloud/dataflow/examples/MinimalWordCount.java) is the simplest word count pipeline and introduces basic concepts like [Pipelines](https://cloud.google.com/dataflow/model/pipelines),
-[PCollections](https://cloud.google.com/dataflow/model/pcollection),
-[ParDo](https://cloud.google.com/dataflow/model/par-do),
-and [reading and writing data](https://cloud.google.com/dataflow/model/reading-and-writing-data) from external storage.
-
-1. [`WordCount`](https://github.com/GoogleCloudPlatform/DataflowJavaSDK/blob/master/examples/src/main/java/com/google/cloud/dataflow/examples/WordCount.java) introduces Dataflow best practices like [PipelineOptions](https://cloud.google.com/dataflow/pipelines/constructing-your-pipeline#Creating) and custom [PTransforms](https://cloud.google.com/dataflow/model/composite-transforms).
-
-1. [`DebuggingWordCount`](https://github.com/GoogleCloudPlatform/DataflowJavaSDK/blob/master/examples/src/main/java/com/google/cloud/dataflow/examples/DebuggingWordCount.java)
-shows how to view live aggregators in the [Dataflow Monitoring Interface](https://cloud.google.com/dataflow/pipelines/dataflow-monitoring-intf), get the most out of
-[Cloud Logging](https://cloud.google.com/dataflow/pipelines/logging) integration, and start writing
-[good tests](https://cloud.google.com/dataflow/pipelines/testing-your-pipeline).
-
-1. [`WindowedWordCount`](https://github.com/GoogleCloudPlatform/DataflowJavaSDK/blob/master/examples/src/main/java/com/google/cloud/dataflow/examples/WindowedWordCount.java) shows how to run the same pipeline over either unbounded PCollections in streaming mode or bounded PCollections in batch mode.
-
-## Building and Running
-
-The examples in this repository can be built and executed from the root directory by running:
-
- mvn compile exec:java -pl examples \
- -Dexec.mainClass=<MAIN CLASS> \
- -Dexec.args="<EXAMPLE-SPECIFIC ARGUMENTS>"
-
-For example, you can execute the `WordCount` pipeline on your local machine as follows:
-
- mvn compile exec:java -pl examples \
- -Dexec.mainClass=com.google.cloud.dataflow.examples.WordCount \
- -Dexec.args="--inputFile=<LOCAL INPUT FILE> --output=<LOCAL OUTPUT FILE>"
-
-Once you have followed the general Cloud Dataflow
-[Getting Started](https://cloud.google.com/dataflow/getting-started) instructions, you can execute
-the same pipeline on fully managed resources in Google Cloud Platform:
-
- mvn compile exec:java -pl examples \
- -Dexec.mainClass=com.google.cloud.dataflow.examples.WordCount \
- -Dexec.args="--project=<YOUR CLOUD PLATFORM PROJECT ID> \
- --stagingLocation=<YOUR CLOUD STORAGE LOCATION> \
- --runner=BlockingDataflowPipelineRunner"
-
-Make sure to use your project id, not the project number or the descriptive name.
-The Cloud Storage location should be entered in the form of
-`gs://bucket/path/to/staging/directory`.
-
-Alternatively, you may choose to bundle all dependencies into a single JAR and
-execute it outside of the Maven environment. For example, you can execute the
-following commands to create the
-bundled JAR of the examples and execute it both locally and in Cloud
-Platform:
-
- mvn package
-
- java -cp examples/target/google-cloud-dataflow-java-examples-all-bundled-<VERSION>.jar \
- com.google.cloud.dataflow.examples.WordCount \
- --inputFile=<INPUT FILE PATTERN> --output=<OUTPUT FILE>
-
- java -cp examples/target/google-cloud-dataflow-java-examples-all-bundled-<VERSION>.jar \
- com.google.cloud.dataflow.examples.WordCount \
- --project=<YOUR CLOUD PLATFORM PROJECT ID> \
- --stagingLocation=<YOUR CLOUD STORAGE LOCATION> \
- --runner=BlockingDataflowPipelineRunner
-
-Other examples can be run similarly by replacing the `WordCount` class path with the example classpath, e.g.
-`com.google.cloud.dataflow.examples.cookbook.BigQueryTornadoes`,
-and adjusting runtime options under the `Dexec.args` parameter, as specified in
-the example itself.
-
-Note that when running Maven on Microsoft Windows platform, backslashes (`\`)
-under the `Dexec.args` parameter should be escaped with another backslash. For
-example, input file pattern of `c:\*.txt` should be entered as `c:\\*.txt`.
-
-## Beyond Word Count
-
-After you've finished running your first few word count pipelines, take a look at the [`cookbook`](https://github.com/GoogleCloudPlatform/DataflowJavaSDK/blob/master/examples/src/main/java/com/google/cloud/dataflow/examples/cookbook)
-directory for some common and useful patterns like joining, filtering, and combining.
-
-The [`complete`](https://github.com/GoogleCloudPlatform/DataflowJavaSDK/blob/master/examples/src/main/java/com/google/cloud/dataflow/examples/complete)
-directory contains a few realistic end-to-end pipelines.
-
-See the
-[Java 8](https://github.com/GoogleCloudPlatform/DataflowJavaSDK/tree/master/examples/src/main/java8/com/google/cloud/dataflow/examples)
-examples as well. This directory includes a Java 8 version of the
-MinimalWordCount example, as well as series of examples in a simple 'mobile
-gaming' domain. This series introduces some advanced concepts and provides
-additional examples of using Java 8 syntax. Other than usage of Java 8 lambda
-expressions, the concepts that are used apply equally well in Java 7.
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/java/README.md
----------------------------------------------------------------------
diff --git a/examples/java/README.md b/examples/java/README.md
new file mode 100644
index 0000000..cbcd01f
--- /dev/null
+++ b/examples/java/README.md
@@ -0,0 +1,95 @@
+# Example Pipelines
+
+The examples included in this module serve to demonstrate the basic
+functionality of Google Cloud Dataflow, and act as starting points for
+the development of more complex pipelines.
+
+## Word Count
+
+A good starting point for new users is our set of
+[word count](https://github.com/GoogleCloudPlatform/DataflowJavaSDK/blob/master/examples/src/main/java/com/google/cloud/dataflow/examples) examples, which computes word frequencies. This series of four successively more detailed pipelines is described in detail in the accompanying [walkthrough](https://cloud.google.com/dataflow/examples/wordcount-example).
+
+1. [`MinimalWordCount`](https://github.com/GoogleCloudPlatform/DataflowJavaSDK/blob/master/examples/src/main/java/com/google/cloud/dataflow/examples/MinimalWordCount.java) is the simplest word count pipeline and introduces basic concepts like [Pipelines](https://cloud.google.com/dataflow/model/pipelines),
+[PCollections](https://cloud.google.com/dataflow/model/pcollection),
+[ParDo](https://cloud.google.com/dataflow/model/par-do),
+and [reading and writing data](https://cloud.google.com/dataflow/model/reading-and-writing-data) from external storage.
+
+1. [`WordCount`](https://github.com/GoogleCloudPlatform/DataflowJavaSDK/blob/master/examples/src/main/java/com/google/cloud/dataflow/examples/WordCount.java) introduces Dataflow best practices like [PipelineOptions](https://cloud.google.com/dataflow/pipelines/constructing-your-pipeline#Creating) and custom [PTransforms](https://cloud.google.com/dataflow/model/composite-transforms).
+
+1. [`DebuggingWordCount`](https://github.com/GoogleCloudPlatform/DataflowJavaSDK/blob/master/examples/src/main/java/com/google/cloud/dataflow/examples/DebuggingWordCount.java)
+shows how to view live aggregators in the [Dataflow Monitoring Interface](https://cloud.google.com/dataflow/pipelines/dataflow-monitoring-intf), get the most out of
+[Cloud Logging](https://cloud.google.com/dataflow/pipelines/logging) integration, and start writing
+[good tests](https://cloud.google.com/dataflow/pipelines/testing-your-pipeline).
+
+1. [`WindowedWordCount`](https://github.com/GoogleCloudPlatform/DataflowJavaSDK/blob/master/examples/src/main/java/com/google/cloud/dataflow/examples/WindowedWordCount.java) shows how to run the same pipeline over either unbounded PCollections in streaming mode or bounded PCollections in batch mode.
+
+## Building and Running
+
+The examples in this repository can be built and executed from the root directory by running:
+
+ mvn compile exec:java -pl examples \
+ -Dexec.mainClass=<MAIN CLASS> \
+ -Dexec.args="<EXAMPLE-SPECIFIC ARGUMENTS>"
+
+For example, you can execute the `WordCount` pipeline on your local machine as follows:
+
+ mvn compile exec:java -pl examples \
+ -Dexec.mainClass=com.google.cloud.dataflow.examples.WordCount \
+ -Dexec.args="--inputFile=<LOCAL INPUT FILE> --output=<LOCAL OUTPUT FILE>"
+
+Once you have followed the general Cloud Dataflow
+[Getting Started](https://cloud.google.com/dataflow/getting-started) instructions, you can execute
+the same pipeline on fully managed resources in Google Cloud Platform:
+
+ mvn compile exec:java -pl examples \
+ -Dexec.mainClass=com.google.cloud.dataflow.examples.WordCount \
+ -Dexec.args="--project=<YOUR CLOUD PLATFORM PROJECT ID> \
+ --stagingLocation=<YOUR CLOUD STORAGE LOCATION> \
+ --runner=BlockingDataflowPipelineRunner"
+
+Make sure to use your project id, not the project number or the descriptive name.
+The Cloud Storage location should be entered in the form of
+`gs://bucket/path/to/staging/directory`.
+
+Alternatively, you may choose to bundle all dependencies into a single JAR and
+execute it outside of the Maven environment. For example, you can execute the
+following commands to create the
+bundled JAR of the examples and execute it both locally and in Cloud
+Platform:
+
+ mvn package
+
+ java -cp examples/target/google-cloud-dataflow-java-examples-all-bundled-<VERSION>.jar \
+ com.google.cloud.dataflow.examples.WordCount \
+ --inputFile=<INPUT FILE PATTERN> --output=<OUTPUT FILE>
+
+ java -cp examples/target/google-cloud-dataflow-java-examples-all-bundled-<VERSION>.jar \
+ com.google.cloud.dataflow.examples.WordCount \
+ --project=<YOUR CLOUD PLATFORM PROJECT ID> \
+ --stagingLocation=<YOUR CLOUD STORAGE LOCATION> \
+ --runner=BlockingDataflowPipelineRunner
+
+Other examples can be run similarly by replacing the `WordCount` class path with the example classpath, e.g.
+`com.google.cloud.dataflow.examples.cookbook.BigQueryTornadoes`,
+and adjusting runtime options under the `Dexec.args` parameter, as specified in
+the example itself.
+
+Note that when running Maven on Microsoft Windows platform, backslashes (`\`)
+under the `Dexec.args` parameter should be escaped with another backslash. For
+example, input file pattern of `c:\*.txt` should be entered as `c:\\*.txt`.
+
+## Beyond Word Count
+
+After you've finished running your first few word count pipelines, take a look at the [`cookbook`](https://github.com/GoogleCloudPlatform/DataflowJavaSDK/blob/master/examples/src/main/java/com/google/cloud/dataflow/examples/cookbook)
+directory for some common and useful patterns like joining, filtering, and combining.
+
+The [`complete`](https://github.com/GoogleCloudPlatform/DataflowJavaSDK/blob/master/examples/src/main/java/com/google/cloud/dataflow/examples/complete)
+directory contains a few realistic end-to-end pipelines.
+
+See the
+[Java 8](https://github.com/GoogleCloudPlatform/DataflowJavaSDK/tree/master/examples/src/main/java8/com/google/cloud/dataflow/examples)
+examples as well. This directory includes a Java 8 version of the
+MinimalWordCount example, as well as series of examples in a simple 'mobile
+gaming' domain. This series introduces some advanced concepts and provides
+additional examples of using Java 8 syntax. Other than usage of Java 8 lambda
+expressions, the concepts that are used apply equally well in Java 7.
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/java/pom.xml
----------------------------------------------------------------------
diff --git a/examples/java/pom.xml b/examples/java/pom.xml
new file mode 100644
index 0000000..b762c84
--- /dev/null
+++ b/examples/java/pom.xml
@@ -0,0 +1,394 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.beam</groupId>
+ <artifactId>parent</artifactId>
+ <version>0.1.0-incubating-SNAPSHOT</version>
+ <relativePath>../../pom.xml</relativePath>
+ </parent>
+
+ <artifactId>java-examples-all</artifactId>
+ <name>Apache Beam :: Examples :: Java All</name>
+ <description>Apache Beam SDK provides a simple, Java-based
+ interface for processing virtually any size data. This
+ artifact includes all Apache Beam Java SDK examples.</description>
+
+ <packaging>jar</packaging>
+
+ <profiles>
+ <profile>
+ <id>DataflowPipelineTests</id>
+ <properties>
+ <runIntegrationTestOnService>true</runIntegrationTestOnService>
+ <testGroups>com.google.cloud.dataflow.sdk.testing.RunnableOnService</testGroups>
+ <testParallelValue>both</testParallelValue>
+ </properties>
+ </profile>
+ </profiles>
+
+ <build>
+ <plugins>
+ <plugin>
+ <artifactId>maven-compiler-plugin</artifactId>
+ </plugin>
+
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-dependency-plugin</artifactId>
+ </plugin>
+
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-checkstyle-plugin</artifactId>
+ <version>2.12</version>
+ <dependencies>
+ <dependency>
+ <groupId>com.puppycrawl.tools</groupId>
+ <artifactId>checkstyle</artifactId>
+ <version>6.6</version>
+ </dependency>
+ </dependencies>
+ <configuration>
+ <configLocation>../../checkstyle.xml</configLocation>
+ <consoleOutput>true</consoleOutput>
+ <failOnViolation>true</failOnViolation>
+ <includeTestSourceDirectory>true</includeTestSourceDirectory>
+ <includeResources>false</includeResources>
+ </configuration>
+ <executions>
+ <execution>
+ <goals>
+ <goal>check</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
+
+ <!-- Source plugin for generating source and test-source JARs. -->
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-source-plugin</artifactId>
+ <version>2.4</version>
+ <executions>
+ <execution>
+ <id>attach-sources</id>
+ <phase>compile</phase>
+ <goals>
+ <goal>jar</goal>
+ </goals>
+ </execution>
+ <execution>
+ <id>attach-test-sources</id>
+ <phase>test-compile</phase>
+ <goals>
+ <goal>test-jar</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
+
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-javadoc-plugin</artifactId>
+ <configuration>
+ <windowtitle>Apache Beam Examples</windowtitle>
+ <doctitle>Apache Beam Examples</doctitle>
+
+ <subpackages>com.google.cloud.dataflow.examples</subpackages>
+ <additionalparam>-exclude com.google.cloud.dataflow.sdk.runners.worker:com.google.cloud.dataflow.sdk.runners.dataflow:com.google.cloud.dataflow.sdk.util ${dataflow.javadoc_opts}</additionalparam>
+ <use>false</use>
+ <quiet>true</quiet>
+ <bottom><![CDATA[<br>]]></bottom>
+
+ <offlineLinks>
+ <!-- The Dataflow SDK docs -->
+ <offlineLink>
+ <url>https://cloud.google.com/dataflow/java-sdk/JavaDoc/</url>
+ <location>${basedir}/../../sdks/java/javadoc/dataflow-sdk-docs</location>
+ </offlineLink>
+ <!-- Other dependencies -->
+ <offlineLink>
+ <url>https://developers.google.com/api-client-library/java/google-api-java-client/reference/1.20.0/</url>
+ <location>${basedir}/../../sdks/java/javadoc/apiclient-docs</location>
+ </offlineLink>
+ <offlineLink>
+ <url>http://avro.apache.org/docs/1.7.7/api/java/</url>
+ <location>${basedir}/../../sdks/java/javadoc/avro-docs</location>
+ </offlineLink>
+ <offlineLink>
+ <url>https://developers.google.com/resources/api-libraries/documentation/bigquery/v2/java/latest/</url>
+ <location>${basedir}/../../sdks/java/javadoc/bq-docs</location>
+ </offlineLink>
+ <offlineLink>
+ <url>https://cloud.google.com/datastore/docs/apis/javadoc/</url>
+ <location>${basedir}/../../sdks/java/javadoc/datastore-docs</location>
+ </offlineLink>
+ <offlineLink>
+ <url>http://docs.guava-libraries.googlecode.com/git-history/release18/javadoc/</url>
+ <location>${basedir}/../../sdks/java/javadoc/guava-docs</location>
+ </offlineLink>
+ <offlineLink>
+ <url>http://fasterxml.github.io/jackson-annotations/javadoc/2.7/</url>
+ <location>${basedir}/../../sdks/java/javadoc/jackson-annotations-docs</location>
+ </offlineLink>
+ <offlineLink>
+ <url>http://fasterxml.github.io/jackson-databind/javadoc/2.7/</url>
+ <location>${basedir}/../../sdks/java/javadoc/jackson-databind-docs</location>
+ </offlineLink>
+ <offlineLink>
+ <url>http://www.joda.org/joda-time/apidocs</url>
+ <location>${basedir}/../../sdks/java/javadoc/joda-docs</location>
+ </offlineLink>
+ <offlineLink>
+ <url>https://developers.google.com/api-client-library/java/google-oauth-java-client/reference/1.20.0/</url>
+ <location>${basedir}/../../sdks/java/javadoc/oauth-docs</location>
+ </offlineLink>
+ </offlineLinks>
+ </configuration>
+ <executions>
+ <execution>
+ <goals>
+ <goal>jar</goal>
+ </goals>
+ <phase>package</phase>
+ </execution>
+ </executions>
+ </plugin>
+
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-shade-plugin</artifactId>
+ <version>2.4.1</version>
+ <executions>
+ <execution>
+ <phase>package</phase>
+ <goals>
+ <goal>shade</goal>
+ </goals>
+ <configuration>
+ <finalName>${project.artifactId}-bundled-${project.version}</finalName>
+ <artifactSet>
+ <includes>
+ <include>*:*</include>
+ </includes>
+ </artifactSet>
+ <filters>
+ <filter>
+ <artifact>*:*</artifact>
+ <excludes>
+ <exclude>META-INF/*.SF</exclude>
+ <exclude>META-INF/*.DSA</exclude>
+ <exclude>META-INF/*.RSA</exclude>
+ </excludes>
+ </filter>
+ </filters>
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
+
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-jar-plugin</artifactId>
+ <executions>
+ <execution>
+ <id>default-jar</id>
+ <goals>
+ <goal>jar</goal>
+ </goals>
+ </execution>
+ <execution>
+ <id>default-test-jar</id>
+ <goals>
+ <goal>test-jar</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
+
+ <!-- Coverage analysis for unit tests. -->
+ <plugin>
+ <groupId>org.jacoco</groupId>
+ <artifactId>jacoco-maven-plugin</artifactId>
+ </plugin>
+ </plugins>
+ </build>
+
+ <dependencies>
+ <dependency>
+ <groupId>org.apache.beam</groupId>
+ <artifactId>java-sdk-all</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+
+ <dependency>
+ <groupId>com.google.api-client</groupId>
+ <artifactId>google-api-client</artifactId>
+ <version>${google-clients.version}</version>
+ <exclusions>
+ <!-- Exclude an old version of guava that is being pulled
+ in by a transitive dependency of google-api-client -->
+ <exclusion>
+ <groupId>com.google.guava</groupId>
+ <artifactId>guava-jdk5</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+
+ <dependency>
+ <groupId>com.google.apis</groupId>
+ <artifactId>google-api-services-dataflow</artifactId>
+ <version>${dataflow.version}</version>
+ <exclusions>
+ <!-- Exclude an old version of guava that is being pulled
+ in by a transitive dependency of google-api-client -->
+ <exclusion>
+ <groupId>com.google.guava</groupId>
+ <artifactId>guava-jdk5</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+
+ <dependency>
+ <groupId>com.google.apis</groupId>
+ <artifactId>google-api-services-bigquery</artifactId>
+ <version>${bigquery.version}</version>
+ <exclusions>
+ <!-- Exclude an old version of guava that is being pulled
+ in by a transitive dependency of google-api-client -->
+ <exclusion>
+ <groupId>com.google.guava</groupId>
+ <artifactId>guava-jdk5</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+
+ <dependency>
+ <groupId>com.google.http-client</groupId>
+ <artifactId>google-http-client</artifactId>
+ <version>${google-clients.version}</version>
+ <exclusions>
+ <!-- Exclude an old version of guava that is being pulled
+ in by a transitive dependency of google-api-client -->
+ <exclusion>
+ <groupId>com.google.guava</groupId>
+ <artifactId>guava-jdk5</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.avro</groupId>
+ <artifactId>avro</artifactId>
+ <version>${avro.version}</version>
+ </dependency>
+
+ <dependency>
+ <groupId>com.google.apis</groupId>
+ <artifactId>google-api-services-datastore-protobuf</artifactId>
+ <version>${datastore.version}</version>
+ <exclusions>
+ <!-- Exclude an old version of guava that is being pulled
+ in by a transitive dependency of google-api-client -->
+ <exclusion>
+ <groupId>com.google.guava</groupId>
+ <artifactId>guava-jdk5</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+
+ <dependency>
+ <groupId>com.google.apis</groupId>
+ <artifactId>google-api-services-pubsub</artifactId>
+ <version>${pubsub.version}</version>
+ <exclusions>
+ <!-- Exclude an old version of guava that is being pulled
+ in by a transitive dependency of google-api-client -->
+ <exclusion>
+ <groupId>com.google.guava</groupId>
+ <artifactId>guava-jdk5</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+
+ <dependency>
+ <groupId>com.google.guava</groupId>
+ <artifactId>guava</artifactId>
+ <version>${guava.version}</version>
+ </dependency>
+
+ <dependency>
+ <groupId>com.google.code.findbugs</groupId>
+ <artifactId>jsr305</artifactId>
+ <version>${jsr305.version}</version>
+ </dependency>
+
+ <dependency>
+ <groupId>joda-time</groupId>
+ <artifactId>joda-time</artifactId>
+ <version>${joda.version}</version>
+ </dependency>
+
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-api</artifactId>
+ <version>${slf4j.version}</version>
+ </dependency>
+
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-jdk14</artifactId>
+ <version>${slf4j.version}</version>
+ <scope>runtime</scope>
+ </dependency>
+
+ <dependency>
+ <groupId>javax.servlet</groupId>
+ <artifactId>javax.servlet-api</artifactId>
+ <version>3.1.0</version>
+ </dependency>
+
+ <!-- Hamcrest and JUnit are required dependencies of DataflowAssert,
+ which is used in the main code of DebuggingWordCount example. -->
+
+ <dependency>
+ <groupId>org.hamcrest</groupId>
+ <artifactId>hamcrest-all</artifactId>
+ <version>${hamcrest.version}</version>
+ </dependency>
+
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <version>${junit.version}</version>
+ </dependency>
+
+ <dependency>
+ <groupId>org.mockito</groupId>
+ <artifactId>mockito-all</artifactId>
+ <version>1.10.19</version>
+ <scope>test</scope>
+ </dependency>
+ </dependencies>
+</project>
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/java/src/main/java/com/google/cloud/dataflow/examples/DebuggingWordCount.java
----------------------------------------------------------------------
diff --git a/examples/java/src/main/java/com/google/cloud/dataflow/examples/DebuggingWordCount.java b/examples/java/src/main/java/com/google/cloud/dataflow/examples/DebuggingWordCount.java
new file mode 100644
index 0000000..8823dbc
--- /dev/null
+++ b/examples/java/src/main/java/com/google/cloud/dataflow/examples/DebuggingWordCount.java
@@ -0,0 +1,182 @@
+/*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package com.google.cloud.dataflow.examples;
+
+import com.google.cloud.dataflow.examples.WordCount.WordCountOptions;
+import com.google.cloud.dataflow.sdk.Pipeline;
+import com.google.cloud.dataflow.sdk.io.TextIO;
+import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
+import com.google.cloud.dataflow.sdk.testing.DataflowAssert;
+import com.google.cloud.dataflow.sdk.transforms.Aggregator;
+import com.google.cloud.dataflow.sdk.transforms.DoFn;
+import com.google.cloud.dataflow.sdk.transforms.ParDo;
+import com.google.cloud.dataflow.sdk.transforms.Sum;
+import com.google.cloud.dataflow.sdk.values.KV;
+import com.google.cloud.dataflow.sdk.values.PCollection;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.Arrays;
+import java.util.List;
+import java.util.regex.Pattern;
+
+
+/**
+ * An example that verifies word counts in Shakespeare and includes Dataflow best practices.
+ *
+ * <p>This class, {@link DebuggingWordCount}, is the third in a series of four successively more
+ * detailed 'word count' examples. You may first want to take a look at {@link MinimalWordCount}
+ * and {@link WordCount}. After you've looked at this example, then see the
+ * {@link WindowedWordCount} pipeline, for introduction of additional concepts.
+ *
+ * <p>Basic concepts, also in the MinimalWordCount and WordCount examples:
+ * Reading text files; counting a PCollection; executing a Pipeline both locally
+ * and using the Dataflow service; defining DoFns.
+ *
+ * <p>New Concepts:
+ * <pre>
+ * 1. Logging to Cloud Logging
+ * 2. Controlling Dataflow worker log levels
+ * 3. Creating a custom aggregator
+ * 4. Testing your Pipeline via DataflowAssert
+ * </pre>
+ *
+ * <p>To execute this pipeline locally, specify general pipeline configuration:
+ * <pre>{@code
+ * --project=YOUR_PROJECT_ID
+ * }
+ * </pre>
+ *
+ * <p>To execute this pipeline using the Dataflow service and the additional logging discussed
+ * below, specify pipeline configuration:
+ * <pre>{@code
+ * --project=YOUR_PROJECT_ID
+ * --stagingLocation=gs://YOUR_STAGING_DIRECTORY
+ * --runner=BlockingDataflowPipelineRunner
+ * --workerLogLevelOverrides={"com.google.cloud.dataflow.examples":"DEBUG"}
+ * }
+ * </pre>
+ *
+ * <p>Note that when you run via <code>mvn exec</code>, you may need to escape
+ * the quotations as appropriate for your shell. For example, in <code>bash</code>:
+ * <pre>
+ * mvn compile exec:java ... \
+ * -Dexec.args="... \
+ * --workerLogLevelOverrides={\\\"com.google.cloud.dataflow.examples\\\":\\\"DEBUG\\\"}"
+ * </pre>
+ *
+ * <p>Concept #2: Dataflow workers which execute user code are configured to log to Cloud
+ * Logging by default at "INFO" log level and higher. One may override log levels for specific
+ * logging namespaces by specifying:
+ * <pre><code>
+ * --workerLogLevelOverrides={"Name1":"Level1","Name2":"Level2",...}
+ * </code></pre>
+ * For example, by specifying:
+ * <pre><code>
+ * --workerLogLevelOverrides={"com.google.cloud.dataflow.examples":"DEBUG"}
+ * </code></pre>
+ * when executing this pipeline using the Dataflow service, Cloud Logging would contain only
+ * "DEBUG" or higher level logs for the {@code com.google.cloud.dataflow.examples} package in
+ * addition to the default "INFO" or higher level logs. In addition, the default Dataflow worker
+ * logging configuration can be overridden by specifying
+ * {@code --defaultWorkerLogLevel=<one of TRACE, DEBUG, INFO, WARN, ERROR>}. For example,
+ * by specifying {@code --defaultWorkerLogLevel=DEBUG} when executing this pipeline with
+ * the Dataflow service, Cloud Logging would contain all "DEBUG" or higher level logs. Note
+ * that changing the default worker log level to TRACE or DEBUG will significantly increase
+ * the amount of logs output.
+ *
+ * <p>The input file defaults to {@code gs://dataflow-samples/shakespeare/kinglear.txt} and can be
+ * overridden with {@code --inputFile}.
+ */
+public class DebuggingWordCount {
+ /** A DoFn that filters for a specific key based upon a regular expression. */
+ public static class FilterTextFn extends DoFn<KV<String, Long>, KV<String, Long>> {
+ /**
+ * Concept #1: The logger below uses the fully qualified class name of FilterTextFn
+ * as the logger. All log statements emitted by this logger will be referenced by this name
+ * and will be visible in the Cloud Logging UI. Learn more at https://cloud.google.com/logging
+ * about the Cloud Logging UI.
+ */
+ private static final Logger LOG = LoggerFactory.getLogger(FilterTextFn.class);
+
+ private final Pattern filter;
+ public FilterTextFn(String pattern) {
+ filter = Pattern.compile(pattern);
+ }
+
+ /**
+ * Concept #3: A custom aggregator can track values in your pipeline as it runs. Those
+ * values will be displayed in the Dataflow Monitoring UI when this pipeline is run using the
+ * Dataflow service. These aggregators below track the number of matched and unmatched words.
+ * Learn more at https://cloud.google.com/dataflow/pipelines/dataflow-monitoring-intf about
+ * the Dataflow Monitoring UI.
+ */
+ private final Aggregator<Long, Long> matchedWords =
+ createAggregator("matchedWords", new Sum.SumLongFn());
+ private final Aggregator<Long, Long> unmatchedWords =
+ createAggregator("umatchedWords", new Sum.SumLongFn());
+
+ @Override
+ public void processElement(ProcessContext c) {
+ if (filter.matcher(c.element().getKey()).matches()) {
+ // Log at the "DEBUG" level each element that we match. When executing this pipeline
+ // using the Dataflow service, these log lines will appear in the Cloud Logging UI
+ // only if the log level is set to "DEBUG" or lower.
+ LOG.debug("Matched: " + c.element().getKey());
+ matchedWords.addValue(1L);
+ c.output(c.element());
+ } else {
+ // Log at the "TRACE" level each element that is not matched. Different log levels
+ // can be used to control the verbosity of logging providing an effective mechanism
+ // to filter less important information.
+ LOG.trace("Did not match: " + c.element().getKey());
+ unmatchedWords.addValue(1L);
+ }
+ }
+ }
+
+ public static void main(String[] args) {
+ WordCountOptions options = PipelineOptionsFactory.fromArgs(args).withValidation()
+ .as(WordCountOptions.class);
+ Pipeline p = Pipeline.create(options);
+
+ PCollection<KV<String, Long>> filteredWords =
+ p.apply(TextIO.Read.named("ReadLines").from(options.getInputFile()))
+ .apply(new WordCount.CountWords())
+ .apply(ParDo.of(new FilterTextFn("Flourish|stomach")));
+
+ /**
+ * Concept #4: DataflowAssert is a set of convenient PTransforms in the style of
+ * Hamcrest's collection matchers that can be used when writing Pipeline level tests
+ * to validate the contents of PCollections. DataflowAssert is best used in unit tests
+ * with small data sets but is demonstrated here as a teaching tool.
+ *
+ * <p>Below we verify that the set of filtered words matches our expected counts. Note
+ * that DataflowAssert does not provide any output and that successful completion of the
+ * Pipeline implies that the expectations were met. Learn more at
+ * https://cloud.google.com/dataflow/pipelines/testing-your-pipeline on how to test
+ * your Pipeline and see {@link DebuggingWordCountTest} for an example unit test.
+ */
+ List<KV<String, Long>> expectedResults = Arrays.asList(
+ KV.of("Flourish", 3L),
+ KV.of("stomach", 1L));
+ DataflowAssert.that(filteredWords).containsInAnyOrder(expectedResults);
+
+ p.run();
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/java/src/main/java/com/google/cloud/dataflow/examples/MinimalWordCount.java
----------------------------------------------------------------------
diff --git a/examples/java/src/main/java/com/google/cloud/dataflow/examples/MinimalWordCount.java b/examples/java/src/main/java/com/google/cloud/dataflow/examples/MinimalWordCount.java
new file mode 100644
index 0000000..4ed0520
--- /dev/null
+++ b/examples/java/src/main/java/com/google/cloud/dataflow/examples/MinimalWordCount.java
@@ -0,0 +1,117 @@
+/*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package com.google.cloud.dataflow.examples;
+
+import com.google.cloud.dataflow.sdk.Pipeline;
+import com.google.cloud.dataflow.sdk.io.TextIO;
+import com.google.cloud.dataflow.sdk.options.DataflowPipelineOptions;
+import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
+import com.google.cloud.dataflow.sdk.runners.BlockingDataflowPipelineRunner;
+import com.google.cloud.dataflow.sdk.transforms.Count;
+import com.google.cloud.dataflow.sdk.transforms.DoFn;
+import com.google.cloud.dataflow.sdk.transforms.MapElements;
+import com.google.cloud.dataflow.sdk.transforms.ParDo;
+import com.google.cloud.dataflow.sdk.transforms.SimpleFunction;
+import com.google.cloud.dataflow.sdk.values.KV;
+
+
+/**
+ * An example that counts words in Shakespeare.
+ *
+ * <p>This class, {@link MinimalWordCount}, is the first in a series of four successively more
+ * detailed 'word count' examples. Here, for simplicity, we don't show any error-checking or
+ * argument processing, and focus on construction of the pipeline, which chains together the
+ * application of core transforms.
+ *
+ * <p>Next, see the {@link WordCount} pipeline, then the {@link DebuggingWordCount}, and finally
+ * the {@link WindowedWordCount} pipeline, for more detailed examples that introduce additional
+ * concepts.
+ *
+ * <p>Concepts:
+ * <pre>
+ * 1. Reading data from text files
+ * 2. Specifying 'inline' transforms
+ * 3. Counting a PCollection
+ * 4. Writing data to Cloud Storage as text files
+ * </pre>
+ *
+ * <p>To execute this pipeline, first edit the code to set your project ID, the staging
+ * location, and the output location. The specified GCS bucket(s) must already exist.
+ *
+ * <p>Then, run the pipeline as described in the README. It will be deployed and run using the
+ * Dataflow service. No args are required to run the pipeline. You can see the results in your
+ * output bucket in the GCS browser.
+ */
+public class MinimalWordCount {
+
+ public static void main(String[] args) {
+ // Create a DataflowPipelineOptions object. This object lets us set various execution
+ // options for our pipeline, such as the associated Cloud Platform project and the location
+ // in Google Cloud Storage to stage files.
+ DataflowPipelineOptions options = PipelineOptionsFactory.create()
+ .as(DataflowPipelineOptions.class);
+ options.setRunner(BlockingDataflowPipelineRunner.class);
+ // CHANGE 1/3: Your project ID is required in order to run your pipeline on the Google Cloud.
+ options.setProject("SET_YOUR_PROJECT_ID_HERE");
+ // CHANGE 2/3: Your Google Cloud Storage path is required for staging local files.
+ options.setStagingLocation("gs://SET_YOUR_BUCKET_NAME_HERE/AND_STAGING_DIRECTORY");
+
+ // Create the Pipeline object with the options we defined above.
+ Pipeline p = Pipeline.create(options);
+
+ // Apply the pipeline's transforms.
+
+ // Concept #1: Apply a root transform to the pipeline; in this case, TextIO.Read to read a set
+ // of input text files. TextIO.Read returns a PCollection where each element is one line from
+ // the input text (a set of Shakespeare's texts).
+ p.apply(TextIO.Read.from("gs://dataflow-samples/shakespeare/*"))
+ // Concept #2: Apply a ParDo transform to our PCollection of text lines. This ParDo invokes a
+ // DoFn (defined in-line) on each element that tokenizes the text line into individual words.
+ // The ParDo returns a PCollection<String>, where each element is an individual word in
+ // Shakespeare's collected texts.
+ .apply(ParDo.named("ExtractWords").of(new DoFn<String, String>() {
+ @Override
+ public void processElement(ProcessContext c) {
+ for (String word : c.element().split("[^a-zA-Z']+")) {
+ if (!word.isEmpty()) {
+ c.output(word);
+ }
+ }
+ }
+ }))
+ // Concept #3: Apply the Count transform to our PCollection of individual words. The Count
+ // transform returns a new PCollection of key/value pairs, where each key represents a unique
+ // word in the text. The associated value is the occurrence count for that word.
+ .apply(Count.<String>perElement())
+ // Apply a MapElements transform that formats our PCollection of word counts into a printable
+ // string, suitable for writing to an output file.
+ .apply("FormatResults", MapElements.via(new SimpleFunction<KV<String, Long>, String>() {
+ @Override
+ public String apply(KV<String, Long> input) {
+ return input.getKey() + ": " + input.getValue();
+ }
+ }))
+ // Concept #4: Apply a write transform, TextIO.Write, at the end of the pipeline.
+ // TextIO.Write writes the contents of a PCollection (in this case, our PCollection of
+ // formatted strings) to a series of text files in Google Cloud Storage.
+ // CHANGE 3/3: The Google Cloud Storage path is required for outputting the results to.
+ .apply(TextIO.Write.to("gs://YOUR_OUTPUT_BUCKET/AND_OUTPUT_PREFIX"));
+
+ // Run the pipeline.
+ p.run();
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/java/src/main/java/com/google/cloud/dataflow/examples/WindowedWordCount.java
----------------------------------------------------------------------
diff --git a/examples/java/src/main/java/com/google/cloud/dataflow/examples/WindowedWordCount.java b/examples/java/src/main/java/com/google/cloud/dataflow/examples/WindowedWordCount.java
new file mode 100644
index 0000000..2adac55
--- /dev/null
+++ b/examples/java/src/main/java/com/google/cloud/dataflow/examples/WindowedWordCount.java
@@ -0,0 +1,269 @@
+/*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package com.google.cloud.dataflow.examples;
+
+import com.google.api.services.bigquery.model.TableFieldSchema;
+import com.google.api.services.bigquery.model.TableReference;
+import com.google.api.services.bigquery.model.TableRow;
+import com.google.api.services.bigquery.model.TableSchema;
+import com.google.cloud.dataflow.examples.common.DataflowExampleOptions;
+import com.google.cloud.dataflow.examples.common.DataflowExampleUtils;
+import com.google.cloud.dataflow.examples.common.ExampleBigQueryTableOptions;
+import com.google.cloud.dataflow.examples.common.ExamplePubsubTopicOptions;
+import com.google.cloud.dataflow.sdk.Pipeline;
+import com.google.cloud.dataflow.sdk.PipelineResult;
+import com.google.cloud.dataflow.sdk.io.BigQueryIO;
+import com.google.cloud.dataflow.sdk.io.PubsubIO;
+import com.google.cloud.dataflow.sdk.io.TextIO;
+import com.google.cloud.dataflow.sdk.options.Default;
+import com.google.cloud.dataflow.sdk.options.Description;
+import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
+import com.google.cloud.dataflow.sdk.transforms.DoFn;
+import com.google.cloud.dataflow.sdk.transforms.ParDo;
+import com.google.cloud.dataflow.sdk.transforms.windowing.FixedWindows;
+import com.google.cloud.dataflow.sdk.transforms.windowing.Window;
+import com.google.cloud.dataflow.sdk.values.KV;
+import com.google.cloud.dataflow.sdk.values.PCollection;
+
+import org.joda.time.Duration;
+import org.joda.time.Instant;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+
+/**
+ * An example that counts words in text, and can run over either unbounded or bounded input
+ * collections.
+ *
+ * <p>This class, {@link WindowedWordCount}, is the last in a series of four successively more
+ * detailed 'word count' examples. First take a look at {@link MinimalWordCount},
+ * {@link WordCount}, and {@link DebuggingWordCount}.
+ *
+ * <p>Basic concepts, also in the MinimalWordCount, WordCount, and DebuggingWordCount examples:
+ * Reading text files; counting a PCollection; writing to GCS; executing a Pipeline both locally
+ * and using the Dataflow service; defining DoFns; creating a custom aggregator;
+ * user-defined PTransforms; defining PipelineOptions.
+ *
+ * <p>New Concepts:
+ * <pre>
+ * 1. Unbounded and bounded pipeline input modes
+ * 2. Adding timestamps to data
+ * 3. PubSub topics as sources
+ * 4. Windowing
+ * 5. Re-using PTransforms over windowed PCollections
+ * 6. Writing to BigQuery
+ * </pre>
+ *
+ * <p>To execute this pipeline locally, specify general pipeline configuration:
+ * <pre>{@code
+ * --project=YOUR_PROJECT_ID
+ * }
+ * </pre>
+ *
+ * <p>To execute this pipeline using the Dataflow service, specify pipeline configuration:
+ * <pre>{@code
+ * --project=YOUR_PROJECT_ID
+ * --stagingLocation=gs://YOUR_STAGING_DIRECTORY
+ * --runner=BlockingDataflowPipelineRunner
+ * }
+ * </pre>
+ *
+ * <p>Optionally specify the input file path via:
+ * {@code --inputFile=gs://INPUT_PATH},
+ * which defaults to {@code gs://dataflow-samples/shakespeare/kinglear.txt}.
+ *
+ * <p>Specify an output BigQuery dataset and optionally, a table for the output. If you don't
+ * specify the table, one will be created for you using the job name. If you don't specify the
+ * dataset, a dataset called {@code dataflow-examples} must already exist in your project.
+ * {@code --bigQueryDataset=YOUR-DATASET --bigQueryTable=YOUR-NEW-TABLE-NAME}.
+ *
+ * <p>Decide whether you want your pipeline to run with 'bounded' (such as files in GCS) or
+ * 'unbounded' input (such as a PubSub topic). To run with unbounded input, set
+ * {@code --unbounded=true}. Then, optionally specify the Google Cloud PubSub topic to read from
+ * via {@code --pubsubTopic=projects/PROJECT_ID/topics/YOUR_TOPIC_NAME}. If the topic does not
+ * exist, the pipeline will create one for you. It will delete this topic when it terminates.
+ * The pipeline will automatically launch an auxiliary batch pipeline to populate the given PubSub
+ * topic with the contents of the {@code --inputFile}, in order to make the example easy to run.
+ * If you want to use an independently-populated PubSub topic, indicate this by setting
+ * {@code --inputFile=""}. In that case, the auxiliary pipeline will not be started.
+ *
+ * <p>By default, the pipeline will do fixed windowing, on 1-minute windows. You can
+ * change this interval by setting the {@code --windowSize} parameter, e.g. {@code --windowSize=10}
+ * for 10-minute windows.
+ */
+public class WindowedWordCount {
+ private static final Logger LOG = LoggerFactory.getLogger(WindowedWordCount.class);
+ static final int WINDOW_SIZE = 1; // Default window duration in minutes
+
+ /**
+ * Concept #2: A DoFn that sets the data element timestamp. This is a silly method, just for
+ * this example, for the bounded data case.
+ *
+ * <p>Imagine that many ghosts of Shakespeare are all typing madly at the same time to recreate
+ * his masterworks. Each line of the corpus will get a random associated timestamp somewhere in a
+ * 2-hour period.
+ */
+ static class AddTimestampFn extends DoFn<String, String> {
+ private static final long RAND_RANGE = 7200000; // 2 hours in ms
+
+ @Override
+ public void processElement(ProcessContext c) {
+ // Generate a timestamp that falls somewhere in the past two hours.
+ long randomTimestamp = System.currentTimeMillis()
+ - (int) (Math.random() * RAND_RANGE);
+ /**
+ * Concept #2: Set the data element with that timestamp.
+ */
+ c.outputWithTimestamp(c.element(), new Instant(randomTimestamp));
+ }
+ }
+
+ /** A DoFn that converts a Word and Count into a BigQuery table row. */
+ static class FormatAsTableRowFn extends DoFn<KV<String, Long>, TableRow> {
+ @Override
+ public void processElement(ProcessContext c) {
+ TableRow row = new TableRow()
+ .set("word", c.element().getKey())
+ .set("count", c.element().getValue())
+ // include a field for the window timestamp
+ .set("window_timestamp", c.timestamp().toString());
+ c.output(row);
+ }
+ }
+
+ /**
+ * Helper method that defines the BigQuery schema used for the output.
+ */
+ private static TableSchema getSchema() {
+ List<TableFieldSchema> fields = new ArrayList<>();
+ fields.add(new TableFieldSchema().setName("word").setType("STRING"));
+ fields.add(new TableFieldSchema().setName("count").setType("INTEGER"));
+ fields.add(new TableFieldSchema().setName("window_timestamp").setType("TIMESTAMP"));
+ TableSchema schema = new TableSchema().setFields(fields);
+ return schema;
+ }
+
+ /**
+ * Concept #6: We'll stream the results to a BigQuery table. The BigQuery output source is one
+ * that supports both bounded and unbounded data. This is a helper method that creates a
+ * TableReference from input options, to tell the pipeline where to write its BigQuery results.
+ */
+ private static TableReference getTableReference(Options options) {
+ TableReference tableRef = new TableReference();
+ tableRef.setProjectId(options.getProject());
+ tableRef.setDatasetId(options.getBigQueryDataset());
+ tableRef.setTableId(options.getBigQueryTable());
+ return tableRef;
+ }
+
+ /**
+ * Options supported by {@link WindowedWordCount}.
+ *
+ * <p>Inherits standard example configuration options, which allow specification of the BigQuery
+ * table and the PubSub topic, as well as the {@link WordCount.WordCountOptions} support for
+ * specification of the input file.
+ */
+ public static interface Options extends WordCount.WordCountOptions,
+ DataflowExampleOptions, ExamplePubsubTopicOptions, ExampleBigQueryTableOptions {
+ @Description("Fixed window duration, in minutes")
+ @Default.Integer(WINDOW_SIZE)
+ Integer getWindowSize();
+ void setWindowSize(Integer value);
+
+ @Description("Whether to run the pipeline with unbounded input")
+ boolean isUnbounded();
+ void setUnbounded(boolean value);
+ }
+
+ public static void main(String[] args) throws IOException {
+ Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
+ options.setBigQuerySchema(getSchema());
+ // DataflowExampleUtils creates the necessary input sources to simplify execution of this
+ // Pipeline.
+ DataflowExampleUtils exampleDataflowUtils = new DataflowExampleUtils(options,
+ options.isUnbounded());
+
+ Pipeline pipeline = Pipeline.create(options);
+
+ /**
+ * Concept #1: the Dataflow SDK lets us run the same pipeline with either a bounded or
+ * unbounded input source.
+ */
+ PCollection<String> input;
+ if (options.isUnbounded()) {
+ LOG.info("Reading from PubSub.");
+ /**
+ * Concept #3: Read from the PubSub topic. A topic will be created if it wasn't
+ * specified as an argument. The data elements' timestamps will come from the pubsub
+ * injection.
+ */
+ input = pipeline
+ .apply(PubsubIO.Read.topic(options.getPubsubTopic()));
+ } else {
+ /** Else, this is a bounded pipeline. Read from the GCS file. */
+ input = pipeline
+ .apply(TextIO.Read.from(options.getInputFile()))
+ // Concept #2: Add an element timestamp, using an artificial time just to show windowing.
+ // See AddTimestampFn for more detail on this.
+ .apply(ParDo.of(new AddTimestampFn()));
+ }
+
+ /**
+ * Concept #4: Window into fixed windows. The fixed window size for this example defaults to 1
+ * minute (you can change this with a command-line option). See the documentation for more
+ * information on how fixed windows work, and for information on the other types of windowing
+ * available (e.g., sliding windows).
+ */
+ PCollection<String> windowedWords = input
+ .apply(Window.<String>into(
+ FixedWindows.of(Duration.standardMinutes(options.getWindowSize()))));
+
+ /**
+ * Concept #5: Re-use our existing CountWords transform that does not have knowledge of
+ * windows over a PCollection containing windowed values.
+ */
+ PCollection<KV<String, Long>> wordCounts = windowedWords.apply(new WordCount.CountWords());
+
+ /**
+ * Concept #6: Format the results for a BigQuery table, then write to BigQuery.
+ * The BigQuery output source supports both bounded and unbounded data.
+ */
+ wordCounts.apply(ParDo.of(new FormatAsTableRowFn()))
+ .apply(BigQueryIO.Write
+ .to(getTableReference(options))
+ .withSchema(getSchema())
+ .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED)
+ .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_APPEND));
+
+ PipelineResult result = pipeline.run();
+
+ /**
+ * To mock unbounded input from PubSub, we'll now start an auxiliary 'injector' pipeline that
+ * runs for a limited time, and publishes to the input PubSub topic.
+ *
+ * With an unbounded input source, you will need to explicitly shut down this pipeline when you
+ * are done with it, so that you do not continue to be charged for the instances. You can do
+ * this via a ctrl-C from the command line, or from the developer's console UI for Dataflow
+ * pipelines. The PubSub topic will also be deleted at this time.
+ */
+ exampleDataflowUtils.mockUnboundedSource(options.getInputFile(), result);
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/java/src/main/java/com/google/cloud/dataflow/examples/WordCount.java
----------------------------------------------------------------------
diff --git a/examples/java/src/main/java/com/google/cloud/dataflow/examples/WordCount.java b/examples/java/src/main/java/com/google/cloud/dataflow/examples/WordCount.java
new file mode 100644
index 0000000..1086106
--- /dev/null
+++ b/examples/java/src/main/java/com/google/cloud/dataflow/examples/WordCount.java
@@ -0,0 +1,206 @@
+/*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package com.google.cloud.dataflow.examples;
+
+import com.google.cloud.dataflow.sdk.Pipeline;
+import com.google.cloud.dataflow.sdk.io.TextIO;
+import com.google.cloud.dataflow.sdk.options.DataflowPipelineOptions;
+import com.google.cloud.dataflow.sdk.options.Default;
+import com.google.cloud.dataflow.sdk.options.DefaultValueFactory;
+import com.google.cloud.dataflow.sdk.options.Description;
+import com.google.cloud.dataflow.sdk.options.PipelineOptions;
+import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
+import com.google.cloud.dataflow.sdk.transforms.Aggregator;
+import com.google.cloud.dataflow.sdk.transforms.Count;
+import com.google.cloud.dataflow.sdk.transforms.DoFn;
+import com.google.cloud.dataflow.sdk.transforms.MapElements;
+import com.google.cloud.dataflow.sdk.transforms.PTransform;
+import com.google.cloud.dataflow.sdk.transforms.ParDo;
+import com.google.cloud.dataflow.sdk.transforms.SimpleFunction;
+import com.google.cloud.dataflow.sdk.transforms.Sum;
+import com.google.cloud.dataflow.sdk.util.gcsfs.GcsPath;
+import com.google.cloud.dataflow.sdk.values.KV;
+import com.google.cloud.dataflow.sdk.values.PCollection;
+
+
+/**
+ * An example that counts words in Shakespeare and includes Dataflow best practices.
+ *
+ * <p>This class, {@link WordCount}, is the second in a series of four successively more detailed
+ * 'word count' examples. You may first want to take a look at {@link MinimalWordCount}.
+ * After you've looked at this example, then see the {@link DebuggingWordCount}
+ * pipeline, for introduction of additional concepts.
+ *
+ * <p>For a detailed walkthrough of this example, see
+ * <a href="https://cloud.google.com/dataflow/java-sdk/wordcount-example">
+ * https://cloud.google.com/dataflow/java-sdk/wordcount-example
+ * </a>
+ *
+ * <p>Basic concepts, also in the MinimalWordCount example:
+ * Reading text files; counting a PCollection; writing to GCS.
+ *
+ * <p>New Concepts:
+ * <pre>
+ * 1. Executing a Pipeline both locally and using the Dataflow service
+ * 2. Using ParDo with static DoFns defined out-of-line
+ * 3. Building a composite transform
+ * 4. Defining your own pipeline options
+ * </pre>
+ *
+ * <p>Concept #1: you can execute this pipeline either locally or using the Dataflow service.
+ * These are now command-line options and not hard-coded as they were in the MinimalWordCount
+ * example.
+ * To execute this pipeline locally, specify general pipeline configuration:
+ * <pre>{@code
+ * --project=YOUR_PROJECT_ID
+ * }
+ * </pre>
+ * and a local output file or output prefix on GCS:
+ * <pre>{@code
+ * --output=[YOUR_LOCAL_FILE | gs://YOUR_OUTPUT_PREFIX]
+ * }</pre>
+ *
+ * <p>To execute this pipeline using the Dataflow service, specify pipeline configuration:
+ * <pre>{@code
+ * --project=YOUR_PROJECT_ID
+ * --stagingLocation=gs://YOUR_STAGING_DIRECTORY
+ * --runner=BlockingDataflowPipelineRunner
+ * }
+ * </pre>
+ * and an output prefix on GCS:
+ * <pre>{@code
+ * --output=gs://YOUR_OUTPUT_PREFIX
+ * }</pre>
+ *
+ * <p>The input file defaults to {@code gs://dataflow-samples/shakespeare/kinglear.txt} and can be
+ * overridden with {@code --inputFile}.
+ */
+public class WordCount {
+
+ /**
+ * Concept #2: You can make your pipeline code less verbose by defining your DoFns statically out-
+ * of-line. This DoFn tokenizes lines of text into individual words; we pass it to a ParDo in the
+ * pipeline.
+ */
+ static class ExtractWordsFn extends DoFn<String, String> {
+ private final Aggregator<Long, Long> emptyLines =
+ createAggregator("emptyLines", new Sum.SumLongFn());
+
+ @Override
+ public void processElement(ProcessContext c) {
+ if (c.element().trim().isEmpty()) {
+ emptyLines.addValue(1L);
+ }
+
+ // Split the line into words.
+ String[] words = c.element().split("[^a-zA-Z']+");
+
+ // Output each word encountered into the output PCollection.
+ for (String word : words) {
+ if (!word.isEmpty()) {
+ c.output(word);
+ }
+ }
+ }
+ }
+
+ /** A SimpleFunction that converts a Word and Count into a printable string. */
+ public static class FormatAsTextFn extends SimpleFunction<KV<String, Long>, String> {
+ @Override
+ public String apply(KV<String, Long> input) {
+ return input.getKey() + ": " + input.getValue();
+ }
+ }
+
+ /**
+ * A PTransform that converts a PCollection containing lines of text into a PCollection of
+ * formatted word counts.
+ *
+ * <p>Concept #3: This is a custom composite transform that bundles two transforms (ParDo and
+ * Count) as a reusable PTransform subclass. Using composite transforms allows for easy reuse,
+ * modular testing, and an improved monitoring experience.
+ */
+ public static class CountWords extends PTransform<PCollection<String>,
+ PCollection<KV<String, Long>>> {
+ @Override
+ public PCollection<KV<String, Long>> apply(PCollection<String> lines) {
+
+ // Convert lines of text into individual words.
+ PCollection<String> words = lines.apply(
+ ParDo.of(new ExtractWordsFn()));
+
+ // Count the number of times each word occurs.
+ PCollection<KV<String, Long>> wordCounts =
+ words.apply(Count.<String>perElement());
+
+ return wordCounts;
+ }
+ }
+
+ /**
+ * Options supported by {@link WordCount}.
+ *
+ * <p>Concept #4: Defining your own configuration options. Here, you can add your own arguments
+ * to be processed by the command-line parser, and specify default values for them. You can then
+ * access the options values in your pipeline code.
+ *
+ * <p>Inherits standard configuration options.
+ */
+ public static interface WordCountOptions extends PipelineOptions {
+ @Description("Path of the file to read from")
+ @Default.String("gs://dataflow-samples/shakespeare/kinglear.txt")
+ String getInputFile();
+ void setInputFile(String value);
+
+ @Description("Path of the file to write to")
+ @Default.InstanceFactory(OutputFactory.class)
+ String getOutput();
+ void setOutput(String value);
+
+ /**
+ * Returns "gs://${YOUR_STAGING_DIRECTORY}/counts.txt" as the default destination.
+ */
+ public static class OutputFactory implements DefaultValueFactory<String> {
+ @Override
+ public String create(PipelineOptions options) {
+ DataflowPipelineOptions dataflowOptions = options.as(DataflowPipelineOptions.class);
+ if (dataflowOptions.getStagingLocation() != null) {
+ return GcsPath.fromUri(dataflowOptions.getStagingLocation())
+ .resolve("counts.txt").toString();
+ } else {
+ throw new IllegalArgumentException("Must specify --output or --stagingLocation");
+ }
+ }
+ }
+
+ }
+
+ public static void main(String[] args) {
+ WordCountOptions options = PipelineOptionsFactory.fromArgs(args).withValidation()
+ .as(WordCountOptions.class);
+ Pipeline p = Pipeline.create(options);
+
+ // Concepts #2 and #3: Our pipeline applies the composite CountWords transform, and passes the
+ // static FormatAsTextFn() to the ParDo transform.
+ p.apply(TextIO.Read.named("ReadLines").from(options.getInputFile()))
+ .apply(new CountWords())
+ .apply(MapElements.via(new FormatAsTextFn()))
+ .apply(TextIO.Write.named("WriteCounts").to(options.getOutput()));
+
+ p.run();
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/java/src/main/java/com/google/cloud/dataflow/examples/common/DataflowExampleOptions.java
----------------------------------------------------------------------
diff --git a/examples/java/src/main/java/com/google/cloud/dataflow/examples/common/DataflowExampleOptions.java b/examples/java/src/main/java/com/google/cloud/dataflow/examples/common/DataflowExampleOptions.java
new file mode 100644
index 0000000..606bfb4
--- /dev/null
+++ b/examples/java/src/main/java/com/google/cloud/dataflow/examples/common/DataflowExampleOptions.java
@@ -0,0 +1,34 @@
+/*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package com.google.cloud.dataflow.examples.common;
+
+import com.google.cloud.dataflow.sdk.options.DataflowPipelineOptions;
+import com.google.cloud.dataflow.sdk.options.Default;
+import com.google.cloud.dataflow.sdk.options.Description;
+
+/**
+ * Options that can be used to configure the Dataflow examples.
+ */
+public interface DataflowExampleOptions extends DataflowPipelineOptions {
+ @Description("Whether to keep jobs running on the Dataflow service after local process exit")
+ @Default.Boolean(false)
+ boolean getKeepJobsRunning();
+ void setKeepJobsRunning(boolean keepJobsRunning);
+
+ @Description("Number of workers to use when executing the injector pipeline")
+ @Default.Integer(1)
+ int getInjectorNumWorkers();
+ void setInjectorNumWorkers(int numWorkers);
+}
[10/67] [partial] incubator-beam git commit: Directory reorganization
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/PackageUtil.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/PackageUtil.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/PackageUtil.java
deleted file mode 100644
index 8b2d56f..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/PackageUtil.java
+++ /dev/null
@@ -1,327 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.api.client.util.BackOffUtils;
-import com.google.api.client.util.Sleeper;
-import com.google.api.services.dataflow.model.DataflowPackage;
-import com.google.cloud.hadoop.util.ApiErrorExtractor;
-import com.google.common.hash.Funnels;
-import com.google.common.hash.Hasher;
-import com.google.common.hash.Hashing;
-import com.google.common.io.CountingOutputStream;
-import com.google.common.io.Files;
-
-import com.fasterxml.jackson.core.Base64Variants;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.File;
-import java.io.FileNotFoundException;
-import java.io.IOException;
-import java.io.OutputStream;
-import java.nio.channels.Channels;
-import java.nio.channels.WritableByteChannel;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.List;
-import java.util.Objects;
-
-/** Helper routines for packages. */
-public class PackageUtil {
- private static final Logger LOG = LoggerFactory.getLogger(PackageUtil.class);
- /**
- * A reasonable upper bound on the number of jars required to launch a Dataflow job.
- */
- public static final int SANE_CLASSPATH_SIZE = 1000;
- /**
- * The initial interval to use between package staging attempts.
- */
- private static final long INITIAL_BACKOFF_INTERVAL_MS = 5000L;
- /**
- * The maximum number of attempts when staging a file.
- */
- private static final int MAX_ATTEMPTS = 5;
-
- /**
- * Translates exceptions from API calls.
- */
- private static final ApiErrorExtractor ERROR_EXTRACTOR = new ApiErrorExtractor();
-
- /**
- * Creates a DataflowPackage containing information about how a classpath element should be
- * staged, including the staging destination as well as its size and hash.
- *
- * @param classpathElement The local path for the classpath element.
- * @param stagingPath The base location for staged classpath elements.
- * @param overridePackageName If non-null, use the given value as the package name
- * instead of generating one automatically.
- * @return The package.
- */
- @Deprecated
- public static DataflowPackage createPackage(File classpathElement,
- String stagingPath, String overridePackageName) {
- return createPackageAttributes(classpathElement, stagingPath, overridePackageName)
- .getDataflowPackage();
- }
-
- /**
- * Compute and cache the attributes of a classpath element that we will need to stage it.
- *
- * @param classpathElement the file or directory to be staged.
- * @param stagingPath The base location for staged classpath elements.
- * @param overridePackageName If non-null, use the given value as the package name
- * instead of generating one automatically.
- * @return a {@link PackageAttributes} that containing metadata about the object to be staged.
- */
- static PackageAttributes createPackageAttributes(File classpathElement,
- String stagingPath, String overridePackageName) {
- try {
- boolean directory = classpathElement.isDirectory();
-
- // Compute size and hash in one pass over file or directory.
- Hasher hasher = Hashing.md5().newHasher();
- OutputStream hashStream = Funnels.asOutputStream(hasher);
- CountingOutputStream countingOutputStream = new CountingOutputStream(hashStream);
-
- if (!directory) {
- // Files are staged as-is.
- Files.asByteSource(classpathElement).copyTo(countingOutputStream);
- } else {
- // Directories are recursively zipped.
- ZipFiles.zipDirectory(classpathElement, countingOutputStream);
- }
-
- long size = countingOutputStream.getCount();
- String hash = Base64Variants.MODIFIED_FOR_URL.encode(hasher.hash().asBytes());
-
- // Create the DataflowPackage with staging name and location.
- String uniqueName = getUniqueContentName(classpathElement, hash);
- String resourcePath = IOChannelUtils.resolve(stagingPath, uniqueName);
- DataflowPackage target = new DataflowPackage();
- target.setName(overridePackageName != null ? overridePackageName : uniqueName);
- target.setLocation(resourcePath);
-
- return new PackageAttributes(size, hash, directory, target);
- } catch (IOException e) {
- throw new RuntimeException("Package setup failure for " + classpathElement, e);
- }
- }
-
- /**
- * Transfers the classpath elements to the staging location.
- *
- * @param classpathElements The elements to stage.
- * @param stagingPath The base location to stage the elements to.
- * @return A list of cloud workflow packages, each representing a classpath element.
- */
- public static List<DataflowPackage> stageClasspathElements(
- Collection<String> classpathElements, String stagingPath) {
- return stageClasspathElements(classpathElements, stagingPath, Sleeper.DEFAULT);
- }
-
- // Visible for testing.
- static List<DataflowPackage> stageClasspathElements(
- Collection<String> classpathElements, String stagingPath,
- Sleeper retrySleeper) {
- LOG.info("Uploading {} files from PipelineOptions.filesToStage to staging location to "
- + "prepare for execution.", classpathElements.size());
-
- if (classpathElements.size() > SANE_CLASSPATH_SIZE) {
- LOG.warn("Your classpath contains {} elements, which Google Cloud Dataflow automatically "
- + "copies to all workers. Having this many entries on your classpath may be indicative "
- + "of an issue in your pipeline. You may want to consider trimming the classpath to "
- + "necessary dependencies only, using --filesToStage pipeline option to override "
- + "what files are being staged, or bundling several dependencies into one.",
- classpathElements.size());
- }
-
- ArrayList<DataflowPackage> packages = new ArrayList<>();
-
- if (stagingPath == null) {
- throw new IllegalArgumentException(
- "Can't stage classpath elements on because no staging location has been provided");
- }
-
- int numUploaded = 0;
- int numCached = 0;
- for (String classpathElement : classpathElements) {
- String packageName = null;
- if (classpathElement.contains("=")) {
- String[] components = classpathElement.split("=", 2);
- packageName = components[0];
- classpathElement = components[1];
- }
-
- File file = new File(classpathElement);
- if (!file.exists()) {
- LOG.warn("Skipping non-existent classpath element {} that was specified.",
- classpathElement);
- continue;
- }
-
- PackageAttributes attributes = createPackageAttributes(file, stagingPath, packageName);
-
- DataflowPackage workflowPackage = attributes.getDataflowPackage();
- packages.add(workflowPackage);
- String target = workflowPackage.getLocation();
-
- // TODO: Should we attempt to detect the Mime type rather than
- // always using MimeTypes.BINARY?
- try {
- try {
- long remoteLength = IOChannelUtils.getSizeBytes(target);
- if (remoteLength == attributes.getSize()) {
- LOG.debug("Skipping classpath element already staged: {} at {}",
- classpathElement, target);
- numCached++;
- continue;
- }
- } catch (FileNotFoundException expected) {
- // If the file doesn't exist, it means we need to upload it.
- }
-
- // Upload file, retrying on failure.
- AttemptBoundedExponentialBackOff backoff = new AttemptBoundedExponentialBackOff(
- MAX_ATTEMPTS,
- INITIAL_BACKOFF_INTERVAL_MS);
- while (true) {
- try {
- LOG.debug("Uploading classpath element {} to {}", classpathElement, target);
- try (WritableByteChannel writer = IOChannelUtils.create(target, MimeTypes.BINARY)) {
- copyContent(classpathElement, writer);
- }
- numUploaded++;
- break;
- } catch (IOException e) {
- if (ERROR_EXTRACTOR.accessDenied(e)) {
- String errorMessage = String.format(
- "Uploaded failed due to permissions error, will NOT retry staging "
- + "of classpath %s. Please verify credentials are valid and that you have "
- + "write access to %s. Stale credentials can be resolved by executing "
- + "'gcloud auth login'.", classpathElement, target);
- LOG.error(errorMessage);
- throw new IOException(errorMessage, e);
- } else if (!backoff.atMaxAttempts()) {
- LOG.warn("Upload attempt failed, sleeping before retrying staging of classpath: {}",
- classpathElement, e);
- BackOffUtils.next(retrySleeper, backoff);
- } else {
- // Rethrow last error, to be included as a cause in the catch below.
- LOG.error("Upload failed, will NOT retry staging of classpath: {}",
- classpathElement, e);
- throw e;
- }
- }
- }
- } catch (Exception e) {
- throw new RuntimeException("Could not stage classpath element: " + classpathElement, e);
- }
- }
-
- LOG.info("Uploading PipelineOptions.filesToStage complete: {} files newly uploaded, "
- + "{} files cached",
- numUploaded, numCached);
-
- return packages;
- }
-
- /**
- * Returns a unique name for a file with a given content hash.
- *
- * <p>Directory paths are removed. Example:
- * <pre>
- * dir="a/b/c/d", contentHash="f000" => d-f000.jar
- * file="a/b/c/d.txt", contentHash="f000" => d-f000.txt
- * file="a/b/c/d", contentHash="f000" => d-f000
- * </pre>
- */
- static String getUniqueContentName(File classpathElement, String contentHash) {
- String fileName = Files.getNameWithoutExtension(classpathElement.getAbsolutePath());
- String fileExtension = Files.getFileExtension(classpathElement.getAbsolutePath());
- if (classpathElement.isDirectory()) {
- return fileName + "-" + contentHash + ".jar";
- } else if (fileExtension.isEmpty()) {
- return fileName + "-" + contentHash;
- }
- return fileName + "-" + contentHash + "." + fileExtension;
- }
-
- /**
- * Copies the contents of the classpathElement to the output channel.
- *
- * <p>If the classpathElement is a directory, a Zip stream is constructed on the fly,
- * otherwise the file contents are copied as-is.
- *
- * <p>The output channel is not closed.
- */
- private static void copyContent(String classpathElement, WritableByteChannel outputChannel)
- throws IOException {
- final File classpathElementFile = new File(classpathElement);
- if (classpathElementFile.isDirectory()) {
- ZipFiles.zipDirectory(classpathElementFile, Channels.newOutputStream(outputChannel));
- } else {
- Files.asByteSource(classpathElementFile).copyTo(Channels.newOutputStream(outputChannel));
- }
- }
- /**
- * Holds the metadata necessary to stage a file or confirm that a staged file has not changed.
- */
- static class PackageAttributes {
- private final boolean directory;
- private final long size;
- private final String hash;
- private DataflowPackage dataflowPackage;
-
- public PackageAttributes(long size, String hash, boolean directory,
- DataflowPackage dataflowPackage) {
- this.size = size;
- this.hash = Objects.requireNonNull(hash, "hash");
- this.directory = directory;
- this.dataflowPackage = Objects.requireNonNull(dataflowPackage, "dataflowPackage");
- }
-
- /**
- * @return the dataflowPackage
- */
- public DataflowPackage getDataflowPackage() {
- return dataflowPackage;
- }
-
- /**
- * @return the directory
- */
- public boolean isDirectory() {
- return directory;
- }
-
- /**
- * @return the size
- */
- public long getSize() {
- return size;
- }
-
- /**
- * @return the hash
- */
- public String getHash() {
- return hash;
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/PaneInfoTracker.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/PaneInfoTracker.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/PaneInfoTracker.java
deleted file mode 100644
index a7818a3..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/PaneInfoTracker.java
+++ /dev/null
@@ -1,151 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.cloud.dataflow.sdk.transforms.windowing.AfterWatermark;
-import com.google.cloud.dataflow.sdk.transforms.windowing.PaneInfo;
-import com.google.cloud.dataflow.sdk.transforms.windowing.PaneInfo.PaneInfoCoder;
-import com.google.cloud.dataflow.sdk.transforms.windowing.PaneInfo.Timing;
-import com.google.cloud.dataflow.sdk.util.state.ReadableState;
-import com.google.cloud.dataflow.sdk.util.state.StateAccessor;
-import com.google.cloud.dataflow.sdk.util.state.StateTag;
-import com.google.cloud.dataflow.sdk.util.state.StateTags;
-import com.google.cloud.dataflow.sdk.util.state.ValueState;
-import com.google.common.annotations.VisibleForTesting;
-import com.google.common.base.Preconditions;
-
-import org.joda.time.Instant;
-
-/**
- * Determine the timing and other properties of a new pane for a given computation, key and window.
- * Incorporates any previous pane, whether the pane has been produced because an
- * on-time {@link AfterWatermark} trigger firing, and the relation between the element's timestamp
- * and the current output watermark.
- */
-public class PaneInfoTracker {
- private TimerInternals timerInternals;
-
- public PaneInfoTracker(TimerInternals timerInternals) {
- this.timerInternals = timerInternals;
- }
-
- @VisibleForTesting
- static final StateTag<Object, ValueState<PaneInfo>> PANE_INFO_TAG =
- StateTags.makeSystemTagInternal(StateTags.value("pane", PaneInfoCoder.INSTANCE));
-
- public void clear(StateAccessor<?> state) {
- state.access(PANE_INFO_TAG).clear();
- }
-
- /**
- * Return a ({@link ReadableState} for) the pane info appropriate for {@code context}. The pane
- * info includes the timing for the pane, who's calculation is quite subtle.
- *
- * @param isFinal should be {@code true} only if the triggering machinery can guarantee
- * no further firings for the
- */
- public ReadableState<PaneInfo> getNextPaneInfo(
- ReduceFn<?, ?, ?, ?>.Context context, final boolean isFinal) {
- final Object key = context.key();
- final ReadableState<PaneInfo> previousPaneFuture =
- context.state().access(PaneInfoTracker.PANE_INFO_TAG);
- final Instant windowMaxTimestamp = context.window().maxTimestamp();
-
- return new ReadableState<PaneInfo>() {
- @Override
- public ReadableState<PaneInfo> readLater() {
- previousPaneFuture.readLater();
- return this;
- }
-
- @Override
- public PaneInfo read() {
- PaneInfo previousPane = previousPaneFuture.read();
- return describePane(key, windowMaxTimestamp, previousPane, isFinal);
- }
- };
- }
-
- public void storeCurrentPaneInfo(ReduceFn<?, ?, ?, ?>.Context context, PaneInfo currentPane) {
- context.state().access(PANE_INFO_TAG).write(currentPane);
- }
-
- private <W> PaneInfo describePane(
- Object key, Instant windowMaxTimestamp, PaneInfo previousPane, boolean isFinal) {
- boolean isFirst = previousPane == null;
- Timing previousTiming = isFirst ? null : previousPane.getTiming();
- long index = isFirst ? 0 : previousPane.getIndex() + 1;
- long nonSpeculativeIndex = isFirst ? 0 : previousPane.getNonSpeculativeIndex() + 1;
- Instant outputWM = timerInternals.currentOutputWatermarkTime();
- Instant inputWM = timerInternals.currentInputWatermarkTime();
-
- // True if it is not possible to assign the element representing this pane a timestamp
- // which will make an ON_TIME pane for any following computation.
- // Ie true if the element's latest possible timestamp is before the current output watermark.
- boolean isLateForOutput = outputWM != null && windowMaxTimestamp.isBefore(outputWM);
-
- // True if all emitted panes (if any) were EARLY panes.
- // Once the ON_TIME pane has fired, all following panes must be considered LATE even
- // if the output watermark is behind the end of the window.
- boolean onlyEarlyPanesSoFar = previousTiming == null || previousTiming == Timing.EARLY;
-
- // True is the input watermark hasn't passed the window's max timestamp.
- boolean isEarlyForInput = inputWM == null || !inputWM.isAfter(windowMaxTimestamp);
-
- Timing timing;
- if (isLateForOutput || !onlyEarlyPanesSoFar) {
- // The output watermark has already passed the end of this window, or we have already
- // emitted a non-EARLY pane. Irrespective of how this pane was triggered we must
- // consider this pane LATE.
- timing = Timing.LATE;
- } else if (isEarlyForInput) {
- // This is an EARLY firing.
- timing = Timing.EARLY;
- nonSpeculativeIndex = -1;
- } else {
- // This is the unique ON_TIME firing for the window.
- timing = Timing.ON_TIME;
- }
-
- WindowTracing.debug(
- "describePane: {} pane (prev was {}) for key:{}; windowMaxTimestamp:{}; "
- + "inputWatermark:{}; outputWatermark:{}; isLateForOutput:{}",
- timing, previousTiming, key, windowMaxTimestamp, inputWM, outputWM, isLateForOutput);
-
- if (previousPane != null) {
- // Timing transitions should follow EARLY* ON_TIME? LATE*
- switch (previousTiming) {
- case EARLY:
- Preconditions.checkState(
- timing == Timing.EARLY || timing == Timing.ON_TIME || timing == Timing.LATE,
- "EARLY cannot transition to %s", timing);
- break;
- case ON_TIME:
- Preconditions.checkState(
- timing == Timing.LATE, "ON_TIME cannot transition to %s", timing);
- break;
- case LATE:
- Preconditions.checkState(timing == Timing.LATE, "LATE cannot transtion to %s", timing);
- break;
- case UNKNOWN:
- break;
- }
- Preconditions.checkState(!previousPane.isLast(), "Last pane was not last after all.");
- }
-
- return PaneInfo.createPane(isFirst, isFinal, timing, index, nonSpeculativeIndex);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/PathValidator.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/PathValidator.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/PathValidator.java
deleted file mode 100644
index 658de2a..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/PathValidator.java
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-/**
- * Interface for controlling validation of paths.
- */
-public interface PathValidator {
- /**
- * Validate that a file pattern is conforming.
- *
- * @param filepattern The file pattern to verify.
- * @return The post-validation filepattern.
- */
- public String validateInputFilePatternSupported(String filepattern);
-
- /**
- * Validate that an output file prefix is conforming.
- *
- * @param filePrefix the file prefix to verify.
- * @return The post-validation filePrefix.
- */
- public String validateOutputFilePrefixSupported(String filePrefix);
-
- /**
- * Validate that a path is a valid path and that the path
- * is accessible.
- *
- * @param path The path to verify.
- * @return The post-validation path.
- */
- public String verifyPath(String path);
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/PerKeyCombineFnRunner.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/PerKeyCombineFnRunner.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/PerKeyCombineFnRunner.java
deleted file mode 100644
index b5f328f..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/PerKeyCombineFnRunner.java
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-import com.google.cloud.dataflow.sdk.transforms.CombineFnBase.PerKeyCombineFn;
-import com.google.cloud.dataflow.sdk.transforms.DoFn;
-import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
-
-import java.io.Serializable;
-import java.util.Collection;
-
-/**
- * An interface that runs a {@link PerKeyCombineFn} with unified APIs.
- *
- * <p>Different keyed combine functions have their own implementations.
- * For example, the implementation can skip allocating {@code Combine.Context},
- * if the keyed combine function doesn't use it.
- */
-public interface PerKeyCombineFnRunner<K, InputT, AccumT, OutputT> extends Serializable {
- /**
- * Returns the {@link PerKeyCombineFn} it holds.
- *
- * <p>It can be a {@code KeyedCombineFn} or a {@code KeyedCombineFnWithContext}.
- */
- public PerKeyCombineFn<K, InputT, AccumT, OutputT> fn();
-
- /////////////////////////////////////////////////////////////////////////////
-
- /**
- * Forwards the call to a {@link PerKeyCombineFn} to create the accumulator in a {@link DoFn}.
- *
- * <p>It constructs a {@code CombineWithContext.Context} from {@code DoFn.ProcessContext}
- * if it is required.
- */
- public AccumT createAccumulator(K key, DoFn<?, ?>.ProcessContext c);
-
- /**
- * Forwards the call to a {@link PerKeyCombineFn} to add the input in a {@link DoFn}.
- *
- * <p>It constructs a {@code CombineWithContext.Context} from {@code DoFn.ProcessContext}
- * if it is required.
- */
- public AccumT addInput(K key, AccumT accumulator, InputT input, DoFn<?, ?>.ProcessContext c);
-
- /**
- * Forwards the call to a {@link PerKeyCombineFn} to merge accumulators in a {@link DoFn}.
- *
- * <p>It constructs a {@code CombineWithContext.Context} from {@code DoFn.ProcessContext}
- * if it is required.
- */
- public AccumT mergeAccumulators(
- K key, Iterable<AccumT> accumulators, DoFn<?, ?>.ProcessContext c);
-
- /**
- * Forwards the call to a {@link PerKeyCombineFn} to extract the output in a {@link DoFn}.
- *
- * <p>It constructs a {@code CombineWithContext.Context} from {@code DoFn.ProcessContext}
- * if it is required.
- */
- public OutputT extractOutput(K key, AccumT accumulator, DoFn<?, ?>.ProcessContext c);
-
- /**
- * Forwards the call to a {@link PerKeyCombineFn} to compact the accumulator in a {@link DoFn}.
- *
- * <p>It constructs a {@code CombineWithContext.Context} from {@code DoFn.ProcessContext}
- * if it is required.
- */
- public AccumT compact(K key, AccumT accumulator, DoFn<?, ?>.ProcessContext c);
-
- /**
- * Forwards the call to a {@link PerKeyCombineFn} to combine the inputs and extract output
- * in a {@link DoFn}.
- *
- * <p>It constructs a {@code CombineWithContext.Context} from {@code DoFn.ProcessContext}
- * if it is required.
- */
- public OutputT apply(K key, Iterable<? extends InputT> inputs, DoFn<?, ?>.ProcessContext c);
-
- /**
- * Forwards the call to a {@link PerKeyCombineFn} to add all inputs in a {@link DoFn}.
- *
- * <p>It constructs a {@code CombineWithContext.Context} from {@code DoFn.ProcessContext}
- * if it is required.
- */
- public AccumT addInputs(K key, Iterable<InputT> inputs, DoFn<?, ?>.ProcessContext c);
-
- /////////////////////////////////////////////////////////////////////////////
-
- /**
- * Forwards the call to a {@link PerKeyCombineFn} to create the accumulator.
- *
- * <p>It constructs a {@code CombineWithContext.Context} from
- * {@link PipelineOptions} and {@link SideInputReader} if it is required.
- */
- public AccumT createAccumulator(K key, PipelineOptions options,
- SideInputReader sideInputReader, Collection<? extends BoundedWindow> windows);
-
- /**
- * Forwards the call to a {@link PerKeyCombineFn} to add the input.
- *
- * <p>It constructs a {@code CombineWithContext.Context} from
- * {@link PipelineOptions} and {@link SideInputReader} if it is required.
- */
- public AccumT addInput(K key, AccumT accumulator, InputT value, PipelineOptions options,
- SideInputReader sideInputReader, Collection<? extends BoundedWindow> windows);
-
- /**
- * Forwards the call to a {@link PerKeyCombineFn} to merge accumulators.
- *
- * <p>It constructs a {@code CombineWithContext.Context} from
- * {@link PipelineOptions} and {@link SideInputReader} if it is required.
- */
- public AccumT mergeAccumulators(K key, Iterable<AccumT> accumulators, PipelineOptions options,
- SideInputReader sideInputReader, Collection<? extends BoundedWindow> windows);
-
- /**
- * Forwards the call to a {@link PerKeyCombineFn} to extract the output.
- *
- * <p>It constructs a {@code CombineWithContext.Context} from
- * {@link PipelineOptions} and {@link SideInputReader} if it is required.
- */
- public OutputT extractOutput(K key, AccumT accumulator, PipelineOptions options,
- SideInputReader sideInputReader, Collection<? extends BoundedWindow> windows);
-
- /**
- * Forwards the call to a {@link PerKeyCombineFn} to compact the accumulator.
- *
- * <p>It constructs a {@code CombineWithContext.Context} from
- * {@link PipelineOptions} and {@link SideInputReader} if it is required.
- */
- public AccumT compact(K key, AccumT accumulator, PipelineOptions options,
- SideInputReader sideInputReader, Collection<? extends BoundedWindow> windows);
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/PerKeyCombineFnRunners.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/PerKeyCombineFnRunners.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/PerKeyCombineFnRunners.java
deleted file mode 100644
index 6606c54..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/PerKeyCombineFnRunners.java
+++ /dev/null
@@ -1,257 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-import com.google.cloud.dataflow.sdk.transforms.Combine.KeyedCombineFn;
-import com.google.cloud.dataflow.sdk.transforms.CombineFnBase.PerKeyCombineFn;
-import com.google.cloud.dataflow.sdk.transforms.CombineWithContext;
-import com.google.cloud.dataflow.sdk.transforms.CombineWithContext.KeyedCombineFnWithContext;
-import com.google.cloud.dataflow.sdk.transforms.CombineWithContext.RequiresContextInternal;
-import com.google.cloud.dataflow.sdk.transforms.DoFn;
-import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
-import com.google.common.collect.Iterables;
-
-import java.util.Collection;
-
-/**
- * Static utility methods that provide {@link PerKeyCombineFnRunner} implementations
- * for different keyed combine functions.
- */
-public class PerKeyCombineFnRunners {
- /**
- * Returns a {@link PerKeyCombineFnRunner} from a {@link PerKeyCombineFn}.
- */
- public static <K, InputT, AccumT, OutputT> PerKeyCombineFnRunner<K, InputT, AccumT, OutputT>
- create(PerKeyCombineFn<K, InputT, AccumT, OutputT> perKeyCombineFn) {
- if (perKeyCombineFn instanceof RequiresContextInternal) {
- return new KeyedCombineFnWithContextRunner<>(
- (KeyedCombineFnWithContext<K, InputT, AccumT, OutputT>) perKeyCombineFn);
- } else {
- return new KeyedCombineFnRunner<>(
- (KeyedCombineFn<K, InputT, AccumT, OutputT>) perKeyCombineFn);
- }
- }
-
- /**
- * An implementation of {@link PerKeyCombineFnRunner} with {@link KeyedCombineFn}.
- *
- * It forwards functions calls to the {@link KeyedCombineFn}.
- */
- private static class KeyedCombineFnRunner<K, InputT, AccumT, OutputT>
- implements PerKeyCombineFnRunner<K, InputT, AccumT, OutputT> {
- private final KeyedCombineFn<K, InputT, AccumT, OutputT> keyedCombineFn;
-
- private KeyedCombineFnRunner(
- KeyedCombineFn<K, InputT, AccumT, OutputT> keyedCombineFn) {
- this.keyedCombineFn = keyedCombineFn;
- }
-
- @Override
- public KeyedCombineFn<K, InputT, AccumT, OutputT> fn() {
- return keyedCombineFn;
- }
-
- @Override
- public AccumT createAccumulator(K key, DoFn<?, ?>.ProcessContext c) {
- return keyedCombineFn.createAccumulator(key);
- }
-
- @Override
- public AccumT addInput(
- K key, AccumT accumulator, InputT input, DoFn<?, ?>.ProcessContext c) {
- return keyedCombineFn.addInput(key, accumulator, input);
- }
-
- @Override
- public AccumT mergeAccumulators(
- K key, Iterable<AccumT> accumulators, DoFn<?, ?>.ProcessContext c) {
- return keyedCombineFn.mergeAccumulators(key, accumulators);
- }
-
- @Override
- public OutputT extractOutput(K key, AccumT accumulator, DoFn<?, ?>.ProcessContext c) {
- return keyedCombineFn.extractOutput(key, accumulator);
- }
-
- @Override
- public AccumT compact(K key, AccumT accumulator, DoFn<?, ?>.ProcessContext c) {
- return keyedCombineFn.compact(key, accumulator);
- }
-
- @Override
- public OutputT apply(K key, Iterable<? extends InputT> inputs, DoFn<?, ?>.ProcessContext c) {
- return keyedCombineFn.apply(key, inputs);
- }
-
- @Override
- public AccumT addInputs(K key, Iterable<InputT> inputs, DoFn<?, ?>.ProcessContext c) {
- AccumT accum = keyedCombineFn.createAccumulator(key);
- for (InputT input : inputs) {
- accum = keyedCombineFn.addInput(key, accum, input);
- }
- return accum;
- }
-
- @Override
- public String toString() {
- return keyedCombineFn.toString();
- }
-
- @Override
- public AccumT createAccumulator(K key, PipelineOptions options,
- SideInputReader sideInputReader, Collection<? extends BoundedWindow> windows) {
- return keyedCombineFn.createAccumulator(key);
- }
-
- @Override
- public AccumT addInput(K key, AccumT accumulator, InputT input, PipelineOptions options,
- SideInputReader sideInputReader, Collection<? extends BoundedWindow> windows) {
- return keyedCombineFn.addInput(key, accumulator, input);
- }
-
- @Override
- public AccumT mergeAccumulators(K key, Iterable<AccumT> accumulators, PipelineOptions options,
- SideInputReader sideInputReader, Collection<? extends BoundedWindow> windows) {
- return keyedCombineFn.mergeAccumulators(key, accumulators);
- }
-
- @Override
- public OutputT extractOutput(K key, AccumT accumulator, PipelineOptions options,
- SideInputReader sideInputReader, Collection<? extends BoundedWindow> windows) {
- return keyedCombineFn.extractOutput(key, accumulator);
- }
-
- @Override
- public AccumT compact(K key, AccumT accumulator, PipelineOptions options,
- SideInputReader sideInputReader, Collection<? extends BoundedWindow> windows) {
- return keyedCombineFn.compact(key, accumulator);
- }
- }
-
- /**
- * An implementation of {@link PerKeyCombineFnRunner} with {@link KeyedCombineFnWithContext}.
- *
- * It forwards functions calls to the {@link KeyedCombineFnWithContext}.
- */
- private static class KeyedCombineFnWithContextRunner<K, InputT, AccumT, OutputT>
- implements PerKeyCombineFnRunner<K, InputT, AccumT, OutputT> {
- private final KeyedCombineFnWithContext<K, InputT, AccumT, OutputT> keyedCombineFnWithContext;
-
- private KeyedCombineFnWithContextRunner(
- KeyedCombineFnWithContext<K, InputT, AccumT, OutputT> keyedCombineFnWithContext) {
- this.keyedCombineFnWithContext = keyedCombineFnWithContext;
- }
-
- @Override
- public KeyedCombineFnWithContext<K, InputT, AccumT, OutputT> fn() {
- return keyedCombineFnWithContext;
- }
-
- @Override
- public AccumT createAccumulator(K key, DoFn<?, ?>.ProcessContext c) {
- return keyedCombineFnWithContext.createAccumulator(key,
- CombineContextFactory.createFromProcessContext(c));
- }
-
- @Override
- public AccumT addInput(
- K key, AccumT accumulator, InputT value, DoFn<?, ?>.ProcessContext c) {
- return keyedCombineFnWithContext.addInput(key, accumulator, value,
- CombineContextFactory.createFromProcessContext(c));
- }
-
- @Override
- public AccumT mergeAccumulators(
- K key, Iterable<AccumT> accumulators, DoFn<?, ?>.ProcessContext c) {
- return keyedCombineFnWithContext.mergeAccumulators(
- key, accumulators, CombineContextFactory.createFromProcessContext(c));
- }
-
- @Override
- public OutputT extractOutput(K key, AccumT accumulator, DoFn<?, ?>.ProcessContext c) {
- return keyedCombineFnWithContext.extractOutput(key, accumulator,
- CombineContextFactory.createFromProcessContext(c));
- }
-
- @Override
- public AccumT compact(K key, AccumT accumulator, DoFn<?, ?>.ProcessContext c) {
- return keyedCombineFnWithContext.compact(key, accumulator,
- CombineContextFactory.createFromProcessContext(c));
- }
-
- @Override
- public OutputT apply(K key, Iterable<? extends InputT> inputs, DoFn<?, ?>.ProcessContext c) {
- return keyedCombineFnWithContext.apply(key, inputs,
- CombineContextFactory.createFromProcessContext(c));
- }
-
- @Override
- public AccumT addInputs(K key, Iterable<InputT> inputs, DoFn<?, ?>.ProcessContext c) {
- CombineWithContext.Context combineContext = CombineContextFactory.createFromProcessContext(c);
- AccumT accum = keyedCombineFnWithContext.createAccumulator(key, combineContext);
- for (InputT input : inputs) {
- accum = keyedCombineFnWithContext.addInput(key, accum, input, combineContext);
- }
- return accum;
- }
-
- @Override
- public String toString() {
- return keyedCombineFnWithContext.toString();
- }
-
- @Override
- public AccumT createAccumulator(K key, PipelineOptions options, SideInputReader sideInputReader,
- Collection<? extends BoundedWindow> windows) {
- return keyedCombineFnWithContext.createAccumulator(key,
- CombineContextFactory.createFromComponents(
- options, sideInputReader, Iterables.getOnlyElement(windows)));
- }
-
- @Override
- public AccumT addInput(K key, AccumT accumulator, InputT input, PipelineOptions options,
- SideInputReader sideInputReader, Collection<? extends BoundedWindow> windows) {
- return keyedCombineFnWithContext.addInput(key, accumulator, input,
- CombineContextFactory.createFromComponents(
- options, sideInputReader, Iterables.getOnlyElement(windows)));
- }
-
- @Override
- public AccumT mergeAccumulators(K key, Iterable<AccumT> accumulators, PipelineOptions options,
- SideInputReader sideInputReader, Collection<? extends BoundedWindow> windows) {
- return keyedCombineFnWithContext.mergeAccumulators(key, accumulators,
- CombineContextFactory.createFromComponents(
- options, sideInputReader, Iterables.getOnlyElement(windows)));
- }
-
- @Override
- public OutputT extractOutput(K key, AccumT accumulator, PipelineOptions options,
- SideInputReader sideInputReader, Collection<? extends BoundedWindow> windows) {
- return keyedCombineFnWithContext.extractOutput(key, accumulator,
- CombineContextFactory.createFromComponents(
- options, sideInputReader, Iterables.getOnlyElement(windows)));
- }
-
- @Override
- public AccumT compact(K key, AccumT accumulator, PipelineOptions options,
- SideInputReader sideInputReader, Collection<? extends BoundedWindow> windows) {
- return keyedCombineFnWithContext.compact(key, accumulator,
- CombineContextFactory.createFromComponents(
- options, sideInputReader, Iterables.getOnlyElement(windows)));
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/PropertyNames.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/PropertyNames.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/PropertyNames.java
deleted file mode 100644
index 81572ea..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/PropertyNames.java
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-/**
- * Constant property names used by the SDK in CloudWorkflow specifications.
- */
-public class PropertyNames {
- public static final String ALLOWED_ENCODINGS = "allowed_encodings";
- public static final String APPEND_TRAILING_NEWLINES = "append_trailing_newlines";
- public static final String BIGQUERY_CREATE_DISPOSITION = "create_disposition";
- public static final String BIGQUERY_DATASET = "dataset";
- public static final String BIGQUERY_PROJECT = "project";
- public static final String BIGQUERY_SCHEMA = "schema";
- public static final String BIGQUERY_TABLE = "table";
- public static final String BIGQUERY_QUERY = "bigquery_query";
- public static final String BIGQUERY_FLATTEN_RESULTS = "bigquery_flatten_results";
- public static final String BIGQUERY_WRITE_DISPOSITION = "write_disposition";
- public static final String BIGQUERY_EXPORT_FORMAT = "bigquery_export_format";
- public static final String BIGQUERY_EXPORT_SCHEMA = "bigquery_export_schema";
- public static final String CO_GBK_RESULT_SCHEMA = "co_gbk_result_schema";
- public static final String COMBINE_FN = "combine_fn";
- public static final String COMPONENT_ENCODINGS = "component_encodings";
- public static final String COMPRESSION_TYPE = "compression_type";
- public static final String CUSTOM_SOURCE_FORMAT = "custom_source";
- public static final String CONCAT_SOURCE_SOURCES = "sources";
- public static final String CONCAT_SOURCE_BASE_SPECS = "base_specs";
- public static final String SOURCE_STEP_INPUT = "custom_source_step_input";
- public static final String SOURCE_SPEC = "spec";
- public static final String SOURCE_METADATA = "metadata";
- public static final String SOURCE_DOES_NOT_NEED_SPLITTING = "does_not_need_splitting";
- public static final String SOURCE_PRODUCES_SORTED_KEYS = "produces_sorted_keys";
- public static final String SOURCE_IS_INFINITE = "is_infinite";
- public static final String SOURCE_ESTIMATED_SIZE_BYTES = "estimated_size_bytes";
- public static final String ELEMENT = "element";
- public static final String ELEMENTS = "elements";
- public static final String ENCODING = "encoding";
- public static final String ENCODING_ID = "encoding_id";
- public static final String END_INDEX = "end_index";
- public static final String END_OFFSET = "end_offset";
- public static final String END_SHUFFLE_POSITION = "end_shuffle_position";
- public static final String ENVIRONMENT_VERSION_JOB_TYPE_KEY = "job_type";
- public static final String ENVIRONMENT_VERSION_MAJOR_KEY = "major";
- public static final String FILENAME = "filename";
- public static final String FILENAME_PREFIX = "filename_prefix";
- public static final String FILENAME_SUFFIX = "filename_suffix";
- public static final String FILEPATTERN = "filepattern";
- public static final String FOOTER = "footer";
- public static final String FORMAT = "format";
- public static final String HEADER = "header";
- public static final String INPUTS = "inputs";
- public static final String INPUT_CODER = "input_coder";
- public static final String IS_GENERATED = "is_generated";
- public static final String IS_MERGING_WINDOW_FN = "is_merging_window_fn";
- public static final String IS_PAIR_LIKE = "is_pair_like";
- public static final String IS_STREAM_LIKE = "is_stream_like";
- public static final String IS_WRAPPER = "is_wrapper";
- public static final String DISALLOW_COMBINER_LIFTING = "disallow_combiner_lifting";
- public static final String NON_PARALLEL_INPUTS = "non_parallel_inputs";
- public static final String NUM_SHARD_CODERS = "num_shard_coders";
- public static final String NUM_METADATA_SHARD_CODERS = "num_metadata_shard_coders";
- public static final String NUM_SHARDS = "num_shards";
- public static final String OBJECT_TYPE_NAME = "@type";
- public static final String OUTPUT = "output";
- public static final String OUTPUT_INFO = "output_info";
- public static final String OUTPUT_NAME = "output_name";
- public static final String PARALLEL_INPUT = "parallel_input";
- public static final String PHASE = "phase";
- public static final String PUBSUB_ID_LABEL = "pubsub_id_label";
- public static final String PUBSUB_SUBSCRIPTION = "pubsub_subscription";
- public static final String PUBSUB_TIMESTAMP_LABEL = "pubsub_timestamp_label";
- public static final String PUBSUB_TOPIC = "pubsub_topic";
- public static final String SCALAR_FIELD_NAME = "value";
- public static final String SERIALIZED_FN = "serialized_fn";
- public static final String SHARD_NAME_TEMPLATE = "shard_template";
- public static final String SHUFFLE_KIND = "shuffle_kind";
- public static final String SHUFFLE_READER_CONFIG = "shuffle_reader_config";
- public static final String SHUFFLE_WRITER_CONFIG = "shuffle_writer_config";
- public static final String SORT_VALUES = "sort_values";
- public static final String START_INDEX = "start_index";
- public static final String START_OFFSET = "start_offset";
- public static final String START_SHUFFLE_POSITION = "start_shuffle_position";
- public static final String STRIP_TRAILING_NEWLINES = "strip_trailing_newlines";
- public static final String TUPLE_TAGS = "tuple_tags";
- public static final String USE_INDEXED_FORMAT = "use_indexed_format";
- public static final String USER_FN = "user_fn";
- public static final String USER_NAME = "user_name";
- public static final String USES_KEYED_STATE = "uses_keyed_state";
- public static final String VALIDATE_SINK = "validate_sink";
- public static final String VALIDATE_SOURCE = "validate_source";
- public static final String VALUE = "value";
- public static final String DISPLAY_DATA = "display_data";
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/RandomAccessData.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/RandomAccessData.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/RandomAccessData.java
deleted file mode 100644
index 6c96c8e..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/RandomAccessData.java
+++ /dev/null
@@ -1,352 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.util;
-
-import static com.google.common.base.Preconditions.checkArgument;
-import static com.google.common.base.Preconditions.checkNotNull;
-
-import com.google.cloud.dataflow.sdk.coders.AtomicCoder;
-import com.google.cloud.dataflow.sdk.coders.ByteArrayCoder;
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.coders.CoderException;
-import com.google.common.base.MoreObjects;
-import com.google.common.io.ByteStreams;
-import com.google.common.primitives.UnsignedBytes;
-
-import com.fasterxml.jackson.annotation.JsonCreator;
-
-import java.io.ByteArrayInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.util.Arrays;
-import java.util.Comparator;
-
-import javax.annotation.concurrent.NotThreadSafe;
-
-/**
- * An elastic-sized byte array which allows you to manipulate it as a stream, or access
- * it directly. This allows for a quick succession of moving bytes from an {@link InputStream}
- * to this wrapper to be used as an {@link OutputStream} and vice versa. This wrapper
- * also provides random access to bytes stored within. This wrapper allows users to finely
- * control the number of byte copies that occur.
- *
- * Anything stored within the in-memory buffer from offset {@link #size()} is considered temporary
- * unused storage.
- */
-@NotThreadSafe
-public class RandomAccessData {
- /**
- * A {@link Coder} which encodes the valid parts of this stream.
- * This follows the same encoding scheme as {@link ByteArrayCoder}.
- * This coder is deterministic and consistent with equals.
- *
- * This coder does not support encoding positive infinity.
- */
- public static class RandomAccessDataCoder extends AtomicCoder<RandomAccessData> {
- private static final RandomAccessDataCoder INSTANCE = new RandomAccessDataCoder();
-
- @JsonCreator
- public static RandomAccessDataCoder of() {
- return INSTANCE;
- }
-
- @Override
- public void encode(RandomAccessData value, OutputStream outStream, Coder.Context context)
- throws CoderException, IOException {
- if (value == POSITIVE_INFINITY) {
- throw new CoderException("Positive infinity can not be encoded.");
- }
- if (!context.isWholeStream) {
- VarInt.encode(value.size, outStream);
- }
- value.writeTo(outStream, 0, value.size);
- }
-
- @Override
- public RandomAccessData decode(InputStream inStream, Coder.Context context)
- throws CoderException, IOException {
- RandomAccessData rval = new RandomAccessData();
- if (!context.isWholeStream) {
- int length = VarInt.decodeInt(inStream);
- rval.readFrom(inStream, 0, length);
- } else {
- ByteStreams.copy(inStream, rval.asOutputStream());
- }
- return rval;
- }
-
- @Override
- public boolean consistentWithEquals() {
- return true;
- }
-
- @Override
- public boolean isRegisterByteSizeObserverCheap(
- RandomAccessData value, Coder.Context context) {
- return true;
- }
-
- @Override
- protected long getEncodedElementByteSize(RandomAccessData value, Coder.Context context)
- throws Exception {
- if (value == null) {
- throw new CoderException("cannot encode a null in memory stream");
- }
- long size = 0;
- if (!context.isWholeStream) {
- size += VarInt.getLength(value.size);
- }
- return size + value.size;
- }
- }
-
- public static final UnsignedLexicographicalComparator UNSIGNED_LEXICOGRAPHICAL_COMPARATOR =
- new UnsignedLexicographicalComparator();
-
- /**
- * A {@link Comparator} that compares two byte arrays lexicographically. It compares
- * values as a list of unsigned bytes. The first pair of values that follow any common prefix,
- * or when one array is a prefix of the other, treats the shorter array as the lesser.
- * For example, [] < [0x01] < [0x01, 0x7F] < [0x01, 0x80] < [0x02] < POSITIVE INFINITY.
- *
- * <p>Note that a token type of positive infinity is supported and is greater than
- * all other {@link RandomAccessData}.
- */
- public static final class UnsignedLexicographicalComparator
- implements Comparator<RandomAccessData> {
- // Do not instantiate
- private UnsignedLexicographicalComparator() {
- }
-
- @Override
- public int compare(RandomAccessData o1, RandomAccessData o2) {
- return compare(o1, o2, 0 /* start from the beginning */);
- }
-
- /**
- * Compare the two sets of bytes starting at the given offset.
- */
- public int compare(RandomAccessData o1, RandomAccessData o2, int startOffset) {
- if (o1 == o2) {
- return 0;
- }
- if (o1 == POSITIVE_INFINITY) {
- return 1;
- }
- if (o2 == POSITIVE_INFINITY) {
- return -1;
- }
-
- int minBytesLen = Math.min(o1.size, o2.size);
- for (int i = startOffset; i < minBytesLen; i++) {
- // unsigned comparison
- int b1 = o1.buffer[i] & 0xFF;
- int b2 = o2.buffer[i] & 0xFF;
- if (b1 == b2) {
- continue;
- }
- // Return the stream with the smaller byte as the smaller value.
- return b1 - b2;
- }
- // If one is a prefix of the other, return the shorter one as the smaller one.
- // If both lengths are equal, then both streams are equal.
- return o1.size - o2.size;
- }
-
- /**
- * Compute the length of the common prefix of the two provided sets of bytes.
- */
- public int commonPrefixLength(RandomAccessData o1, RandomAccessData o2) {
- int minBytesLen = Math.min(o1.size, o2.size);
- for (int i = 0; i < minBytesLen; i++) {
- // unsigned comparison
- int b1 = o1.buffer[i] & 0xFF;
- int b2 = o2.buffer[i] & 0xFF;
- if (b1 != b2) {
- return i;
- }
- }
- return minBytesLen;
- }
- }
-
- /** A token type representing positive infinity. */
- static final RandomAccessData POSITIVE_INFINITY = new RandomAccessData(0);
-
- /**
- * Returns a RandomAccessData that is the smallest value of same length which
- * is strictly greater than this. Note that if this is empty or is all 0xFF then
- * a token value of positive infinity is returned.
- *
- * The {@link UnsignedLexicographicalComparator} supports comparing {@link RandomAccessData}
- * with support for positive infinitiy.
- */
- public RandomAccessData increment() throws IOException {
- RandomAccessData copy = copy();
- for (int i = copy.size - 1; i >= 0; --i) {
- if (copy.buffer[i] != UnsignedBytes.MAX_VALUE) {
- copy.buffer[i] = UnsignedBytes.checkedCast(UnsignedBytes.toInt(copy.buffer[i]) + 1);
- return copy;
- }
- }
- return POSITIVE_INFINITY;
- }
-
- private static final int DEFAULT_INITIAL_BUFFER_SIZE = 128;
-
- /** Constructs a RandomAccessData with a default buffer size. */
- public RandomAccessData() {
- this(DEFAULT_INITIAL_BUFFER_SIZE);
- }
-
- /** Constructs a RandomAccessData with the initial buffer. */
- public RandomAccessData(byte[] initialBuffer) {
- checkNotNull(initialBuffer);
- this.buffer = initialBuffer;
- this.size = initialBuffer.length;
- }
-
- /** Constructs a RandomAccessData with the given buffer size. */
- public RandomAccessData(int initialBufferSize) {
- checkArgument(initialBufferSize >= 0, "Expected initial buffer size to be greater than zero.");
- this.buffer = new byte[initialBufferSize];
- }
-
- private byte[] buffer;
- private int size;
-
- /** Returns the backing array. */
- public byte[] array() {
- return buffer;
- }
-
- /** Returns the number of bytes in the backing array that are valid. */
- public int size() {
- return size;
- }
-
- /** Resets the end of the stream to the specified position. */
- public void resetTo(int position) {
- ensureCapacity(position);
- size = position;
- }
-
- private final OutputStream outputStream = new OutputStream() {
- @Override
- public void write(int b) throws IOException {
- ensureCapacity(size + 1);
- buffer[size] = (byte) b;
- size += 1;
- }
-
- @Override
- public void write(byte[] b, int offset, int length) throws IOException {
- ensureCapacity(size + length);
- System.arraycopy(b, offset, buffer, size, length);
- size += length;
- }
- };
-
- /**
- * Returns an output stream which writes to the backing buffer from the current position.
- * Note that the internal buffer will grow as required to accomodate all data written.
- */
- public OutputStream asOutputStream() {
- return outputStream;
- }
-
- /**
- * Returns an {@link InputStream} wrapper which supplies the portion of this backing byte buffer
- * starting at {@code offset} and up to {@code length} bytes. Note that the returned
- * {@link InputStream} is only a wrapper and any modifications to the underlying
- * {@link RandomAccessData} will be visible by the {@link InputStream}.
- */
- public InputStream asInputStream(final int offset, final int length) {
- return new ByteArrayInputStream(buffer, offset, length);
- }
-
- /**
- * Writes {@code length} bytes starting at {@code offset} from the backing data store to the
- * specified output stream.
- */
- public void writeTo(OutputStream out, int offset, int length) throws IOException {
- out.write(buffer, offset, length);
- }
-
- /**
- * Reads {@code length} bytes from the specified input stream writing them into the backing
- * data store starting at {@code offset}.
- *
- * <p>Note that the in memory stream will be grown to ensure there is enough capacity.
- */
- public void readFrom(InputStream inStream, int offset, int length) throws IOException {
- ensureCapacity(offset + length);
- ByteStreams.readFully(inStream, buffer, offset, length);
- size = offset + length;
- }
-
- /** Returns a copy of this RandomAccessData. */
- public RandomAccessData copy() throws IOException {
- RandomAccessData copy = new RandomAccessData(size);
- writeTo(copy.asOutputStream(), 0, size);
- return copy;
- }
-
- @Override
- public boolean equals(Object other) {
- if (other == this) {
- return true;
- }
- if (!(other instanceof RandomAccessData)) {
- return false;
- }
- return UNSIGNED_LEXICOGRAPHICAL_COMPARATOR.compare(this, (RandomAccessData) other) == 0;
- }
-
- @Override
- public int hashCode() {
- int result = 1;
- for (int i = 0; i < size; ++i) {
- result = 31 * result + buffer[i];
- }
-
- return result;
- }
-
- @Override
- public String toString() {
- return MoreObjects.toStringHelper(this)
- .add("buffer", Arrays.copyOf(buffer, size))
- .add("size", size)
- .toString();
- }
-
- private void ensureCapacity(int minCapacity) {
- // If we have enough space, don't grow the buffer.
- if (minCapacity <= buffer.length) {
- return;
- }
-
- // Try to double the size of the buffer, if thats not enough, just use the new capacity.
- // Note that we use Math.min(long, long) to not cause overflow on the multiplication.
- int newCapacity = (int) Math.min(Integer.MAX_VALUE, buffer.length * 2L);
- if (newCapacity < minCapacity) {
- newCapacity = minCapacity;
- }
- buffer = Arrays.copyOf(buffer, newCapacity);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/ReduceFn.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/ReduceFn.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/ReduceFn.java
deleted file mode 100644
index c5ef2ea..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/ReduceFn.java
+++ /dev/null
@@ -1,128 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
-import com.google.cloud.dataflow.sdk.transforms.windowing.PaneInfo;
-import com.google.cloud.dataflow.sdk.util.state.MergingStateAccessor;
-import com.google.cloud.dataflow.sdk.util.state.ReadableState;
-import com.google.cloud.dataflow.sdk.util.state.StateAccessor;
-
-import org.joda.time.Instant;
-
-import java.io.Serializable;
-
-/**
- * Specification for processing to happen after elements have been grouped by key.
- *
- * @param <K> The type of key being processed.
- * @param <InputT> The type of input values associated with the key.
- * @param <OutputT> The output type that will be produced for each key.
- * @param <W> The type of windows this operates on.
- */
-public abstract class ReduceFn<K, InputT, OutputT, W extends BoundedWindow>
- implements Serializable {
-
- /** Information accessible to all the processing methods in this {@code ReduceFn}. */
- public abstract class Context {
- /** Return the key that is being processed. */
- public abstract K key();
-
- /** The window that is being processed. */
- public abstract W window();
-
- /** Access the current {@link WindowingStrategy}. */
- public abstract WindowingStrategy<?, W> windowingStrategy();
-
- /** Return the interface for accessing state. */
- public abstract StateAccessor<K> state();
-
- /** Return the interface for accessing timers. */
- public abstract Timers timers();
- }
-
- /** Information accessible within {@link #processValue}. */
- public abstract class ProcessValueContext extends Context {
- /** Return the actual value being processed. */
- public abstract InputT value();
-
- /** Return the timestamp associated with the value. */
- public abstract Instant timestamp();
- }
-
- /** Information accessible within {@link #onMerge}. */
- public abstract class OnMergeContext extends Context {
- /** Return the interface for accessing state. */
- @Override
- public abstract MergingStateAccessor<K, W> state();
- }
-
- /** Information accessible within {@link #onTrigger}. */
- public abstract class OnTriggerContext extends Context {
- /** Returns the {@link PaneInfo} for the trigger firing being processed. */
- public abstract PaneInfo paneInfo();
-
- /** Output the given value in the current window. */
- public abstract void output(OutputT value);
- }
-
- //////////////////////////////////////////////////////////////////////////////////////////////////
-
- /**
- * Called for each value of type {@code InputT} associated with the current key.
- */
- public abstract void processValue(ProcessValueContext c) throws Exception;
-
- /**
- * Called when windows are merged.
- */
- public abstract void onMerge(OnMergeContext context) throws Exception;
-
- /**
- * Called when triggers fire.
- *
- * <p>Implementations of {@link ReduceFn} should call {@link OnTriggerContext#output} to emit
- * any results that should be included in the pane produced by this trigger firing.
- */
- public abstract void onTrigger(OnTriggerContext context) throws Exception;
-
- /**
- * Called before {@link #onMerge} is invoked to provide an opportunity to prefetch any needed
- * state.
- *
- * @param c Context to use prefetch from.
- */
- public void prefetchOnMerge(MergingStateAccessor<K, W> c) throws Exception {}
-
- /**
- * Called before {@link #onTrigger} is invoked to provide an opportunity to prefetch any needed
- * state.
- *
- * @param context Context to use prefetch from.
- */
- public void prefetchOnTrigger(StateAccessor<K> context) {}
-
- /**
- * Called to clear any persisted state that the {@link ReduceFn} may be holding. This will be
- * called when the windowing is closing and will receive no future interactions.
- */
- public abstract void clearState(Context context) throws Exception;
-
- /**
- * Returns true if the there is no buffered state.
- */
- public abstract ReadableState<Boolean> isEmpty(StateAccessor<K> context);
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/ReduceFnContextFactory.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/ReduceFnContextFactory.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/ReduceFnContextFactory.java
deleted file mode 100644
index bdbaf10..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/ReduceFnContextFactory.java
+++ /dev/null
@@ -1,495 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.util;
-
-import static com.google.common.base.Preconditions.checkNotNull;
-
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
-import com.google.cloud.dataflow.sdk.transforms.windowing.PaneInfo;
-import com.google.cloud.dataflow.sdk.util.TimerInternals.TimerData;
-import com.google.cloud.dataflow.sdk.util.state.MergingStateAccessor;
-import com.google.cloud.dataflow.sdk.util.state.ReadableState;
-import com.google.cloud.dataflow.sdk.util.state.State;
-import com.google.cloud.dataflow.sdk.util.state.StateAccessor;
-import com.google.cloud.dataflow.sdk.util.state.StateContext;
-import com.google.cloud.dataflow.sdk.util.state.StateContexts;
-import com.google.cloud.dataflow.sdk.util.state.StateInternals;
-import com.google.cloud.dataflow.sdk.util.state.StateNamespace;
-import com.google.cloud.dataflow.sdk.util.state.StateNamespaces;
-import com.google.cloud.dataflow.sdk.util.state.StateNamespaces.WindowNamespace;
-import com.google.cloud.dataflow.sdk.util.state.StateTag;
-import com.google.common.base.Preconditions;
-import com.google.common.collect.ImmutableMap;
-
-import org.joda.time.Instant;
-
-import java.util.Collection;
-import java.util.Map;
-
-import javax.annotation.Nullable;
-
-/**
- * Factory for creating instances of the various {@link ReduceFn} contexts.
- */
-class ReduceFnContextFactory<K, InputT, OutputT, W extends BoundedWindow> {
- public interface OnTriggerCallbacks<OutputT> {
- void output(OutputT toOutput);
- }
-
- private final K key;
- private final ReduceFn<K, InputT, OutputT, W> reduceFn;
- private final WindowingStrategy<?, W> windowingStrategy;
- private final StateInternals<K> stateInternals;
- private final ActiveWindowSet<W> activeWindows;
- private final TimerInternals timerInternals;
- private final WindowingInternals<?, ?> windowingInternals;
- private final PipelineOptions options;
-
- ReduceFnContextFactory(K key, ReduceFn<K, InputT, OutputT, W> reduceFn,
- WindowingStrategy<?, W> windowingStrategy, StateInternals<K> stateInternals,
- ActiveWindowSet<W> activeWindows, TimerInternals timerInternals,
- WindowingInternals<?, ?> windowingInternals, PipelineOptions options) {
- this.key = key;
- this.reduceFn = reduceFn;
- this.windowingStrategy = windowingStrategy;
- this.stateInternals = stateInternals;
- this.activeWindows = activeWindows;
- this.timerInternals = timerInternals;
- this.windowingInternals = windowingInternals;
- this.options = options;
- }
-
- /** Where should we look for state associated with a given window? */
- public static enum StateStyle {
- /** All state is associated with the window itself. */
- DIRECT,
- /** State is associated with the 'state address' windows tracked by the active window set. */
- RENAMED
- }
-
- private StateAccessorImpl<K, W> stateAccessor(W window, StateStyle style) {
- return new StateAccessorImpl<K, W>(
- activeWindows, windowingStrategy.getWindowFn().windowCoder(),
- stateInternals, StateContexts.createFromComponents(options, windowingInternals, window),
- style);
- }
-
- public ReduceFn<K, InputT, OutputT, W>.Context base(W window, StateStyle style) {
- return new ContextImpl(stateAccessor(window, style));
- }
-
- public ReduceFn<K, InputT, OutputT, W>.ProcessValueContext forValue(
- W window, InputT value, Instant timestamp, StateStyle style) {
- return new ProcessValueContextImpl(stateAccessor(window, style), value, timestamp);
- }
-
- public ReduceFn<K, InputT, OutputT, W>.OnTriggerContext forTrigger(W window,
- ReadableState<PaneInfo> pane, StateStyle style, OnTriggerCallbacks<OutputT> callbacks) {
- return new OnTriggerContextImpl(stateAccessor(window, style), pane, callbacks);
- }
-
- public ReduceFn<K, InputT, OutputT, W>.OnMergeContext forMerge(
- Collection<W> activeToBeMerged, W mergeResult, StateStyle style) {
- return new OnMergeContextImpl(
- new MergingStateAccessorImpl<K, W>(activeWindows,
- windowingStrategy.getWindowFn().windowCoder(),
- stateInternals, style, activeToBeMerged, mergeResult));
- }
-
- public ReduceFn<K, InputT, OutputT, W>.OnMergeContext forPremerge(W window) {
- return new OnPremergeContextImpl(new PremergingStateAccessorImpl<K, W>(
- activeWindows, windowingStrategy.getWindowFn().windowCoder(), stateInternals, window));
- }
-
- private class TimersImpl implements Timers {
- private final StateNamespace namespace;
-
- public TimersImpl(StateNamespace namespace) {
- Preconditions.checkArgument(namespace instanceof WindowNamespace);
- this.namespace = namespace;
- }
-
- @Override
- public void setTimer(Instant timestamp, TimeDomain timeDomain) {
- timerInternals.setTimer(TimerData.of(namespace, timestamp, timeDomain));
- }
-
- @Override
- public void deleteTimer(Instant timestamp, TimeDomain timeDomain) {
- timerInternals.deleteTimer(TimerData.of(namespace, timestamp, timeDomain));
- }
-
- @Override
- public Instant currentProcessingTime() {
- return timerInternals.currentProcessingTime();
- }
-
- @Override
- @Nullable
- public Instant currentSynchronizedProcessingTime() {
- return timerInternals.currentSynchronizedProcessingTime();
- }
-
- @Override
- @Nullable
- public Instant currentEventTime() {
- return timerInternals.currentInputWatermarkTime();
- }
- }
-
- // ======================================================================
- // StateAccessors
- // ======================================================================
- static class StateAccessorImpl<K, W extends BoundedWindow> implements StateAccessor<K> {
-
-
- protected final ActiveWindowSet<W> activeWindows;
- protected final StateContext<W> context;
- protected final StateNamespace windowNamespace;
- protected final Coder<W> windowCoder;
- protected final StateInternals<K> stateInternals;
- protected final StateStyle style;
-
- public StateAccessorImpl(ActiveWindowSet<W> activeWindows, Coder<W> windowCoder,
- StateInternals<K> stateInternals, StateContext<W> context, StateStyle style) {
-
- this.activeWindows = activeWindows;
- this.windowCoder = windowCoder;
- this.stateInternals = stateInternals;
- this.context = checkNotNull(context);
- this.windowNamespace = namespaceFor(context.window());
- this.style = style;
- }
-
- protected StateNamespace namespaceFor(W window) {
- return StateNamespaces.window(windowCoder, window);
- }
-
- protected StateNamespace windowNamespace() {
- return windowNamespace;
- }
-
- W window() {
- return context.window();
- }
-
- StateNamespace namespace() {
- return windowNamespace();
- }
-
- @Override
- public <StateT extends State> StateT access(StateTag<? super K, StateT> address) {
- switch (style) {
- case DIRECT:
- return stateInternals.state(windowNamespace(), address, context);
- case RENAMED:
- return stateInternals.state(
- namespaceFor(activeWindows.writeStateAddress(context.window())), address, context);
- }
- throw new RuntimeException(); // cases are exhaustive.
- }
- }
-
- static class MergingStateAccessorImpl<K, W extends BoundedWindow>
- extends StateAccessorImpl<K, W> implements MergingStateAccessor<K, W> {
- private final Collection<W> activeToBeMerged;
-
- public MergingStateAccessorImpl(ActiveWindowSet<W> activeWindows, Coder<W> windowCoder,
- StateInternals<K> stateInternals, StateStyle style, Collection<W> activeToBeMerged,
- W mergeResult) {
- super(activeWindows, windowCoder, stateInternals,
- StateContexts.windowOnly(mergeResult), style);
- this.activeToBeMerged = activeToBeMerged;
- }
-
- @Override
- public <StateT extends State> StateT access(StateTag<? super K, StateT> address) {
- switch (style) {
- case DIRECT:
- return stateInternals.state(windowNamespace(), address, context);
- case RENAMED:
- return stateInternals.state(
- namespaceFor(activeWindows.mergedWriteStateAddress(
- activeToBeMerged, context.window())),
- address,
- context);
- }
- throw new RuntimeException(); // cases are exhaustive.
- }
-
- @Override
- public <StateT extends State> Map<W, StateT> accessInEachMergingWindow(
- StateTag<? super K, StateT> address) {
- ImmutableMap.Builder<W, StateT> builder = ImmutableMap.builder();
- for (W mergingWindow : activeToBeMerged) {
- StateNamespace namespace = null;
- switch (style) {
- case DIRECT:
- namespace = namespaceFor(mergingWindow);
- break;
- case RENAMED:
- namespace = namespaceFor(activeWindows.writeStateAddress(mergingWindow));
- break;
- }
- Preconditions.checkNotNull(namespace); // cases are exhaustive.
- builder.put(mergingWindow, stateInternals.state(namespace, address, context));
- }
- return builder.build();
- }
- }
-
- static class PremergingStateAccessorImpl<K, W extends BoundedWindow>
- extends StateAccessorImpl<K, W> implements MergingStateAccessor<K, W> {
- public PremergingStateAccessorImpl(ActiveWindowSet<W> activeWindows, Coder<W> windowCoder,
- StateInternals<K> stateInternals, W window) {
- super(activeWindows, windowCoder, stateInternals,
- StateContexts.windowOnly(window), StateStyle.RENAMED);
- }
-
- Collection<W> mergingWindows() {
- return activeWindows.readStateAddresses(context.window());
- }
-
- @Override
- public <StateT extends State> Map<W, StateT> accessInEachMergingWindow(
- StateTag<? super K, StateT> address) {
- ImmutableMap.Builder<W, StateT> builder = ImmutableMap.builder();
- for (W stateAddressWindow : activeWindows.readStateAddresses(context.window())) {
- StateT stateForWindow =
- stateInternals.state(namespaceFor(stateAddressWindow), address, context);
- builder.put(stateAddressWindow, stateForWindow);
- }
- return builder.build();
- }
- }
-
- // ======================================================================
- // Contexts
- // ======================================================================
-
- private class ContextImpl extends ReduceFn<K, InputT, OutputT, W>.Context {
- private final StateAccessorImpl<K, W> state;
- private final TimersImpl timers;
-
- private ContextImpl(StateAccessorImpl<K, W> state) {
- reduceFn.super();
- this.state = state;
- this.timers = new TimersImpl(state.namespace());
- }
-
- @Override
- public K key() {
- return key;
- }
-
- @Override
- public W window() {
- return state.window();
- }
-
- @Override
- public WindowingStrategy<?, W> windowingStrategy() {
- return windowingStrategy;
- }
-
- @Override
- public StateAccessor<K> state() {
- return state;
- }
-
- @Override
- public Timers timers() {
- return timers;
- }
- }
-
- private class ProcessValueContextImpl
- extends ReduceFn<K, InputT, OutputT, W>.ProcessValueContext {
- private final InputT value;
- private final Instant timestamp;
- private final StateAccessorImpl<K, W> state;
- private final TimersImpl timers;
-
- private ProcessValueContextImpl(StateAccessorImpl<K, W> state,
- InputT value, Instant timestamp) {
- reduceFn.super();
- this.state = state;
- this.value = value;
- this.timestamp = timestamp;
- this.timers = new TimersImpl(state.namespace());
- }
-
- @Override
- public K key() {
- return key;
- }
-
- @Override
- public W window() {
- return state.window();
- }
-
- @Override
- public WindowingStrategy<?, W> windowingStrategy() {
- return windowingStrategy;
- }
-
- @Override
- public StateAccessor<K> state() {
- return state;
- }
-
- @Override
- public InputT value() {
- return value;
- }
-
- @Override
- public Instant timestamp() {
- return timestamp;
- }
-
- @Override
- public Timers timers() {
- return timers;
- }
- }
-
- private class OnTriggerContextImpl extends ReduceFn<K, InputT, OutputT, W>.OnTriggerContext {
- private final StateAccessorImpl<K, W> state;
- private final ReadableState<PaneInfo> pane;
- private final OnTriggerCallbacks<OutputT> callbacks;
- private final TimersImpl timers;
-
- private OnTriggerContextImpl(StateAccessorImpl<K, W> state, ReadableState<PaneInfo> pane,
- OnTriggerCallbacks<OutputT> callbacks) {
- reduceFn.super();
- this.state = state;
- this.pane = pane;
- this.callbacks = callbacks;
- this.timers = new TimersImpl(state.namespace());
- }
-
- @Override
- public K key() {
- return key;
- }
-
- @Override
- public W window() {
- return state.window();
- }
-
- @Override
- public WindowingStrategy<?, W> windowingStrategy() {
- return windowingStrategy;
- }
-
- @Override
- public StateAccessor<K> state() {
- return state;
- }
-
- @Override
- public PaneInfo paneInfo() {
- return pane.read();
- }
-
- @Override
- public void output(OutputT value) {
- callbacks.output(value);
- }
-
- @Override
- public Timers timers() {
- return timers;
- }
- }
-
- private class OnMergeContextImpl extends ReduceFn<K, InputT, OutputT, W>.OnMergeContext {
- private final MergingStateAccessorImpl<K, W> state;
- private final TimersImpl timers;
-
- private OnMergeContextImpl(MergingStateAccessorImpl<K, W> state) {
- reduceFn.super();
- this.state = state;
- this.timers = new TimersImpl(state.namespace());
- }
-
- @Override
- public K key() {
- return key;
- }
-
- @Override
- public WindowingStrategy<?, W> windowingStrategy() {
- return windowingStrategy;
- }
-
- @Override
- public MergingStateAccessor<K, W> state() {
- return state;
- }
-
- @Override
- public W window() {
- return state.window();
- }
-
- @Override
- public Timers timers() {
- return timers;
- }
- }
-
- private class OnPremergeContextImpl extends ReduceFn<K, InputT, OutputT, W>.OnMergeContext {
- private final PremergingStateAccessorImpl<K, W> state;
- private final TimersImpl timers;
-
- private OnPremergeContextImpl(PremergingStateAccessorImpl<K, W> state) {
- reduceFn.super();
- this.state = state;
- this.timers = new TimersImpl(state.namespace());
- }
-
- @Override
- public K key() {
- return key;
- }
-
- @Override
- public WindowingStrategy<?, W> windowingStrategy() {
- return windowingStrategy;
- }
-
- @Override
- public MergingStateAccessor<K, W> state() {
- return state;
- }
-
- @Override
- public W window() {
- return state.window();
- }
-
- @Override
- public Timers timers() {
- return timers;
- }
- }
-}
[66/67] incubator-beam git commit: Directory reorganization
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/257a7a6b/maven-archetypes/pom.xml
----------------------------------------------------------------------
diff --git a/maven-archetypes/pom.xml b/maven-archetypes/pom.xml
deleted file mode 100644
index 4565253..0000000
--- a/maven-archetypes/pom.xml
+++ /dev/null
@@ -1,41 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project xmlns="http://maven.apache.org/POM/4.0.0"
- xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
- xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
-
- <modelVersion>4.0.0</modelVersion>
-
- <parent>
- <groupId>org.apache.beam</groupId>
- <artifactId>parent</artifactId>
- <version>0.1.0-incubating-SNAPSHOT</version>
- <relativePath>../pom.xml</relativePath>
- </parent>
-
- <artifactId>maven-archetypes-parent</artifactId>
- <packaging>pom</packaging>
-
- <name>Apache Beam :: Maven Archetypes</name>
-
- <modules>
- <module>starter</module>
- <module>examples</module>
- </modules>
-
-</project>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/257a7a6b/maven-archetypes/starter/pom.xml
----------------------------------------------------------------------
diff --git a/maven-archetypes/starter/pom.xml b/maven-archetypes/starter/pom.xml
deleted file mode 100644
index 933e8b1..0000000
--- a/maven-archetypes/starter/pom.xml
+++ /dev/null
@@ -1,57 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project xmlns="http://maven.apache.org/POM/4.0.0"
- xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
- xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
-
- <modelVersion>4.0.0</modelVersion>
-
- <parent>
- <groupId>org.apache.beam</groupId>
- <artifactId>maven-archetypes-parent</artifactId>
- <version>0.1.0-incubating-SNAPSHOT</version>
- <relativePath>../pom.xml</relativePath>
- </parent>
-
- <groupId>org.apache.beam</groupId>
- <artifactId>maven-archetypes-starter</artifactId>
- <name>Apache Beam :: Maven Archetypes :: Starter</name>
- <description>A Maven archetype to create a simple starter pipeline to
- get started using the Apache Beam Java SDK. </description>
-
- <packaging>maven-archetype</packaging>
-
- <build>
- <extensions>
- <extension>
- <groupId>org.apache.maven.archetype</groupId>
- <artifactId>archetype-packaging</artifactId>
- <version>2.4</version>
- </extension>
- </extensions>
-
- <pluginManagement>
- <plugins>
- <plugin>
- <artifactId>maven-archetype-plugin</artifactId>
- <version>2.4</version>
- </plugin>
- </plugins>
- </pluginManagement>
- </build>
-</project>
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/257a7a6b/maven-archetypes/starter/src/main/resources/META-INF/maven/archetype-metadata.xml
----------------------------------------------------------------------
diff --git a/maven-archetypes/starter/src/main/resources/META-INF/maven/archetype-metadata.xml b/maven-archetypes/starter/src/main/resources/META-INF/maven/archetype-metadata.xml
deleted file mode 100644
index bf75798..0000000
--- a/maven-archetypes/starter/src/main/resources/META-INF/maven/archetype-metadata.xml
+++ /dev/null
@@ -1,21 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<archetype-descriptor
- xsi:schemaLocation="http://maven.apache.org/plugins/maven-archetype-plugin/archetype-descriptor/1.0.0 http://maven.apache.org/xsd/archetype-descriptor-1.0.0.xsd"
- name="Google Cloud Dataflow Starter Pipeline Archetype"
- xmlns="http://maven.apache.org/plugins/maven-archetype-plugin/archetype-descriptor/1.0.0"
- xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
- <requiredProperties>
- <requiredProperty key="targetPlatform">
- <defaultValue>1.7</defaultValue>
- </requiredProperty>
- </requiredProperties>
-
- <fileSets>
- <fileSet filtered="true" packaged="true" encoding="UTF-8">
- <directory>src/main/java</directory>
- <includes>
- <include>**/*.java</include>
- </includes>
- </fileSet>
- </fileSets>
-</archetype-descriptor>
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/257a7a6b/maven-archetypes/starter/src/main/resources/archetype-resources/pom.xml
----------------------------------------------------------------------
diff --git a/maven-archetypes/starter/src/main/resources/archetype-resources/pom.xml b/maven-archetypes/starter/src/main/resources/archetype-resources/pom.xml
deleted file mode 100644
index 19e7d2d..0000000
--- a/maven-archetypes/starter/src/main/resources/archetype-resources/pom.xml
+++ /dev/null
@@ -1,43 +0,0 @@
-<project xmlns="http://maven.apache.org/POM/4.0.0"
- xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
- xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
- <modelVersion>4.0.0</modelVersion>
-
- <groupId>${groupId}</groupId>
- <artifactId>${artifactId}</artifactId>
- <version>${version}</version>
-
- <build>
- <plugins>
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-compiler-plugin</artifactId>
- <version>3.3</version>
- <configuration>
- <source>${targetPlatform}</source>
- <target>${targetPlatform}</target>
- </configuration>
- </plugin>
- </plugins>
- </build>
-
- <dependencies>
- <dependency>
- <groupId>org.apache.beam</groupId>
- <artifactId>java-sdk-all</artifactId>
- <version>[0-incubating, 1-incubating)</version>
- </dependency>
-
- <!-- slf4j API frontend binding with JUL backend -->
- <dependency>
- <groupId>org.slf4j</groupId>
- <artifactId>slf4j-api</artifactId>
- <version>1.7.7</version>
- </dependency>
- <dependency>
- <groupId>org.slf4j</groupId>
- <artifactId>slf4j-jdk14</artifactId>
- <version>1.7.7</version>
- </dependency>
- </dependencies>
-</project>
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/257a7a6b/maven-archetypes/starter/src/main/resources/archetype-resources/src/main/java/StarterPipeline.java
----------------------------------------------------------------------
diff --git a/maven-archetypes/starter/src/main/resources/archetype-resources/src/main/java/StarterPipeline.java b/maven-archetypes/starter/src/main/resources/archetype-resources/src/main/java/StarterPipeline.java
deleted file mode 100644
index ffabbc0..0000000
--- a/maven-archetypes/starter/src/main/resources/archetype-resources/src/main/java/StarterPipeline.java
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package ${package};
-
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
-import com.google.cloud.dataflow.sdk.transforms.Create;
-import com.google.cloud.dataflow.sdk.transforms.DoFn;
-import com.google.cloud.dataflow.sdk.transforms.ParDo;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * A starter example for writing Google Cloud Dataflow programs.
- *
- * <p>The example takes two strings, converts them to their upper-case
- * representation and logs them.
- *
- * <p>To run this starter example locally using DirectPipelineRunner, just
- * execute it without any additional parameters from your favorite development
- * environment.
- *
- * <p>To run this starter example using managed resource in Google Cloud
- * Platform, you should specify the following command-line options:
- * --project=<YOUR_PROJECT_ID>
- * --stagingLocation=<STAGING_LOCATION_IN_CLOUD_STORAGE>
- * --runner=BlockingDataflowPipelineRunner
- */
-public class StarterPipeline {
- private static final Logger LOG = LoggerFactory.getLogger(StarterPipeline.class);
-
- public static void main(String[] args) {
- Pipeline p = Pipeline.create(
- PipelineOptionsFactory.fromArgs(args).withValidation().create());
-
- p.apply(Create.of("Hello", "World"))
- .apply(ParDo.of(new DoFn<String, String>() {
- @Override
- public void processElement(ProcessContext c) {
- c.output(c.element().toUpperCase());
- }
- }))
- .apply(ParDo.of(new DoFn<String, Void>() {
- @Override
- public void processElement(ProcessContext c) {
- LOG.info(c.element());
- }
- }));
-
- p.run();
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/257a7a6b/maven-archetypes/starter/src/test/resources/projects/basic/archetype.properties
----------------------------------------------------------------------
diff --git a/maven-archetypes/starter/src/test/resources/projects/basic/archetype.properties b/maven-archetypes/starter/src/test/resources/projects/basic/archetype.properties
deleted file mode 100644
index c59e77a..0000000
--- a/maven-archetypes/starter/src/test/resources/projects/basic/archetype.properties
+++ /dev/null
@@ -1,5 +0,0 @@
-package=it.pkg
-version=0.1-SNAPSHOT
-groupId=archetype.it
-artifactId=basic
-targetPlatform=1.7
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/257a7a6b/maven-archetypes/starter/src/test/resources/projects/basic/goal.txt
----------------------------------------------------------------------
diff --git a/maven-archetypes/starter/src/test/resources/projects/basic/goal.txt b/maven-archetypes/starter/src/test/resources/projects/basic/goal.txt
deleted file mode 100644
index 0b59873..0000000
--- a/maven-archetypes/starter/src/test/resources/projects/basic/goal.txt
+++ /dev/null
@@ -1 +0,0 @@
-verify
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/257a7a6b/maven-archetypes/starter/src/test/resources/projects/basic/reference/pom.xml
----------------------------------------------------------------------
diff --git a/maven-archetypes/starter/src/test/resources/projects/basic/reference/pom.xml b/maven-archetypes/starter/src/test/resources/projects/basic/reference/pom.xml
deleted file mode 100644
index d29424a..0000000
--- a/maven-archetypes/starter/src/test/resources/projects/basic/reference/pom.xml
+++ /dev/null
@@ -1,43 +0,0 @@
-<project xmlns="http://maven.apache.org/POM/4.0.0"
- xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
- xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
- <modelVersion>4.0.0</modelVersion>
-
- <groupId>archetype.it</groupId>
- <artifactId>basic</artifactId>
- <version>0.1-SNAPSHOT</version>
-
- <build>
- <plugins>
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-compiler-plugin</artifactId>
- <version>3.3</version>
- <configuration>
- <source>1.7</source>
- <target>1.7</target>
- </configuration>
- </plugin>
- </plugins>
- </build>
-
- <dependencies>
- <dependency>
- <groupId>org.apache.beam</groupId>
- <artifactId>java-sdk-all</artifactId>
- <version>[0-incubating, 1-incubating)</version>
- </dependency>
-
- <!-- slf4j API frontend binding with JUL backend -->
- <dependency>
- <groupId>org.slf4j</groupId>
- <artifactId>slf4j-api</artifactId>
- <version>1.7.7</version>
- </dependency>
- <dependency>
- <groupId>org.slf4j</groupId>
- <artifactId>slf4j-jdk14</artifactId>
- <version>1.7.7</version>
- </dependency>
- </dependencies>
-</project>
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/257a7a6b/maven-archetypes/starter/src/test/resources/projects/basic/reference/src/main/java/it/pkg/StarterPipeline.java
----------------------------------------------------------------------
diff --git a/maven-archetypes/starter/src/test/resources/projects/basic/reference/src/main/java/it/pkg/StarterPipeline.java b/maven-archetypes/starter/src/test/resources/projects/basic/reference/src/main/java/it/pkg/StarterPipeline.java
deleted file mode 100644
index 2e7c4e1..0000000
--- a/maven-archetypes/starter/src/test/resources/projects/basic/reference/src/main/java/it/pkg/StarterPipeline.java
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package it.pkg;
-
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
-import com.google.cloud.dataflow.sdk.transforms.Create;
-import com.google.cloud.dataflow.sdk.transforms.DoFn;
-import com.google.cloud.dataflow.sdk.transforms.ParDo;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * A starter example for writing Google Cloud Dataflow programs.
- *
- * <p>The example takes two strings, converts them to their upper-case
- * representation and logs them.
- *
- * <p>To run this starter example locally using DirectPipelineRunner, just
- * execute it without any additional parameters from your favorite development
- * environment.
- *
- * <p>To run this starter example using managed resource in Google Cloud
- * Platform, you should specify the following command-line options:
- * --project=<YOUR_PROJECT_ID>
- * --stagingLocation=<STAGING_LOCATION_IN_CLOUD_STORAGE>
- * --runner=BlockingDataflowPipelineRunner
- */
-public class StarterPipeline {
- private static final Logger LOG = LoggerFactory.getLogger(StarterPipeline.class);
-
- public static void main(String[] args) {
- Pipeline p = Pipeline.create(
- PipelineOptionsFactory.fromArgs(args).withValidation().create());
-
- p.apply(Create.of("Hello", "World"))
- .apply(ParDo.of(new DoFn<String, String>() {
- @Override
- public void processElement(ProcessContext c) {
- c.output(c.element().toUpperCase());
- }
- }))
- .apply(ParDo.of(new DoFn<String, Void>() {
- @Override
- public void processElement(ProcessContext c) {
- LOG.info(c.element());
- }
- }));
-
- p.run();
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/257a7a6b/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index 6b2fd93..b79ddf6 100644
--- a/pom.xml
+++ b/pom.xml
@@ -128,7 +128,7 @@
<module>sdks/java/core</module>
<module>runners</module>
<module>examples/java</module>
- <module>maven-archetypes</module>
+ <module>sdks/java/maven-archetypes</module>
</modules>
<profiles>
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/257a7a6b/sdks/java/maven-archetypes/examples/pom.xml
----------------------------------------------------------------------
diff --git a/sdks/java/maven-archetypes/examples/pom.xml b/sdks/java/maven-archetypes/examples/pom.xml
new file mode 100644
index 0000000..7e74b9d
--- /dev/null
+++ b/sdks/java/maven-archetypes/examples/pom.xml
@@ -0,0 +1,56 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.beam</groupId>
+ <artifactId>maven-archetypes-parent</artifactId>
+ <version>0.1.0-incubating-SNAPSHOT</version>
+ <relativePath>../pom.xml</relativePath>
+ </parent>
+
+ <artifactId>maven-archetypes-examples</artifactId>
+ <name>Apache Beam :: Maven Archetypes :: Examples</name>
+ <description>A Maven Archetype to create a project containing all the
+ example pipelines from the Apache Beam Java SDK.</description>
+
+ <packaging>maven-archetype</packaging>
+
+ <build>
+ <extensions>
+ <extension>
+ <groupId>org.apache.maven.archetype</groupId>
+ <artifactId>archetype-packaging</artifactId>
+ <version>2.4</version>
+ </extension>
+ </extensions>
+
+ <pluginManagement>
+ <plugins>
+ <plugin>
+ <artifactId>maven-archetype-plugin</artifactId>
+ <version>2.4</version>
+ </plugin>
+ </plugins>
+ </pluginManagement>
+ </build>
+</project>
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/257a7a6b/sdks/java/maven-archetypes/examples/src/main/resources/META-INF/maven/archetype-metadata.xml
----------------------------------------------------------------------
diff --git a/sdks/java/maven-archetypes/examples/src/main/resources/META-INF/maven/archetype-metadata.xml b/sdks/java/maven-archetypes/examples/src/main/resources/META-INF/maven/archetype-metadata.xml
new file mode 100644
index 0000000..7742af4
--- /dev/null
+++ b/sdks/java/maven-archetypes/examples/src/main/resources/META-INF/maven/archetype-metadata.xml
@@ -0,0 +1,29 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<archetype-descriptor
+ xsi:schemaLocation="http://maven.apache.org/plugins/maven-archetype-plugin/archetype-descriptor/1.0.0 http://maven.apache.org/xsd/archetype-descriptor-1.0.0.xsd"
+ name="Google Cloud Dataflow Example Pipelines Archetype"
+ xmlns="http://maven.apache.org/plugins/maven-archetype-plugin/archetype-descriptor/1.0.0"
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+
+ <requiredProperties>
+ <requiredProperty key="targetPlatform">
+ <defaultValue>1.7</defaultValue>
+ </requiredProperty>
+ </requiredProperties>
+
+ <fileSets>
+ <fileSet filtered="true" packaged="true" encoding="UTF-8">
+ <directory>src/main/java</directory>
+ <includes>
+ <include>**/*.java</include>
+ </includes>
+ </fileSet>
+
+ <fileSet filtered="true" packaged="true" encoding="UTF-8">
+ <directory>src/test/java</directory>
+ <includes>
+ <include>**/*.java</include>
+ </includes>
+ </fileSet>
+ </fileSets>
+</archetype-descriptor>
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/257a7a6b/sdks/java/maven-archetypes/examples/src/main/resources/archetype-resources/pom.xml
----------------------------------------------------------------------
diff --git a/sdks/java/maven-archetypes/examples/src/main/resources/archetype-resources/pom.xml b/sdks/java/maven-archetypes/examples/src/main/resources/archetype-resources/pom.xml
new file mode 100644
index 0000000..d19d0c6
--- /dev/null
+++ b/sdks/java/maven-archetypes/examples/src/main/resources/archetype-resources/pom.xml
@@ -0,0 +1,204 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ ~ Copyright (C) 2015 Google Inc.
+ ~
+ ~ Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ ~ use this file except in compliance with the License. You may obtain a copy of
+ ~ the License at
+ ~
+ ~ http://www.apache.org/licenses/LICENSE-2.0
+ ~
+ ~ Unless required by applicable law or agreed to in writing, software
+ ~ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ ~ WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ ~ License for the specific language governing permissions and limitations under
+ ~ the License.
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~-->
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <groupId>${groupId}</groupId>
+ <artifactId>${artifactId}</artifactId>
+ <version>${version}</version>
+
+ <packaging>jar</packaging>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-compiler-plugin</artifactId>
+ <version>3.3</version>
+ <configuration>
+ <source>${targetPlatform}</source>
+ <target>${targetPlatform}</target>
+ </configuration>
+ </plugin>
+
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-shade-plugin</artifactId>
+ <version>2.3</version>
+ <executions>
+ <execution>
+ <phase>package</phase>
+ <goals>
+ <goal>shade</goal>
+ </goals>
+ <configuration>
+ <finalName>${project.artifactId}-bundled-${project.version}</finalName>
+ <artifactSet>
+ <includes>
+ <include>*:*</include>
+ </includes>
+ </artifactSet>
+ <filters>
+ <filter>
+ <artifact>*:*</artifact>
+ <excludes>
+ <exclude>META-INF/*.SF</exclude>
+ <exclude>META-INF/*.DSA</exclude>
+ <exclude>META-INF/*.RSA</exclude>
+ </excludes>
+ </filter>
+ </filters>
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
+
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-surefire-plugin</artifactId>
+ <version>2.18.1</version>
+ <configuration>
+ <parallel>all</parallel>
+ <threadCount>4</threadCount>
+ <redirectTestOutputToFile>true</redirectTestOutputToFile>
+ </configuration>
+ <dependencies>
+ <dependency>
+ <groupId>org.apache.maven.surefire</groupId>
+ <artifactId>surefire-junit47</artifactId>
+ <version>2.18.1</version>
+ </dependency>
+ </dependencies>
+ </plugin>
+ </plugins>
+ </build>
+
+ <dependencies>
+ <!-- Adds a dependency on a specific version of the Dataflow SDK. -->
+ <dependency>
+ <groupId>org.apache.beam</groupId>
+ <artifactId>java-sdk-all</artifactId>
+ <version>[0-incubating, 2-incubating)</version>
+ </dependency>
+
+ <dependency>
+ <groupId>com.google.api-client</groupId>
+ <artifactId>google-api-client</artifactId>
+ <version>1.21.0</version>
+ <exclusions>
+ <!-- Exclude an old version of guava that is being pulled
+ in by a transitive dependency of google-api-client -->
+ <exclusion>
+ <groupId>com.google.guava</groupId>
+ <artifactId>guava-jdk5</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+
+ <!-- Dependencies below this line are specific dependencies needed by the examples code. -->
+ <dependency>
+ <groupId>com.google.apis</groupId>
+ <artifactId>google-api-services-bigquery</artifactId>
+ <version>v2-rev248-1.21.0</version>
+ <exclusions>
+ <!-- Exclude an old version of guava that is being pulled
+ in by a transitive dependency of google-api-client -->
+ <exclusion>
+ <groupId>com.google.guava</groupId>
+ <artifactId>guava-jdk5</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+
+ <dependency>
+ <groupId>com.google.http-client</groupId>
+ <artifactId>google-http-client</artifactId>
+ <version>1.21.0</version>
+ <exclusions>
+ <!-- Exclude an old version of guava that is being pulled
+ in by a transitive dependency of google-api-client -->
+ <exclusion>
+ <groupId>com.google.guava</groupId>
+ <artifactId>guava-jdk5</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+
+ <dependency>
+ <groupId>com.google.apis</groupId>
+ <artifactId>google-api-services-pubsub</artifactId>
+ <version>v1-rev7-1.21.0</version>
+ <exclusions>
+ <!-- Exclude an old version of guava that is being pulled
+ in by a transitive dependency of google-api-client -->
+ <exclusion>
+ <groupId>com.google.guava</groupId>
+ <artifactId>guava-jdk5</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+
+ <dependency>
+ <groupId>joda-time</groupId>
+ <artifactId>joda-time</artifactId>
+ <version>2.4</version>
+ </dependency>
+
+ <dependency>
+ <groupId>com.google.guava</groupId>
+ <artifactId>guava</artifactId>
+ <version>18.0</version>
+ </dependency>
+
+ <dependency>
+ <groupId>javax.servlet</groupId>
+ <artifactId>javax.servlet-api</artifactId>
+ <version>3.1.0</version>
+ </dependency>
+
+ <!-- Add slf4j API frontend binding with JUL backend -->
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-api</artifactId>
+ <version>1.7.7</version>
+ </dependency>
+
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-jdk14</artifactId>
+ <version>1.7.7</version>
+ <!-- When loaded at runtime this will wire up slf4j to the JUL backend -->
+ <scope>runtime</scope>
+ </dependency>
+
+ <!-- Hamcrest and JUnit are required dependencies of DataflowAssert,
+ which is used in the main code of DebuggingWordCount example. -->
+ <dependency>
+ <groupId>org.hamcrest</groupId>
+ <artifactId>hamcrest-all</artifactId>
+ <version>1.3</version>
+ </dependency>
+
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <version>4.11</version>
+ </dependency>
+ </dependencies>
+</project>
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/257a7a6b/sdks/java/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/DebuggingWordCount.java
----------------------------------------------------------------------
diff --git a/sdks/java/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/DebuggingWordCount.java b/sdks/java/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/DebuggingWordCount.java
new file mode 100644
index 0000000..3cf2bc0
--- /dev/null
+++ b/sdks/java/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/DebuggingWordCount.java
@@ -0,0 +1,182 @@
+/*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package ${package};
+
+import ${package}.WordCount.WordCountOptions;
+import com.google.cloud.dataflow.sdk.Pipeline;
+import com.google.cloud.dataflow.sdk.io.TextIO;
+import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
+import com.google.cloud.dataflow.sdk.testing.DataflowAssert;
+import com.google.cloud.dataflow.sdk.transforms.Aggregator;
+import com.google.cloud.dataflow.sdk.transforms.DoFn;
+import com.google.cloud.dataflow.sdk.transforms.ParDo;
+import com.google.cloud.dataflow.sdk.transforms.Sum;
+import com.google.cloud.dataflow.sdk.values.KV;
+import com.google.cloud.dataflow.sdk.values.PCollection;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.Arrays;
+import java.util.List;
+import java.util.regex.Pattern;
+
+
+/**
+ * An example that verifies word counts in Shakespeare and includes Dataflow best practices.
+ *
+ * <p>This class, {@link DebuggingWordCount}, is the third in a series of four successively more
+ * detailed 'word count' examples. You may first want to take a look at {@link MinimalWordCount}
+ * and {@link WordCount}. After you've looked at this example, then see the
+ * {@link WindowedWordCount} pipeline, for introduction of additional concepts.
+ *
+ * <p>Basic concepts, also in the MinimalWordCount and WordCount examples:
+ * Reading text files; counting a PCollection; executing a Pipeline both locally
+ * and using the Dataflow service; defining DoFns.
+ *
+ * <p>New Concepts:
+ * <pre>
+ * 1. Logging to Cloud Logging
+ * 2. Controlling Dataflow worker log levels
+ * 3. Creating a custom aggregator
+ * 4. Testing your Pipeline via DataflowAssert
+ * </pre>
+ *
+ * <p>To execute this pipeline locally, specify general pipeline configuration:
+ * <pre>{@code
+ * --project=YOUR_PROJECT_ID
+ * }
+ * </pre>
+ *
+ * <p>To execute this pipeline using the Dataflow service and the additional logging discussed
+ * below, specify pipeline configuration:
+ * <pre>{@code
+ * --project=YOUR_PROJECT_ID
+ * --stagingLocation=gs://YOUR_STAGING_DIRECTORY
+ * --runner=BlockingDataflowPipelineRunner
+ * --workerLogLevelOverrides={"com.google.cloud.dataflow.examples":"DEBUG"}
+ * }
+ * </pre>
+ *
+ * <p>Note that when you run via <code>mvn exec</code>, you may need to escape
+ * the quotations as appropriate for your shell. For example, in <code>bash</code>:
+ * <pre>
+ * mvn compile exec:java ... \
+ * -Dexec.args="... \
+ * --workerLogLevelOverrides={\\\"com.google.cloud.dataflow.examples\\\":\\\"DEBUG\\\"}"
+ * </pre>
+ *
+ * <p>Concept #2: Dataflow workers which execute user code are configured to log to Cloud
+ * Logging by default at "INFO" log level and higher. One may override log levels for specific
+ * logging namespaces by specifying:
+ * <pre><code>
+ * --workerLogLevelOverrides={"Name1":"Level1","Name2":"Level2",...}
+ * </code></pre>
+ * For example, by specifying:
+ * <pre><code>
+ * --workerLogLevelOverrides={"com.google.cloud.dataflow.examples":"DEBUG"}
+ * </code></pre>
+ * when executing this pipeline using the Dataflow service, Cloud Logging would contain only
+ * "DEBUG" or higher level logs for the {@code com.google.cloud.dataflow.examples} package in
+ * addition to the default "INFO" or higher level logs. In addition, the default Dataflow worker
+ * logging configuration can be overridden by specifying
+ * {@code --defaultWorkerLogLevel=<one of TRACE, DEBUG, INFO, WARN, ERROR>}. For example,
+ * by specifying {@code --defaultWorkerLogLevel=DEBUG} when executing this pipeline with
+ * the Dataflow service, Cloud Logging would contain all "DEBUG" or higher level logs. Note
+ * that changing the default worker log level to TRACE or DEBUG will significantly increase
+ * the amount of logs output.
+ *
+ * <p>The input file defaults to {@code gs://dataflow-samples/shakespeare/kinglear.txt} and can be
+ * overridden with {@code --inputFile}.
+ */
+public class DebuggingWordCount {
+ /** A DoFn that filters for a specific key based upon a regular expression. */
+ public static class FilterTextFn extends DoFn<KV<String, Long>, KV<String, Long>> {
+ /**
+ * Concept #1: The logger below uses the fully qualified class name of FilterTextFn
+ * as the logger. All log statements emitted by this logger will be referenced by this name
+ * and will be visible in the Cloud Logging UI. Learn more at https://cloud.google.com/logging
+ * about the Cloud Logging UI.
+ */
+ private static final Logger LOG = LoggerFactory.getLogger(FilterTextFn.class);
+
+ private final Pattern filter;
+ public FilterTextFn(String pattern) {
+ filter = Pattern.compile(pattern);
+ }
+
+ /**
+ * Concept #3: A custom aggregator can track values in your pipeline as it runs. Those
+ * values will be displayed in the Dataflow Monitoring UI when this pipeline is run using the
+ * Dataflow service. These aggregators below track the number of matched and unmatched words.
+ * Learn more at https://cloud.google.com/dataflow/pipelines/dataflow-monitoring-intf about
+ * the Dataflow Monitoring UI.
+ */
+ private final Aggregator<Long, Long> matchedWords =
+ createAggregator("matchedWords", new Sum.SumLongFn());
+ private final Aggregator<Long, Long> unmatchedWords =
+ createAggregator("umatchedWords", new Sum.SumLongFn());
+
+ @Override
+ public void processElement(ProcessContext c) {
+ if (filter.matcher(c.element().getKey()).matches()) {
+ // Log at the "DEBUG" level each element that we match. When executing this pipeline
+ // using the Dataflow service, these log lines will appear in the Cloud Logging UI
+ // only if the log level is set to "DEBUG" or lower.
+ LOG.debug("Matched: " + c.element().getKey());
+ matchedWords.addValue(1L);
+ c.output(c.element());
+ } else {
+ // Log at the "TRACE" level each element that is not matched. Different log levels
+ // can be used to control the verbosity of logging providing an effective mechanism
+ // to filter less important information.
+ LOG.trace("Did not match: " + c.element().getKey());
+ unmatchedWords.addValue(1L);
+ }
+ }
+ }
+
+ public static void main(String[] args) {
+ WordCountOptions options = PipelineOptionsFactory.fromArgs(args).withValidation()
+ .as(WordCountOptions.class);
+ Pipeline p = Pipeline.create(options);
+
+ PCollection<KV<String, Long>> filteredWords =
+ p.apply(TextIO.Read.named("ReadLines").from(options.getInputFile()))
+ .apply(new WordCount.CountWords())
+ .apply(ParDo.of(new FilterTextFn("Flourish|stomach")));
+
+ /**
+ * Concept #4: DataflowAssert is a set of convenient PTransforms in the style of
+ * Hamcrest's collection matchers that can be used when writing Pipeline level tests
+ * to validate the contents of PCollections. DataflowAssert is best used in unit tests
+ * with small data sets but is demonstrated here as a teaching tool.
+ *
+ * <p>Below we verify that the set of filtered words matches our expected counts. Note
+ * that DataflowAssert does not provide any output and that successful completion of the
+ * Pipeline implies that the expectations were met. Learn more at
+ * https://cloud.google.com/dataflow/pipelines/testing-your-pipeline on how to test
+ * your Pipeline and see {@link DebuggingWordCountTest} for an example unit test.
+ */
+ List<KV<String, Long>> expectedResults = Arrays.asList(
+ KV.of("Flourish", 3L),
+ KV.of("stomach", 1L));
+ DataflowAssert.that(filteredWords).containsInAnyOrder(expectedResults);
+
+ p.run();
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/257a7a6b/sdks/java/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/MinimalWordCount.java
----------------------------------------------------------------------
diff --git a/sdks/java/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/MinimalWordCount.java b/sdks/java/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/MinimalWordCount.java
new file mode 100644
index 0000000..035db01
--- /dev/null
+++ b/sdks/java/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/MinimalWordCount.java
@@ -0,0 +1,115 @@
+/*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package ${package};
+
+import com.google.cloud.dataflow.sdk.Pipeline;
+import com.google.cloud.dataflow.sdk.io.TextIO;
+import com.google.cloud.dataflow.sdk.options.DataflowPipelineOptions;
+import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
+import com.google.cloud.dataflow.sdk.runners.BlockingDataflowPipelineRunner;
+import com.google.cloud.dataflow.sdk.transforms.Count;
+import com.google.cloud.dataflow.sdk.transforms.DoFn;
+import com.google.cloud.dataflow.sdk.transforms.ParDo;
+import com.google.cloud.dataflow.sdk.values.KV;
+
+
+/**
+ * An example that counts words in Shakespeare.
+ *
+ * <p>This class, {@link MinimalWordCount}, is the first in a series of four successively more
+ * detailed 'word count' examples. Here, for simplicity, we don't show any error-checking or
+ * argument processing, and focus on construction of the pipeline, which chains together the
+ * application of core transforms.
+ *
+ * <p>Next, see the {@link WordCount} pipeline, then the {@link DebuggingWordCount}, and finally
+ * the {@link WindowedWordCount} pipeline, for more detailed examples that introduce additional
+ * concepts.
+ *
+ * <p>Concepts:
+ * <pre>
+ * 1. Reading data from text files
+ * 2. Specifying 'inline' transforms
+ * 3. Counting a PCollection
+ * 4. Writing data to Cloud Storage as text files
+ * </pre>
+ *
+ * <p>To execute this pipeline, first edit the code to set your project ID, the staging
+ * location, and the output location. The specified GCS bucket(s) must already exist.
+ *
+ * <p>Then, run the pipeline as described in the README. It will be deployed and run using the
+ * Dataflow service. No args are required to run the pipeline. You can see the results in your
+ * output bucket in the GCS browser.
+ */
+public class MinimalWordCount {
+
+ public static void main(String[] args) {
+ // Create a DataflowPipelineOptions object. This object lets us set various execution
+ // options for our pipeline, such as the associated Cloud Platform project and the location
+ // in Google Cloud Storage to stage files.
+ DataflowPipelineOptions options = PipelineOptionsFactory.create()
+ .as(DataflowPipelineOptions.class);
+ options.setRunner(BlockingDataflowPipelineRunner.class);
+ // CHANGE 1/3: Your project ID is required in order to run your pipeline on the Google Cloud.
+ options.setProject("SET_YOUR_PROJECT_ID_HERE");
+ // CHANGE 2/3: Your Google Cloud Storage path is required for staging local files.
+ options.setStagingLocation("gs://SET_YOUR_BUCKET_NAME_HERE/AND_STAGING_DIRECTORY");
+
+ // Create the Pipeline object with the options we defined above.
+ Pipeline p = Pipeline.create(options);
+
+ // Apply the pipeline's transforms.
+
+ // Concept #1: Apply a root transform to the pipeline; in this case, TextIO.Read to read a set
+ // of input text files. TextIO.Read returns a PCollection where each element is one line from
+ // the input text (a set of Shakespeare's texts).
+ p.apply(TextIO.Read.from("gs://dataflow-samples/shakespeare/*"))
+ // Concept #2: Apply a ParDo transform to our PCollection of text lines. This ParDo invokes a
+ // DoFn (defined in-line) on each element that tokenizes the text line into individual words.
+ // The ParDo returns a PCollection<String>, where each element is an individual word in
+ // Shakespeare's collected texts.
+ .apply(ParDo.named("ExtractWords").of(new DoFn<String, String>() {
+ @Override
+ public void processElement(ProcessContext c) {
+ for (String word : c.element().split("[^a-zA-Z']+")) {
+ if (!word.isEmpty()) {
+ c.output(word);
+ }
+ }
+ }
+ }))
+ // Concept #3: Apply the Count transform to our PCollection of individual words. The Count
+ // transform returns a new PCollection of key/value pairs, where each key represents a unique
+ // word in the text. The associated value is the occurrence count for that word.
+ .apply(Count.<String>perElement())
+ // Apply another ParDo transform that formats our PCollection of word counts into a printable
+ // string, suitable for writing to an output file.
+ .apply(ParDo.named("FormatResults").of(new DoFn<KV<String, Long>, String>() {
+ @Override
+ public void processElement(ProcessContext c) {
+ c.output(c.element().getKey() + ": " + c.element().getValue());
+ }
+ }))
+ // Concept #4: Apply a write transform, TextIO.Write, at the end of the pipeline.
+ // TextIO.Write writes the contents of a PCollection (in this case, our PCollection of
+ // formatted strings) to a series of text files in Google Cloud Storage.
+ // CHANGE 3/3: The Google Cloud Storage path is required for outputting the results to.
+ .apply(TextIO.Write.to("gs://YOUR_OUTPUT_BUCKET/AND_OUTPUT_PREFIX"));
+
+ // Run the pipeline.
+ p.run();
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/257a7a6b/sdks/java/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/WindowedWordCount.java
----------------------------------------------------------------------
diff --git a/sdks/java/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/WindowedWordCount.java b/sdks/java/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/WindowedWordCount.java
new file mode 100644
index 0000000..29921e2
--- /dev/null
+++ b/sdks/java/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/WindowedWordCount.java
@@ -0,0 +1,262 @@
+/*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package ${package};
+
+import com.google.api.services.bigquery.model.TableFieldSchema;
+import com.google.api.services.bigquery.model.TableReference;
+import com.google.api.services.bigquery.model.TableRow;
+import com.google.api.services.bigquery.model.TableSchema;
+import ${package}.common.DataflowExampleUtils;
+import com.google.cloud.dataflow.sdk.Pipeline;
+import com.google.cloud.dataflow.sdk.PipelineResult;
+import com.google.cloud.dataflow.sdk.io.BigQueryIO;
+import com.google.cloud.dataflow.sdk.io.PubsubIO;
+import com.google.cloud.dataflow.sdk.io.TextIO;
+import com.google.cloud.dataflow.sdk.options.Default;
+import com.google.cloud.dataflow.sdk.options.Description;
+import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
+import com.google.cloud.dataflow.sdk.transforms.DoFn;
+import com.google.cloud.dataflow.sdk.transforms.ParDo;
+import com.google.cloud.dataflow.sdk.transforms.windowing.FixedWindows;
+import com.google.cloud.dataflow.sdk.transforms.windowing.Window;
+import com.google.cloud.dataflow.sdk.values.KV;
+import com.google.cloud.dataflow.sdk.values.PCollection;
+
+import org.joda.time.Duration;
+import org.joda.time.Instant;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+
+/**
+ * An example that counts words in text, and can run over either unbounded or bounded input
+ * collections.
+ *
+ * <p>This class, {@link WindowedWordCount}, is the last in a series of four successively more
+ * detailed 'word count' examples. First take a look at {@link MinimalWordCount},
+ * {@link WordCount}, and {@link DebuggingWordCount}.
+ *
+ * <p>Basic concepts, also in the MinimalWordCount, WordCount, and DebuggingWordCount examples:
+ * Reading text files; counting a PCollection; writing to GCS; executing a Pipeline both locally
+ * and using the Dataflow service; defining DoFns; creating a custom aggregator;
+ * user-defined PTransforms; defining PipelineOptions.
+ *
+ * <p>New Concepts:
+ * <pre>
+ * 1. Unbounded and bounded pipeline input modes
+ * 2. Adding timestamps to data
+ * 3. PubSub topics as sources
+ * 4. Windowing
+ * 5. Re-using PTransforms over windowed PCollections
+ * 6. Writing to BigQuery
+ * </pre>
+ *
+ * <p>To execute this pipeline locally, specify general pipeline configuration:
+ * <pre>{@code
+ * --project=YOUR_PROJECT_ID
+ * }
+ * </pre>
+ *
+ * <p>To execute this pipeline using the Dataflow service, specify pipeline configuration:
+ * <pre>{@code
+ * --project=YOUR_PROJECT_ID
+ * --stagingLocation=gs://YOUR_STAGING_DIRECTORY
+ * --runner=BlockingDataflowPipelineRunner
+ * }
+ * </pre>
+ *
+ * <p>Optionally specify the input file path via:
+ * {@code --inputFile=gs://INPUT_PATH},
+ * which defaults to {@code gs://dataflow-samples/shakespeare/kinglear.txt}.
+ *
+ * <p>Specify an output BigQuery dataset and optionally, a table for the output. If you don't
+ * specify the table, one will be created for you using the job name. If you don't specify the
+ * dataset, a dataset called {@code dataflow-examples} must already exist in your project.
+ * {@code --bigQueryDataset=YOUR-DATASET --bigQueryTable=YOUR-NEW-TABLE-NAME}.
+ *
+ * <p>Decide whether you want your pipeline to run with 'bounded' (such as files in GCS) or
+ * 'unbounded' input (such as a PubSub topic). To run with unbounded input, set
+ * {@code --unbounded=true}. Then, optionally specify the Google Cloud PubSub topic to read from
+ * via {@code --pubsubTopic=projects/PROJECT_ID/topics/YOUR_TOPIC_NAME}. If the topic does not
+ * exist, the pipeline will create one for you. It will delete this topic when it terminates.
+ * The pipeline will automatically launch an auxiliary batch pipeline to populate the given PubSub
+ * topic with the contents of the {@code --inputFile}, in order to make the example easy to run.
+ * If you want to use an independently-populated PubSub topic, indicate this by setting
+ * {@code --inputFile=""}. In that case, the auxiliary pipeline will not be started.
+ *
+ * <p>By default, the pipeline will do fixed windowing, on 1-minute windows. You can
+ * change this interval by setting the {@code --windowSize} parameter, e.g. {@code --windowSize=10}
+ * for 10-minute windows.
+ */
+public class WindowedWordCount {
+ private static final Logger LOG = LoggerFactory.getLogger(WindowedWordCount.class);
+ static final int WINDOW_SIZE = 1; // Default window duration in minutes
+
+ /**
+ * Concept #2: A DoFn that sets the data element timestamp. This is a silly method, just for
+ * this example, for the bounded data case.
+ *
+ * <p>Imagine that many ghosts of Shakespeare are all typing madly at the same time to recreate
+ * his masterworks. Each line of the corpus will get a random associated timestamp somewhere in a
+ * 2-hour period.
+ */
+ static class AddTimestampFn extends DoFn<String, String> {
+ private static final long RAND_RANGE = 7200000; // 2 hours in ms
+
+ @Override
+ public void processElement(ProcessContext c) {
+ // Generate a timestamp that falls somewhere in the past two hours.
+ long randomTimestamp = System.currentTimeMillis()
+ - (int) (Math.random() * RAND_RANGE);
+ /**
+ * Concept #2: Set the data element with that timestamp.
+ */
+ c.outputWithTimestamp(c.element(), new Instant(randomTimestamp));
+ }
+ }
+
+ /** A DoFn that converts a Word and Count into a BigQuery table row. */
+ static class FormatAsTableRowFn extends DoFn<KV<String, Long>, TableRow> {
+ @Override
+ public void processElement(ProcessContext c) {
+ TableRow row = new TableRow()
+ .set("word", c.element().getKey())
+ .set("count", c.element().getValue())
+ // include a field for the window timestamp
+ .set("window_timestamp", c.timestamp().toString());
+ c.output(row);
+ }
+ }
+
+ /**
+ * Helper method that defines the BigQuery schema used for the output.
+ */
+ private static TableSchema getSchema() {
+ List<TableFieldSchema> fields = new ArrayList<>();
+ fields.add(new TableFieldSchema().setName("word").setType("STRING"));
+ fields.add(new TableFieldSchema().setName("count").setType("INTEGER"));
+ fields.add(new TableFieldSchema().setName("window_timestamp").setType("TIMESTAMP"));
+ TableSchema schema = new TableSchema().setFields(fields);
+ return schema;
+ }
+
+ /**
+ * Concept #6: We'll stream the results to a BigQuery table. The BigQuery output source is one
+ * that supports both bounded and unbounded data. This is a helper method that creates a
+ * TableReference from input options, to tell the pipeline where to write its BigQuery results.
+ */
+ private static TableReference getTableReference(Options options) {
+ TableReference tableRef = new TableReference();
+ tableRef.setProjectId(options.getProject());
+ tableRef.setDatasetId(options.getBigQueryDataset());
+ tableRef.setTableId(options.getBigQueryTable());
+ return tableRef;
+ }
+
+ /**
+ * Options supported by {@link WindowedWordCount}.
+ *
+ * <p>Inherits standard example configuration options, which allow specification of the BigQuery
+ * table and the PubSub topic, as well as the {@link WordCount.WordCountOptions} support for
+ * specification of the input file.
+ */
+ public static interface Options
+ extends WordCount.WordCountOptions, DataflowExampleUtils.DataflowExampleUtilsOptions {
+ @Description("Fixed window duration, in minutes")
+ @Default.Integer(WINDOW_SIZE)
+ Integer getWindowSize();
+ void setWindowSize(Integer value);
+
+ @Description("Whether to run the pipeline with unbounded input")
+ boolean isUnbounded();
+ void setUnbounded(boolean value);
+ }
+
+ public static void main(String[] args) throws IOException {
+ Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
+ options.setBigQuerySchema(getSchema());
+ // DataflowExampleUtils creates the necessary input sources to simplify execution of this
+ // Pipeline.
+ DataflowExampleUtils exampleDataflowUtils = new DataflowExampleUtils(options,
+ options.isUnbounded());
+
+ Pipeline pipeline = Pipeline.create(options);
+
+ /**
+ * Concept #1: the Dataflow SDK lets us run the same pipeline with either a bounded or
+ * unbounded input source.
+ */
+ PCollection<String> input;
+ if (options.isUnbounded()) {
+ LOG.info("Reading from PubSub.");
+ /**
+ * Concept #3: Read from the PubSub topic. A topic will be created if it wasn't
+ * specified as an argument. The data elements' timestamps will come from the pubsub
+ * injection.
+ */
+ input = pipeline
+ .apply(PubsubIO.Read.topic(options.getPubsubTopic()));
+ } else {
+ /** Else, this is a bounded pipeline. Read from the GCS file. */
+ input = pipeline
+ .apply(TextIO.Read.from(options.getInputFile()))
+ // Concept #2: Add an element timestamp, using an artificial time just to show windowing.
+ // See AddTimestampFn for more detail on this.
+ .apply(ParDo.of(new AddTimestampFn()));
+ }
+
+ /**
+ * Concept #4: Window into fixed windows. The fixed window size for this example defaults to 1
+ * minute (you can change this with a command-line option). See the documentation for more
+ * information on how fixed windows work, and for information on the other types of windowing
+ * available (e.g., sliding windows).
+ */
+ PCollection<String> windowedWords = input
+ .apply(Window.<String>into(
+ FixedWindows.of(Duration.standardMinutes(options.getWindowSize()))));
+
+ /**
+ * Concept #5: Re-use our existing CountWords transform that does not have knowledge of
+ * windows over a PCollection containing windowed values.
+ */
+ PCollection<KV<String, Long>> wordCounts = windowedWords.apply(new WordCount.CountWords());
+
+ /**
+ * Concept #6: Format the results for a BigQuery table, then write to BigQuery.
+ * The BigQuery output source supports both bounded and unbounded data.
+ */
+ wordCounts.apply(ParDo.of(new FormatAsTableRowFn()))
+ .apply(BigQueryIO.Write.to(getTableReference(options)).withSchema(getSchema()));
+
+ PipelineResult result = pipeline.run();
+
+ /**
+ * To mock unbounded input from PubSub, we'll now start an auxiliary 'injector' pipeline that
+ * runs for a limited time, and publishes to the input PubSub topic.
+ *
+ * With an unbounded input source, you will need to explicitly shut down this pipeline when you
+ * are done with it, so that you do not continue to be charged for the instances. You can do
+ * this via a ctrl-C from the command line, or from the developer's console UI for Dataflow
+ * pipelines. The PubSub topic will also be deleted at this time.
+ */
+ exampleDataflowUtils.mockUnboundedSource(options.getInputFile(), result);
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/257a7a6b/sdks/java/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/WordCount.java
----------------------------------------------------------------------
diff --git a/sdks/java/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/WordCount.java b/sdks/java/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/WordCount.java
new file mode 100644
index 0000000..150b60d
--- /dev/null
+++ b/sdks/java/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/WordCount.java
@@ -0,0 +1,204 @@
+/*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package ${package};
+
+import com.google.cloud.dataflow.sdk.Pipeline;
+import com.google.cloud.dataflow.sdk.io.TextIO;
+import com.google.cloud.dataflow.sdk.options.DataflowPipelineOptions;
+import com.google.cloud.dataflow.sdk.options.Default;
+import com.google.cloud.dataflow.sdk.options.DefaultValueFactory;
+import com.google.cloud.dataflow.sdk.options.Description;
+import com.google.cloud.dataflow.sdk.options.PipelineOptions;
+import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
+import com.google.cloud.dataflow.sdk.transforms.Aggregator;
+import com.google.cloud.dataflow.sdk.transforms.Count;
+import com.google.cloud.dataflow.sdk.transforms.DoFn;
+import com.google.cloud.dataflow.sdk.transforms.PTransform;
+import com.google.cloud.dataflow.sdk.transforms.ParDo;
+import com.google.cloud.dataflow.sdk.transforms.Sum;
+import com.google.cloud.dataflow.sdk.util.gcsfs.GcsPath;
+import com.google.cloud.dataflow.sdk.values.KV;
+import com.google.cloud.dataflow.sdk.values.PCollection;
+
+
+/**
+ * An example that counts words in Shakespeare and includes Dataflow best practices.
+ *
+ * <p>This class, {@link WordCount}, is the second in a series of four successively more detailed
+ * 'word count' examples. You may first want to take a look at {@link MinimalWordCount}.
+ * After you've looked at this example, then see the {@link DebuggingWordCount}
+ * pipeline, for introduction of additional concepts.
+ *
+ * <p>For a detailed walkthrough of this example, see
+ * <a href="https://cloud.google.com/dataflow/java-sdk/wordcount-example">
+ * https://cloud.google.com/dataflow/java-sdk/wordcount-example
+ * </a>
+ *
+ * <p>Basic concepts, also in the MinimalWordCount example:
+ * Reading text files; counting a PCollection; writing to GCS.
+ *
+ * <p>New Concepts:
+ * <pre>
+ * 1. Executing a Pipeline both locally and using the Dataflow service
+ * 2. Using ParDo with static DoFns defined out-of-line
+ * 3. Building a composite transform
+ * 4. Defining your own pipeline options
+ * </pre>
+ *
+ * <p>Concept #1: you can execute this pipeline either locally or using the Dataflow service.
+ * These are now command-line options and not hard-coded as they were in the MinimalWordCount
+ * example.
+ * To execute this pipeline locally, specify general pipeline configuration:
+ * <pre>{@code
+ * --project=YOUR_PROJECT_ID
+ * }
+ * </pre>
+ * and a local output file or output prefix on GCS:
+ * <pre>{@code
+ * --output=[YOUR_LOCAL_FILE | gs://YOUR_OUTPUT_PREFIX]
+ * }</pre>
+ *
+ * <p>To execute this pipeline using the Dataflow service, specify pipeline configuration:
+ * <pre>{@code
+ * --project=YOUR_PROJECT_ID
+ * --stagingLocation=gs://YOUR_STAGING_DIRECTORY
+ * --runner=BlockingDataflowPipelineRunner
+ * }
+ * </pre>
+ * and an output prefix on GCS:
+ * <pre>{@code
+ * --output=gs://YOUR_OUTPUT_PREFIX
+ * }</pre>
+ *
+ * <p>The input file defaults to {@code gs://dataflow-samples/shakespeare/kinglear.txt} and can be
+ * overridden with {@code --inputFile}.
+ */
+public class WordCount {
+
+ /**
+ * Concept #2: You can make your pipeline code less verbose by defining your DoFns statically out-
+ * of-line. This DoFn tokenizes lines of text into individual words; we pass it to a ParDo in the
+ * pipeline.
+ */
+ static class ExtractWordsFn extends DoFn<String, String> {
+ private final Aggregator<Long, Long> emptyLines =
+ createAggregator("emptyLines", new Sum.SumLongFn());
+
+ @Override
+ public void processElement(ProcessContext c) {
+ if (c.element().trim().isEmpty()) {
+ emptyLines.addValue(1L);
+ }
+
+ // Split the line into words.
+ String[] words = c.element().split("[^a-zA-Z']+");
+
+ // Output each word encountered into the output PCollection.
+ for (String word : words) {
+ if (!word.isEmpty()) {
+ c.output(word);
+ }
+ }
+ }
+ }
+
+ /** A DoFn that converts a Word and Count into a printable string. */
+ public static class FormatAsTextFn extends DoFn<KV<String, Long>, String> {
+ @Override
+ public void processElement(ProcessContext c) {
+ c.output(c.element().getKey() + ": " + c.element().getValue());
+ }
+ }
+
+ /**
+ * A PTransform that converts a PCollection containing lines of text into a PCollection of
+ * formatted word counts.
+ *
+ * <p>Concept #3: This is a custom composite transform that bundles two transforms (ParDo and
+ * Count) as a reusable PTransform subclass. Using composite transforms allows for easy reuse,
+ * modular testing, and an improved monitoring experience.
+ */
+ public static class CountWords extends PTransform<PCollection<String>,
+ PCollection<KV<String, Long>>> {
+ @Override
+ public PCollection<KV<String, Long>> apply(PCollection<String> lines) {
+
+ // Convert lines of text into individual words.
+ PCollection<String> words = lines.apply(
+ ParDo.of(new ExtractWordsFn()));
+
+ // Count the number of times each word occurs.
+ PCollection<KV<String, Long>> wordCounts =
+ words.apply(Count.<String>perElement());
+
+ return wordCounts;
+ }
+ }
+
+ /**
+ * Options supported by {@link WordCount}.
+ *
+ * <p>Concept #4: Defining your own configuration options. Here, you can add your own arguments
+ * to be processed by the command-line parser, and specify default values for them. You can then
+ * access the options values in your pipeline code.
+ *
+ * <p>Inherits standard configuration options.
+ */
+ public static interface WordCountOptions extends PipelineOptions {
+ @Description("Path of the file to read from")
+ @Default.String("gs://dataflow-samples/shakespeare/kinglear.txt")
+ String getInputFile();
+ void setInputFile(String value);
+
+ @Description("Path of the file to write to")
+ @Default.InstanceFactory(OutputFactory.class)
+ String getOutput();
+ void setOutput(String value);
+
+ /**
+ * Returns "gs://${YOUR_STAGING_DIRECTORY}/counts.txt" as the default destination.
+ */
+ public static class OutputFactory implements DefaultValueFactory<String> {
+ @Override
+ public String create(PipelineOptions options) {
+ DataflowPipelineOptions dataflowOptions = options.as(DataflowPipelineOptions.class);
+ if (dataflowOptions.getStagingLocation() != null) {
+ return GcsPath.fromUri(dataflowOptions.getStagingLocation())
+ .resolve("counts.txt").toString();
+ } else {
+ throw new IllegalArgumentException("Must specify --output or --stagingLocation");
+ }
+ }
+ }
+
+ }
+
+ public static void main(String[] args) {
+ WordCountOptions options = PipelineOptionsFactory.fromArgs(args).withValidation()
+ .as(WordCountOptions.class);
+ Pipeline p = Pipeline.create(options);
+
+ // Concepts #2 and #3: Our pipeline applies the composite CountWords transform, and passes the
+ // static FormatAsTextFn() to the ParDo transform.
+ p.apply(TextIO.Read.named("ReadLines").from(options.getInputFile()))
+ .apply(new CountWords())
+ .apply(ParDo.of(new FormatAsTextFn()))
+ .apply(TextIO.Write.named("WriteCounts").to(options.getOutput()));
+
+ p.run();
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/257a7a6b/sdks/java/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/DataflowExampleOptions.java
----------------------------------------------------------------------
diff --git a/sdks/java/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/DataflowExampleOptions.java b/sdks/java/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/DataflowExampleOptions.java
new file mode 100644
index 0000000..e182f4c
--- /dev/null
+++ b/sdks/java/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/DataflowExampleOptions.java
@@ -0,0 +1,29 @@
+/*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package ${package}.common;
+
+import com.google.cloud.dataflow.sdk.options.DataflowPipelineOptions;
+import com.google.cloud.dataflow.sdk.options.Default;
+import com.google.cloud.dataflow.sdk.options.Description;
+
+/**
+ * Options that can be used to configure the Dataflow examples.
+ */
+public interface DataflowExampleOptions extends DataflowPipelineOptions {
+ @Description("Whether to keep jobs running on the Dataflow service after local process exit")
+ @Default.Boolean(false)
+ boolean getKeepJobsRunning();
+ void setKeepJobsRunning(boolean keepJobsRunning);
+}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/257a7a6b/sdks/java/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/DataflowExampleUtils.java
----------------------------------------------------------------------
diff --git a/sdks/java/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/DataflowExampleUtils.java b/sdks/java/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/DataflowExampleUtils.java
new file mode 100644
index 0000000..9861769
--- /dev/null
+++ b/sdks/java/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/DataflowExampleUtils.java
@@ -0,0 +1,398 @@
+/*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package ${package}.common;
+
+import com.google.api.client.googleapis.json.GoogleJsonResponseException;
+import com.google.api.client.googleapis.services.AbstractGoogleClientRequest;
+import com.google.api.services.bigquery.Bigquery;
+import com.google.api.services.bigquery.Bigquery.Datasets;
+import com.google.api.services.bigquery.Bigquery.Tables;
+import com.google.api.services.bigquery.model.Dataset;
+import com.google.api.services.bigquery.model.DatasetReference;
+import com.google.api.services.bigquery.model.Table;
+import com.google.api.services.bigquery.model.TableReference;
+import com.google.api.services.bigquery.model.TableSchema;
+import com.google.api.services.dataflow.Dataflow;
+import com.google.api.services.pubsub.Pubsub;
+import com.google.api.services.pubsub.model.Topic;
+import com.google.cloud.dataflow.sdk.Pipeline;
+import com.google.cloud.dataflow.sdk.PipelineResult;
+import com.google.cloud.dataflow.sdk.io.TextIO;
+import com.google.cloud.dataflow.sdk.options.BigQueryOptions;
+import com.google.cloud.dataflow.sdk.options.DataflowPipelineOptions;
+import com.google.cloud.dataflow.sdk.runners.DataflowPipelineJob;
+import com.google.cloud.dataflow.sdk.runners.DataflowPipelineRunner;
+import com.google.cloud.dataflow.sdk.runners.DirectPipelineRunner;
+import com.google.cloud.dataflow.sdk.transforms.IntraBundleParallelization;
+import com.google.cloud.dataflow.sdk.util.MonitoringUtil;
+import com.google.cloud.dataflow.sdk.util.Transport;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Sets;
+
+import java.io.IOException;
+import java.util.Collection;
+import java.util.List;
+import java.util.Set;
+import java.util.concurrent.TimeUnit;
+
+import javax.servlet.http.HttpServletResponse;
+
+/**
+ * The utility class that sets up and tears down external resources, starts the Google Cloud Pub/Sub
+ * injector, and cancels the streaming and the injector pipelines once the program terminates.
+ *
+ * <p>It is used to run Dataflow examples, such as TrafficMaxLaneFlow and TrafficRoutes.
+ */
+public class DataflowExampleUtils {
+
+ private final DataflowPipelineOptions options;
+ private Bigquery bigQueryClient = null;
+ private Pubsub pubsubClient = null;
+ private Dataflow dataflowClient = null;
+ private Set<DataflowPipelineJob> jobsToCancel = Sets.newHashSet();
+ private List<String> pendingMessages = Lists.newArrayList();
+
+ /**
+ * Define an interface that supports the PubSub and BigQuery example options.
+ */
+ public static interface DataflowExampleUtilsOptions
+ extends DataflowExampleOptions, ExamplePubsubTopicOptions, ExampleBigQueryTableOptions {
+ }
+
+ public DataflowExampleUtils(DataflowPipelineOptions options) {
+ this.options = options;
+ }
+
+ /**
+ * Do resources and runner options setup.
+ */
+ public DataflowExampleUtils(DataflowPipelineOptions options, boolean isUnbounded)
+ throws IOException {
+ this.options = options;
+ setupResourcesAndRunner(isUnbounded);
+ }
+
+ /**
+ * Sets up external resources that are required by the example,
+ * such as Pub/Sub topics and BigQuery tables.
+ *
+ * @throws IOException if there is a problem setting up the resources
+ */
+ public void setup() throws IOException {
+ setupPubsubTopic();
+ setupBigQueryTable();
+ }
+
+ /**
+ * Set up external resources, and configure the runner appropriately.
+ */
+ public void setupResourcesAndRunner(boolean isUnbounded) throws IOException {
+ if (isUnbounded) {
+ options.setStreaming(true);
+ }
+ setup();
+ setupRunner();
+ }
+
+ /**
+ * Sets up the Google Cloud Pub/Sub topic.
+ *
+ * <p>If the topic doesn't exist, a new topic with the given name will be created.
+ *
+ * @throws IOException if there is a problem setting up the Pub/Sub topic
+ */
+ public void setupPubsubTopic() throws IOException {
+ ExamplePubsubTopicOptions pubsubTopicOptions = options.as(ExamplePubsubTopicOptions.class);
+ if (!pubsubTopicOptions.getPubsubTopic().isEmpty()) {
+ pendingMessages.add("*******************Set Up Pubsub Topic*********************");
+ setupPubsubTopic(pubsubTopicOptions.getPubsubTopic());
+ pendingMessages.add("The Pub/Sub topic has been set up for this example: "
+ + pubsubTopicOptions.getPubsubTopic());
+ }
+ }
+
+ /**
+ * Sets up the BigQuery table with the given schema.
+ *
+ * <p>If the table already exists, the schema has to match the given one. Otherwise, the example
+ * will throw a RuntimeException. If the table doesn't exist, a new table with the given schema
+ * will be created.
+ *
+ * @throws IOException if there is a problem setting up the BigQuery table
+ */
+ public void setupBigQueryTable() throws IOException {
+ ExampleBigQueryTableOptions bigQueryTableOptions =
+ options.as(ExampleBigQueryTableOptions.class);
+ if (bigQueryTableOptions.getBigQueryDataset() != null
+ && bigQueryTableOptions.getBigQueryTable() != null
+ && bigQueryTableOptions.getBigQuerySchema() != null) {
+ pendingMessages.add("******************Set Up Big Query Table*******************");
+ setupBigQueryTable(bigQueryTableOptions.getProject(),
+ bigQueryTableOptions.getBigQueryDataset(),
+ bigQueryTableOptions.getBigQueryTable(),
+ bigQueryTableOptions.getBigQuerySchema());
+ pendingMessages.add("The BigQuery table has been set up for this example: "
+ + bigQueryTableOptions.getProject()
+ + ":" + bigQueryTableOptions.getBigQueryDataset()
+ + "." + bigQueryTableOptions.getBigQueryTable());
+ }
+ }
+
+ /**
+ * Tears down external resources that can be deleted upon the example's completion.
+ */
+ private void tearDown() {
+ pendingMessages.add("*************************Tear Down*************************");
+ ExamplePubsubTopicOptions pubsubTopicOptions = options.as(ExamplePubsubTopicOptions.class);
+ if (!pubsubTopicOptions.getPubsubTopic().isEmpty()) {
+ try {
+ deletePubsubTopic(pubsubTopicOptions.getPubsubTopic());
+ pendingMessages.add("The Pub/Sub topic has been deleted: "
+ + pubsubTopicOptions.getPubsubTopic());
+ } catch (IOException e) {
+ pendingMessages.add("Failed to delete the Pub/Sub topic : "
+ + pubsubTopicOptions.getPubsubTopic());
+ }
+ }
+
+ ExampleBigQueryTableOptions bigQueryTableOptions =
+ options.as(ExampleBigQueryTableOptions.class);
+ if (bigQueryTableOptions.getBigQueryDataset() != null
+ && bigQueryTableOptions.getBigQueryTable() != null
+ && bigQueryTableOptions.getBigQuerySchema() != null) {
+ pendingMessages.add("The BigQuery table might contain the example's output, "
+ + "and it is not deleted automatically: "
+ + bigQueryTableOptions.getProject()
+ + ":" + bigQueryTableOptions.getBigQueryDataset()
+ + "." + bigQueryTableOptions.getBigQueryTable());
+ pendingMessages.add("Please go to the Developers Console to delete it manually."
+ + " Otherwise, you may be charged for its usage.");
+ }
+ }
+
+ private void setupBigQueryTable(String projectId, String datasetId, String tableId,
+ TableSchema schema) throws IOException {
+ if (bigQueryClient == null) {
+ bigQueryClient = Transport.newBigQueryClient(options.as(BigQueryOptions.class)).build();
+ }
+
+ Datasets datasetService = bigQueryClient.datasets();
+ if (executeNullIfNotFound(datasetService.get(projectId, datasetId)) == null) {
+ Dataset newDataset = new Dataset().setDatasetReference(
+ new DatasetReference().setProjectId(projectId).setDatasetId(datasetId));
+ datasetService.insert(projectId, newDataset).execute();
+ }
+
+ Tables tableService = bigQueryClient.tables();
+ Table table = executeNullIfNotFound(tableService.get(projectId, datasetId, tableId));
+ if (table == null) {
+ Table newTable = new Table().setSchema(schema).setTableReference(
+ new TableReference().setProjectId(projectId).setDatasetId(datasetId).setTableId(tableId));
+ tableService.insert(projectId, datasetId, newTable).execute();
+ } else if (!table.getSchema().equals(schema)) {
+ throw new RuntimeException(
+ "Table exists and schemas do not match, expecting: " + schema.toPrettyString()
+ + ", actual: " + table.getSchema().toPrettyString());
+ }
+ }
+
+ private void setupPubsubTopic(String topic) throws IOException {
+ if (pubsubClient == null) {
+ pubsubClient = Transport.newPubsubClient(options).build();
+ }
+ if (executeNullIfNotFound(pubsubClient.projects().topics().get(topic)) == null) {
+ pubsubClient.projects().topics().create(topic, new Topic().setName(topic)).execute();
+ }
+ }
+
+ /**
+ * Deletes the Google Cloud Pub/Sub topic.
+ *
+ * @throws IOException if there is a problem deleting the Pub/Sub topic
+ */
+ private void deletePubsubTopic(String topic) throws IOException {
+ if (pubsubClient == null) {
+ pubsubClient = Transport.newPubsubClient(options).build();
+ }
+ if (executeNullIfNotFound(pubsubClient.projects().topics().get(topic)) != null) {
+ pubsubClient.projects().topics().delete(topic).execute();
+ }
+ }
+
+ /**
+ * If this is an unbounded (streaming) pipeline, and both inputFile and pubsub topic are defined,
+ * start an 'injector' pipeline that publishes the contents of the file to the given topic, first
+ * creating the topic if necessary.
+ */
+ public void startInjectorIfNeeded(String inputFile) {
+ ExamplePubsubTopicOptions pubsubTopicOptions = options.as(ExamplePubsubTopicOptions.class);
+ if (pubsubTopicOptions.isStreaming()
+ && inputFile != null && !inputFile.isEmpty()
+ && pubsubTopicOptions.getPubsubTopic() != null
+ && !pubsubTopicOptions.getPubsubTopic().isEmpty()) {
+ runInjectorPipeline(inputFile, pubsubTopicOptions.getPubsubTopic());
+ }
+ }
+
+ /**
+ * Do some runner setup: check that the DirectPipelineRunner is not used in conjunction with
+ * streaming, and if streaming is specified, use the DataflowPipelineRunner. Return the streaming
+ * flag value.
+ */
+ public void setupRunner() {
+ if (options.isStreaming()) {
+ if (options.getRunner() == DirectPipelineRunner.class) {
+ throw new IllegalArgumentException(
+ "Processing of unbounded input sources is not supported with the DirectPipelineRunner.");
+ }
+ // In order to cancel the pipelines automatically,
+ // {@literal DataflowPipelineRunner} is forced to be used.
+ options.setRunner(DataflowPipelineRunner.class);
+ }
+ }
+
+ /**
+ * Runs the batch injector for the streaming pipeline.
+ *
+ * <p>The injector pipeline will read from the given text file, and inject data
+ * into the Google Cloud Pub/Sub topic.
+ */
+ public void runInjectorPipeline(String inputFile, String topic) {
+ DataflowPipelineOptions copiedOptions = options.cloneAs(DataflowPipelineOptions.class);
+ copiedOptions.setStreaming(false);
+ copiedOptions.setNumWorkers(
+ options.as(ExamplePubsubTopicOptions.class).getInjectorNumWorkers());
+ copiedOptions.setJobName(options.getJobName() + "-injector");
+ Pipeline injectorPipeline = Pipeline.create(copiedOptions);
+ injectorPipeline.apply(TextIO.Read.from(inputFile))
+ .apply(IntraBundleParallelization
+ .of(PubsubFileInjector.publish(topic))
+ .withMaxParallelism(20));
+ DataflowPipelineJob injectorJob = (DataflowPipelineJob) injectorPipeline.run();
+ jobsToCancel.add(injectorJob);
+ }
+
+ /**
+ * Runs the provided injector pipeline for the streaming pipeline.
+ */
+ public void runInjectorPipeline(Pipeline injectorPipeline) {
+ DataflowPipelineJob injectorJob = (DataflowPipelineJob) injectorPipeline.run();
+ jobsToCancel.add(injectorJob);
+ }
+
+ /**
+ * Start the auxiliary injector pipeline, then wait for this pipeline to finish.
+ */
+ public void mockUnboundedSource(String inputFile, PipelineResult result) {
+ startInjectorIfNeeded(inputFile);
+ waitToFinish(result);
+ }
+
+ /**
+ * If {@literal DataflowPipelineRunner} or {@literal BlockingDataflowPipelineRunner} is used,
+ * waits for the pipeline to finish and cancels it (and the injector) before the program exists.
+ */
+ public void waitToFinish(PipelineResult result) {
+ if (result instanceof DataflowPipelineJob) {
+ final DataflowPipelineJob job = (DataflowPipelineJob) result;
+ jobsToCancel.add(job);
+ if (!options.as(DataflowExampleOptions.class).getKeepJobsRunning()) {
+ addShutdownHook(jobsToCancel);
+ }
+ try {
+ job.waitToFinish(-1, TimeUnit.SECONDS, new MonitoringUtil.PrintHandler(System.out));
+ } catch (Exception e) {
+ throw new RuntimeException("Failed to wait for job to finish: " + job.getJobId());
+ }
+ } else {
+ // Do nothing if the given PipelineResult doesn't support waitToFinish(),
+ // such as EvaluationResults returned by DirectPipelineRunner.
+ }
+ }
+
+ private void addShutdownHook(final Collection<DataflowPipelineJob> jobs) {
+ if (dataflowClient == null) {
+ dataflowClient = options.getDataflowClient();
+ }
+
+ Runtime.getRuntime().addShutdownHook(new Thread() {
+ @Override
+ public void run() {
+ tearDown();
+ printPendingMessages();
+ for (DataflowPipelineJob job : jobs) {
+ System.out.println("Canceling example pipeline: " + job.getJobId());
+ try {
+ job.cancel();
+ } catch (IOException e) {
+ System.out.println("Failed to cancel the job,"
+ + " please go to the Developers Console to cancel it manually");
+ System.out.println(
+ MonitoringUtil.getJobMonitoringPageURL(job.getProjectId(), job.getJobId()));
+ }
+ }
+
+ for (DataflowPipelineJob job : jobs) {
+ boolean cancellationVerified = false;
+ for (int retryAttempts = 6; retryAttempts > 0; retryAttempts--) {
+ if (job.getState().isTerminal()) {
+ cancellationVerified = true;
+ System.out.println("Canceled example pipeline: " + job.getJobId());
+ break;
+ } else {
+ System.out.println(
+ "The example pipeline is still running. Verifying the cancellation.");
+ }
+ try {
+ Thread.sleep(10000);
+ } catch (InterruptedException e) {
+ // Ignore
+ }
+ }
+ if (!cancellationVerified) {
+ System.out.println("Failed to verify the cancellation for job: " + job.getJobId());
+ System.out.println("Please go to the Developers Console to verify manually:");
+ System.out.println(
+ MonitoringUtil.getJobMonitoringPageURL(job.getProjectId(), job.getJobId()));
+ }
+ }
+ }
+ });
+ }
+
+ private void printPendingMessages() {
+ System.out.println();
+ System.out.println("***********************************************************");
+ System.out.println("***********************************************************");
+ for (String message : pendingMessages) {
+ System.out.println(message);
+ }
+ System.out.println("***********************************************************");
+ System.out.println("***********************************************************");
+ }
+
+ private static <T> T executeNullIfNotFound(
+ AbstractGoogleClientRequest<T> request) throws IOException {
+ try {
+ return request.execute();
+ } catch (GoogleJsonResponseException e) {
+ if (e.getStatusCode() == HttpServletResponse.SC_NOT_FOUND) {
+ return null;
+ } else {
+ throw e;
+ }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/257a7a6b/sdks/java/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/ExampleBigQueryTableOptions.java
----------------------------------------------------------------------
diff --git a/sdks/java/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/ExampleBigQueryTableOptions.java b/sdks/java/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/ExampleBigQueryTableOptions.java
new file mode 100644
index 0000000..bef5bfd
--- /dev/null
+++ b/sdks/java/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/ExampleBigQueryTableOptions.java
@@ -0,0 +1,53 @@
+/*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package ${package}.common;
+
+import com.google.api.services.bigquery.model.TableSchema;
+import com.google.cloud.dataflow.sdk.options.DataflowPipelineOptions;
+import com.google.cloud.dataflow.sdk.options.Default;
+import com.google.cloud.dataflow.sdk.options.DefaultValueFactory;
+import com.google.cloud.dataflow.sdk.options.Description;
+import com.google.cloud.dataflow.sdk.options.PipelineOptions;
+
+/**
+ * Options that can be used to configure BigQuery tables in Dataflow examples.
+ * The project defaults to the project being used to run the example.
+ */
+public interface ExampleBigQueryTableOptions extends DataflowPipelineOptions {
+ @Description("BigQuery dataset name")
+ @Default.String("dataflow_examples")
+ String getBigQueryDataset();
+ void setBigQueryDataset(String dataset);
+
+ @Description("BigQuery table name")
+ @Default.InstanceFactory(BigQueryTableFactory.class)
+ String getBigQueryTable();
+ void setBigQueryTable(String table);
+
+ @Description("BigQuery table schema")
+ TableSchema getBigQuerySchema();
+ void setBigQuerySchema(TableSchema schema);
+
+ /**
+ * Returns the job name as the default BigQuery table name.
+ */
+ static class BigQueryTableFactory implements DefaultValueFactory<String> {
+ @Override
+ public String create(PipelineOptions options) {
+ return options.as(DataflowPipelineOptions.class).getJobName()
+ .replace('-', '_');
+ }
+ }
+}
[28/67] [partial] incubator-beam git commit: Directory reorganization
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/testing/DataflowAssert.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/testing/DataflowAssert.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/testing/DataflowAssert.java
deleted file mode 100644
index 6c9643c..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/testing/DataflowAssert.java
+++ /dev/null
@@ -1,825 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.testing;
-
-import static org.hamcrest.Matchers.containsInAnyOrder;
-import static org.hamcrest.Matchers.equalTo;
-import static org.hamcrest.Matchers.not;
-import static org.junit.Assert.assertThat;
-
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.coders.CoderException;
-import com.google.cloud.dataflow.sdk.coders.IterableCoder;
-import com.google.cloud.dataflow.sdk.coders.KvCoder;
-import com.google.cloud.dataflow.sdk.coders.MapCoder;
-import com.google.cloud.dataflow.sdk.coders.VoidCoder;
-import com.google.cloud.dataflow.sdk.options.StreamingOptions;
-import com.google.cloud.dataflow.sdk.runners.PipelineRunner;
-import com.google.cloud.dataflow.sdk.transforms.Aggregator;
-import com.google.cloud.dataflow.sdk.transforms.Create;
-import com.google.cloud.dataflow.sdk.transforms.DoFn;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-import com.google.cloud.dataflow.sdk.transforms.ParDo;
-import com.google.cloud.dataflow.sdk.transforms.SerializableFunction;
-import com.google.cloud.dataflow.sdk.transforms.Sum;
-import com.google.cloud.dataflow.sdk.transforms.View;
-import com.google.cloud.dataflow.sdk.transforms.windowing.GlobalWindows;
-import com.google.cloud.dataflow.sdk.transforms.windowing.Window;
-import com.google.cloud.dataflow.sdk.util.CoderUtils;
-import com.google.cloud.dataflow.sdk.values.KV;
-import com.google.cloud.dataflow.sdk.values.PBegin;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.cloud.dataflow.sdk.values.PCollectionView;
-import com.google.cloud.dataflow.sdk.values.PDone;
-import com.google.common.base.Optional;
-import com.google.common.collect.Iterables;
-import com.google.common.collect.Lists;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.Serializable;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.List;
-import java.util.Map;
-import java.util.NoSuchElementException;
-
-/**
- * An assertion on the contents of a {@link PCollection}
- * incorporated into the pipeline. Such an assertion
- * can be checked no matter what kind of {@link PipelineRunner} is
- * used.
- *
- * <p>Note that the {@code DataflowAssert} call must precede the call
- * to {@link Pipeline#run}.
- *
- * <p>Examples of use:
- * <pre>{@code
- * Pipeline p = TestPipeline.create();
- * ...
- * PCollection<String> output =
- * input
- * .apply(ParDo.of(new TestDoFn()));
- * DataflowAssert.that(output)
- * .containsInAnyOrder("out1", "out2", "out3");
- * ...
- * PCollection<Integer> ints = ...
- * PCollection<Integer> sum =
- * ints
- * .apply(Combine.globally(new SumInts()));
- * DataflowAssert.that(sum)
- * .is(42);
- * ...
- * p.run();
- * }</pre>
- *
- * <p>JUnit and Hamcrest must be linked in by any code that uses DataflowAssert.
- */
-public class DataflowAssert {
-
- private static final Logger LOG = LoggerFactory.getLogger(DataflowAssert.class);
-
- static final String SUCCESS_COUNTER = "DataflowAssertSuccess";
- static final String FAILURE_COUNTER = "DataflowAssertFailure";
-
- private static int assertCount = 0;
-
- // Do not instantiate.
- private DataflowAssert() {}
-
- /**
- * Constructs an {@link IterableAssert} for the elements of the provided
- * {@link PCollection}.
- */
- public static <T> IterableAssert<T> that(PCollection<T> actual) {
- return new IterableAssert<>(
- new CreateActual<T, Iterable<T>>(actual, View.<T>asIterable()),
- actual.getPipeline())
- .setCoder(actual.getCoder());
- }
-
- /**
- * Constructs an {@link IterableAssert} for the value of the provided
- * {@link PCollection} which must contain a single {@code Iterable<T>}
- * value.
- */
- public static <T> IterableAssert<T>
- thatSingletonIterable(PCollection<? extends Iterable<T>> actual) {
-
- List<? extends Coder<?>> maybeElementCoder = actual.getCoder().getCoderArguments();
- Coder<T> tCoder;
- try {
- @SuppressWarnings("unchecked")
- Coder<T> tCoderTmp = (Coder<T>) Iterables.getOnlyElement(maybeElementCoder);
- tCoder = tCoderTmp;
- } catch (NoSuchElementException | IllegalArgumentException exc) {
- throw new IllegalArgumentException(
- "DataflowAssert.<T>thatSingletonIterable requires a PCollection<Iterable<T>>"
- + " with a Coder<Iterable<T>> where getCoderArguments() yields a"
- + " single Coder<T> to apply to the elements.");
- }
-
- @SuppressWarnings("unchecked") // Safe covariant cast
- PCollection<Iterable<T>> actualIterables = (PCollection<Iterable<T>>) actual;
-
- return new IterableAssert<>(
- new CreateActual<Iterable<T>, Iterable<T>>(
- actualIterables, View.<Iterable<T>>asSingleton()),
- actual.getPipeline())
- .setCoder(tCoder);
- }
-
- /**
- * Constructs an {@link IterableAssert} for the value of the provided
- * {@code PCollectionView PCollectionView<Iterable<T>>}.
- */
- public static <T> IterableAssert<T> thatIterable(PCollectionView<Iterable<T>> actual) {
- return new IterableAssert<>(new PreExisting<Iterable<T>>(actual), actual.getPipeline());
- }
-
- /**
- * Constructs a {@link SingletonAssert} for the value of the provided
- * {@code PCollection PCollection<T>}, which must be a singleton.
- */
- public static <T> SingletonAssert<T> thatSingleton(PCollection<T> actual) {
- return new SingletonAssert<>(
- new CreateActual<T, T>(actual, View.<T>asSingleton()), actual.getPipeline())
- .setCoder(actual.getCoder());
- }
-
- /**
- * Constructs a {@link SingletonAssert} for the value of the provided {@link PCollection}.
- *
- * <p>Note that the actual value must be coded by a {@link KvCoder},
- * not just any {@code Coder<K, V>}.
- */
- public static <K, V> SingletonAssert<Map<K, Iterable<V>>>
- thatMultimap(PCollection<KV<K, V>> actual) {
- @SuppressWarnings("unchecked")
- KvCoder<K, V> kvCoder = (KvCoder<K, V>) actual.getCoder();
-
- return new SingletonAssert<>(
- new CreateActual<>(actual, View.<K, V>asMultimap()), actual.getPipeline())
- .setCoder(MapCoder.of(kvCoder.getKeyCoder(), IterableCoder.of(kvCoder.getValueCoder())));
- }
-
- /**
- * Constructs a {@link SingletonAssert} for the value of the provided {@link PCollection},
- * which must have at most one value per key.
- *
- * <p>Note that the actual value must be coded by a {@link KvCoder},
- * not just any {@code Coder<K, V>}.
- */
- public static <K, V> SingletonAssert<Map<K, V>> thatMap(PCollection<KV<K, V>> actual) {
- @SuppressWarnings("unchecked")
- KvCoder<K, V> kvCoder = (KvCoder<K, V>) actual.getCoder();
-
- return new SingletonAssert<>(
- new CreateActual<>(actual, View.<K, V>asMap()), actual.getPipeline())
- .setCoder(MapCoder.of(kvCoder.getKeyCoder(), kvCoder.getValueCoder()));
- }
-
- ////////////////////////////////////////////////////////////
-
- /**
- * An assertion about the contents of a {@link PCollectionView} yielding an {@code Iterable<T>}.
- */
- public static class IterableAssert<T> implements Serializable {
- private final Pipeline pipeline;
- private final PTransform<PBegin, PCollectionView<Iterable<T>>> createActual;
- private Optional<Coder<T>> coder;
-
- protected IterableAssert(
- PTransform<PBegin, PCollectionView<Iterable<T>>> createActual, Pipeline pipeline) {
- this.createActual = createActual;
- this.pipeline = pipeline;
- this.coder = Optional.absent();
- }
-
- /**
- * Sets the coder to use for elements of type {@code T}, as needed for internal purposes.
- *
- * <p>Returns this {@code IterableAssert}.
- */
- public IterableAssert<T> setCoder(Coder<T> coderOrNull) {
- this.coder = Optional.fromNullable(coderOrNull);
- return this;
- }
-
- /**
- * Gets the coder, which may yet be absent.
- */
- public Coder<T> getCoder() {
- if (coder.isPresent()) {
- return coder.get();
- } else {
- throw new IllegalStateException(
- "Attempting to access the coder of an IterableAssert"
- + " that has not been set yet.");
- }
- }
-
- /**
- * Applies a {@link SerializableFunction} to check the elements of the {@code Iterable}.
- *
- * <p>Returns this {@code IterableAssert}.
- */
- public IterableAssert<T> satisfies(SerializableFunction<Iterable<T>, Void> checkerFn) {
- pipeline.apply(
- "DataflowAssert$" + (assertCount++),
- new OneSideInputAssert<Iterable<T>>(createActual, checkerFn));
- return this;
- }
-
- /**
- * Applies a {@link SerializableFunction} to check the elements of the {@code Iterable}.
- *
- * <p>Returns this {@code IterableAssert}.
- */
- public IterableAssert<T> satisfies(
- AssertRelation<Iterable<T>, Iterable<T>> relation,
- final Iterable<T> expectedElements) {
- pipeline.apply(
- "DataflowAssert$" + (assertCount++),
- new TwoSideInputAssert<Iterable<T>, Iterable<T>>(createActual,
- new CreateExpected<T, Iterable<T>>(expectedElements, coder, View.<T>asIterable()),
- relation));
-
- return this;
- }
-
- /**
- * Applies a {@link SerializableMatcher} to check the elements of the {@code Iterable}.
- *
- * <p>Returns this {@code IterableAssert}.
- */
- IterableAssert<T> satisfies(final SerializableMatcher<Iterable<? extends T>> matcher) {
- // Safe covariant cast. Could be elided by changing a lot of this file to use
- // more flexible bounds.
- @SuppressWarnings({"rawtypes", "unchecked"})
- SerializableFunction<Iterable<T>, Void> checkerFn =
- (SerializableFunction) new MatcherCheckerFn<>(matcher);
- pipeline.apply(
- "DataflowAssert$" + (assertCount++),
- new OneSideInputAssert<Iterable<T>>(
- createActual,
- checkerFn));
- return this;
- }
-
- private static class MatcherCheckerFn<T> implements SerializableFunction<T, Void> {
- private SerializableMatcher<T> matcher;
-
- public MatcherCheckerFn(SerializableMatcher<T> matcher) {
- this.matcher = matcher;
- }
-
- @Override
- public Void apply(T actual) {
- assertThat(actual, matcher);
- return null;
- }
- }
-
- /**
- * Checks that the {@code Iterable} is empty.
- *
- * <p>Returns this {@code IterableAssert}.
- */
- public IterableAssert<T> empty() {
- return satisfies(new AssertContainsInAnyOrderRelation<T>(), Collections.<T>emptyList());
- }
-
- /**
- * @throws UnsupportedOperationException always
- * @deprecated {@link Object#equals(Object)} is not supported on DataflowAssert objects.
- * If you meant to test object equality, use a variant of {@link #containsInAnyOrder}
- * instead.
- */
- @Deprecated
- @Override
- public boolean equals(Object o) {
- throw new UnsupportedOperationException(
- "If you meant to test object equality, use .containsInAnyOrder instead.");
- }
-
- /**
- * @throws UnsupportedOperationException always.
- * @deprecated {@link Object#hashCode()} is not supported on DataflowAssert objects.
- */
- @Deprecated
- @Override
- public int hashCode() {
- throw new UnsupportedOperationException(
- String.format("%s.hashCode() is not supported.", IterableAssert.class.getSimpleName()));
- }
-
- /**
- * Checks that the {@code Iterable} contains the expected elements, in any
- * order.
- *
- * <p>Returns this {@code IterableAssert}.
- */
- public IterableAssert<T> containsInAnyOrder(Iterable<T> expectedElements) {
- return satisfies(new AssertContainsInAnyOrderRelation<T>(), expectedElements);
- }
-
- /**
- * Checks that the {@code Iterable} contains the expected elements, in any
- * order.
- *
- * <p>Returns this {@code IterableAssert}.
- */
- @SafeVarargs
- public final IterableAssert<T> containsInAnyOrder(T... expectedElements) {
- return satisfies(
- new AssertContainsInAnyOrderRelation<T>(),
- Arrays.asList(expectedElements));
- }
-
- /**
- * Checks that the {@code Iterable} contains elements that match the provided matchers,
- * in any order.
- *
- * <p>Returns this {@code IterableAssert}.
- */
- @SafeVarargs
- final IterableAssert<T> containsInAnyOrder(
- SerializableMatcher<? super T>... elementMatchers) {
- return satisfies(SerializableMatchers.<T>containsInAnyOrder(elementMatchers));
- }
- }
-
- /**
- * An assertion about the single value of type {@code T}
- * associated with a {@link PCollectionView}.
- */
- public static class SingletonAssert<T> implements Serializable {
- private final Pipeline pipeline;
- private final CreateActual<?, T> createActual;
- private Optional<Coder<T>> coder;
-
- protected SingletonAssert(
- CreateActual<?, T> createActual, Pipeline pipeline) {
- this.pipeline = pipeline;
- this.createActual = createActual;
- this.coder = Optional.absent();
- }
-
- /**
- * Always throws an {@link UnsupportedOperationException}: users are probably looking for
- * {@link #isEqualTo}.
- */
- @Deprecated
- @Override
- public boolean equals(Object o) {
- throw new UnsupportedOperationException(
- String.format(
- "tests for Java equality of the %s object, not the PCollection in question. "
- + "Call a test method, such as isEqualTo.",
- getClass().getSimpleName()));
- }
-
- /**
- * @throws UnsupportedOperationException always.
- * @deprecated {@link Object#hashCode()} is not supported on DataflowAssert objects.
- */
- @Deprecated
- @Override
- public int hashCode() {
- throw new UnsupportedOperationException(
- String.format("%s.hashCode() is not supported.", SingletonAssert.class.getSimpleName()));
- }
-
- /**
- * Sets the coder to use for elements of type {@code T}, as needed
- * for internal purposes.
- *
- * <p>Returns this {@code IterableAssert}.
- */
- public SingletonAssert<T> setCoder(Coder<T> coderOrNull) {
- this.coder = Optional.fromNullable(coderOrNull);
- return this;
- }
-
- /**
- * Gets the coder, which may yet be absent.
- */
- public Coder<T> getCoder() {
- if (coder.isPresent()) {
- return coder.get();
- } else {
- throw new IllegalStateException(
- "Attempting to access the coder of an IterableAssert that has not been set yet.");
- }
- }
-
- /**
- * Applies a {@link SerializableFunction} to check the value of this
- * {@code SingletonAssert}'s view.
- *
- * <p>Returns this {@code SingletonAssert}.
- */
- public SingletonAssert<T> satisfies(SerializableFunction<T, Void> checkerFn) {
- pipeline.apply(
- "DataflowAssert$" + (assertCount++),
- new OneSideInputAssert<T>(createActual, checkerFn));
- return this;
- }
-
- /**
- * Applies an {@link AssertRelation} to check the provided relation against the
- * value of this assert and the provided expected value.
- *
- * <p>Returns this {@code SingletonAssert}.
- */
- public SingletonAssert<T> satisfies(
- AssertRelation<T, T> relation,
- final T expectedValue) {
- pipeline.apply(
- "DataflowAssert$" + (assertCount++),
- new TwoSideInputAssert<T, T>(createActual,
- new CreateExpected<T, T>(Arrays.asList(expectedValue), coder, View.<T>asSingleton()),
- relation));
-
- return this;
- }
-
- /**
- * Checks that the value of this {@code SingletonAssert}'s view is equal
- * to the expected value.
- *
- * <p>Returns this {@code SingletonAssert}.
- */
- public SingletonAssert<T> isEqualTo(T expectedValue) {
- return satisfies(new AssertIsEqualToRelation<T>(), expectedValue);
- }
-
- /**
- * Checks that the value of this {@code SingletonAssert}'s view is not equal
- * to the expected value.
- *
- * <p>Returns this {@code SingletonAssert}.
- */
- public SingletonAssert<T> notEqualTo(T expectedValue) {
- return satisfies(new AssertNotEqualToRelation<T>(), expectedValue);
- }
-
- /**
- * Checks that the value of this {@code SingletonAssert}'s view is equal to
- * the expected value.
- *
- * @deprecated replaced by {@link #isEqualTo}
- */
- @Deprecated
- public SingletonAssert<T> is(T expectedValue) {
- return isEqualTo(expectedValue);
- }
-
- }
-
- ////////////////////////////////////////////////////////////////////////
-
- private static class CreateActual<T, ActualT>
- extends PTransform<PBegin, PCollectionView<ActualT>> {
-
- private final transient PCollection<T> actual;
- private final transient PTransform<PCollection<T>, PCollectionView<ActualT>> actualView;
-
- private CreateActual(PCollection<T> actual,
- PTransform<PCollection<T>, PCollectionView<ActualT>> actualView) {
- this.actual = actual;
- this.actualView = actualView;
- }
-
- @Override
- public PCollectionView<ActualT> apply(PBegin input) {
- final Coder<T> coder = actual.getCoder();
- return actual
- .apply(Window.<T>into(new GlobalWindows()))
- .apply(ParDo.of(new DoFn<T, T>() {
- @Override
- public void processElement(ProcessContext context) throws CoderException {
- context.output(CoderUtils.clone(coder, context.element()));
- }
- }))
- .apply(actualView);
- }
- }
-
- private static class CreateExpected<T, ExpectedT>
- extends PTransform<PBegin, PCollectionView<ExpectedT>> {
-
- private final Iterable<T> elements;
- private final Optional<Coder<T>> coder;
- private final transient PTransform<PCollection<T>, PCollectionView<ExpectedT>> view;
-
- private CreateExpected(Iterable<T> elements, Optional<Coder<T>> coder,
- PTransform<PCollection<T>, PCollectionView<ExpectedT>> view) {
- this.elements = elements;
- this.coder = coder;
- this.view = view;
- }
-
- @Override
- public PCollectionView<ExpectedT> apply(PBegin input) {
- Create.Values<T> createTransform = Create.<T>of(elements);
- if (coder.isPresent()) {
- createTransform = createTransform.withCoder(coder.get());
- }
- return input.apply(createTransform).apply(view);
- }
- }
-
- private static class PreExisting<T> extends PTransform<PBegin, PCollectionView<T>> {
-
- private final PCollectionView<T> view;
-
- private PreExisting(PCollectionView<T> view) {
- this.view = view;
- }
-
- @Override
- public PCollectionView<T> apply(PBegin input) {
- return view;
- }
- }
-
- /**
- * An assertion checker that takes a single
- * {@link PCollectionView PCollectionView<ActualT>}
- * and an assertion over {@code ActualT}, and checks it within a dataflow
- * pipeline.
- *
- * <p>Note that the entire assertion must be serializable. If
- * you need to make assertions involving multiple inputs
- * that are each not serializable, use TwoSideInputAssert.
- *
- * <p>This is generally useful for assertion functions that
- * are serializable but whose underlying data may not have a coder.
- */
- static class OneSideInputAssert<ActualT>
- extends PTransform<PBegin, PDone> implements Serializable {
- private final transient PTransform<PBegin, PCollectionView<ActualT>> createActual;
- private final SerializableFunction<ActualT, Void> checkerFn;
-
- public OneSideInputAssert(
- PTransform<PBegin, PCollectionView<ActualT>> createActual,
- SerializableFunction<ActualT, Void> checkerFn) {
- this.createActual = createActual;
- this.checkerFn = checkerFn;
- }
-
- @Override
- public PDone apply(PBegin input) {
- final PCollectionView<ActualT> actual = input.apply("CreateActual", createActual);
-
- input
- .apply(Create.<Void>of((Void) null).withCoder(VoidCoder.of()))
- .apply(ParDo.named("RunChecks").withSideInputs(actual)
- .of(new CheckerDoFn<>(checkerFn, actual)));
-
- return PDone.in(input.getPipeline());
- }
- }
-
- /**
- * A {@link DoFn} that runs a checking {@link SerializableFunction} on the contents of
- * a {@link PCollectionView}, and adjusts counters and thrown exceptions for use in testing.
- */
- private static class CheckerDoFn<ActualT> extends DoFn<Void, Void> {
- private final SerializableFunction<ActualT, Void> checkerFn;
- private final Aggregator<Integer, Integer> success =
- createAggregator(SUCCESS_COUNTER, new Sum.SumIntegerFn());
- private final Aggregator<Integer, Integer> failure =
- createAggregator(FAILURE_COUNTER, new Sum.SumIntegerFn());
- private final PCollectionView<ActualT> actual;
-
- private CheckerDoFn(
- SerializableFunction<ActualT, Void> checkerFn,
- PCollectionView<ActualT> actual) {
- this.checkerFn = checkerFn;
- this.actual = actual;
- }
-
- @Override
- public void processElement(ProcessContext c) {
- try {
- ActualT actualContents = c.sideInput(actual);
- checkerFn.apply(actualContents);
- success.addValue(1);
- } catch (Throwable t) {
- LOG.error("DataflowAssert failed expectations.", t);
- failure.addValue(1);
- // TODO: allow for metrics to propagate on failure when running a streaming pipeline
- if (!c.getPipelineOptions().as(StreamingOptions.class).isStreaming()) {
- throw t;
- }
- }
- }
- }
-
- /**
- * An assertion checker that takes a {@link PCollectionView PCollectionView<ActualT>},
- * a {@link PCollectionView PCollectionView<ExpectedT>}, a relation
- * over {@code A} and {@code B}, and checks that the relation holds
- * within a dataflow pipeline.
- *
- * <p>This is useful when either/both of {@code A} and {@code B}
- * are not serializable, but have coders (provided
- * by the underlying {@link PCollection}s).
- */
- static class TwoSideInputAssert<ActualT, ExpectedT>
- extends PTransform<PBegin, PDone> implements Serializable {
-
- private final transient PTransform<PBegin, PCollectionView<ActualT>> createActual;
- private final transient PTransform<PBegin, PCollectionView<ExpectedT>> createExpected;
- private final AssertRelation<ActualT, ExpectedT> relation;
-
- protected TwoSideInputAssert(
- PTransform<PBegin, PCollectionView<ActualT>> createActual,
- PTransform<PBegin, PCollectionView<ExpectedT>> createExpected,
- AssertRelation<ActualT, ExpectedT> relation) {
- this.createActual = createActual;
- this.createExpected = createExpected;
- this.relation = relation;
- }
-
- @Override
- public PDone apply(PBegin input) {
- final PCollectionView<ActualT> actual = input.apply("CreateActual", createActual);
- final PCollectionView<ExpectedT> expected = input.apply("CreateExpected", createExpected);
-
- input
- .apply(Create.<Void>of((Void) null).withCoder(VoidCoder.of()))
- .apply(ParDo.named("RunChecks").withSideInputs(actual, expected)
- .of(new CheckerDoFn<>(relation, actual, expected)));
-
- return PDone.in(input.getPipeline());
- }
-
- private static class CheckerDoFn<ActualT, ExpectedT> extends DoFn<Void, Void> {
- private final Aggregator<Integer, Integer> success =
- createAggregator(SUCCESS_COUNTER, new Sum.SumIntegerFn());
- private final Aggregator<Integer, Integer> failure =
- createAggregator(FAILURE_COUNTER, new Sum.SumIntegerFn());
- private final AssertRelation<ActualT, ExpectedT> relation;
- private final PCollectionView<ActualT> actual;
- private final PCollectionView<ExpectedT> expected;
-
- private CheckerDoFn(AssertRelation<ActualT, ExpectedT> relation,
- PCollectionView<ActualT> actual, PCollectionView<ExpectedT> expected) {
- this.relation = relation;
- this.actual = actual;
- this.expected = expected;
- }
-
- @Override
- public void processElement(ProcessContext c) {
- try {
- ActualT actualContents = c.sideInput(actual);
- ExpectedT expectedContents = c.sideInput(expected);
- relation.assertFor(expectedContents).apply(actualContents);
- success.addValue(1);
- } catch (Throwable t) {
- LOG.error("DataflowAssert failed expectations.", t);
- failure.addValue(1);
- // TODO: allow for metrics to propagate on failure when running a streaming pipeline
- if (!c.getPipelineOptions().as(StreamingOptions.class).isStreaming()) {
- throw t;
- }
- }
- }
- }
- }
-
- /////////////////////////////////////////////////////////////////////////////
-
- /**
- * A {@link SerializableFunction} that verifies that an actual value is equal to an
- * expected value.
- */
- private static class AssertIsEqualTo<T> implements SerializableFunction<T, Void> {
- private T expected;
-
- public AssertIsEqualTo(T expected) {
- this.expected = expected;
- }
-
- @Override
- public Void apply(T actual) {
- assertThat(actual, equalTo(expected));
- return null;
- }
- }
-
- /**
- * A {@link SerializableFunction} that verifies that an actual value is not equal to an
- * expected value.
- */
- private static class AssertNotEqualTo<T> implements SerializableFunction<T, Void> {
- private T expected;
-
- public AssertNotEqualTo(T expected) {
- this.expected = expected;
- }
-
- @Override
- public Void apply(T actual) {
- assertThat(actual, not(equalTo(expected)));
- return null;
- }
- }
-
- /**
- * A {@link SerializableFunction} that verifies that an {@code Iterable} contains
- * expected items in any order.
- */
- private static class AssertContainsInAnyOrder<T>
- implements SerializableFunction<Iterable<T>, Void> {
- private T[] expected;
-
- @SafeVarargs
- public AssertContainsInAnyOrder(T... expected) {
- this.expected = expected;
- }
-
- @SuppressWarnings("unchecked")
- public AssertContainsInAnyOrder(Collection<T> expected) {
- this((T[]) expected.toArray());
- }
-
- public AssertContainsInAnyOrder(Iterable<T> expected) {
- this(Lists.<T>newArrayList(expected));
- }
-
- @Override
- public Void apply(Iterable<T> actual) {
- assertThat(actual, containsInAnyOrder(expected));
- return null;
- }
- }
-
- ////////////////////////////////////////////////////////////
-
- /**
- * A binary predicate between types {@code Actual} and {@code Expected}.
- * Implemented as a method {@code assertFor(Expected)} which returns
- * a {@code SerializableFunction<Actual, Void>}
- * that should verify the assertion..
- */
- private static interface AssertRelation<ActualT, ExpectedT> extends Serializable {
- public SerializableFunction<ActualT, Void> assertFor(ExpectedT input);
- }
-
- /**
- * An {@link AssertRelation} implementing the binary predicate that two objects are equal.
- */
- private static class AssertIsEqualToRelation<T>
- implements AssertRelation<T, T> {
- @Override
- public SerializableFunction<T, Void> assertFor(T expected) {
- return new AssertIsEqualTo<T>(expected);
- }
- }
-
- /**
- * An {@link AssertRelation} implementing the binary predicate that two objects are not equal.
- */
- private static class AssertNotEqualToRelation<T>
- implements AssertRelation<T, T> {
- @Override
- public SerializableFunction<T, Void> assertFor(T expected) {
- return new AssertNotEqualTo<T>(expected);
- }
- }
-
- /**
- * An {@code AssertRelation} implementing the binary predicate that two collections are equal
- * modulo reordering.
- */
- private static class AssertContainsInAnyOrderRelation<T>
- implements AssertRelation<Iterable<T>, Iterable<T>> {
- @Override
- public SerializableFunction<Iterable<T>, Void> assertFor(Iterable<T> expectedElements) {
- return new AssertContainsInAnyOrder<T>(expectedElements);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/testing/RunnableOnService.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/testing/RunnableOnService.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/testing/RunnableOnService.java
deleted file mode 100644
index 60ab2e5..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/testing/RunnableOnService.java
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.testing;
-
-/**
- * Category tag for tests that can be run on the
- * {@link com.google.cloud.dataflow.sdk.runners.DataflowPipelineRunner} if the
- * {@code runIntegrationTestOnService} System property is set to true.
- * Example usage:
- * <pre><code>
- * {@literal @}Test
- * {@literal @}Category(RunnableOnService.class)
- * public void testParDo() {...
- * </code></pre>
- */
-public interface RunnableOnService {}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/testing/SerializableMatcher.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/testing/SerializableMatcher.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/testing/SerializableMatcher.java
deleted file mode 100644
index 10f221e..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/testing/SerializableMatcher.java
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.testing;
-
-import org.hamcrest.Matcher;
-
-import java.io.Serializable;
-
-/**
- * A {@link Matcher} that is also {@link Serializable}.
- *
- * <p>Such matchers can be used with {@link DataflowAssert}, which builds Dataflow pipelines
- * such that these matchers may be serialized and executed remotely.
- *
- * <p>To create a {@code SerializableMatcher}, extend {@link org.hamcrest.BaseMatcher}
- * and also implement this interface.
- *
- * @param <T> The type of value matched.
- */
-interface SerializableMatcher<T> extends Matcher<T>, Serializable {
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/testing/SerializableMatchers.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/testing/SerializableMatchers.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/testing/SerializableMatchers.java
deleted file mode 100644
index da5171e..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/testing/SerializableMatchers.java
+++ /dev/null
@@ -1,1180 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.testing;
-
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.coders.CoderException;
-import com.google.cloud.dataflow.sdk.coders.ListCoder;
-import com.google.cloud.dataflow.sdk.util.CoderUtils;
-import com.google.cloud.dataflow.sdk.util.UserCodeException;
-import com.google.cloud.dataflow.sdk.values.KV;
-import com.google.common.base.MoreObjects;
-
-import org.hamcrest.BaseMatcher;
-import org.hamcrest.Description;
-import org.hamcrest.Matcher;
-import org.hamcrest.Matchers;
-
-import java.io.Serializable;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.List;
-
-import javax.annotation.Nullable;
-
-/**
- * Static class for building and using {@link SerializableMatcher} instances.
- *
- * <p>Most matchers are wrappers for hamcrest's {@link Matchers}. Please be familiar with the
- * documentation there. Values retained by a {@link SerializableMatcher} are required to be
- * serializable, either via Java serialization or via a provided {@link Coder}.
- *
- * <p>The following matchers are novel to Dataflow:
- * <ul>
- * <li>{@link #kvWithKey} for matching just the key of a {@link KV}.
- * <li>{@link #kvWithValue} for matching just the value of a {@link KV}.
- * <li>{@link #kv} for matching the key and value of a {@link KV}.
- * </ul>
- *
- * <p>For example, to match a group from
- * {@link com.google.cloud.dataflow.sdk.transforms.GroupByKey}, which has type
- * {@code KV<K, Iterable<V>>} for some {@code K} and {@code V} and where the order of the iterable
- * is undefined, use a matcher like
- * {@code kv(equalTo("some key"), containsInAnyOrder(1, 2, 3))}.
- */
-class SerializableMatchers implements Serializable {
-
- // Serializable only because of capture by anonymous inner classes
- private SerializableMatchers() { } // not instantiable
-
- /**
- * A {@link SerializableMatcher} with identical criteria to {@link Matchers#allOf(Iterable)}.
- */
- public static <T> SerializableMatcher<T>
- allOf(Iterable<SerializableMatcher<? super T>> serializableMatchers) {
-
- @SuppressWarnings({"rawtypes", "unchecked"}) // safe covariant cast
- final Iterable<Matcher<? super T>> matchers = (Iterable) serializableMatchers;
-
- return fromSupplier(new SerializableSupplier<Matcher<T>>() {
- @Override
- public Matcher<T> get() {
- return Matchers.allOf(matchers);
- }
- });
- }
-
- /**
- * A {@link SerializableMatcher} with identical criteria to {@link Matchers#allOf(Matcher[])}.
- */
- @SafeVarargs
- public static <T> SerializableMatcher<T> allOf(final SerializableMatcher<T>... matchers) {
- return fromSupplier(new SerializableSupplier<Matcher<T>>() {
- @Override
- public Matcher<T> get() {
- return Matchers.allOf(matchers);
- }
- });
- }
-
- /**
- * A {@link SerializableMatcher} with identical criteria to {@link Matchers#anyOf(Iterable)}.
- */
- public static <T> SerializableMatcher<T>
- anyOf(Iterable<SerializableMatcher<? super T>> serializableMatchers) {
-
- @SuppressWarnings({"rawtypes", "unchecked"}) // safe covariant cast
- final Iterable<Matcher<? super T>> matchers = (Iterable) serializableMatchers;
-
- return fromSupplier(new SerializableSupplier<Matcher<T>>() {
- @Override
- public Matcher<T> get() {
- return Matchers.anyOf(matchers);
- }
- });
- }
-
- /**
- * A {@link SerializableMatcher} with identical criteria to {@link Matchers#anyOf(Matcher[])}.
- */
- @SafeVarargs
- public static <T> SerializableMatcher<T> anyOf(final SerializableMatcher<T>... matchers) {
- return fromSupplier(new SerializableSupplier<Matcher<T>>() {
- @Override
- public Matcher<T> get() {
- return Matchers.anyOf(matchers);
- }
- });
- }
-
- /**
- * A {@link SerializableMatcher} with identical criteria to {@link Matchers#anything()}.
- */
- public static SerializableMatcher<Object> anything() {
- return fromSupplier(new SerializableSupplier<Matcher<Object>>() {
- @Override
- public Matcher<Object> get() {
- return Matchers.anything();
- }
- });
- }
-
- /**
- * A {@link SerializableMatcher} with identical criteria to
- * {@link Matchers#arrayContaining(Object[])}.
- */
- @SafeVarargs
- public static <T extends Serializable> SerializableMatcher<T[]>
- arrayContaining(final T... items) {
- return fromSupplier(new SerializableSupplier<Matcher<T[]>>() {
- @Override
- public Matcher<T[]> get() {
- return Matchers.arrayContaining(items);
- }
- });
- }
-
- /**
- * A {@link SerializableMatcher} with identical criteria to
- * {@link Matchers#arrayContaining(Object[])}.
- *
- * <p>The items of type {@code T} will be serialized using the provided {@link Coder}. They are
- * explicitly <i>not</i> required or expected to be serializable via Java serialization.
- */
- @SafeVarargs
- public static <T> SerializableMatcher<T[]> arrayContaining(Coder<T> coder, T... items) {
-
- final SerializableSupplier<T[]> itemsSupplier =
- new SerializableArrayViaCoder<>(coder, items);
-
- return fromSupplier(new SerializableSupplier<Matcher<T[]>>() {
- @Override
- public Matcher<T[]> get() {
- return Matchers.arrayContaining(itemsSupplier.get());
- }
- });
- }
-
- /**
- * A {@link SerializableMatcher} with identical criteria to
- * {@link Matchers#arrayContaining(Matcher[])}.
- */
- @SafeVarargs
- public static <T> SerializableMatcher<T[]>
- arrayContaining(final SerializableMatcher<? super T>... matchers) {
- return fromSupplier(new SerializableSupplier<Matcher<T[]>>() {
- @Override
- public Matcher<T[]> get() {
- return Matchers.<T>arrayContaining(matchers);
- }
- });
- }
-
- /**
- * A {@link SerializableMatcher} with identical criteria to
- * {@link Matchers#arrayContaining(List)}.
- */
- public static <T> SerializableMatcher<T[]>
- arrayContaining(List<SerializableMatcher<? super T>> serializableMatchers) {
-
- @SuppressWarnings({"rawtypes", "unchecked"}) // safe covariant cast
- final List<Matcher<? super T>> matchers = (List) serializableMatchers;
-
- return fromSupplier(new SerializableSupplier<Matcher<T[]>>() {
- @Override
- public Matcher<T[]> get() {
- return Matchers.arrayContaining(matchers);
- }
- });
- }
-
- /**
- * A {@link SerializableMatcher} with identical criteria to
- * {@link Matchers#arrayContainingInAnyOrder(Object[])}.
- */
- @SafeVarargs
- public static <T extends Serializable> SerializableMatcher<T[]>
- arrayContainingInAnyOrder(final T... items) {
-
- return fromSupplier(new SerializableSupplier<Matcher<T[]>>() {
- @Override
- public Matcher<T[]> get() {
- return Matchers.arrayContainingInAnyOrder(items);
- }
- });
- }
-
- /**
- * A {@link SerializableMatcher} with identical criteria to
- * {@link Matchers#arrayContainingInAnyOrder(Object[])}.
- *
- * <p>The items of type {@code T} will be serialized using the provided {@link Coder}. They are
- * explicitly <i>not</i> required or expected to be serializable via Java serialization.
- */
- @SafeVarargs
- public static <T> SerializableMatcher<T[]> arrayContainingInAnyOrder(Coder<T> coder, T... items) {
-
- final SerializableSupplier<T[]> itemsSupplier =
- new SerializableArrayViaCoder<>(coder, items);
-
- return fromSupplier(new SerializableSupplier<Matcher<T[]>>() {
- @Override
- public Matcher<T[]> get() {
- return Matchers.arrayContaining(itemsSupplier.get());
- }
- });
- }
-
- /**
- * A {@link SerializableMatcher} with identical criteria to
- * {@link Matchers#arrayContainingInAnyOrder(Matcher[])}.
- */
- @SafeVarargs
- public static <T> SerializableMatcher<T[]> arrayContainingInAnyOrder(
- final SerializableMatcher<? super T>... matchers) {
- return fromSupplier(new SerializableSupplier<Matcher<T[]>>() {
- @Override
- public Matcher<T[]> get() {
- return Matchers.<T>arrayContainingInAnyOrder(matchers);
- }
- });
- }
-
- /**
- * A {@link SerializableMatcher} with identical criteria to
- * {@link Matchers#arrayContainingInAnyOrder(Collection)}.
- */
- public static <T> SerializableMatcher<T[]> arrayContainingInAnyOrder(
- Collection<SerializableMatcher<? super T>> serializableMatchers) {
-
- @SuppressWarnings({"rawtypes", "unchecked"}) // safe covariant cast
- final Collection<Matcher<? super T>> matchers = (Collection) serializableMatchers;
-
- return fromSupplier(new SerializableSupplier<Matcher<T[]>>() {
- @Override
- public Matcher<T[]> get() {
- return Matchers.arrayContainingInAnyOrder(matchers);
- }
- });
- }
-
- /**
- * A {@link SerializableMatcher} with identical criteria to
- * {@link Matchers#arrayWithSize(int)}.
- */
- public static <T> SerializableMatcher<T[]> arrayWithSize(final int size) {
- return fromSupplier(new SerializableSupplier<Matcher<T[]>>() {
- @Override
- public Matcher<T[]> get() {
- return Matchers.arrayWithSize(size);
- }
- });
- }
-
- /**
- * A {@link SerializableMatcher} with identical criteria to
- * {@link Matchers#arrayWithSize(Matcher)}.
- */
- public static <T> SerializableMatcher<T[]> arrayWithSize(
- final SerializableMatcher<? super Integer> sizeMatcher) {
- return fromSupplier(new SerializableSupplier<Matcher<T[]>>() {
- @Override
- public Matcher<T[]> get() {
- return Matchers.arrayWithSize(sizeMatcher);
- }
- });
- }
-
- /**
- * A {@link SerializableMatcher} with identical criteria to
- * {@link Matchers#closeTo(double,double)}.
- */
- public static SerializableMatcher<Double> closeTo(final double target, final double error) {
- return fromSupplier(new SerializableSupplier<Matcher<Double>>() {
- @Override
- public Matcher<Double> get() {
- return Matchers.closeTo(target, error);
- }
- });
- }
-
- /**
- * A {@link SerializableMatcher} with identical criteria to
- * {@link Matchers#contains(Object[])}.
- */
- @SafeVarargs
- public static <T extends Serializable> SerializableMatcher<Iterable<? extends T>> contains(
- final T... items) {
- return fromSupplier(new SerializableSupplier<Matcher<Iterable<? extends T>>>() {
- @Override
- public Matcher<Iterable<? extends T>> get() {
- return Matchers.contains(items);
- }
- });
- }
-
- /**
- * A {@link SerializableMatcher} with identical criteria to
- * {@link Matchers#contains(Object[])}.
- *
- * <p>The items of type {@code T} will be serialized using the provided {@link Coder}. They are
- * explicitly <i>not</i> required or expected to be serializable via Java serialization.
- */
- @SafeVarargs
- public static <T> SerializableMatcher<Iterable<? extends T>>
- contains(Coder<T> coder, T... items) {
-
- final SerializableSupplier<T[]> itemsSupplier =
- new SerializableArrayViaCoder<>(coder, items);
-
- return fromSupplier(new SerializableSupplier<Matcher<Iterable<? extends T>>>() {
- @Override
- public Matcher<Iterable<? extends T>> get() {
- return Matchers.containsInAnyOrder(itemsSupplier.get());
- }
- });
- }
-
- /**
- * A {@link SerializableMatcher} with identical criteria to
- * {@link Matchers#contains(Matcher[])}.
- */
- @SafeVarargs
- public static <T> SerializableMatcher<Iterable<? extends T>> contains(
- final SerializableMatcher<? super T>... matchers) {
- return fromSupplier(new SerializableSupplier<Matcher<Iterable<? extends T>>>() {
- @Override
- public Matcher<Iterable<? extends T>> get() {
- return Matchers.<T>contains(matchers);
- }
- });
- }
-
- /**
- * A {@link SerializableMatcher} with identical criteria to
- * {@link Matchers#contains(List)}.
- */
- public static <T extends Serializable> SerializableMatcher<Iterable<? extends T>> contains(
- List<SerializableMatcher<? super T>> serializableMatchers) {
-
- @SuppressWarnings({"rawtypes", "unchecked"}) // safe covariant cast
- final List<Matcher<? super T>> matchers = (List) serializableMatchers;
-
- return fromSupplier(new SerializableSupplier<Matcher<Iterable<? extends T>>>() {
- @Override
- public Matcher<Iterable<? extends T>> get() {
- return Matchers.contains(matchers);
- }
- });
- }
-
- /**
- * A {@link SerializableMatcher} with identical criteria to
- * {@link Matchers#containsInAnyOrder(Object[])}.
- */
- @SafeVarargs
- public static <T extends Serializable> SerializableMatcher<Iterable<? extends T>>
- containsInAnyOrder(final T... items) {
- return fromSupplier(new SerializableSupplier<Matcher<Iterable<? extends T>>>() {
- @Override
- public Matcher<Iterable<? extends T>> get() {
- return Matchers.containsInAnyOrder(items);
- }
- });
- }
-
- /**
- * A {@link SerializableMatcher} with identical criteria to
- * {@link Matchers#containsInAnyOrder(Object[])}.
- *
- * <p>The items of type {@code T} will be serialized using the provided {@link Coder}.
- * It is explicitly <i>not</i> required or expected to be serializable via Java serialization.
- */
- @SafeVarargs
- public static <T> SerializableMatcher<Iterable<? extends T>>
- containsInAnyOrder(Coder<T> coder, T... items) {
-
- final SerializableSupplier<T[]> itemsSupplier =
- new SerializableArrayViaCoder<>(coder, items);
-
- return fromSupplier(new SerializableSupplier<Matcher<Iterable<? extends T>>>() {
- @Override
- public Matcher<Iterable<? extends T>> get() {
- return Matchers.containsInAnyOrder(itemsSupplier.get());
- }
- });
- }
-
- /**
- * A {@link SerializableMatcher} with identical criteria to
- * {@link Matchers#containsInAnyOrder(Matcher[])}.
- */
- @SafeVarargs
- public static <T> SerializableMatcher<Iterable<? extends T>> containsInAnyOrder(
- final SerializableMatcher<? super T>... matchers) {
- return fromSupplier(new SerializableSupplier<Matcher<Iterable<? extends T>>>() {
- @Override
- public Matcher<Iterable<? extends T>> get() {
- return Matchers.<T>containsInAnyOrder(matchers);
- }
- });
- }
-
- /**
- * A {@link SerializableMatcher} with identical criteria to
- * {@link Matchers#containsInAnyOrder(Collection)}.
- */
- public static <T> SerializableMatcher<Iterable<? extends T>> containsInAnyOrder(
- Collection<SerializableMatcher<? super T>> serializableMatchers) {
-
- @SuppressWarnings({"rawtypes", "unchecked"}) // safe covariant cast
- final Collection<Matcher<? super T>> matchers = (Collection) serializableMatchers;
-
- return fromSupplier(new SerializableSupplier<Matcher<Iterable<? extends T>>>() {
- @Override
- public Matcher<Iterable<? extends T>> get() {
- return Matchers.containsInAnyOrder(matchers);
- }
- });
- }
-
- /**
- * A {@link SerializableMatcher} with identical criteria to {@link Matchers#containsString}.
- */
- public static SerializableMatcher<String> containsString(final String substring) {
- return fromSupplier(new SerializableSupplier<Matcher<String>>() {
- @Override
- public Matcher<String> get() {
- return Matchers.containsString(substring);
- }
- });
- }
-
- /**
- * A {@link SerializableMatcher} with identical criteria to {@link Matchers#empty()}.
- */
- public static <T> SerializableMatcher<Collection<? extends T>> empty() {
- return fromSupplier(new SerializableSupplier<Matcher<Collection<? extends T>>>() {
- @Override
- public Matcher<Collection<? extends T>> get() {
- return Matchers.empty();
- }
- });
- }
-
- /**
- * A {@link SerializableMatcher} with identical criteria to {@link Matchers#emptyArray()}.
- */
- public static <T> SerializableMatcher<T[]> emptyArray() {
- return fromSupplier(new SerializableSupplier<Matcher<T[]>>() {
- @Override
- public Matcher<T[]> get() {
- return Matchers.emptyArray();
- }
- });
- }
-
- /**
- * A {@link SerializableMatcher} with identical criteria to {@link Matchers#emptyIterable()}.
- */
- public static <T> SerializableMatcher<Iterable<? extends T>> emptyIterable() {
- return fromSupplier(new SerializableSupplier<Matcher<Iterable<? extends T>>>() {
- @Override
- public Matcher<Iterable<? extends T>> get() {
- return Matchers.emptyIterable();
- }
- });
- }
-
- /**
- * A {@link SerializableMatcher} with identical criteria to {@link Matchers#endsWith}.
- */
- public static SerializableMatcher<String> endsWith(final String substring) {
- return fromSupplier(new SerializableSupplier<Matcher<String>>() {
- @Override
- public Matcher<String> get() {
- return Matchers.endsWith(substring);
- }
- });
- }
-
- /**
- * A {@link SerializableMatcher} with identical criteria to {@link Matchers#equalTo()}.
- */
- public static <T extends Serializable> SerializableMatcher<T> equalTo(final T expected) {
- return fromSupplier(new SerializableSupplier<Matcher<T>>() {
- @Override
- public Matcher<T> get() {
- return Matchers.equalTo(expected);
- }
- });
- }
-
- /**
- * A {@link SerializableMatcher} with identical criteria to {@link Matchers#equalTo()}.
- *
- * <p>The expected value of type {@code T} will be serialized using the provided {@link Coder}.
- * It is explicitly <i>not</i> required or expected to be serializable via Java serialization.
- */
- public static <T> SerializableMatcher<T> equalTo(Coder<T> coder, T expected) {
-
- final SerializableSupplier<T> expectedSupplier = new SerializableViaCoder<>(coder, expected);
-
- return fromSupplier(new SerializableSupplier<Matcher<T>>() {
- @Override
- public Matcher<T> get() {
- return Matchers.equalTo(expectedSupplier.get());
- }
- });
- }
-
- /**
- * A {@link SerializableMatcher} with identical criteria to {@link Matchers#greaterThan()}.
- */
- public static <T extends Comparable<T> & Serializable> SerializableMatcher<T>
- greaterThan(final T target) {
- return fromSupplier(new SerializableSupplier<Matcher<T>>() {
- @Override
- public Matcher<T> get() {
- return Matchers.greaterThan(target);
- }
- });
- }
-
- /**
- * A {@link SerializableMatcher} with identical criteria to {@link Matchers#greaterThan()}.
- *
- * <p>The target value of type {@code T} will be serialized using the provided {@link Coder}.
- * It is explicitly <i>not</i> required or expected to be serializable via Java serialization.
- */
- public static <T extends Comparable<T> & Serializable> SerializableMatcher<T>
- greaterThan(final Coder<T> coder, T target) {
- final SerializableSupplier<T> targetSupplier = new SerializableViaCoder<>(coder, target);
- return fromSupplier(new SerializableSupplier<Matcher<T>>() {
- @Override
- public Matcher<T> get() {
- return Matchers.greaterThan(targetSupplier.get());
- }
- });
- }
-
- /**
- * A {@link SerializableMatcher} with identical criteria to
- * {@link Matchers#greaterThanOrEqualTo()}.
- */
- public static <T extends Comparable<T>> SerializableMatcher<T> greaterThanOrEqualTo(
- final T target) {
- return fromSupplier(new SerializableSupplier<Matcher<T>>() {
- @Override
- public Matcher<T> get() {
- return Matchers.greaterThanOrEqualTo(target);
- }
- });
- }
-
- /**
- * A {@link SerializableMatcher} with identical criteria to
- * {@link Matchers#greaterThanOrEqualTo()}.
- *
- * <p>The target value of type {@code T} will be serialized using the provided {@link Coder}.
- * It is explicitly <i>not</i> required or expected to be serializable via Java serialization.
- */
- public static <T extends Comparable<T> & Serializable> SerializableMatcher<T>
- greaterThanOrEqualTo(final Coder<T> coder, T target) {
- final SerializableSupplier<T> targetSupplier = new SerializableViaCoder<>(coder, target);
- return fromSupplier(new SerializableSupplier<Matcher<T>>() {
- @Override
- public Matcher<T> get() {
- return Matchers.greaterThanOrEqualTo(targetSupplier.get());
- }
- });
- }
-
- /**
- * A {@link SerializableMatcher} with identical criteria to {@link Matchers#hasItem(Object)}.
- */
- public static <T extends Serializable> SerializableMatcher<Iterable<? super T>> hasItem(
- final T target) {
- return fromSupplier(new SerializableSupplier<Matcher<Iterable<? super T>>>() {
- @Override
- public Matcher<Iterable<? super T>> get() {
- return Matchers.hasItem(target);
- }
- });
- }
-
- /**
- * A {@link SerializableMatcher} with identical criteria to {@link Matchers#hasItem(Object)}.
- *
- * <p>The item of type {@code T} will be serialized using the provided {@link Coder}.
- * It is explicitly <i>not</i> required or expected to be serializable via Java serialization.
- */
- public static <T> SerializableMatcher<Iterable<? super T>> hasItem(Coder<T> coder, T target) {
- final SerializableSupplier<T> targetSupplier = new SerializableViaCoder<>(coder, target);
- return fromSupplier(new SerializableSupplier<Matcher<Iterable<? super T>>>() {
- @Override
- public Matcher<Iterable<? super T>> get() {
- return Matchers.hasItem(targetSupplier.get());
- }
- });
- }
-
- /**
- * A {@link SerializableMatcher} with identical criteria to {@link Matchers#hasItem(Matcher)}.
- */
- public static <T> SerializableMatcher<Iterable<? super T>> hasItem(
- final SerializableMatcher<? super T> matcher) {
- return fromSupplier(new SerializableSupplier<Matcher<Iterable<? super T>>>() {
- @Override
- public Matcher<Iterable<? super T>> get() {
- return Matchers.hasItem(matcher);
- }
- });
- }
-
- /**
- * A {@link SerializableMatcher} with identical criteria to {@link Matchers#hasSize(int)}.
- */
- public static <T> SerializableMatcher<Collection<? extends T>> hasSize(final int size) {
- return fromSupplier(new SerializableSupplier<Matcher<Collection<? extends T>>>() {
- @Override
- public Matcher<Collection<? extends T>> get() {
- return Matchers.hasSize(size);
- }
- });
- }
-
- /**
- * A {@link SerializableMatcher} with identical criteria to {@link Matchers#hasSize(Matcher)}.
- */
- public static <T> SerializableMatcher<Collection<? extends T>> hasSize(
- final SerializableMatcher<? super Integer> sizeMatcher) {
- return fromSupplier(new SerializableSupplier<Matcher<Collection<? extends T>>>() {
- @Override
- public Matcher<Collection<? extends T>> get() {
- return Matchers.hasSize(sizeMatcher);
- }
- });
- }
-
- /**
- * A {@link SerializableMatcher} with identical criteria to
- * {@link Matchers#iterableWithSize(int)}.
- */
- public static <T> SerializableMatcher<Iterable<T>> iterableWithSize(final int size) {
- return fromSupplier(new SerializableSupplier<Matcher<Iterable<T>>>() {
- @Override
- public Matcher<Iterable<T>> get() {
- return Matchers.iterableWithSize(size);
- }
- });
- }
-
- /**
- * A {@link SerializableMatcher} with identical criteria to
- * {@link Matchers#iterableWithSize(Matcher)}.
- */
- public static <T> SerializableMatcher<Iterable<T>> iterableWithSize(
- final SerializableMatcher<? super Integer> sizeMatcher) {
- return fromSupplier(new SerializableSupplier<Matcher<Iterable<T>>>() {
- @Override
- public Matcher<Iterable<T>> get() {
- return Matchers.iterableWithSize(sizeMatcher);
- }
- });
- }
-
- /**
- * A {@link SerializableMatcher} with identical criteria to {@link Matchers#isIn(Collection)}.
- */
- public static <T extends Serializable> SerializableMatcher<T>
- isIn(final Collection<T> collection) {
- return fromSupplier(new SerializableSupplier<Matcher<T>>() {
- @Override
- public Matcher<T> get() {
- return Matchers.isIn(collection);
- }
- });
- }
-
- /**
- * A {@link SerializableMatcher} with identical criteria to {@link Matchers#isIn(Collection)}.
- *
- * <p>The items of type {@code T} will be serialized using the provided {@link Coder}.
- * They are explicitly <i>not</i> required or expected to be serializable via Java serialization.
- */
- public static <T> SerializableMatcher<T> isIn(Coder<T> coder, Collection<T> collection) {
- @SuppressWarnings("unchecked")
- T[] items = (T[]) collection.toArray();
- final SerializableSupplier<T[]> itemsSupplier =
- new SerializableArrayViaCoder<>(coder, items);
- return fromSupplier(new SerializableSupplier<Matcher<T>>() {
- @Override
- public Matcher<T> get() {
- return Matchers.isIn(itemsSupplier.get());
- }
- });
- }
-
- /**
- * A {@link SerializableMatcher} with identical criteria to {@link Matchers#isIn(Object[])}.
- */
- public static <T extends Serializable> SerializableMatcher<T> isIn(final T[] items) {
- return fromSupplier(new SerializableSupplier<Matcher<T>>() {
- @Override
- public Matcher<T> get() {
- return Matchers.isIn(items);
- }
- });
- }
-
- /**
- * A {@link SerializableMatcher} with identical criteria to {@link Matchers#isIn(Object[])}.
- *
- * <p>The items of type {@code T} will be serialized using the provided {@link Coder}.
- * They are explicitly <i>not</i> required or expected to be serializable via Java serialization.
- */
- public static <T> SerializableMatcher<T> isIn(Coder<T> coder, T[] items) {
- final SerializableSupplier<T[]> itemsSupplier =
- new SerializableArrayViaCoder<>(coder, items);
- return fromSupplier(new SerializableSupplier<Matcher<T>>() {
- @Override
- public Matcher<T> get() {
- return Matchers.isIn(itemsSupplier.get());
- }
- });
- }
-
- /**
- * A {@link SerializableMatcher} with identical criteria to {@link Matchers#isOneOf}.
- */
- @SafeVarargs
- public static <T extends Serializable> SerializableMatcher<T> isOneOf(final T... elems) {
- return fromSupplier(new SerializableSupplier<Matcher<T>>() {
- @Override
- public Matcher<T> get() {
- return Matchers.isOneOf(elems);
- }
- });
- }
-
- /**
- * A {@link SerializableMatcher} with identical criteria to {@link Matchers#isOneOf}.
- *
- * <p>The items of type {@code T} will be serialized using the provided {@link Coder}.
- * They are explicitly <i>not</i> required or expected to be serializable via Java serialization.
- */
- @SafeVarargs
- public static <T> SerializableMatcher<T> isOneOf(Coder<T> coder, T... items) {
- final SerializableSupplier<T[]> itemsSupplier =
- new SerializableArrayViaCoder<>(coder, items);
- return fromSupplier(new SerializableSupplier<Matcher<T>>() {
- @Override
- public Matcher<T> get() {
- return Matchers.isOneOf(itemsSupplier.get());
- }
- });
- }
-
- /**
- * A {@link SerializableMatcher} that matches any {@link KV} with the specified key.
- */
- public static <K extends Serializable, V> SerializableMatcher<KV<? extends K, ? extends V>>
- kvWithKey(K key) {
- return new KvKeyMatcher<K, V>(equalTo(key));
- }
-
- /**
- * A {@link SerializableMatcher} that matches any {@link KV} with the specified key.
- *
- * <p>The key of type {@code K} will be serialized using the provided {@link Coder}.
- * It is explicitly <i>not</i> required or expected to be serializable via Java serialization.
- */
- public static <K, V> SerializableMatcher<KV<? extends K, ? extends V>>
- kvWithKey(Coder<K> coder, K key) {
- return new KvKeyMatcher<K, V>(equalTo(coder, key));
- }
-
- /**
- * A {@link SerializableMatcher} that matches any {@link KV} with matching key.
- */
- public static <K, V> SerializableMatcher<KV<? extends K, ? extends V>> kvWithKey(
- final SerializableMatcher<? super K> keyMatcher) {
- return new KvKeyMatcher<K, V>(keyMatcher);
- }
-
- /**
- * A {@link SerializableMatcher} that matches any {@link KV} with the specified value.
- */
- public static <K, V extends Serializable> SerializableMatcher<KV<? extends K, ? extends V>>
- kvWithValue(V value) {
- return new KvValueMatcher<K, V>(equalTo(value));
- }
-
- /**
- * A {@link SerializableMatcher} that matches any {@link KV} with the specified value.
- *
- * <p>The value of type {@code V} will be serialized using the provided {@link Coder}.
- * It is explicitly <i>not</i> required or expected to be serializable via Java serialization.
- */
- public static <K, V> SerializableMatcher<KV<? extends K, ? extends V>>
- kvWithValue(Coder<V> coder, V value) {
- return new KvValueMatcher<K, V>(equalTo(coder, value));
- }
-
- /**
- * A {@link SerializableMatcher} that matches any {@link KV} with matching value.
- */
- public static <K, V> SerializableMatcher<KV<? extends K, ? extends V>> kvWithValue(
- final SerializableMatcher<? super V> valueMatcher) {
- return new KvValueMatcher<>(valueMatcher);
- }
-
- /**
- * A {@link SerializableMatcher} that matches any {@link KV} with matching key and value.
- */
- public static <K, V> SerializableMatcher<KV<? extends K, ? extends V>> kv(
- final SerializableMatcher<? super K> keyMatcher,
- final SerializableMatcher<? super V> valueMatcher) {
-
- return SerializableMatchers.<KV<? extends K, ? extends V>>allOf(
- SerializableMatchers.<K, V>kvWithKey(keyMatcher),
- SerializableMatchers.<K, V>kvWithValue(valueMatcher));
- }
-
- /**
- * A {@link SerializableMatcher} with identical criteria to {@link Matchers#lessThan()}.
- */
- public static <T extends Comparable<T> & Serializable> SerializableMatcher<T> lessThan(
- final T target) {
- return fromSupplier(new SerializableSupplier<Matcher<T>>() {
- @Override
- public Matcher<T> get() {
- return Matchers.lessThan(target);
- }
- });
- }
-
- /**
- * A {@link SerializableMatcher} with identical criteria to {@link Matchers#lessThan()}.
- *
- * <p>The target value of type {@code T} will be serialized using the provided {@link Coder}.
- * It is explicitly <i>not</i> required or expected to be serializable via Java serialization.
- */
- public static <T extends Comparable<T>> SerializableMatcher<T>
- lessThan(Coder<T> coder, T target) {
- final SerializableSupplier<T> targetSupplier = new SerializableViaCoder<>(coder, target);
- return fromSupplier(new SerializableSupplier<Matcher<T>>() {
- @Override
- public Matcher<T> get() {
- return Matchers.lessThan(targetSupplier.get());
- }
- });
- }
-
- /**
- * A {@link SerializableMatcher} with identical criteria to
- * {@link Matchers#lessThanOrEqualTo()}.
- */
- public static <T extends Comparable<T> & Serializable> SerializableMatcher<T> lessThanOrEqualTo(
- final T target) {
- return fromSupplier(new SerializableSupplier<Matcher<T>>() {
- @Override
- public Matcher<T> get() {
- return Matchers.lessThanOrEqualTo(target);
- }
- });
- }
-
- /**
- * A {@link SerializableMatcher} with identical criteria to
- * {@link Matchers#lessThanOrEqualTo()}.
- *
- * <p>The target value of type {@code T} will be serialized using the provided {@link Coder}.
- * It is explicitly <i>not</i> required or expected to be serializable via Java serialization.
- */
- public static <T extends Comparable<T>> SerializableMatcher<T> lessThanOrEqualTo(
- Coder<T> coder, T target) {
- final SerializableSupplier<T> targetSupplier = new SerializableViaCoder<>(coder, target);
- return fromSupplier(new SerializableSupplier<Matcher<T>>() {
- @Override
- public Matcher<T> get() {
- return Matchers.lessThanOrEqualTo(targetSupplier.get());
- }
- });
- }
-
- /**
- * A {@link SerializableMatcher} with identical criteria to
- * {@link Matchers#not}.
- */
- public static <T> SerializableMatcher<T> not(final SerializableMatcher<T> matcher) {
- return fromSupplier(new SerializableSupplier<Matcher<T>>() {
- @Override
- public Matcher<T> get() {
- return Matchers.not(matcher);
- }
- });
- }
-
- /**
- * A {@link SerializableMatcher} with identical criteria to
- * {@link Matchers#nullValue}.
- */
- public static SerializableMatcher<Object> nullValue() {
- return fromSupplier(new SerializableSupplier<Matcher<Object>>() {
- @Override
- public Matcher<Object> get() {
- return Matchers.nullValue();
- }
- });
- }
-
- /**
- * A {@link SerializableMatcher} with identical criteria to {@link Matchers#startsWith}.
- */
- public static SerializableMatcher<String> startsWith(final String substring) {
- return fromSupplier(new SerializableSupplier<Matcher<String>>() {
- @Override
- public Matcher<String> get() {
- return Matchers.startsWith(substring);
- }
- });
- }
-
- private static class KvKeyMatcher<K, V>
- extends BaseMatcher<KV<? extends K, ? extends V>>
- implements SerializableMatcher<KV<? extends K, ? extends V>> {
- private final SerializableMatcher<? super K> keyMatcher;
-
- public KvKeyMatcher(SerializableMatcher<? super K> keyMatcher) {
- this.keyMatcher = keyMatcher;
- }
-
- @Override
- public boolean matches(Object item) {
- @SuppressWarnings("unchecked")
- KV<K, ?> kvItem = (KV<K, ?>) item;
- return keyMatcher.matches(kvItem.getKey());
- }
-
- @Override
- public void describeMismatch(Object item, Description mismatchDescription) {
- @SuppressWarnings("unchecked")
- KV<K, ?> kvItem = (KV<K, ?>) item;
- if (!keyMatcher.matches(kvItem.getKey())) {
- mismatchDescription.appendText("key did not match: ");
- keyMatcher.describeMismatch(kvItem.getKey(), mismatchDescription);
- }
- }
-
- @Override
- public void describeTo(Description description) {
- description.appendText("KV with key matching ");
- keyMatcher.describeTo(description);
- }
-
- @Override
- public String toString() {
- return MoreObjects.toStringHelper(this)
- .addValue(keyMatcher)
- .toString();
- }
- }
-
- private static class KvValueMatcher<K, V>
- extends BaseMatcher<KV<? extends K, ? extends V>>
- implements SerializableMatcher<KV<? extends K, ? extends V>> {
- private final SerializableMatcher<? super V> valueMatcher;
-
- public KvValueMatcher(SerializableMatcher<? super V> valueMatcher) {
- this.valueMatcher = valueMatcher;
- }
-
- @Override
- public boolean matches(Object item) {
- @SuppressWarnings("unchecked")
- KV<?, V> kvItem = (KV<?, V>) item;
- return valueMatcher.matches(kvItem.getValue());
- }
-
- @Override
- public void describeMismatch(Object item, Description mismatchDescription) {
- @SuppressWarnings("unchecked")
- KV<?, V> kvItem = (KV<?, V>) item;
- if (!valueMatcher.matches(kvItem.getValue())) {
- mismatchDescription.appendText("value did not match: ");
- valueMatcher.describeMismatch(kvItem.getValue(), mismatchDescription);
- }
- }
-
- @Override
- public void describeTo(Description description) {
- description.appendText("KV with value matching ");
- valueMatcher.describeTo(description);
- }
-
- @Override
- public String toString() {
- return MoreObjects.toStringHelper(this)
- .addValue(valueMatcher)
- .toString();
- }
- }
-
- /**
- * Constructs a {@link SerializableMatcher} from a non-serializable {@link Matcher} via
- * indirection through {@link SerializableSupplier}.
- *
- * <p>To wrap a {@link Matcher} which is not serializable, provide a {@link SerializableSupplier}
- * with a {@link SerializableSupplier#get()} method that returns a fresh instance of the
- * {@link Matcher} desired. The resulting {@link SerializableMatcher} will behave according to
- * the {@link Matcher} returned by {@link SerializableSupplier#get() get()} when it is invoked
- * during matching (which may occur on another machine, such as a Dataflow worker).
- *
- * <code>
- * return fromSupplier(new SerializableSupplier<Matcher<T>>() {
- * * @Override
- * public Matcher<T> get() {
- * return new MyMatcherForT();
- * }
- * });
- * </code>
- */
- public static <T> SerializableMatcher<T> fromSupplier(
- SerializableSupplier<Matcher<T>> supplier) {
- return new SerializableMatcherFromSupplier<>(supplier);
- }
-
- /**
- * Supplies values of type {@code T}, and is serializable. Thus, even if {@code T} is not
- * serializable, the supplier can be serialized and provide a {@code T} wherever it is
- * deserialized.
- *
- * @param <T> the type of value supplied.
- */
- public interface SerializableSupplier<T> extends Serializable {
- T get();
- }
-
- /**
- * Since the delegate {@link Matcher} is not generally serializable, instead this takes a nullary
- * SerializableFunction to return such a matcher.
- */
- private static class SerializableMatcherFromSupplier<T> extends BaseMatcher<T>
- implements SerializableMatcher<T> {
-
- private SerializableSupplier<Matcher<T>> supplier;
-
- public SerializableMatcherFromSupplier(SerializableSupplier<Matcher<T>> supplier) {
- this.supplier = supplier;
- }
-
- @Override
- public void describeTo(Description description) {
- supplier.get().describeTo(description);
- }
-
- @Override
- public boolean matches(Object item) {
- return supplier.get().matches(item);
- }
-
- @Override
- public void describeMismatch(Object item, Description mismatchDescription) {
- supplier.get().describeMismatch(item, mismatchDescription);
- }
- }
-
- /**
- * Wraps any value that can be encoded via a {@link Coder} to make it {@link Serializable}.
- * This is not likely to be a good encoding, so should be used only for tests, where data
- * volume is small and minor costs are not critical.
- */
- private static class SerializableViaCoder<T> implements SerializableSupplier<T> {
- /** Cached value that is not serialized. */
- @Nullable
- private transient T value;
-
- /** The bytes of {@link #value} when encoded via {@link #coder}. */
- private byte[] encodedValue;
-
- private Coder<T> coder;
-
- public SerializableViaCoder(Coder<T> coder, T value) {
- this.coder = coder;
- this.value = value;
- try {
- this.encodedValue = CoderUtils.encodeToByteArray(coder, value);
- } catch (CoderException exc) {
- throw new RuntimeException("Error serializing via Coder", exc);
- }
- }
-
- @Override
- public T get() {
- if (value == null) {
- try {
- value = CoderUtils.decodeFromByteArray(coder, encodedValue);
- } catch (CoderException exc) {
- throw new RuntimeException("Error deserializing via Coder", exc);
- }
- }
- return value;
- }
- }
-
- /**
- * Wraps any array with values that can be encoded via a {@link Coder} to make it
- * {@link Serializable}. This is not likely to be a good encoding, so should be used only for
- * tests, where data volume is small and minor costs are not critical.
- */
- private static class SerializableArrayViaCoder<T> implements SerializableSupplier<T[]> {
- /** Cached value that is not serialized. */
- @Nullable
- private transient T[] value;
-
- /** The bytes of {@link #value} when encoded via {@link #coder}. */
- private byte[] encodedValue;
-
- private Coder<List<T>> coder;
-
- public SerializableArrayViaCoder(Coder<T> elementCoder, T[] value) {
- this.coder = ListCoder.of(elementCoder);
- this.value = value;
- try {
- this.encodedValue = CoderUtils.encodeToByteArray(coder, Arrays.asList(value));
- } catch (CoderException exc) {
- throw UserCodeException.wrap(exc);
- }
- }
-
- @Override
- public T[] get() {
- if (value == null) {
- try {
- @SuppressWarnings("unchecked")
- T[] decoded = (T[]) CoderUtils.decodeFromByteArray(coder, encodedValue).toArray();
- value = decoded;
- } catch (CoderException exc) {
- throw new RuntimeException("Error deserializing via Coder", exc);
- }
- }
- return value;
- }
- }
-}
[44/67] [partial] incubator-beam git commit: Directory reorganization
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/BoundedSource.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/BoundedSource.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/BoundedSource.java
deleted file mode 100644
index be3a415..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/BoundedSource.java
+++ /dev/null
@@ -1,277 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.io;
-
-import com.google.cloud.dataflow.sdk.annotations.Experimental;
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
-
-import org.joda.time.Instant;
-
-import java.io.IOException;
-import java.util.List;
-import java.util.NoSuchElementException;
-
-/**
- * A {@link Source} that reads a finite amount of input and, because of that, supports
- * some additional operations.
- *
- * <p>The operations are:
- * <ul>
- * <li>Splitting into bundles of given size: {@link #splitIntoBundles};
- * <li>Size estimation: {@link #getEstimatedSizeBytes};
- * <li>Telling whether or not this source produces key/value pairs in sorted order:
- * {@link #producesSortedKeys};
- * <li>The reader ({@link BoundedReader}) supports progress estimation
- * ({@link BoundedReader#getFractionConsumed}) and dynamic splitting
- * ({@link BoundedReader#splitAtFraction}).
- * </ul>
- *
- * <p>To use this class for supporting your custom input type, derive your class
- * class from it, and override the abstract methods. For an example, see {@link DatastoreIO}.
- *
- * @param <T> Type of records read by the source.
- */
-public abstract class BoundedSource<T> extends Source<T> {
- /**
- * Splits the source into bundles of approximately {@code desiredBundleSizeBytes}.
- */
- public abstract List<? extends BoundedSource<T>> splitIntoBundles(
- long desiredBundleSizeBytes, PipelineOptions options) throws Exception;
-
- /**
- * An estimate of the total size (in bytes) of the data that would be read from this source.
- * This estimate is in terms of external storage size, before any decompression or other
- * processing done by the reader.
- */
- public abstract long getEstimatedSizeBytes(PipelineOptions options) throws Exception;
-
- /**
- * Whether this source is known to produce key/value pairs sorted by lexicographic order on
- * the bytes of the encoded key.
- */
- public abstract boolean producesSortedKeys(PipelineOptions options) throws Exception;
-
- /**
- * Returns a new {@link BoundedReader} that reads from this source.
- */
- public abstract BoundedReader<T> createReader(PipelineOptions options) throws IOException;
-
- /**
- * A {@code Reader} that reads a bounded amount of input and supports some additional
- * operations, such as progress estimation and dynamic work rebalancing.
- *
- * <h3>Boundedness</h3>
- * <p>Once {@link #start} or {@link #advance} has returned false, neither will be called
- * again on this object.
- *
- * <h3>Thread safety</h3>
- * All methods will be run from the same thread except {@link #splitAtFraction},
- * {@link #getFractionConsumed} and {@link #getCurrentSource}, which can be called concurrently
- * from a different thread. There will not be multiple concurrent calls to
- * {@link #splitAtFraction} but there can be for {@link #getFractionConsumed} if
- * {@link #splitAtFraction} is implemented.
- *
- * <p>If the source does not implement {@link #splitAtFraction}, you do not need to worry about
- * thread safety. If implemented, it must be safe to call {@link #splitAtFraction} and
- * {@link #getFractionConsumed} concurrently with other methods.
- *
- * <p>Additionally, a successful {@link #splitAtFraction} call must, by definition, cause
- * {@link #getCurrentSource} to start returning a different value.
- * Callers of {@link #getCurrentSource} need to be aware of the possibility that the returned
- * value can change at any time, and must only access the properties of the source returned by
- * {@link #getCurrentSource} which do not change between {@link #splitAtFraction} calls.
- *
- * <h3>Implementing {@link #splitAtFraction}</h3>
- * In the course of dynamic work rebalancing, the method {@link #splitAtFraction}
- * may be called concurrently with {@link #advance} or {@link #start}. It is critical that
- * their interaction is implemented in a thread-safe way, otherwise data loss is possible.
- *
- * <p>Sources which support dynamic work rebalancing should use
- * {@link com.google.cloud.dataflow.sdk.io.range.RangeTracker} to manage the (source-specific)
- * range of positions that is being split. If your source supports dynamic work rebalancing,
- * please use that class to implement it if possible; if not possible, please contact the team
- * at <i>dataflow-feedback@google.com</i>.
- */
- @Experimental(Experimental.Kind.SOURCE_SINK)
- public abstract static class BoundedReader<T> extends Source.Reader<T> {
- /**
- * Returns a value in [0, 1] representing approximately what fraction of the
- * {@link #getCurrentSource current source} this reader has read so far, or {@code null} if such
- * an estimate is not available.
- *
- * <p>It is recommended that this method should satisfy the following properties:
- * <ul>
- * <li>Should return 0 before the {@link #start} call.
- * <li>Should return 1 after a {@link #start} or {@link #advance} call that returns false.
- * <li>The returned values should be non-decreasing (though they don't have to be unique).
- * </ul>
- *
- * <p>By default, returns null to indicate that this cannot be estimated.
- *
- * <h5>Thread safety</h5>
- * If {@link #splitAtFraction} is implemented, this method can be called concurrently to other
- * methods (including itself), and it is therefore critical for it to be implemented
- * in a thread-safe way.
- */
- public Double getFractionConsumed() {
- return null;
- }
-
- /**
- * Returns a {@code Source} describing the same input that this {@code Reader} currently reads
- * (including items already read).
- *
- * <h3>Usage</h3>
- * <p>Reader subclasses can use this method for convenience to access unchanging properties of
- * the source being read. Alternatively, they can cache these properties in the constructor.
- * <p>The framework will call this method in the course of dynamic work rebalancing, e.g. after
- * a successful {@link BoundedSource.BoundedReader#splitAtFraction} call.
- *
- * <h3>Mutability and thread safety</h3>
- * Remember that {@link Source} objects must always be immutable. However, the return value of
- * this function may be affected by dynamic work rebalancing, happening asynchronously via
- * {@link BoundedSource.BoundedReader#splitAtFraction}, meaning it can return a different
- * {@link Source} object. However, the returned object itself will still itself be immutable.
- * Callers must take care not to rely on properties of the returned source that may be
- * asynchronously changed as a result of this process (e.g. do not cache an end offset when
- * reading a file).
- *
- * <h3>Implementation</h3>
- * For convenience, subclasses should usually return the most concrete subclass of
- * {@link Source} possible.
- * In practice, the implementation of this method should nearly always be one of the following:
- * <ul>
- * <li>Source that inherits from a base class that already implements
- * {@link #getCurrentSource}: delegate to base class. In this case, it is almost always
- * an error for the subclass to maintain its own copy of the source.
- * <pre>{@code
- * public FooReader(FooSource<T> source) {
- * super(source);
- * }
- *
- * public FooSource<T> getCurrentSource() {
- * return (FooSource<T>)super.getCurrentSource();
- * }
- * }</pre>
- * <li>Source that does not support dynamic work rebalancing: return a private final variable.
- * <pre>{@code
- * private final FooSource<T> source;
- *
- * public FooReader(FooSource<T> source) {
- * this.source = source;
- * }
- *
- * public FooSource<T> getCurrentSource() {
- * return source;
- * }
- * }</pre>
- * <li>{@link BoundedSource.BoundedReader} that explicitly supports dynamic work rebalancing:
- * maintain a variable pointing to an immutable source object, and protect it with
- * synchronization.
- * <pre>{@code
- * private FooSource<T> source;
- *
- * public FooReader(FooSource<T> source) {
- * this.source = source;
- * }
- *
- * public synchronized FooSource<T> getCurrentSource() {
- * return source;
- * }
- *
- * public synchronized FooSource<T> splitAtFraction(double fraction) {
- * ...
- * FooSource<T> primary = ...;
- * FooSource<T> residual = ...;
- * this.source = primary;
- * return residual;
- * }
- * }</pre>
- * </ul>
- */
- @Override
- public abstract BoundedSource<T> getCurrentSource();
-
- /**
- * Tells the reader to narrow the range of the input it's going to read and give up
- * the remainder, so that the new range would contain approximately the given
- * fraction of the amount of data in the current range.
- *
- * <p>Returns a {@code BoundedSource} representing the remainder.
- *
- * <h5>Detailed description</h5>
- * Assuming the following sequence of calls:
- * <pre>{@code
- * BoundedSource<T> initial = reader.getCurrentSource();
- * BoundedSource<T> residual = reader.splitAtFraction(fraction);
- * BoundedSource<T> primary = reader.getCurrentSource();
- * }</pre>
- * <ul>
- * <li> The "primary" and "residual" sources, when read, should together cover the same
- * set of records as "initial".
- * <li> The current reader should continue to be in a valid state, and continuing to read
- * from it should, together with the records it already read, yield the same records
- * as would have been read by "primary".
- * <li> The amount of data read by "primary" should ideally represent approximately
- * the given fraction of the amount of data read by "initial".
- * </ul>
- * For example, a reader that reads a range of offsets <i>[A, B)</i> in a file might implement
- * this method by truncating the current range to <i>[A, A + fraction*(B-A))</i> and returning
- * a Source representing the range <i>[A + fraction*(B-A), B)</i>.
- *
- * <p>This method should return {@code null} if the split cannot be performed for this fraction
- * while satisfying the semantics above. E.g., a reader that reads a range of offsets
- * in a file should return {@code null} if it is already past the position in its range
- * corresponding to the given fraction. In this case, the method MUST have no effect
- * (the reader must behave as if the method hadn't been called at all).
- *
- * <h5>Statefulness</h5>
- * Since this method (if successful) affects the reader's source, in subsequent invocations
- * "fraction" should be interpreted relative to the new current source.
- *
- * <h5>Thread safety and blocking</h5>
- * This method will be called concurrently to other methods (however there will not be multiple
- * concurrent invocations of this method itself), and it is critical for it to be implemented
- * in a thread-safe way (otherwise data loss is possible).
- *
- * <p>It is also very important that this method always completes quickly. In particular,
- * it should not perform or wait on any blocking operations such as I/O, RPCs etc. Violating
- * this requirement may stall completion of the work item or even cause it to fail.
- *
- * <p>It is incorrect to make both this method and {@link #start}/{@link #advance}
- * {@code synchronized}, because those methods can perform blocking operations, and then
- * this method would have to wait for those calls to complete.
- *
- * <p>{@link com.google.cloud.dataflow.sdk.io.range.RangeTracker} makes it easy to implement
- * this method safely and correctly.
- *
- * <p>By default, returns null to indicate that splitting is not possible.
- */
- public BoundedSource<T> splitAtFraction(double fraction) {
- return null;
- }
-
- /**
- * By default, returns the minimum possible timestamp.
- */
- @Override
- public Instant getCurrentTimestamp() throws NoSuchElementException {
- return BoundedWindow.TIMESTAMP_MIN_VALUE;
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/CompressedSource.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/CompressedSource.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/CompressedSource.java
deleted file mode 100644
index e3dca91..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/CompressedSource.java
+++ /dev/null
@@ -1,413 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.io;
-
-import com.google.cloud.dataflow.sdk.annotations.Experimental;
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-import com.google.common.base.Preconditions;
-import com.google.common.io.ByteStreams;
-import com.google.common.primitives.Ints;
-
-import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
-import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
-
-import java.io.IOException;
-import java.io.PushbackInputStream;
-import java.io.Serializable;
-import java.nio.channels.Channels;
-import java.nio.channels.ReadableByteChannel;
-import java.util.NoSuchElementException;
-import java.util.zip.GZIPInputStream;
-
-/**
- * A Source that reads from compressed files. A {@code CompressedSources} wraps a delegate
- * {@link FileBasedSource} that is able to read the decompressed file format.
- *
- * <p>For example, use the following to read from a gzip-compressed XML file:
- *
- * <pre> {@code
- * XmlSource mySource = XmlSource.from(...);
- * PCollection<T> collection = p.apply(Read.from(CompressedSource
- * .from(mySource)
- * .withDecompression(CompressedSource.CompressionMode.GZIP)));
- * } </pre>
- *
- * <p>Supported compression algorithms are {@link CompressionMode#GZIP} and
- * {@link CompressionMode#BZIP2}. User-defined compression types are supported by implementing
- * {@link DecompressingChannelFactory}.
- *
- * <p>By default, the compression algorithm is selected from those supported in
- * {@link CompressionMode} based on the file name provided to the source, namely
- * {@code ".bz2"} indicates {@link CompressionMode#BZIP2} and {@code ".gz"} indicates
- * {@link CompressionMode#GZIP}. If the file name does not match any of the supported
- * algorithms, it is assumed to be uncompressed data.
- *
- * @param <T> The type to read from the compressed file.
- */
-@Experimental(Experimental.Kind.SOURCE_SINK)
-public class CompressedSource<T> extends FileBasedSource<T> {
- /**
- * Factory interface for creating channels that decompress the content of an underlying channel.
- */
- public static interface DecompressingChannelFactory extends Serializable {
- /**
- * Given a channel, create a channel that decompresses the content read from the channel.
- * @throws IOException
- */
- public ReadableByteChannel createDecompressingChannel(ReadableByteChannel channel)
- throws IOException;
- }
-
- /**
- * Factory interface for creating channels that decompress the content of an underlying channel,
- * based on both the channel and the file name.
- */
- private static interface FileNameBasedDecompressingChannelFactory
- extends DecompressingChannelFactory {
- /**
- * Given a channel, create a channel that decompresses the content read from the channel.
- * @throws IOException
- */
- ReadableByteChannel createDecompressingChannel(String fileName, ReadableByteChannel channel)
- throws IOException;
-
- /**
- * Given a file name, returns true if the file name matches any supported compression
- * scheme.
- */
- boolean isCompressed(String fileName);
- }
-
- /**
- * Default compression types supported by the {@code CompressedSource}.
- */
- public enum CompressionMode implements DecompressingChannelFactory {
- /**
- * Reads a byte channel assuming it is compressed with gzip.
- */
- GZIP {
- @Override
- public boolean matches(String fileName) {
- return fileName.toLowerCase().endsWith(".gz");
- }
-
- @Override
- public ReadableByteChannel createDecompressingChannel(ReadableByteChannel channel)
- throws IOException {
- // Determine if the input stream is gzipped. The input stream returned from the
- // GCS connector may already be decompressed; GCS does this based on the
- // content-encoding property.
- PushbackInputStream stream = new PushbackInputStream(Channels.newInputStream(channel), 2);
- byte[] headerBytes = new byte[2];
- int bytesRead = ByteStreams.read(
- stream /* source */, headerBytes /* dest */, 0 /* offset */, 2 /* len */);
- stream.unread(headerBytes, 0, bytesRead);
- if (bytesRead >= 2) {
- byte zero = 0x00;
- int header = Ints.fromBytes(zero, zero, headerBytes[1], headerBytes[0]);
- if (header == GZIPInputStream.GZIP_MAGIC) {
- return Channels.newChannel(new GzipCompressorInputStream(stream));
- }
- }
- return Channels.newChannel(stream);
- }
- },
-
- /**
- * Reads a byte channel assuming it is compressed with bzip2.
- */
- BZIP2 {
- @Override
- public boolean matches(String fileName) {
- return fileName.toLowerCase().endsWith(".bz2");
- }
-
- @Override
- public ReadableByteChannel createDecompressingChannel(ReadableByteChannel channel)
- throws IOException {
- return Channels.newChannel(
- new BZip2CompressorInputStream(Channels.newInputStream(channel)));
- }
- };
-
- /**
- * Returns {@code true} if the given file name implies that the contents are compressed
- * according to the compression embodied by this factory.
- */
- public abstract boolean matches(String fileName);
-
- @Override
- public abstract ReadableByteChannel createDecompressingChannel(ReadableByteChannel channel)
- throws IOException;
- }
-
- /**
- * Reads a byte channel detecting compression according to the file name. If the filename
- * is not any other known {@link CompressionMode}, it is presumed to be uncompressed.
- */
- private static class DecompressAccordingToFilename
- implements FileNameBasedDecompressingChannelFactory {
-
- @Override
- public ReadableByteChannel createDecompressingChannel(
- String fileName, ReadableByteChannel channel) throws IOException {
- for (CompressionMode type : CompressionMode.values()) {
- if (type.matches(fileName)) {
- return type.createDecompressingChannel(channel);
- }
- }
- // Uncompressed
- return channel;
- }
-
- @Override
- public ReadableByteChannel createDecompressingChannel(ReadableByteChannel channel) {
- throw new UnsupportedOperationException(
- String.format("%s does not support createDecompressingChannel(%s) but only"
- + " createDecompressingChannel(%s,%s)",
- getClass().getSimpleName(),
- String.class.getSimpleName(),
- ReadableByteChannel.class.getSimpleName(),
- ReadableByteChannel.class.getSimpleName()));
- }
-
- @Override
- public boolean isCompressed(String fileName) {
- for (CompressionMode type : CompressionMode.values()) {
- if (type.matches(fileName)) {
- return true;
- }
- }
- return false;
- }
- }
-
- private final FileBasedSource<T> sourceDelegate;
- private final DecompressingChannelFactory channelFactory;
-
- /**
- * Creates a {@link Read} transform that reads from that reads from the underlying
- * {@link FileBasedSource} {@code sourceDelegate} after decompressing it with a {@link
- * DecompressingChannelFactory}.
- */
- public static <T> Read.Bounded<T> readFromSource(
- FileBasedSource<T> sourceDelegate, DecompressingChannelFactory channelFactory) {
- return Read.from(new CompressedSource<>(sourceDelegate, channelFactory));
- }
-
- /**
- * Creates a {@code CompressedSource} from an underlying {@code FileBasedSource}. The type
- * of compression used will be based on the file name extension unless explicitly
- * configured via {@link CompressedSource#withDecompression}.
- */
- public static <T> CompressedSource<T> from(FileBasedSource<T> sourceDelegate) {
- return new CompressedSource<>(sourceDelegate, new DecompressAccordingToFilename());
- }
-
- /**
- * Return a {@code CompressedSource} that is like this one but will decompress its underlying file
- * with the given {@link DecompressingChannelFactory}.
- */
- public CompressedSource<T> withDecompression(DecompressingChannelFactory channelFactory) {
- return new CompressedSource<>(this.sourceDelegate, channelFactory);
- }
-
- /**
- * Creates a {@code CompressedSource} from a delegate file based source and a decompressing
- * channel factory.
- */
- private CompressedSource(
- FileBasedSource<T> sourceDelegate, DecompressingChannelFactory channelFactory) {
- super(sourceDelegate.getFileOrPatternSpec(), Long.MAX_VALUE);
- this.sourceDelegate = sourceDelegate;
- this.channelFactory = channelFactory;
- }
-
- /**
- * Creates a {@code CompressedSource} for an individual file. Used by {@link
- * CompressedSource#createForSubrangeOfFile}.
- */
- private CompressedSource(FileBasedSource<T> sourceDelegate,
- DecompressingChannelFactory channelFactory, String filePatternOrSpec, long minBundleSize,
- long startOffset, long endOffset) {
- super(filePatternOrSpec, minBundleSize, startOffset, endOffset);
- Preconditions.checkArgument(
- startOffset == 0,
- "CompressedSources must start reading at offset 0. Requested offset: " + startOffset);
- this.sourceDelegate = sourceDelegate;
- this.channelFactory = channelFactory;
- }
-
- /**
- * Validates that the delegate source is a valid source and that the channel factory is not null.
- */
- @Override
- public void validate() {
- super.validate();
- Preconditions.checkNotNull(sourceDelegate);
- sourceDelegate.validate();
- Preconditions.checkNotNull(channelFactory);
- }
-
- /**
- * Creates a {@code CompressedSource} for a subrange of a file. Called by superclass to create a
- * source for a single file.
- */
- @Override
- protected FileBasedSource<T> createForSubrangeOfFile(String fileName, long start, long end) {
- return new CompressedSource<>(sourceDelegate.createForSubrangeOfFile(fileName, start, end),
- channelFactory, fileName, Long.MAX_VALUE, start, end);
- }
-
- /**
- * Determines whether a single file represented by this source is splittable. Returns true
- * if we are using the default decompression factory and and it determines
- * from the requested file name that the file is not compressed.
- */
- @Override
- protected final boolean isSplittable() throws Exception {
- if (channelFactory instanceof FileNameBasedDecompressingChannelFactory) {
- FileNameBasedDecompressingChannelFactory fileNameBasedChannelFactory =
- (FileNameBasedDecompressingChannelFactory) channelFactory;
- return !fileNameBasedChannelFactory.isCompressed(getFileOrPatternSpec());
- }
- return true;
- }
-
- /**
- * Creates a {@code FileBasedReader} to read a single file.
- *
- * <p>Uses the delegate source to create a single file reader for the delegate source.
- * Utilizes the default decompression channel factory to not wrap the source reader
- * if the file name does not represent a compressed file allowing for splitting of
- * the source.
- */
- @Override
- protected final FileBasedReader<T> createSingleFileReader(PipelineOptions options) {
- if (channelFactory instanceof FileNameBasedDecompressingChannelFactory) {
- FileNameBasedDecompressingChannelFactory fileNameBasedChannelFactory =
- (FileNameBasedDecompressingChannelFactory) channelFactory;
- if (!fileNameBasedChannelFactory.isCompressed(getFileOrPatternSpec())) {
- return sourceDelegate.createSingleFileReader(options);
- }
- }
- return new CompressedReader<T>(
- this, sourceDelegate.createSingleFileReader(options));
- }
-
- /**
- * Returns whether the delegate source produces sorted keys.
- */
- @Override
- public final boolean producesSortedKeys(PipelineOptions options) throws Exception {
- return sourceDelegate.producesSortedKeys(options);
- }
-
- /**
- * Returns the delegate source's default output coder.
- */
- @Override
- public final Coder<T> getDefaultOutputCoder() {
- return sourceDelegate.getDefaultOutputCoder();
- }
-
- public final DecompressingChannelFactory getChannelFactory() {
- return channelFactory;
- }
-
- /**
- * Reader for a {@link CompressedSource}. Decompresses its input and uses a delegate
- * reader to read elements from the decompressed input.
- * @param <T> The type of records read from the source.
- */
- public static class CompressedReader<T> extends FileBasedReader<T> {
-
- private final FileBasedReader<T> readerDelegate;
- private final CompressedSource<T> source;
- private int numRecordsRead;
-
- /**
- * Create a {@code CompressedReader} from a {@code CompressedSource} and delegate reader.
- */
- public CompressedReader(CompressedSource<T> source, FileBasedReader<T> readerDelegate) {
- super(source);
- this.source = source;
- this.readerDelegate = readerDelegate;
- }
-
- /**
- * Gets the current record from the delegate reader.
- */
- @Override
- public T getCurrent() throws NoSuchElementException {
- return readerDelegate.getCurrent();
- }
-
- /**
- * Returns true only for the first record; compressed sources cannot be split.
- */
- @Override
- protected final boolean isAtSplitPoint() {
- // We have to return true for the first record, but not for the state before reading it,
- // and not for the state after reading any other record. Hence == rather than >= or <=.
- // This is required because FileBasedReader is intended for readers that can read a range
- // of offsets in a file and where the range can be split in parts. CompressedReader,
- // however, is a degenerate case because it cannot be split, but it has to satisfy the
- // semantics of offsets and split points anyway.
- return numRecordsRead == 1;
- }
-
- /**
- * Creates a decompressing channel from the input channel and passes it to its delegate reader's
- * {@link FileBasedReader#startReading(ReadableByteChannel)}.
- */
- @Override
- protected final void startReading(ReadableByteChannel channel) throws IOException {
- if (source.getChannelFactory() instanceof FileNameBasedDecompressingChannelFactory) {
- FileNameBasedDecompressingChannelFactory channelFactory =
- (FileNameBasedDecompressingChannelFactory) source.getChannelFactory();
- readerDelegate.startReading(channelFactory.createDecompressingChannel(
- getCurrentSource().getFileOrPatternSpec(),
- channel));
- } else {
- readerDelegate.startReading(source.getChannelFactory().createDecompressingChannel(
- channel));
- }
- }
-
- /**
- * Reads the next record via the delegate reader.
- */
- @Override
- protected final boolean readNextRecord() throws IOException {
- if (!readerDelegate.readNextRecord()) {
- return false;
- }
- ++numRecordsRead;
- return true;
- }
-
- /**
- * Returns the delegate reader's current offset in the decompressed input.
- */
- @Override
- protected final long getCurrentOffset() {
- return readerDelegate.getCurrentOffset();
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/CountingInput.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/CountingInput.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/CountingInput.java
deleted file mode 100644
index 07609ba..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/CountingInput.java
+++ /dev/null
@@ -1,191 +0,0 @@
-/*
- * Copyright (C) 2016 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.io;
-
-import static com.google.common.base.Preconditions.checkArgument;
-import static com.google.common.base.Preconditions.checkNotNull;
-
-import com.google.cloud.dataflow.sdk.io.CountingSource.NowTimestampFn;
-import com.google.cloud.dataflow.sdk.io.Read.Unbounded;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-import com.google.cloud.dataflow.sdk.transforms.SerializableFunction;
-import com.google.cloud.dataflow.sdk.values.PBegin;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.cloud.dataflow.sdk.values.PCollection.IsBounded;
-import com.google.common.base.Optional;
-
-import org.joda.time.Duration;
-import org.joda.time.Instant;
-
-/**
- * A {@link PTransform} that produces longs. When used to produce a
- * {@link IsBounded#BOUNDED bounded} {@link PCollection}, {@link CountingInput} starts at {@code 0}
- * and counts up to a specified maximum. When used to produce an
- * {@link IsBounded#UNBOUNDED unbounded} {@link PCollection}, it counts up to {@link Long#MAX_VALUE}
- * and then never produces more output. (In practice, this limit should never be reached.)
- *
- * <p>The bounded {@link CountingInput} is implemented based on {@link OffsetBasedSource} and
- * {@link OffsetBasedSource.OffsetBasedReader}, so it performs efficient initial splitting and it
- * supports dynamic work rebalancing.
- *
- * <p>To produce a bounded {@code PCollection<Long>}, use {@link CountingInput#upTo(long)}:
- *
- * <pre>{@code
- * Pipeline p = ...
- * PTransform<PBegin, PCollection<Long>> producer = CountingInput.upTo(1000);
- * PCollection<Long> bounded = p.apply(producer);
- * }</pre>
- *
- * <p>To produce an unbounded {@code PCollection<Long>}, use {@link CountingInput#unbounded()},
- * calling {@link UnboundedCountingInput#withTimestampFn(SerializableFunction)} to provide values
- * with timestamps other than {@link Instant#now}.
- *
- * <pre>{@code
- * Pipeline p = ...
- *
- * // To create an unbounded producer that uses processing time as the element timestamp.
- * PCollection<Long> unbounded = p.apply(CountingInput.unbounded());
- * // Or, to create an unbounded source that uses a provided function to set the element timestamp.
- * PCollection<Long> unboundedWithTimestamps =
- * p.apply(CountingInput.unbounded().withTimestampFn(someFn));
- * }</pre>
- */
-public class CountingInput {
- /**
- * Creates a {@link BoundedCountingInput} that will produce the specified number of elements,
- * from {@code 0} to {@code numElements - 1}.
- */
- public static BoundedCountingInput upTo(long numElements) {
- checkArgument(numElements > 0, "numElements (%s) must be greater than 0", numElements);
- return new BoundedCountingInput(numElements);
- }
-
- /**
- * Creates an {@link UnboundedCountingInput} that will produce numbers starting from {@code 0} up
- * to {@link Long#MAX_VALUE}.
- *
- * <p>After {@link Long#MAX_VALUE}, the transform never produces more output. (In practice, this
- * limit should never be reached.)
- *
- * <p>Elements in the resulting {@link PCollection PCollection<Long>} will by default have
- * timestamps corresponding to processing time at element generation, provided by
- * {@link Instant#now}. Use the transform returned by
- * {@link UnboundedCountingInput#withTimestampFn(SerializableFunction)} to control the output
- * timestamps.
- */
- public static UnboundedCountingInput unbounded() {
- return new UnboundedCountingInput(
- new NowTimestampFn(), Optional.<Long>absent(), Optional.<Duration>absent());
- }
-
- /**
- * A {@link PTransform} that will produce a specified number of {@link Long Longs} starting from
- * 0.
- */
- public static class BoundedCountingInput extends PTransform<PBegin, PCollection<Long>> {
- private final long numElements;
-
- private BoundedCountingInput(long numElements) {
- this.numElements = numElements;
- }
-
- @SuppressWarnings("deprecation")
- @Override
- public PCollection<Long> apply(PBegin begin) {
- return begin.apply(Read.from(CountingSource.upTo(numElements)));
- }
- }
-
- /**
- * A {@link PTransform} that will produce numbers starting from {@code 0} up to
- * {@link Long#MAX_VALUE}.
- *
- * <p>After {@link Long#MAX_VALUE}, the transform never produces more output. (In practice, this
- * limit should never be reached.)
- *
- * <p>Elements in the resulting {@link PCollection PCollection<Long>} will by default have
- * timestamps corresponding to processing time at element generation, provided by
- * {@link Instant#now}. Use the transform returned by
- * {@link UnboundedCountingInput#withTimestampFn(SerializableFunction)} to control the output
- * timestamps.
- */
- public static class UnboundedCountingInput extends PTransform<PBegin, PCollection<Long>> {
- private final SerializableFunction<Long, Instant> timestampFn;
- private final Optional<Long> maxNumRecords;
- private final Optional<Duration> maxReadTime;
-
- private UnboundedCountingInput(
- SerializableFunction<Long, Instant> timestampFn,
- Optional<Long> maxNumRecords,
- Optional<Duration> maxReadTime) {
- this.timestampFn = timestampFn;
- this.maxNumRecords = maxNumRecords;
- this.maxReadTime = maxReadTime;
- }
-
- /**
- * Returns an {@link UnboundedCountingInput} like this one, but where output elements have the
- * timestamp specified by the timestampFn.
- *
- * <p>Note that the timestamps produced by {@code timestampFn} may not decrease.
- */
- public UnboundedCountingInput withTimestampFn(SerializableFunction<Long, Instant> timestampFn) {
- return new UnboundedCountingInput(timestampFn, maxNumRecords, maxReadTime);
- }
-
- /**
- * Returns an {@link UnboundedCountingInput} like this one, but that will read at most the
- * specified number of elements.
- *
- * <p>A bounded amount of elements will be produced by the result transform, and the result
- * {@link PCollection} will be {@link IsBounded#BOUNDED bounded}.
- */
- public UnboundedCountingInput withMaxNumRecords(long maxRecords) {
- checkArgument(
- maxRecords > 0, "MaxRecords must be a positive (nonzero) value. Got %s", maxRecords);
- return new UnboundedCountingInput(timestampFn, Optional.of(maxRecords), maxReadTime);
- }
-
- /**
- * Returns an {@link UnboundedCountingInput} like this one, but that will read for at most the
- * specified amount of time.
- *
- * <p>A bounded amount of elements will be produced by the result transform, and the result
- * {@link PCollection} will be {@link IsBounded#BOUNDED bounded}.
- */
- public UnboundedCountingInput withMaxReadTime(Duration readTime) {
- checkNotNull(readTime, "ReadTime cannot be null");
- return new UnboundedCountingInput(timestampFn, maxNumRecords, Optional.of(readTime));
- }
-
- @SuppressWarnings("deprecation")
- @Override
- public PCollection<Long> apply(PBegin begin) {
- Unbounded<Long> read = Read.from(CountingSource.unboundedWithTimestampFn(timestampFn));
- if (!maxNumRecords.isPresent() && !maxReadTime.isPresent()) {
- return begin.apply(read);
- } else if (maxNumRecords.isPresent() && !maxReadTime.isPresent()) {
- return begin.apply(read.withMaxNumRecords(maxNumRecords.get()));
- } else if (!maxNumRecords.isPresent() && maxReadTime.isPresent()) {
- return begin.apply(read.withMaxReadTime(maxReadTime.get()));
- } else {
- return begin.apply(
- read.withMaxReadTime(maxReadTime.get()).withMaxNumRecords(maxNumRecords.get()));
- }
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/CountingSource.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/CountingSource.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/CountingSource.java
deleted file mode 100644
index 412f3a7..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/CountingSource.java
+++ /dev/null
@@ -1,397 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.io;
-
-import static com.google.common.base.Preconditions.checkArgument;
-
-import com.google.cloud.dataflow.sdk.coders.AvroCoder;
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.coders.DefaultCoder;
-import com.google.cloud.dataflow.sdk.coders.VarLongCoder;
-import com.google.cloud.dataflow.sdk.io.CountingInput.UnboundedCountingInput;
-import com.google.cloud.dataflow.sdk.io.UnboundedSource.UnboundedReader;
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-import com.google.cloud.dataflow.sdk.transforms.SerializableFunction;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.common.collect.ImmutableList;
-
-import org.joda.time.Instant;
-
-import java.io.IOException;
-import java.util.List;
-import java.util.NoSuchElementException;
-
-/**
- * A source that produces longs. When used as a {@link BoundedSource}, {@link CountingSource}
- * starts at {@code 0} and counts up to a specified maximum. When used as an
- * {@link UnboundedSource}, it counts up to {@link Long#MAX_VALUE} and then never produces more
- * output. (In practice, this limit should never be reached.)
- *
- * <p>The bounded {@link CountingSource} is implemented based on {@link OffsetBasedSource} and
- * {@link OffsetBasedSource.OffsetBasedReader}, so it performs efficient initial splitting and it
- * supports dynamic work rebalancing.
- *
- * <p>To produce a bounded {@code PCollection<Long>}, use {@link CountingSource#upTo(long)}:
- *
- * <pre>{@code
- * Pipeline p = ...
- * PTransform<PBegin, PCollection<Long>> producer = CountingInput.upTo(1000);
- * PCollection<Long> bounded = p.apply(producer);
- * }</pre>
- *
- * <p>To produce an unbounded {@code PCollection<Long>}, use {@link CountingInput#unbounded()},
- * calling {@link UnboundedCountingInput#withTimestampFn(SerializableFunction)} to provide values
- * with timestamps other than {@link Instant#now}.
- *
- * <pre>{@code
- * Pipeline p = ...
- *
- * // To create an unbounded PCollection that uses processing time as the element timestamp.
- * PCollection<Long> unbounded = p.apply(CountingInput.unbounded());
- * // Or, to create an unbounded source that uses a provided function to set the element timestamp.
- * PCollection<Long> unboundedWithTimestamps =
- * p.apply(CountingInput.unbounded().withTimestampFn(someFn));
- *
- * }</pre>
- */
-public class CountingSource {
- /**
- * Creates a {@link BoundedSource} that will produce the specified number of elements,
- * from {@code 0} to {@code numElements - 1}.
- *
- * @deprecated use {@link CountingInput#upTo(long)} instead
- */
- @Deprecated
- public static BoundedSource<Long> upTo(long numElements) {
- checkArgument(numElements > 0, "numElements (%s) must be greater than 0", numElements);
- return new BoundedCountingSource(0, numElements);
- }
-
- /**
- * Creates an {@link UnboundedSource} that will produce numbers starting from {@code 0} up to
- * {@link Long#MAX_VALUE}.
- *
- * <p>After {@link Long#MAX_VALUE}, the source never produces more output. (In practice, this
- * limit should never be reached.)
- *
- * <p>Elements in the resulting {@link PCollection PCollection<Long>} will have timestamps
- * corresponding to processing time at element generation, provided by {@link Instant#now}.
- *
- * @deprecated use {@link CountingInput#unbounded()} instead
- */
- @Deprecated
- public static UnboundedSource<Long, CounterMark> unbounded() {
- return unboundedWithTimestampFn(new NowTimestampFn());
- }
-
- /**
- * Creates an {@link UnboundedSource} that will produce numbers starting from {@code 0} up to
- * {@link Long#MAX_VALUE}, with element timestamps supplied by the specified function.
- *
- * <p>After {@link Long#MAX_VALUE}, the source never produces more output. (In practice, this
- * limit should never be reached.)
- *
- * <p>Note that the timestamps produced by {@code timestampFn} may not decrease.
- *
- * @deprecated use {@link CountingInput#unbounded()} and call
- * {@link UnboundedCountingInput#withTimestampFn(SerializableFunction)} instead
- */
- @Deprecated
- public static UnboundedSource<Long, CounterMark> unboundedWithTimestampFn(
- SerializableFunction<Long, Instant> timestampFn) {
- return new UnboundedCountingSource(0, 1, timestampFn);
- }
-
- /////////////////////////////////////////////////////////////////////////////////////////////
-
- /** Prevent instantiation. */
- private CountingSource() {}
-
- /**
- * A function that returns {@link Instant#now} as the timestamp for each generated element.
- */
- static class NowTimestampFn implements SerializableFunction<Long, Instant> {
- @Override
- public Instant apply(Long input) {
- return Instant.now();
- }
- }
-
- /**
- * An implementation of {@link CountingSource} that produces a bounded {@link PCollection}.
- * It is implemented on top of {@link OffsetBasedSource} (with associated reader
- * {@link BoundedCountingReader}) and performs efficient initial splitting and supports dynamic
- * work rebalancing.
- */
- private static class BoundedCountingSource extends OffsetBasedSource<Long> {
- /**
- * Creates a {@link BoundedCountingSource} that generates the numbers in the specified
- * {@code [start, end)} range.
- */
- public BoundedCountingSource(long start, long end) {
- super(start, end, 1 /* can be split every 1 offset */);
- }
-
- ////////////////////////////////////////////////////////////////////////////////////////////
-
- @Override
- public long getBytesPerOffset() {
- return 8;
- }
-
- @Override
- public long getMaxEndOffset(PipelineOptions options) throws Exception {
- return getEndOffset();
- }
-
- @Override
- public OffsetBasedSource<Long> createSourceForSubrange(long start, long end) {
- return new BoundedCountingSource(start, end);
- }
-
- @Override
- public boolean producesSortedKeys(PipelineOptions options) throws Exception {
- return true;
- }
-
- @Override
- public com.google.cloud.dataflow.sdk.io.BoundedSource.BoundedReader<Long> createReader(
- PipelineOptions options) throws IOException {
- return new BoundedCountingReader(this);
- }
-
- @Override
- public Coder<Long> getDefaultOutputCoder() {
- return VarLongCoder.of();
- }
- }
-
- /**
- * The reader associated with {@link BoundedCountingSource}.
- *
- * @see BoundedCountingSource
- */
- private static class BoundedCountingReader extends OffsetBasedSource.OffsetBasedReader<Long> {
- private long current;
-
- public BoundedCountingReader(OffsetBasedSource<Long> source) {
- super(source);
- }
-
- @Override
- protected long getCurrentOffset() throws NoSuchElementException {
- return current;
- }
-
- @Override
- public synchronized BoundedCountingSource getCurrentSource() {
- return (BoundedCountingSource) super.getCurrentSource();
- }
-
- @Override
- public Long getCurrent() throws NoSuchElementException {
- return current;
- }
-
- @Override
- protected boolean startImpl() throws IOException {
- current = getCurrentSource().getStartOffset();
- return true;
- }
-
- @Override
- protected boolean advanceImpl() throws IOException {
- current++;
- return true;
- }
-
- @Override
- public void close() throws IOException {}
- }
-
- /**
- * An implementation of {@link CountingSource} that produces an unbounded {@link PCollection}.
- */
- private static class UnboundedCountingSource extends UnboundedSource<Long, CounterMark> {
- /** The first number (>= 0) generated by this {@link UnboundedCountingSource}. */
- private final long start;
- /** The interval between numbers generated by this {@link UnboundedCountingSource}. */
- private final long stride;
- /** The function used to produce timestamps for the generated elements. */
- private final SerializableFunction<Long, Instant> timestampFn;
-
- /**
- * Creates an {@link UnboundedSource} that will produce numbers starting from {@code 0} up to
- * {@link Long#MAX_VALUE}, with element timestamps supplied by the specified function.
- *
- * <p>After {@link Long#MAX_VALUE}, the source never produces more output. (In practice, this
- * limit should never be reached.)
- *
- * <p>Note that the timestamps produced by {@code timestampFn} may not decrease.
- */
- public UnboundedCountingSource(
- long start, long stride, SerializableFunction<Long, Instant> timestampFn) {
- this.start = start;
- this.stride = stride;
- this.timestampFn = timestampFn;
- }
-
- /**
- * Splits an unbounded source {@code desiredNumSplits} ways by giving each split every
- * {@code desiredNumSplits}th element that this {@link UnboundedCountingSource}
- * produces.
- *
- * <p>E.g., if a source produces all even numbers {@code [0, 2, 4, 6, 8, ...)} and we want to
- * split into 3 new sources, then the new sources will produce numbers that are 6 apart and
- * are offset at the start by the original stride: {@code [0, 6, 12, ...)},
- * {@code [2, 8, 14, ...)}, and {@code [4, 10, 16, ...)}.
- */
- @Override
- public List<? extends UnboundedSource<Long, CountingSource.CounterMark>> generateInitialSplits(
- int desiredNumSplits, PipelineOptions options) throws Exception {
- // Using Javadoc example, stride 2 with 3 splits becomes stride 6.
- long newStride = stride * desiredNumSplits;
-
- ImmutableList.Builder<UnboundedCountingSource> splits = ImmutableList.builder();
- for (int i = 0; i < desiredNumSplits; ++i) {
- // Starts offset by the original stride. Using Javadoc example, this generates starts of
- // 0, 2, and 4.
- splits.add(new UnboundedCountingSource(start + i * stride, newStride, timestampFn));
- }
- return splits.build();
- }
-
- @Override
- public UnboundedReader<Long> createReader(
- PipelineOptions options, CounterMark checkpointMark) {
- return new UnboundedCountingReader(this, checkpointMark);
- }
-
- @Override
- public Coder<CountingSource.CounterMark> getCheckpointMarkCoder() {
- return AvroCoder.of(CountingSource.CounterMark.class);
- }
-
- @Override
- public void validate() {}
-
- @Override
- public Coder<Long> getDefaultOutputCoder() {
- return VarLongCoder.of();
- }
- }
-
- /**
- * The reader associated with {@link UnboundedCountingSource}.
- *
- * @see UnboundedCountingSource
- */
- private static class UnboundedCountingReader extends UnboundedReader<Long> {
- private UnboundedCountingSource source;
- private long current;
- private Instant currentTimestamp;
-
- public UnboundedCountingReader(UnboundedCountingSource source, CounterMark mark) {
- this.source = source;
- if (mark == null) {
- // Because we have not emitted an element yet, and start() calls advance, we need to
- // "un-advance" so that start() produces the correct output.
- this.current = source.start - source.stride;
- } else {
- this.current = mark.getLastEmitted();
- }
- }
-
- @Override
- public boolean start() throws IOException {
- return advance();
- }
-
- @Override
- public boolean advance() throws IOException {
- // Overflow-safe check that (current + source.stride) <= LONG.MAX_VALUE. Else, stop producing.
- if (Long.MAX_VALUE - source.stride < current) {
- return false;
- }
- current += source.stride;
- currentTimestamp = source.timestampFn.apply(current);
- return true;
- }
-
- @Override
- public Instant getWatermark() {
- return source.timestampFn.apply(current);
- }
-
- @Override
- public CounterMark getCheckpointMark() {
- return new CounterMark(current);
- }
-
- @Override
- public UnboundedSource<Long, CounterMark> getCurrentSource() {
- return source;
- }
-
- @Override
- public Long getCurrent() throws NoSuchElementException {
- return current;
- }
-
- @Override
- public Instant getCurrentTimestamp() throws NoSuchElementException {
- return currentTimestamp;
- }
-
- @Override
- public void close() throws IOException {}
- }
-
- /**
- * The checkpoint for an unbounded {@link CountingSource} is simply the last value produced. The
- * associated source object encapsulates the information needed to produce the next value.
- */
- @DefaultCoder(AvroCoder.class)
- public static class CounterMark implements UnboundedSource.CheckpointMark {
- /** The last value emitted. */
- private final long lastEmitted;
-
- /**
- * Creates a checkpoint mark reflecting the last emitted value.
- */
- public CounterMark(long lastEmitted) {
- this.lastEmitted = lastEmitted;
- }
-
- /**
- * Returns the last value emitted by the reader.
- */
- public long getLastEmitted() {
- return lastEmitted;
- }
-
- /////////////////////////////////////////////////////////////////////////////////////
-
- @SuppressWarnings("unused") // For AvroCoder
- private CounterMark() {
- this.lastEmitted = 0L;
- }
-
- @Override
- public void finalizeCheckpoint() throws IOException {}
- }
-}
[48/67] [partial] incubator-beam git commit: Directory reorganization
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/CoderRegistry.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/CoderRegistry.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/CoderRegistry.java
deleted file mode 100644
index 00982e6..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/CoderRegistry.java
+++ /dev/null
@@ -1,843 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.coders;
-
-import com.google.api.services.bigquery.model.TableRow;
-import com.google.cloud.dataflow.sdk.coders.CannotProvideCoderException.ReasonCode;
-import com.google.cloud.dataflow.sdk.coders.protobuf.ProtoCoder;
-import com.google.cloud.dataflow.sdk.transforms.SerializableFunction;
-import com.google.cloud.dataflow.sdk.util.CoderUtils;
-import com.google.cloud.dataflow.sdk.values.KV;
-import com.google.cloud.dataflow.sdk.values.TimestampedValue;
-import com.google.cloud.dataflow.sdk.values.TypeDescriptor;
-import com.google.common.annotations.VisibleForTesting;
-import com.google.common.base.Preconditions;
-import com.google.common.collect.ImmutableMap;
-import com.google.common.collect.Maps;
-import com.google.protobuf.ByteString;
-
-import org.joda.time.Instant;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.lang.reflect.ParameterizedType;
-import java.lang.reflect.Type;
-import java.lang.reflect.TypeVariable;
-import java.lang.reflect.WildcardType;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-
-import javax.annotation.Nullable;
-
-/**
- * A {@link CoderRegistry} allows registering the default {@link Coder} to use for a Java class,
- * and looking up and instantiating the default {@link Coder} for a Java type.
- *
- * <p>{@link CoderRegistry} uses the following mechanisms to determine a default {@link Coder} for a
- * Java class, in order of precedence:
- * <ol>
- * <li>Registration:
- * <ul>
- * <li>A {@link CoderFactory} can be registered to handle a particular class via
- * {@link #registerCoder(Class, CoderFactory)}.</li>
- * <li>A {@link Coder} class with the static methods to satisfy
- * {@link CoderFactories#fromStaticMethods} can be registered via
- * {@link #registerCoder(Class, Class)}.</li>
- * <li>Built-in types are registered via
- * {@link #registerStandardCoders()}.</li>
- * </ul>
- * <li>Annotations: {@link DefaultCoder} can be used to annotate a type with
- * the default {@code Coder} type. The {@link Coder} class must satisfy the requirements
- * of {@link CoderProviders#fromStaticMethods}.
- * <li>Fallback: A fallback {@link CoderProvider} is used to attempt to provide a {@link Coder}
- * for any type. By default, this is {@link SerializableCoder#PROVIDER}, which can provide
- * a {@link Coder} for any type that is serializable via Java serialization. The fallback
- * {@link CoderProvider} can be get and set via {@link #getFallbackCoderProvider()}
- * and {@link #setFallbackCoderProvider}. Multiple fallbacks can be chained together using
- * {@link CoderProviders#firstOf}.
- * </ol>
- */
-public class CoderRegistry implements CoderProvider {
-
- private static final Logger LOG = LoggerFactory.getLogger(CoderRegistry.class);
-
- public CoderRegistry() {
- setFallbackCoderProvider(
- CoderProviders.firstOf(ProtoCoder.coderProvider(), SerializableCoder.PROVIDER));
- }
-
- /**
- * Registers standard Coders with this CoderRegistry.
- */
- public void registerStandardCoders() {
- registerCoder(Byte.class, ByteCoder.class);
- registerCoder(ByteString.class, ByteStringCoder.class);
- registerCoder(Double.class, DoubleCoder.class);
- registerCoder(Instant.class, InstantCoder.class);
- registerCoder(Integer.class, VarIntCoder.class);
- registerCoder(Iterable.class, IterableCoder.class);
- registerCoder(KV.class, KvCoder.class);
- registerCoder(List.class, ListCoder.class);
- registerCoder(Long.class, VarLongCoder.class);
- registerCoder(Map.class, MapCoder.class);
- registerCoder(Set.class, SetCoder.class);
- registerCoder(String.class, StringUtf8Coder.class);
- registerCoder(TableRow.class, TableRowJsonCoder.class);
- registerCoder(TimestampedValue.class, TimestampedValue.TimestampedValueCoder.class);
- registerCoder(Void.class, VoidCoder.class);
- registerCoder(byte[].class, ByteArrayCoder.class);
- }
-
- /**
- * Registers {@code coderClazz} as the default {@link Coder} class to handle encoding and
- * decoding instances of {@code clazz}, overriding prior registrations if any exist.
- *
- * <p>Supposing {@code T} is the static type corresponding to the {@code clazz}, then
- * {@code coderClazz} should have a static factory method with the following signature:
- *
- * <pre> {@code
- * public static Coder<T> of(Coder<X> argCoder1, Coder<Y> argCoder2, ...)
- * } </pre>
- *
- * <p>This method will be called to create instances of {@code Coder<T>} for values of type
- * {@code T}, passing Coders for each of the generic type parameters of {@code T}. If {@code T}
- * takes no generic type parameters, then the {@code of()} factory method should have no
- * arguments.
- *
- * <p>If {@code T} is a parameterized type, then it should additionally have a method with the
- * following signature:
- *
- * <pre> {@code
- * public static List<Object> getInstanceComponents(T exampleValue);
- * } </pre>
- *
- * <p>This method will be called to decompose a value during the {@link Coder} inference process,
- * to automatically choose {@link Coder Coders} for the components.
- *
- * @param clazz the class of objects to be encoded
- * @param coderClazz a class with static factory methods to provide {@link Coder Coders}
- */
- public void registerCoder(Class<?> clazz, Class<?> coderClazz) {
- registerCoder(clazz, CoderFactories.fromStaticMethods(coderClazz));
- }
-
- /**
- * Registers {@code coderFactory} as the default {@link CoderFactory} to produce {@code Coder}
- * instances to decode and encode instances of {@code clazz}. This will override prior
- * registrations if any exist.
- */
- public void registerCoder(Class<?> clazz, CoderFactory coderFactory) {
- coderFactoryMap.put(clazz, coderFactory);
- }
-
- /**
- * Register the provided {@link Coder} for encoding all values of the specified {@code Class}.
- * This will override prior registrations if any exist.
- *
- * <p>Not for use with generic rawtypes. Instead, register a {@link CoderFactory} via
- * {@link #registerCoder(Class, CoderFactory)} or ensure your {@code Coder} class has the
- * appropriate static methods and register it directly via {@link #registerCoder(Class, Class)}.
- */
- public <T> void registerCoder(Class<T> rawClazz, Coder<T> coder) {
- Preconditions.checkArgument(
- rawClazz.getTypeParameters().length == 0,
- "CoderRegistry.registerCoder(Class<T>, Coder<T>) may not be used "
- + "with unspecialized generic classes");
-
- CoderFactory factory = CoderFactories.forCoder(coder);
- registerCoder(rawClazz, factory);
- }
-
- /**
- * Returns the {@link Coder} to use by default for values of the given type.
- *
- * @throws CannotProvideCoderException if there is no default Coder.
- */
- public <T> Coder<T> getDefaultCoder(TypeDescriptor<T> typeDescriptor)
- throws CannotProvideCoderException {
- return getDefaultCoder(typeDescriptor, Collections.<Type, Coder<?>>emptyMap());
- }
-
- /**
- * See {@link #getDefaultCoder(TypeDescriptor)}.
- */
- @Override
- public <T> Coder<T> getCoder(TypeDescriptor<T> typeDescriptor)
- throws CannotProvideCoderException {
- return getDefaultCoder(typeDescriptor);
- }
-
- /**
- * Returns the {@link Coder} to use by default for values of the given type, where the given input
- * type uses the given {@link Coder}.
- *
- * @throws CannotProvideCoderException if there is no default Coder.
- */
- public <InputT, OutputT> Coder<OutputT> getDefaultCoder(
- TypeDescriptor<OutputT> typeDescriptor,
- TypeDescriptor<InputT> inputTypeDescriptor,
- Coder<InputT> inputCoder)
- throws CannotProvideCoderException {
- return getDefaultCoder(
- typeDescriptor, getTypeToCoderBindings(inputTypeDescriptor.getType(), inputCoder));
- }
-
- /**
- * Returns the {@link Coder} to use on elements produced by this function, given the {@link Coder}
- * used for its input elements.
- */
- public <InputT, OutputT> Coder<OutputT> getDefaultOutputCoder(
- SerializableFunction<InputT, OutputT> fn, Coder<InputT> inputCoder)
- throws CannotProvideCoderException {
-
- ParameterizedType fnType = (ParameterizedType)
- TypeDescriptor.of(fn.getClass()).getSupertype(SerializableFunction.class).getType();
-
- return getDefaultCoder(
- fn.getClass(),
- SerializableFunction.class,
- ImmutableMap.of(fnType.getActualTypeArguments()[0], inputCoder),
- SerializableFunction.class.getTypeParameters()[1]);
- }
-
- /**
- * Returns the {@link Coder} to use for the specified type parameter specialization of the
- * subclass, given {@link Coder Coders} to use for all other type parameters (if any).
- *
- * @throws CannotProvideCoderException if there is no default Coder.
- */
- public <T, OutputT> Coder<OutputT> getDefaultCoder(
- Class<? extends T> subClass,
- Class<T> baseClass,
- Map<Type, ? extends Coder<?>> knownCoders,
- TypeVariable<?> param)
- throws CannotProvideCoderException {
-
- Map<Type, Coder<?>> inferredCoders = getDefaultCoders(subClass, baseClass, knownCoders);
-
- @SuppressWarnings("unchecked")
- Coder<OutputT> paramCoderOrNull = (Coder<OutputT>) inferredCoders.get(param);
- if (paramCoderOrNull != null) {
- return paramCoderOrNull;
- } else {
- throw new CannotProvideCoderException(
- "Cannot infer coder for type parameter " + param.getName());
- }
- }
-
- /**
- * Returns the {@link Coder} to use for the provided example value, if it can be determined.
- *
- * @throws CannotProvideCoderException if there is no default {@link Coder} or
- * more than one {@link Coder} matches
- */
- public <T> Coder<T> getDefaultCoder(T exampleValue) throws CannotProvideCoderException {
- Class<?> clazz = exampleValue == null ? Void.class : exampleValue.getClass();
-
- if (clazz.getTypeParameters().length == 0) {
- // Trust that getDefaultCoder returns a valid
- // Coder<T> for non-generic clazz.
- @SuppressWarnings("unchecked")
- Coder<T> coder = (Coder<T>) getDefaultCoder(clazz);
- return coder;
- } else {
- CoderFactory factory = getDefaultCoderFactory(clazz);
-
- List<Object> components = factory.getInstanceComponents(exampleValue);
- if (components == null) {
- throw new CannotProvideCoderException(String.format(
- "Cannot provide coder based on value with class %s: The registered CoderFactory with "
- + "class %s failed to decompose the value, which is required in order to provide "
- + "Coders for the components.",
- clazz.getCanonicalName(), factory.getClass().getCanonicalName()));
- }
-
- // componentcoders = components.map(this.getDefaultCoder)
- List<Coder<?>> componentCoders = new ArrayList<>();
- for (Object component : components) {
- try {
- Coder<?> componentCoder = getDefaultCoder(component);
- componentCoders.add(componentCoder);
- } catch (CannotProvideCoderException exc) {
- throw new CannotProvideCoderException(
- String.format("Cannot provide coder based on value with class %s",
- clazz.getCanonicalName()),
- exc);
- }
- }
-
- // Trust that factory.create maps from valid component Coders
- // to a valid Coder<T>.
- @SuppressWarnings("unchecked")
- Coder<T> coder = (Coder<T>) factory.create(componentCoders);
- return coder;
- }
- }
-
- /**
- * Returns the {@link Coder} to use by default for values of the given class. The following three
- * sources for a {@link Coder} will be attempted, in order:
- *
- * <ol>
- * <li>A {@link Coder} class registered explicitly via a call to {@link #registerCoder},
- * <li>A {@link DefaultCoder} annotation on the class,
- * <li>This registry's fallback {@link CoderProvider}, which may be able to generate a
- * {@link Coder} for an arbitrary class.
- * </ol>
- *
- * @throws CannotProvideCoderException if a {@link Coder} cannot be provided
- */
- public <T> Coder<T> getDefaultCoder(Class<T> clazz) throws CannotProvideCoderException {
-
- CannotProvideCoderException factoryException;
- try {
- CoderFactory coderFactory = getDefaultCoderFactory(clazz);
- LOG.debug("Default coder for {} found by factory", clazz);
- @SuppressWarnings("unchecked")
- Coder<T> coder = (Coder<T>) coderFactory.create(Collections.<Coder<?>>emptyList());
- return coder;
- } catch (CannotProvideCoderException exc) {
- factoryException = exc;
- }
-
- CannotProvideCoderException annotationException;
- try {
- return getDefaultCoderFromAnnotation(clazz);
- } catch (CannotProvideCoderException exc) {
- annotationException = exc;
- }
-
- CannotProvideCoderException fallbackException;
- if (getFallbackCoderProvider() != null) {
- try {
- return getFallbackCoderProvider().getCoder(TypeDescriptor.<T>of(clazz));
- } catch (CannotProvideCoderException exc) {
- fallbackException = exc;
- }
- } else {
- fallbackException = new CannotProvideCoderException("no fallback CoderProvider configured");
- }
-
- // Build up the error message and list of causes.
- StringBuilder messageBuilder = new StringBuilder()
- .append("Unable to provide a default Coder for ").append(clazz.getCanonicalName())
- .append(". Correct one of the following root causes:");
-
- messageBuilder
- .append("\n Building a Coder using a registered CoderFactory failed: ")
- .append(factoryException.getMessage());
-
- messageBuilder
- .append("\n Building a Coder from the @DefaultCoder annotation failed: ")
- .append(annotationException.getMessage());
-
- messageBuilder
- .append("\n Building a Coder from the fallback CoderProvider failed: ")
- .append(fallbackException.getMessage());
-
- throw new CannotProvideCoderException(messageBuilder.toString());
- }
-
- /**
- * Sets the fallback {@link CoderProvider} for this registry. If no other method succeeds in
- * providing a {@code Coder<T>} for a type {@code T}, then the registry will attempt to create
- * a {@link Coder} using this {@link CoderProvider}.
- *
- * <p>By default, this is set to {@link SerializableCoder#PROVIDER}.
- *
- * <p>See {@link #getFallbackCoderProvider}.
- */
- public void setFallbackCoderProvider(CoderProvider coderProvider) {
- fallbackCoderProvider = coderProvider;
- }
-
- /**
- * Returns the fallback {@link CoderProvider} for this registry.
- *
- * <p>See {@link #setFallbackCoderProvider}.
- */
- public CoderProvider getFallbackCoderProvider() {
- return fallbackCoderProvider;
- }
-
- /**
- * Returns a {@code Map} from each of {@code baseClass}'s type parameters to the {@link Coder} to
- * use by default for it, in the context of {@code subClass}'s specialization of
- * {@code baseClass}.
- *
- * <p>If no {@link Coder} can be inferred for a particular type parameter, then that type variable
- * will be absent from the returned {@code Map}.
- *
- * <p>For example, if {@code baseClass} is {@code Map.class}, where {@code Map<K, V>} has type
- * parameters {@code K} and {@code V}, and {@code subClass} extends {@code Map<String, Integer>}
- * then the result will map the type variable {@code K} to a {@code Coder<String>} and the
- * type variable {@code V} to a {@code Coder<Integer>}.
- *
- * <p>The {@code knownCoders} parameter can be used to provide known {@link Coder Coders} for any
- * of the parameters; these will be used to infer the others.
- *
- * <p>Note that inference is attempted for every type variable. For a type
- * {@code MyType<One, Two, Three>} inference will be attempted for all of {@code One},
- * {@code Two}, {@code Three}, even if the requester only wants a {@link Coder} for {@code Two}.
- *
- * <p>For this reason {@code getDefaultCoders} (plural) does not throw an exception if a
- * {@link Coder} for a particular type variable cannot be inferred, but merely omits the entry
- * from the returned {@code Map}. It is the responsibility of the caller (usually
- * {@link #getDefaultCoder} to extract the desired coder or throw a
- * {@link CannotProvideCoderException} when appropriate.
- *
- * @param subClass the concrete type whose specializations are being inferred
- * @param baseClass the base type, a parameterized class
- * @param knownCoders a map corresponding to the set of known {@link Coder Coders} indexed by
- * parameter name
- *
- * @deprecated this method is not part of the public interface and will be made private
- */
- @Deprecated
- public <T> Map<Type, Coder<?>> getDefaultCoders(
- Class<? extends T> subClass,
- Class<T> baseClass,
- Map<Type, ? extends Coder<?>> knownCoders) {
- TypeVariable<Class<T>>[] typeParams = baseClass.getTypeParameters();
- Coder<?>[] knownCodersArray = new Coder<?>[typeParams.length];
- for (int i = 0; i < typeParams.length; i++) {
- knownCodersArray[i] = knownCoders.get(typeParams[i]);
- }
- Coder<?>[] resultArray = getDefaultCoders(
- subClass, baseClass, knownCodersArray);
- Map<Type, Coder<?>> result = new HashMap<>();
- for (int i = 0; i < typeParams.length; i++) {
- if (resultArray[i] != null) {
- result.put(typeParams[i], resultArray[i]);
- }
- }
- return result;
- }
-
- /**
- * Returns an array listing, for each of {@code baseClass}'s type parameters, the {@link Coder} to
- * use by default for it, in the context of {@code subClass}'s specialization of
- * {@code baseClass}.
- *
- * <p>If a {@link Coder} cannot be inferred for a type variable, its slot in the resulting array
- * will be {@code null}.
- *
- * <p>For example, if {@code baseClass} is {@code Map.class}, where {@code Map<K, V>} has type
- * parameters {@code K} and {@code V} in that order, and {@code subClass} extends
- * {@code Map<String, Integer>} then the result will contain a {@code Coder<String>} and a
- * {@code Coder<Integer>}, in that order.
- *
- * <p>The {@code knownCoders} parameter can be used to provide known {@link Coder Coders} for any
- * of the type parameters. These will be used to infer the others. If non-null, the length of this
- * array must match the number of type parameters of {@code baseClass}, and simply be filled with
- * {@code null} values for each type parameters without a known {@link Coder}.
- *
- * <p>Note that inference is attempted for every type variable. For a type
- * {@code MyType<One, Two, Three>} inference will will be attempted for all of {@code One},
- * {@code Two}, {@code Three}, even if the requester only wants a {@link Coder} for {@code Two}.
- *
- * <p>For this reason {@code getDefaultCoders} (plural) does not throw an exception if a
- * {@link Coder} for a particular type variable cannot be inferred. Instead, it results in a
- * {@code null} in the array. It is the responsibility of the caller (usually
- * {@link #getDefaultCoder} to extract the desired coder or throw a
- * {@link CannotProvideCoderException} when appropriate.
- *
- * @param subClass the concrete type whose specializations are being inferred
- * @param baseClass the base type, a parameterized class
- * @param knownCoders an array corresponding to the set of base class type parameters. Each entry
- * can be either a {@link Coder} (in which case it will be used for inference) or
- * {@code null} (in which case it will be inferred). May be {@code null} to indicate the
- * entire set of parameters should be inferred.
- * @throws IllegalArgumentException if baseClass doesn't have type parameters or if the length of
- * {@code knownCoders} is not equal to the number of type parameters of {@code baseClass}.
- */
- private <T> Coder<?>[] getDefaultCoders(
- Class<? extends T> subClass,
- Class<T> baseClass,
- @Nullable Coder<?>[] knownCoders) {
- Type type = TypeDescriptor.of(subClass).getSupertype(baseClass).getType();
- if (!(type instanceof ParameterizedType)) {
- throw new IllegalArgumentException(type + " is not a ParameterizedType");
- }
- ParameterizedType parameterizedType = (ParameterizedType) type;
- Type[] typeArgs = parameterizedType.getActualTypeArguments();
- if (knownCoders == null) {
- knownCoders = new Coder<?>[typeArgs.length];
- } else if (typeArgs.length != knownCoders.length) {
- throw new IllegalArgumentException(
- String.format("Class %s has %d parameters, but %d coders are requested.",
- baseClass.getCanonicalName(), typeArgs.length, knownCoders.length));
- }
-
- Map<Type, Coder<?>> context = new HashMap<>();
- for (int i = 0; i < knownCoders.length; i++) {
- if (knownCoders[i] != null) {
- try {
- verifyCompatible(knownCoders[i], typeArgs[i]);
- } catch (IncompatibleCoderException exn) {
- throw new IllegalArgumentException(
- String.format("Provided coders for type arguments of %s contain incompatibilities:"
- + " Cannot encode elements of type %s with coder %s",
- baseClass,
- typeArgs[i], knownCoders[i]),
- exn);
- }
- context.putAll(getTypeToCoderBindings(typeArgs[i], knownCoders[i]));
- }
- }
-
- Coder<?>[] result = new Coder<?>[typeArgs.length];
- for (int i = 0; i < knownCoders.length; i++) {
- if (knownCoders[i] != null) {
- result[i] = knownCoders[i];
- } else {
- try {
- result[i] = getDefaultCoder(typeArgs[i], context);
- } catch (CannotProvideCoderException exc) {
- result[i] = null;
- }
- }
- }
- return result;
- }
-
-
- /////////////////////////////////////////////////////////////////////////////
-
- /**
- * Thrown when a {@link Coder} cannot possibly encode a type, yet has been proposed as a
- * {@link Coder} for that type.
- */
- @VisibleForTesting static class IncompatibleCoderException extends RuntimeException {
- private Coder<?> coder;
- private Type type;
-
- public IncompatibleCoderException(String message, Coder<?> coder, Type type) {
- super(message);
- this.coder = coder;
- this.type = type;
- }
-
- public IncompatibleCoderException(String message, Coder<?> coder, Type type, Throwable cause) {
- super(message, cause);
- this.coder = coder;
- this.type = type;
- }
-
- public Coder<?> getCoder() {
- return coder;
- }
-
- public Type getType() {
- return type;
- }
- }
-
- /**
- * Returns {@code true} if the given {@link Coder} can possibly encode elements
- * of the given type.
- */
- @VisibleForTesting static <T, CoderT extends Coder<T>, CandidateT>
- void verifyCompatible(CoderT coder, Type candidateType) throws IncompatibleCoderException {
-
- // Various representations of the coder's class
- @SuppressWarnings("unchecked")
- Class<CoderT> coderClass = (Class<CoderT>) coder.getClass();
- TypeDescriptor<CoderT> coderDescriptor = TypeDescriptor.of(coderClass);
-
- // Various representations of the actual coded type
- @SuppressWarnings("unchecked")
- TypeDescriptor<T> codedDescriptor = CoderUtils.getCodedType(coderDescriptor);
- @SuppressWarnings("unchecked")
- Class<T> codedClass = (Class<T>) codedDescriptor.getRawType();
- Type codedType = codedDescriptor.getType();
-
- // Various representations of the candidate type
- @SuppressWarnings("unchecked")
- TypeDescriptor<CandidateT> candidateDescriptor =
- (TypeDescriptor<CandidateT>) TypeDescriptor.of(candidateType);
- @SuppressWarnings("unchecked")
- Class<CandidateT> candidateClass = (Class<CandidateT>) candidateDescriptor.getRawType();
-
- // If coder has type Coder<T> where the actual value of T is lost
- // to erasure, then we cannot rule it out.
- if (candidateType instanceof TypeVariable) {
- return;
- }
-
- // If the raw types are not compatible, we can certainly rule out
- // coder compatibility
- if (!codedClass.isAssignableFrom(candidateClass)) {
- throw new IncompatibleCoderException(
- String.format("Cannot encode elements of type %s with coder %s because the"
- + " coded type %s is not assignable from %s",
- candidateType, coder, codedClass, candidateType),
- coder, candidateType);
- }
- // we have established that this is a covariant upcast... though
- // coders are invariant, we are just checking one direction
- @SuppressWarnings("unchecked")
- TypeDescriptor<T> candidateOkDescriptor = (TypeDescriptor<T>) candidateDescriptor;
-
- // If the coded type is a parameterized type where any of the actual
- // type parameters are not compatible, then the whole thing is certainly not
- // compatible.
- if ((codedType instanceof ParameterizedType) && !isNullOrEmpty(coder.getCoderArguments())) {
- ParameterizedType parameterizedSupertype = ((ParameterizedType)
- candidateOkDescriptor.getSupertype(codedClass).getType());
- Type[] typeArguments = parameterizedSupertype.getActualTypeArguments();
- List<? extends Coder<?>> typeArgumentCoders = coder.getCoderArguments();
- if (typeArguments.length < typeArgumentCoders.size()) {
- throw new IncompatibleCoderException(
- String.format("Cannot encode elements of type %s with coder %s:"
- + " the generic supertype %s has %s type parameters, which is less than the"
- + " number of coder arguments %s has (%s).",
- candidateOkDescriptor, coder,
- parameterizedSupertype, typeArguments.length,
- coder, typeArgumentCoders.size()),
- coder, candidateOkDescriptor.getType());
- }
- for (int i = 0; i < typeArgumentCoders.size(); i++) {
- try {
- verifyCompatible(
- typeArgumentCoders.get(i),
- candidateDescriptor.resolveType(typeArguments[i]).getType());
- } catch (IncompatibleCoderException exn) {
- throw new IncompatibleCoderException(
- String.format("Cannot encode elements of type %s with coder %s"
- + " because some component coder is incompatible",
- candidateType, coder),
- coder, candidateType, exn);
- }
- }
- }
- }
-
- private static boolean isNullOrEmpty(Collection<?> c) {
- return c == null || c.size() == 0;
- }
-
- /**
- * The map of classes to the CoderFactories to use to create their
- * default Coders.
- */
- private Map<Class<?>, CoderFactory> coderFactoryMap = new HashMap<>();
-
- /**
- * A provider of coders for types where no coder is registered.
- */
- private CoderProvider fallbackCoderProvider;
-
- /**
- * Returns the {@link CoderFactory} to use to create default {@link Coder Coders} for instances of
- * the given class, or {@code null} if there is no default {@link CoderFactory} registered.
- */
- private CoderFactory getDefaultCoderFactory(Class<?> clazz) throws CannotProvideCoderException {
- CoderFactory coderFactoryOrNull = coderFactoryMap.get(clazz);
- if (coderFactoryOrNull != null) {
- return coderFactoryOrNull;
- } else {
- throw new CannotProvideCoderException(
- String.format("Cannot provide coder based on value with class %s: No CoderFactory has "
- + "been registered for the class.", clazz.getCanonicalName()));
- }
- }
-
- /**
- * Returns the {@link Coder} returned according to the {@link CoderProvider} from any
- * {@link DefaultCoder} annotation on the given class.
- */
- private <T> Coder<T> getDefaultCoderFromAnnotation(Class<T> clazz)
- throws CannotProvideCoderException {
- DefaultCoder defaultAnnotation = clazz.getAnnotation(DefaultCoder.class);
- if (defaultAnnotation == null) {
- throw new CannotProvideCoderException(
- String.format("Class %s does not have a @DefaultCoder annotation.",
- clazz.getCanonicalName()));
- }
-
- LOG.debug("DefaultCoder annotation found for {} with value {}",
- clazz, defaultAnnotation.value());
- CoderProvider coderProvider = CoderProviders.fromStaticMethods(defaultAnnotation.value());
- return coderProvider.getCoder(TypeDescriptor.of(clazz));
- }
-
- /**
- * Returns the {@link Coder} to use by default for values of the given type,
- * in a context where the given types use the given coders.
- *
- * @throws CannotProvideCoderException if a coder cannot be provided
- */
- private <T> Coder<T> getDefaultCoder(
- TypeDescriptor<T> typeDescriptor,
- Map<Type, Coder<?>> typeCoderBindings)
- throws CannotProvideCoderException {
-
- Coder<?> defaultCoder = getDefaultCoder(typeDescriptor.getType(), typeCoderBindings);
- LOG.debug("Default coder for {}: {}", typeDescriptor, defaultCoder);
- @SuppressWarnings("unchecked")
- Coder<T> result = (Coder<T>) defaultCoder;
- return result;
- }
-
- /**
- * Returns the {@link Coder} to use by default for values of the given type,
- * in a context where the given types use the given coders.
- *
- * @throws CannotProvideCoderException if a coder cannot be provided
- */
- private Coder<?> getDefaultCoder(Type type, Map<Type, Coder<?>> typeCoderBindings)
- throws CannotProvideCoderException {
- Coder<?> coder = typeCoderBindings.get(type);
- if (coder != null) {
- return coder;
- }
- if (type instanceof Class<?>) {
- Class<?> clazz = (Class<?>) type;
- return getDefaultCoder(clazz);
- } else if (type instanceof ParameterizedType) {
- return getDefaultCoder((ParameterizedType) type, typeCoderBindings);
- } else if (type instanceof TypeVariable || type instanceof WildcardType) {
- // No default coder for an unknown generic type.
- throw new CannotProvideCoderException(
- String.format("Cannot provide a coder for type variable %s"
- + " (declared by %s) because the actual type is unknown due to erasure.",
- type,
- ((TypeVariable<?>) type).getGenericDeclaration()),
- ReasonCode.TYPE_ERASURE);
- } else {
- throw new RuntimeException(
- "Internal error: unexpected kind of Type: " + type);
- }
- }
-
- /**
- * Returns the {@link Coder} to use by default for values of the given
- * parameterized type, in a context where the given types use the
- * given {@link Coder Coders}.
- *
- * @throws CannotProvideCoderException if no coder can be provided
- */
- private Coder<?> getDefaultCoder(
- ParameterizedType type,
- Map<Type, Coder<?>> typeCoderBindings)
- throws CannotProvideCoderException {
-
- CannotProvideCoderException factoryException;
- try {
- return getDefaultCoderFromFactory(type, typeCoderBindings);
- } catch (CannotProvideCoderException exc) {
- factoryException = exc;
- }
-
- CannotProvideCoderException annotationException;
- try {
- Class<?> rawClazz = (Class<?>) type.getRawType();
- return getDefaultCoderFromAnnotation(rawClazz);
- } catch (CannotProvideCoderException exc) {
- annotationException = exc;
- }
-
- // Build up the error message and list of causes.
- StringBuilder messageBuilder = new StringBuilder()
- .append("Unable to provide a default Coder for ").append(type)
- .append(". Correct one of the following root causes:");
-
- messageBuilder
- .append("\n Building a Coder using a registered CoderFactory failed: ")
- .append(factoryException.getMessage());
-
- messageBuilder
- .append("\n Building a Coder from the @DefaultCoder annotation failed: ")
- .append(annotationException.getMessage());
-
- throw new CannotProvideCoderException(messageBuilder.toString());
- }
-
- private Coder<?> getDefaultCoderFromFactory(
- ParameterizedType type,
- Map<Type, Coder<?>> typeCoderBindings)
- throws CannotProvideCoderException {
- Class<?> rawClazz = (Class<?>) type.getRawType();
- CoderFactory coderFactory = getDefaultCoderFactory(rawClazz);
- List<Coder<?>> typeArgumentCoders = new ArrayList<>();
- for (Type typeArgument : type.getActualTypeArguments()) {
- try {
- Coder<?> typeArgumentCoder = getDefaultCoder(typeArgument,
- typeCoderBindings);
- typeArgumentCoders.add(typeArgumentCoder);
- } catch (CannotProvideCoderException exc) {
- throw new CannotProvideCoderException(
- String.format("Cannot provide coder for parameterized type %s: %s",
- type,
- exc.getMessage()),
- exc);
- }
- }
- return coderFactory.create(typeArgumentCoders);
- }
-
- /**
- * Returns an immutable {@code Map} from each of the type variables
- * embedded in the given type to the corresponding types
- * in the given {@link Coder}.
- */
- private Map<Type, Coder<?>> getTypeToCoderBindings(Type type, Coder<?> coder) {
- if (type instanceof TypeVariable || type instanceof Class) {
- return ImmutableMap.<Type, Coder<?>>of(type, coder);
- } else if (type instanceof ParameterizedType) {
- return getTypeToCoderBindings((ParameterizedType) type, coder);
- } else {
- return ImmutableMap.of();
- }
- }
-
- /**
- * Returns an immutable {@code Map} from the type arguments of the parameterized type to their
- * corresponding {@link Coder Coders}, and so on recursively for their type parameters.
- *
- * <p>This method is simply a specialization to break out the most
- * elaborate case of {@link #getTypeToCoderBindings(Type, Coder)}.
- */
- private Map<Type, Coder<?>> getTypeToCoderBindings(ParameterizedType type, Coder<?> coder) {
- List<Type> typeArguments = Arrays.asList(type.getActualTypeArguments());
- List<? extends Coder<?>> coderArguments = coder.getCoderArguments();
-
- if ((coderArguments == null) || (typeArguments.size() != coderArguments.size())) {
- return ImmutableMap.of();
- } else {
- Map<Type, Coder<?>> typeToCoder = Maps.newHashMap();
-
- typeToCoder.put(type, coder);
-
- for (int i = 0; i < typeArguments.size(); i++) {
- Type typeArgument = typeArguments.get(i);
- Coder<?> coderArgument = coderArguments.get(i);
- typeToCoder.putAll(getTypeToCoderBindings(typeArgument, coderArgument));
- }
-
- return ImmutableMap.<Type, Coder<?>>builder().putAll(typeToCoder).build();
- }
-
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/CollectionCoder.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/CollectionCoder.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/CollectionCoder.java
deleted file mode 100644
index a028317..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/CollectionCoder.java
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.coders;
-
-import com.google.cloud.dataflow.sdk.util.PropertyNames;
-import com.google.common.base.Preconditions;
-
-import com.fasterxml.jackson.annotation.JsonCreator;
-import com.fasterxml.jackson.annotation.JsonProperty;
-
-import java.util.Collection;
-import java.util.List;
-
-/**
- * A {@link CollectionCoder} encodes {@link Collection Collections} in the format
- * of {@link IterableLikeCoder}.
- */
-public class CollectionCoder<T> extends IterableLikeCoder<T, Collection<T>> {
-
- public static <T> CollectionCoder<T> of(Coder<T> elemCoder) {
- return new CollectionCoder<>(elemCoder);
- }
-
- /////////////////////////////////////////////////////////////////////////////
- // Internal operations below here.
-
- /**
- * {@inheritDoc}
- *
- * @return the decoded elements directly, since {@link List} is a subtype of
- * {@link Collection}.
- */
- @Override
- protected final Collection<T> decodeToIterable(List<T> decodedElements) {
- return decodedElements;
- }
-
- @JsonCreator
- public static CollectionCoder<?> of(
- @JsonProperty(PropertyNames.COMPONENT_ENCODINGS)
- List<Object> components) {
- Preconditions.checkArgument(components.size() == 1,
- "Expecting 1 component, got " + components.size());
- return of((Coder<?>) components.get(0));
- }
-
- /**
- * Returns the first element in this collection if it is non-empty,
- * otherwise returns {@code null}.
- */
- public static <T> List<Object> getInstanceComponents(
- Collection<T> exampleValue) {
- return getInstanceComponentsHelper(exampleValue);
- }
-
- protected CollectionCoder(Coder<T> elemCoder) {
- super(elemCoder, "Collection");
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/CustomCoder.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/CustomCoder.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/CustomCoder.java
deleted file mode 100644
index b34ef8c..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/CustomCoder.java
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.coders;
-
-import static com.google.cloud.dataflow.sdk.util.Structs.addString;
-import static com.google.cloud.dataflow.sdk.util.Structs.addStringList;
-import static com.google.common.base.Preconditions.checkNotNull;
-
-import com.google.cloud.dataflow.sdk.util.CloudObject;
-import com.google.cloud.dataflow.sdk.util.PropertyNames;
-import com.google.cloud.dataflow.sdk.util.SerializableUtils;
-import com.google.cloud.dataflow.sdk.util.StringUtils;
-import com.google.common.collect.Lists;
-
-import com.fasterxml.jackson.annotation.JsonCreator;
-import com.fasterxml.jackson.annotation.JsonProperty;
-
-import java.io.Serializable;
-import java.util.Collection;
-
-/**
- * An abstract base class for writing a {@link Coder} class that encodes itself via Java
- * serialization.
- *
- * <p>To complete an implementation, subclasses must implement {@link Coder#encode}
- * and {@link Coder#decode} methods. Anonymous subclasses must furthermore override
- * {@link #getEncodingId}.
- *
- * <p>Not to be confused with {@link SerializableCoder} that encodes objects that implement the
- * {@link Serializable} interface.
- *
- * @param <T> the type of elements handled by this coder
- */
-public abstract class CustomCoder<T> extends AtomicCoder<T>
- implements Serializable {
- @JsonCreator
- public static CustomCoder<?> of(
- // N.B. typeId is a required parameter here, since a field named "@type"
- // is presented to the deserializer as an input.
- //
- // If this method did not consume the field, Jackson2 would observe an
- // unconsumed field and a returned value of a derived type. So Jackson2
- // would attempt to update the returned value with the unconsumed field
- // data, The standard JsonDeserializer does not implement a mechanism for
- // updating constructed values, so it would throw an exception, causing
- // deserialization to fail.
- @JsonProperty(value = "@type", required = false) String typeId,
- @JsonProperty(value = "encoding_id", required = false) String encodingId,
- @JsonProperty("type") String type,
- @JsonProperty("serialized_coder") String serializedCoder) {
- return (CustomCoder<?>) SerializableUtils.deserializeFromByteArray(
- StringUtils.jsonStringToByteArray(serializedCoder),
- type);
- }
-
- /**
- * {@inheritDoc}
- *
- * @return A thin {@link CloudObject} wrapping of the Java serialization of {@code this}.
- */
- @Override
- public CloudObject asCloudObject() {
- // N.B. We use the CustomCoder class, not the derived class, since during
- // deserialization we will be using the CustomCoder's static factory method
- // to construct an instance of the derived class.
- CloudObject result = CloudObject.forClass(CustomCoder.class);
- addString(result, "type", getClass().getName());
- addString(result, "serialized_coder",
- StringUtils.byteArrayToJsonString(
- SerializableUtils.serializeToByteArray(this)));
-
- String encodingId = getEncodingId();
- checkNotNull(encodingId, "Coder.getEncodingId() must not return null.");
- if (!encodingId.isEmpty()) {
- addString(result, PropertyNames.ENCODING_ID, encodingId);
- }
-
- Collection<String> allowedEncodings = getAllowedEncodings();
- if (!allowedEncodings.isEmpty()) {
- addStringList(result, PropertyNames.ALLOWED_ENCODINGS, Lists.newArrayList(allowedEncodings));
- }
-
- return result;
- }
-
- /**
- * {@inheritDoc}
- *
- * @throws NonDeterministicException a {@link CustomCoder} is presumed
- * nondeterministic.
- */
- @Override
- public void verifyDeterministic() throws NonDeterministicException {
- throw new NonDeterministicException(this,
- "CustomCoder implementations must override verifyDeterministic,"
- + " or they are presumed nondeterministic.");
- }
-
- /**
- * {@inheritDoc}
- *
- * @return The canonical class name for this coder. For stable data formats that are independent
- * of class name, it is recommended to override this method.
- *
- * @throws UnsupportedOperationException when an anonymous class is used, since they do not have
- * a stable canonical class name.
- */
- @Override
- public String getEncodingId() {
- if (getClass().isAnonymousClass()) {
- throw new UnsupportedOperationException(
- String.format("Anonymous CustomCoder subclass %s must override getEncodingId()."
- + " Otherwise, convert to a named class and getEncodingId() will be automatically"
- + " generated from the fully qualified class name.",
- getClass()));
- }
- return getClass().getCanonicalName();
- }
-
- // This coder inherits isRegisterByteSizeObserverCheap,
- // getEncodedElementByteSize and registerByteSizeObserver
- // from StandardCoder. Override if we can do better.
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/DefaultCoder.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/DefaultCoder.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/DefaultCoder.java
deleted file mode 100644
index 110579b..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/DefaultCoder.java
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.coders;
-
-import com.google.cloud.dataflow.sdk.values.PCollection;
-
-import java.lang.annotation.Documented;
-import java.lang.annotation.ElementType;
-import java.lang.annotation.Retention;
-import java.lang.annotation.RetentionPolicy;
-import java.lang.annotation.Target;
-
-/**
- * The {@link DefaultCoder} annotation
- * specifies a default {@link Coder} class to handle encoding and decoding
- * instances of the annotated class.
- *
- * <p>The specified {@link Coder} must satisfy the requirements of
- * {@link CoderProviders#fromStaticMethods}. Two classes provided by the SDK that
- * are intended for use with this annotation include {@link SerializableCoder}
- * and {@link AvroCoder}.
- *
- * <p>To configure the use of Java serialization as the default
- * for a class, annotate the class to use
- * {@link SerializableCoder} as follows:
- *
- * <pre><code>{@literal @}DefaultCoder(SerializableCoder.class)
- * public class MyCustomDataType implements Serializable {
- * // ...
- * }</code></pre>
- *
- * <p>Similarly, to configure the use of
- * {@link AvroCoder} as the default:
- * <pre><code>{@literal @}DefaultCoder(AvroCoder.class)
- * public class MyCustomDataType {
- * public MyCustomDataType() {} // Avro requires an empty constructor.
- * // ...
- * }</code></pre>
- *
- * <p>Coders specified explicitly via
- * {@link PCollection#setCoder}
- * take precedence, followed by Coders registered at runtime via
- * {@link CoderRegistry#registerCoder}. See {@link CoderRegistry} for a more detailed discussion
- * of the precedence rules.
- */
-@Documented
-@Retention(RetentionPolicy.RUNTIME)
-@Target(ElementType.TYPE)
-@SuppressWarnings("rawtypes")
-public @interface DefaultCoder {
- Class<? extends Coder> value();
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/DelegateCoder.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/DelegateCoder.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/DelegateCoder.java
deleted file mode 100644
index cdd882b..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/DelegateCoder.java
+++ /dev/null
@@ -1,164 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.coders;
-
-import com.google.common.collect.Lists;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.io.Serializable;
-import java.util.Collection;
-import java.util.List;
-
-/**
- * A {@code DelegateCoder<T, IntermediateT>} wraps a {@link Coder} for {@code IntermediateT} and
- * encodes/decodes values of type {@code T} by converting
- * to/from {@code IntermediateT} and then encoding/decoding using the underlying
- * {@code Coder<IntermediateT>}.
- *
- * <p>The conversions from {@code T} to {@code IntermediateT} and vice versa
- * must be supplied as {@link CodingFunction}, a serializable
- * function that may throw any {@code Exception}. If a thrown
- * exception is an instance of {@link CoderException} or
- * {@link IOException}, it will be re-thrown, otherwise it will be wrapped as
- * a {@link CoderException}.
- *
- * @param <T> The type of objects coded by this Coder.
- * @param <IntermediateT> The type of objects a {@code T} will be converted to for coding.
- */
-public class DelegateCoder<T, IntermediateT> extends CustomCoder<T> {
- /**
- * A {@link DelegateCoder.CodingFunction CodingFunction<InputT, OutputT>} is a serializable
- * function from {@code InputT} to {@code OutputT} that may throw any {@link Exception}.
- */
- public static interface CodingFunction<InputT, OutputT> extends Serializable {
- public abstract OutputT apply(InputT input) throws Exception;
- }
-
- public static <T, IntermediateT> DelegateCoder<T, IntermediateT> of(Coder<IntermediateT> coder,
- CodingFunction<T, IntermediateT> toFn,
- CodingFunction<IntermediateT, T> fromFn) {
- return new DelegateCoder<T, IntermediateT>(coder, toFn, fromFn);
- }
-
- @Override
- public void encode(T value, OutputStream outStream, Context context)
- throws CoderException, IOException {
- coder.encode(applyAndWrapExceptions(toFn, value), outStream, context);
- }
-
- @Override
- public T decode(InputStream inStream, Context context) throws CoderException, IOException {
- return applyAndWrapExceptions(fromFn, coder.decode(inStream, context));
- }
-
- /**
- * Returns the coder used to encode/decode the intermediate values produced/consumed by the
- * coding functions of this {@code DelegateCoder}.
- */
- public Coder<IntermediateT> getCoder() {
- return coder;
- }
-
- /**
- * {@inheritDoc}
- *
- * @throws NonDeterministicException when the underlying coder's {@code verifyDeterministic()}
- * throws a {@link Coder.NonDeterministicException}. For this to be safe, the
- * intermediate {@code CodingFunction<T, IntermediateT>} must also be deterministic.
- */
- @Override
- public void verifyDeterministic() throws NonDeterministicException {
- coder.verifyDeterministic();
- }
-
- /**
- * {@inheritDoc}
- *
- * @return a structural for a value of type {@code T} obtained by first converting to
- * {@code IntermediateT} and then obtaining a structural value according to the underlying
- * coder.
- */
- @Override
- public Object structuralValue(T value) throws Exception {
- return coder.structuralValue(toFn.apply(value));
- }
-
- @Override
- public String toString() {
- return "DelegateCoder(" + coder + ")";
- }
-
- /**
- * {@inheritDoc}
- *
- * @return a {@link String} composed from the underlying coder class name and its encoding id.
- * Note that this omits any description of the coding functions. These should be modified
- * with care.
- */
- @Override
- public String getEncodingId() {
- return delegateEncodingId(coder.getClass(), coder.getEncodingId());
- }
-
- /**
- * {@inheritDoc}
- *
- * @return allowed encodings which are composed from the underlying coder class and its allowed
- * encoding ids. Note that this omits any description of the coding functions. These
- * should be modified with care.
- */
- @Override
- public Collection<String> getAllowedEncodings() {
- List<String> allowedEncodings = Lists.newArrayList();
- for (String allowedEncoding : coder.getAllowedEncodings()) {
- allowedEncodings.add(delegateEncodingId(coder.getClass(), allowedEncoding));
- }
- return allowedEncodings;
- }
-
- private String delegateEncodingId(Class<?> delegateClass, String encodingId) {
- return String.format("%s:%s", delegateClass.getName(), encodingId);
- }
-
- /////////////////////////////////////////////////////////////////////////////
-
- private <InputT, OutputT> OutputT applyAndWrapExceptions(
- CodingFunction<InputT, OutputT> fn,
- InputT input) throws CoderException, IOException {
- try {
- return fn.apply(input);
- } catch (IOException exc) {
- throw exc;
- } catch (Exception exc) {
- throw new CoderException(exc);
- }
- }
-
- private final Coder<IntermediateT> coder;
- private final CodingFunction<T, IntermediateT> toFn;
- private final CodingFunction<IntermediateT, T> fromFn;
-
- protected DelegateCoder(Coder<IntermediateT> coder,
- CodingFunction<T, IntermediateT> toFn,
- CodingFunction<IntermediateT, T> fromFn) {
- this.coder = coder;
- this.fromFn = fromFn;
- this.toFn = toFn;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/DeterministicStandardCoder.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/DeterministicStandardCoder.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/DeterministicStandardCoder.java
deleted file mode 100644
index 0e0018a..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/DeterministicStandardCoder.java
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.coders;
-
-/**
- * A {@link DeterministicStandardCoder} is a {@link StandardCoder} that is
- * deterministic, in the sense that for objects considered equal
- * according to {@link Object#equals(Object)}, the encoded bytes are
- * also equal.
- *
- * @param <T> the type of the values being transcoded
- */
-public abstract class DeterministicStandardCoder<T> extends StandardCoder<T> {
- protected DeterministicStandardCoder() {}
-
- /**
- * {@inheritDoc}
- *
- * @throws NonDeterministicException never, unless overridden. A
- * {@link DeterministicStandardCoder} is presumed deterministic.
- */
- @Override
- public void verifyDeterministic() throws NonDeterministicException { }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/DoubleCoder.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/DoubleCoder.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/DoubleCoder.java
deleted file mode 100644
index 68d58df..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/DoubleCoder.java
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.coders;
-
-import com.fasterxml.jackson.annotation.JsonCreator;
-
-import java.io.DataInputStream;
-import java.io.DataOutputStream;
-import java.io.EOFException;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.io.UTFDataFormatException;
-
-/**
- * A {@link DoubleCoder} encodes {@link Double} values in 8 bytes using Java serialization.
- */
-public class DoubleCoder extends AtomicCoder<Double> {
-
- @JsonCreator
- public static DoubleCoder of() {
- return INSTANCE;
- }
-
- /////////////////////////////////////////////////////////////////////////////
-
- private static final DoubleCoder INSTANCE = new DoubleCoder();
-
- private DoubleCoder() {}
-
- @Override
- public void encode(Double value, OutputStream outStream, Context context)
- throws IOException, CoderException {
- if (value == null) {
- throw new CoderException("cannot encode a null Double");
- }
- new DataOutputStream(outStream).writeDouble(value);
- }
-
- @Override
- public Double decode(InputStream inStream, Context context)
- throws IOException, CoderException {
- try {
- return new DataInputStream(inStream).readDouble();
- } catch (EOFException | UTFDataFormatException exn) {
- // These exceptions correspond to decoding problems, so change
- // what kind of exception they're branded as.
- throw new CoderException(exn);
- }
- }
-
- /**
- * {@inheritDoc}
- *
- * @throws NonDeterministicException always.
- * Floating-point operations are not guaranteed to be deterministic, even
- * if the storage format might be, so floating point representations are not
- * recommended for use in operations that require deterministic inputs.
- */
- @Override
- public void verifyDeterministic() throws NonDeterministicException {
- throw new NonDeterministicException(this,
- "Floating point encodings are not guaranteed to be deterministic.");
- }
-
- /**
- * {@inheritDoc}
- *
- * @return {@code true}. This coder is injective.
- */
- @Override
- public boolean consistentWithEquals() {
- return true;
- }
-
- /**
- * {@inheritDoc}
- *
- * @return {@code true}. {@link DoubleCoder#getEncodedElementByteSize} returns a constant.
- */
- @Override
- public boolean isRegisterByteSizeObserverCheap(Double value, Context context) {
- return true;
- }
-
- /**
- * {@inheritDoc}
- *
- * @return {@code 8}, the byte size of a {@link Double} encoded using Java serialization.
- */
- @Override
- protected long getEncodedElementByteSize(Double value, Context context)
- throws Exception {
- if (value == null) {
- throw new CoderException("cannot encode a null Double");
- }
- return 8;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/DurationCoder.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/DurationCoder.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/DurationCoder.java
deleted file mode 100644
index 25527f0..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/DurationCoder.java
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.coders;
-
-import com.google.cloud.dataflow.sdk.util.common.ElementByteSizeObserver;
-
-import com.fasterxml.jackson.annotation.JsonCreator;
-
-import org.joda.time.Duration;
-import org.joda.time.ReadableDuration;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-
-/**
- * A {@link Coder} that encodes a joda {@link Duration} as a {@link Long} using the format of
- * {@link VarLongCoder}.
- */
-public class DurationCoder extends AtomicCoder<ReadableDuration> {
-
- @JsonCreator
- public static DurationCoder of() {
- return INSTANCE;
- }
-
- /////////////////////////////////////////////////////////////////////////////
-
- private static final DurationCoder INSTANCE = new DurationCoder();
-
- private final VarLongCoder longCoder = VarLongCoder.of();
-
- private DurationCoder() {}
-
- private Long toLong(ReadableDuration value) {
- return value.getMillis();
- }
-
- private ReadableDuration fromLong(Long decoded) {
- return Duration.millis(decoded);
- }
-
- @Override
- public void encode(ReadableDuration value, OutputStream outStream, Context context)
- throws CoderException, IOException {
- if (value == null) {
- throw new CoderException("cannot encode a null ReadableDuration");
- }
- longCoder.encode(toLong(value), outStream, context);
- }
-
- @Override
- public ReadableDuration decode(InputStream inStream, Context context)
- throws CoderException, IOException {
- return fromLong(longCoder.decode(inStream, context));
- }
-
- /**
- * {@inheritDoc}
- *
- * @return {@code true}. This coder is injective.
- */
- @Override
- public boolean consistentWithEquals() {
- return true;
- }
-
- /**
- * {@inheritDoc}
- *
- * @return {@code true}, because it is cheap to ascertain the byte size of a long.
- */
- @Override
- public boolean isRegisterByteSizeObserverCheap(ReadableDuration value, Context context) {
- return longCoder.isRegisterByteSizeObserverCheap(toLong(value), context);
- }
-
- @Override
- public void registerByteSizeObserver(
- ReadableDuration value, ElementByteSizeObserver observer, Context context) throws Exception {
- longCoder.registerByteSizeObserver(toLong(value), observer, context);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/EntityCoder.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/EntityCoder.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/EntityCoder.java
deleted file mode 100644
index 3ae857f..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/EntityCoder.java
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.coders;
-
-import com.google.api.services.datastore.DatastoreV1.Entity;
-
-import com.fasterxml.jackson.annotation.JsonCreator;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-
-/**
- * A {@link Coder} for {@link Entity} objects based on their encoded Protocol Buffer form.
- */
-public class EntityCoder extends AtomicCoder<Entity> {
-
- @JsonCreator
- public static EntityCoder of() {
- return INSTANCE;
- }
-
- /***************************/
-
- private static final EntityCoder INSTANCE = new EntityCoder();
-
- private EntityCoder() {}
-
- @Override
- public void encode(Entity value, OutputStream outStream, Context context)
- throws IOException, CoderException {
- if (value == null) {
- throw new CoderException("cannot encode a null Entity");
- }
-
- // Since Entity implements com.google.protobuf.MessageLite,
- // we could directly use writeTo to write to a OutputStream object
- outStream.write(java.nio.ByteBuffer.allocate(4).putInt(value.getSerializedSize()).array());
- value.writeTo(outStream);
- outStream.flush();
- }
-
- @Override
- public Entity decode(InputStream inStream, Context context)
- throws IOException {
- byte[] entitySize = new byte[4];
- inStream.read(entitySize, 0, 4);
- int size = java.nio.ByteBuffer.wrap(entitySize).getInt();
- byte[] data = new byte[size];
- inStream.read(data, 0, size);
- return Entity.parseFrom(data);
- }
-
- @Override
- protected long getEncodedElementByteSize(Entity value, Context context)
- throws Exception {
- return value.getSerializedSize();
- }
-
- /**
- * {@inheritDoc}
- *
- * @throws NonDeterministicException always.
- * A datastore kind can hold arbitrary {@link Object} instances, which
- * makes the encoding non-deterministic.
- */
- @Override
- public void verifyDeterministic() throws NonDeterministicException {
- throw new NonDeterministicException(this,
- "Datastore encodings can hold arbitrary Object instances");
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/InstantCoder.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/InstantCoder.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/InstantCoder.java
deleted file mode 100644
index 99b58ce..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/InstantCoder.java
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.coders;
-
-import com.google.cloud.dataflow.sdk.util.common.ElementByteSizeObserver;
-import com.google.common.base.Converter;
-
-import com.fasterxml.jackson.annotation.JsonCreator;
-
-import org.joda.time.Instant;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-
-/**
- * A {@link Coder} for joda {@link Instant} that encodes it as a big endian {@link Long}
- * shifted such that lexicographic ordering of the bytes corresponds to chronological order.
- */
-public class InstantCoder extends AtomicCoder<Instant> {
-
- @JsonCreator
- public static InstantCoder of() {
- return INSTANCE;
- }
-
- /////////////////////////////////////////////////////////////////////////////
-
- private static final InstantCoder INSTANCE = new InstantCoder();
-
- private final BigEndianLongCoder longCoder = BigEndianLongCoder.of();
-
- private InstantCoder() {}
-
- /**
- * Converts {@link Instant} to a {@code Long} representing its millis-since-epoch,
- * but shifted so that the byte representation of negative values are lexicographically
- * ordered before the byte representation of positive values.
- *
- * <p>This deliberately utilizes the well-defined overflow for {@code Long} values.
- * See http://docs.oracle.com/javase/specs/jls/se7/html/jls-15.html#jls-15.18.2
- */
- private static final Converter<Instant, Long> ORDER_PRESERVING_CONVERTER =
- new Converter<Instant, Long>() {
-
- @Override
- protected Long doForward(Instant instant) {
- return instant.getMillis() - Long.MIN_VALUE;
- }
-
- @Override
- protected Instant doBackward(Long shiftedMillis) {
- return new Instant(shiftedMillis + Long.MIN_VALUE);
- }
- };
-
- @Override
- public void encode(Instant value, OutputStream outStream, Context context)
- throws CoderException, IOException {
- if (value == null) {
- throw new CoderException("cannot encode a null Instant");
- }
- longCoder.encode(ORDER_PRESERVING_CONVERTER.convert(value), outStream, context);
- }
-
- @Override
- public Instant decode(InputStream inStream, Context context)
- throws CoderException, IOException {
- return ORDER_PRESERVING_CONVERTER.reverse().convert(longCoder.decode(inStream, context));
- }
-
- /**
- * {@inheritDoc}
- *
- * @return {@code true}. This coder is injective.
- */
- @Override
- public boolean consistentWithEquals() {
- return true;
- }
-
- /**
- * {@inheritDoc}
- *
- * @return {@code true}. The byte size for a big endian long is a constant.
- */
- @Override
- public boolean isRegisterByteSizeObserverCheap(Instant value, Context context) {
- return longCoder.isRegisterByteSizeObserverCheap(
- ORDER_PRESERVING_CONVERTER.convert(value), context);
- }
-
- @Override
- public void registerByteSizeObserver(
- Instant value, ElementByteSizeObserver observer, Context context) throws Exception {
- longCoder.registerByteSizeObserver(
- ORDER_PRESERVING_CONVERTER.convert(value), observer, context);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/IterableCoder.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/IterableCoder.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/IterableCoder.java
deleted file mode 100644
index 70dcd84..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/IterableCoder.java
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.coders;
-
-import static com.google.cloud.dataflow.sdk.util.Structs.addBoolean;
-
-import com.google.cloud.dataflow.sdk.util.CloudObject;
-import com.google.cloud.dataflow.sdk.util.PropertyNames;
-import com.google.common.base.Preconditions;
-
-import com.fasterxml.jackson.annotation.JsonCreator;
-import com.fasterxml.jackson.annotation.JsonProperty;
-
-import java.util.List;
-
-/**
- * An {@link IterableCoder} encodes any {@link Iterable} in the format
- * of {@link IterableLikeCoder}.
- *
- * @param <T> the type of the elements of the iterables being transcoded
- */
-public class IterableCoder<T> extends IterableLikeCoder<T, Iterable<T>> {
-
- public static <T> IterableCoder<T> of(Coder<T> elemCoder) {
- return new IterableCoder<>(elemCoder);
- }
-
- /////////////////////////////////////////////////////////////////////////////
- // Internal operations below here.
-
- @Override
- protected final Iterable<T> decodeToIterable(List<T> decodedElements) {
- return decodedElements;
- }
-
- @JsonCreator
- public static IterableCoder<?> of(
- @JsonProperty(PropertyNames.COMPONENT_ENCODINGS)
- List<Coder<?>> components) {
- Preconditions.checkArgument(components.size() == 1,
- "Expecting 1 component, got " + components.size());
- return of(components.get(0));
- }
-
- /**
- * Returns the first element in this iterable if it is non-empty,
- * otherwise returns {@code null}.
- */
- public static <T> List<Object> getInstanceComponents(
- Iterable<T> exampleValue) {
- return getInstanceComponentsHelper(exampleValue);
- }
-
- protected IterableCoder(Coder<T> elemCoder) {
- super(elemCoder, "Iterable");
- }
-
- @Override
- public CloudObject asCloudObject() {
- CloudObject result = super.asCloudObject();
- addBoolean(result, PropertyNames.IS_STREAM_LIKE, true);
- return result;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/IterableLikeCoder.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/IterableLikeCoder.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/IterableLikeCoder.java
deleted file mode 100644
index 7fb573a..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/IterableLikeCoder.java
+++ /dev/null
@@ -1,278 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.coders;
-
-import com.google.cloud.dataflow.sdk.util.BufferedElementCountingOutputStream;
-import com.google.cloud.dataflow.sdk.util.VarInt;
-import com.google.cloud.dataflow.sdk.util.common.ElementByteSizeObservableIterable;
-import com.google.cloud.dataflow.sdk.util.common.ElementByteSizeObserver;
-import com.google.common.base.Preconditions;
-
-import java.io.DataInputStream;
-import java.io.DataOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.List;
-import java.util.Observable;
-import java.util.Observer;
-
-/**
- * An abstract base class with functionality for assembling a
- * {@link Coder} for a class that implements {@code Iterable}.
- *
- * <p>To complete a subclass, implement the {@link #decodeToIterable} method. This superclass
- * will decode the elements in the input stream into a {@link List} and then pass them to that
- * method to be converted into the appropriate iterable type. Note that this means the input
- * iterables must fit into memory.
- *
- * <p>The format of this coder is as follows:
- *
- * <ul>
- * <li>If the input {@link Iterable} has a known and finite size, then the size is written to the
- * output stream in big endian format, followed by all of the encoded elements.</li>
- * <li>If the input {@link Iterable} is not known to have a finite size, then each element
- * of the input is preceded by {@code true} encoded as a byte (indicating "more data")
- * followed by the encoded element, and terminated by {@code false} encoded as a byte.</li>
- * </ul>
- *
- * @param <T> the type of the elements of the {@code Iterable}s being transcoded
- * @param <IterableT> the type of the Iterables being transcoded
- */
-public abstract class IterableLikeCoder<T, IterableT extends Iterable<T>>
- extends StandardCoder<IterableT> {
- public Coder<T> getElemCoder() {
- return elementCoder;
- }
-
- /**
- * Builds an instance of {@code IterableT}, this coder's associated {@link Iterable}-like
- * subtype, from a list of decoded elements.
- */
- protected abstract IterableT decodeToIterable(List<T> decodedElements);
-
- /////////////////////////////////////////////////////////////////////////////
- // Internal operations below here.
-
- private final Coder<T> elementCoder;
- private final String iterableName;
-
- /**
- * Returns the first element in the iterable-like {@code exampleValue} if it is non-empty,
- * otherwise returns {@code null}.
- */
- protected static <T, IterableT extends Iterable<T>>
- List<Object> getInstanceComponentsHelper(IterableT exampleValue) {
- for (T value : exampleValue) {
- return Arrays.<Object>asList(value);
- }
- return null;
- }
-
- protected IterableLikeCoder(Coder<T> elementCoder, String iterableName) {
- Preconditions.checkArgument(elementCoder != null,
- "element Coder for IterableLikeCoder must not be null");
- Preconditions.checkArgument(iterableName != null,
- "iterable name for IterableLikeCoder must not be null");
- this.elementCoder = elementCoder;
- this.iterableName = iterableName;
- }
-
- @Override
- public void encode(
- IterableT iterable, OutputStream outStream, Context context)
- throws IOException, CoderException {
- if (iterable == null) {
- throw new CoderException("cannot encode a null " + iterableName);
- }
- Context nestedContext = context.nested();
- DataOutputStream dataOutStream = new DataOutputStream(outStream);
- if (iterable instanceof Collection) {
- // We can know the size of the Iterable. Use an encoding with a
- // leading size field, followed by that many elements.
- Collection<T> collection = (Collection<T>) iterable;
- dataOutStream.writeInt(collection.size());
- for (T elem : collection) {
- elementCoder.encode(elem, dataOutStream, nestedContext);
- }
- } else {
- // We don't know the size without traversing it so use a fixed size buffer
- // and encode as many elements as possible into it before outputting the size followed
- // by the elements.
- dataOutStream.writeInt(-1);
- BufferedElementCountingOutputStream countingOutputStream =
- new BufferedElementCountingOutputStream(dataOutStream);
- for (T elem : iterable) {
- countingOutputStream.markElementStart();
- elementCoder.encode(elem, countingOutputStream, nestedContext);
- }
- countingOutputStream.finish();
- }
- // Make sure all our output gets pushed to the underlying outStream.
- dataOutStream.flush();
- }
-
- @Override
- public IterableT decode(InputStream inStream, Context context)
- throws IOException, CoderException {
- Context nestedContext = context.nested();
- DataInputStream dataInStream = new DataInputStream(inStream);
- int size = dataInStream.readInt();
- if (size >= 0) {
- List<T> elements = new ArrayList<>(size);
- for (int i = 0; i < size; i++) {
- elements.add(elementCoder.decode(dataInStream, nestedContext));
- }
- return decodeToIterable(elements);
- } else {
- List<T> elements = new ArrayList<>();
- long count;
- // We don't know the size a priori. Check if we're done with
- // each block of elements.
- while ((count = VarInt.decodeLong(dataInStream)) > 0) {
- while (count > 0) {
- elements.add(elementCoder.decode(dataInStream, nestedContext));
- count -= 1;
- }
- }
- return decodeToIterable(elements);
- }
- }
-
- @Override
- public List<? extends Coder<?>> getCoderArguments() {
- return Arrays.asList(elementCoder);
- }
-
- /**
- * {@inheritDoc}
- *
- * @throws NonDeterministicException always.
- * Encoding is not deterministic for the general {@link Iterable} case, as it depends
- * upon the type of iterable. This may allow two objects to compare as equal
- * while the encoding differs.
- */
- @Override
- public void verifyDeterministic() throws NonDeterministicException {
- throw new NonDeterministicException(this,
- "IterableLikeCoder can not guarantee deterministic ordering.");
- }
-
- /**
- * {@inheritDoc}
- *
- * @return {@code true} if the iterable is of a known class that supports lazy counting
- * of byte size, since that requires minimal extra computation.
- */
- @Override
- public boolean isRegisterByteSizeObserverCheap(
- IterableT iterable, Context context) {
- return iterable instanceof ElementByteSizeObservableIterable;
- }
-
- @Override
- public void registerByteSizeObserver(
- IterableT iterable, ElementByteSizeObserver observer, Context context)
- throws Exception {
- if (iterable == null) {
- throw new CoderException("cannot encode a null Iterable");
- }
- Context nestedContext = context.nested();
-
- if (iterable instanceof ElementByteSizeObservableIterable) {
- observer.setLazy();
- ElementByteSizeObservableIterable<?, ?> observableIterable =
- (ElementByteSizeObservableIterable<?, ?>) iterable;
- observableIterable.addObserver(
- new IteratorObserver(observer, iterable instanceof Collection));
- } else {
- if (iterable instanceof Collection) {
- // We can know the size of the Iterable. Use an encoding with a
- // leading size field, followed by that many elements.
- Collection<T> collection = (Collection<T>) iterable;
- observer.update(4L);
- for (T elem : collection) {
- elementCoder.registerByteSizeObserver(elem, observer, nestedContext);
- }
- } else {
- // TODO: Update to use an accurate count depending on size and count, currently we
- // are under estimating the size by up to 10 bytes per block of data since we are
- // not encoding the count prefix which occurs at most once per 64k of data and is upto
- // 10 bytes long. Since we include the total count we can upper bound the underestimate
- // to be 10 / 65536 ~= 0.0153% of the actual size.
- observer.update(4L);
- long count = 0;
- for (T elem : iterable) {
- count += 1;
- elementCoder.registerByteSizeObserver(elem, observer, nestedContext);
- }
- if (count > 0) {
- // Update the length based upon the number of counted elements, this helps
- // eliminate the case where all the elements are encoded in the first block and
- // it is quite short (e.g. Long.MAX_VALUE nulls encoded with VoidCoder).
- observer.update(VarInt.getLength(count));
- }
- // Update with the terminator byte.
- observer.update(1L);
- }
- }
- }
-
- /**
- * An observer that gets notified when an observable iterator
- * returns a new value. This observer just notifies an outerObserver
- * about this event. Additionally, the outerObserver is notified
- * about additional separators that are transparently added by this
- * coder.
- */
- private class IteratorObserver implements Observer {
- private final ElementByteSizeObserver outerObserver;
- private final boolean countable;
-
- public IteratorObserver(ElementByteSizeObserver outerObserver,
- boolean countable) {
- this.outerObserver = outerObserver;
- this.countable = countable;
-
- if (countable) {
- // Additional 4 bytes are due to size.
- outerObserver.update(4L);
- } else {
- // Additional 5 bytes are due to size = -1 (4 bytes) and
- // hasNext = false (1 byte).
- outerObserver.update(5L);
- }
- }
-
- @Override
- public void update(Observable obs, Object obj) {
- if (!(obj instanceof Long)) {
- throw new AssertionError("unexpected parameter object");
- }
-
- if (countable) {
- outerObserver.update(obs, obj);
- } else {
- // Additional 1 byte is due to hasNext = true flag.
- outerObserver.update(obs, 1 + (long) obj);
- }
- }
- }
-}
[24/67] [partial] incubator-beam git commit: Directory reorganization
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/CombineFnBase.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/CombineFnBase.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/CombineFnBase.java
deleted file mode 100644
index a0b06cf..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/CombineFnBase.java
+++ /dev/null
@@ -1,283 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.transforms;
-
-import com.google.cloud.dataflow.sdk.coders.CannotProvideCoderException;
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.coders.CoderRegistry;
-import com.google.cloud.dataflow.sdk.transforms.Combine.CombineFn;
-import com.google.cloud.dataflow.sdk.transforms.Combine.KeyedCombineFn;
-import com.google.cloud.dataflow.sdk.transforms.CombineWithContext.CombineFnWithContext;
-import com.google.cloud.dataflow.sdk.transforms.CombineWithContext.KeyedCombineFnWithContext;
-import com.google.cloud.dataflow.sdk.values.TypeDescriptor;
-import com.google.common.collect.ImmutableMap;
-
-import java.io.Serializable;
-import java.lang.reflect.Type;
-import java.lang.reflect.TypeVariable;
-
-/**
- * This class contains the shared interfaces and abstract classes for different types of combine
- * functions.
- *
- * <p>Users should not implement or extend them directly.
- */
-public class CombineFnBase {
- /**
- * A {@code GloballyCombineFn<InputT, AccumT, OutputT>} specifies how to combine a
- * collection of input values of type {@code InputT} into a single
- * output value of type {@code OutputT}. It does this via one or more
- * intermediate mutable accumulator values of type {@code AccumT}.
- *
- * <p>Do not implement this interface directly.
- * Extends {@link CombineFn} and {@link CombineFnWithContext} instead.
- *
- * @param <InputT> type of input values
- * @param <AccumT> type of mutable accumulator values
- * @param <OutputT> type of output values
- */
- public interface GlobalCombineFn<InputT, AccumT, OutputT> extends Serializable {
-
- /**
- * Returns the {@code Coder} to use for accumulator {@code AccumT}
- * values, or null if it is not able to be inferred.
- *
- * <p>By default, uses the knowledge of the {@code Coder} being used
- * for {@code InputT} values and the enclosing {@code Pipeline}'s
- * {@code CoderRegistry} to try to infer the Coder for {@code AccumT}
- * values.
- *
- * <p>This is the Coder used to send data through a communication-intensive
- * shuffle step, so a compact and efficient representation may have
- * significant performance benefits.
- */
- public Coder<AccumT> getAccumulatorCoder(CoderRegistry registry, Coder<InputT> inputCoder)
- throws CannotProvideCoderException;
-
- /**
- * Returns the {@code Coder} to use by default for output
- * {@code OutputT} values, or null if it is not able to be inferred.
- *
- * <p>By default, uses the knowledge of the {@code Coder} being
- * used for input {@code InputT} values and the enclosing
- * {@code Pipeline}'s {@code CoderRegistry} to try to infer the
- * Coder for {@code OutputT} values.
- */
- public Coder<OutputT> getDefaultOutputCoder(CoderRegistry registry, Coder<InputT> inputCoder)
- throws CannotProvideCoderException;
-
- /**
- * Returns the error message for not supported default values in Combine.globally().
- */
- public String getIncompatibleGlobalWindowErrorMessage();
-
- /**
- * Returns the default value when there are no values added to the accumulator.
- */
- public OutputT defaultValue();
-
- /**
- * Converts this {@code GloballyCombineFn} into an equivalent
- * {@link PerKeyCombineFn} that ignores the keys passed to it and
- * combines the values according to this {@code GloballyCombineFn}.
- *
- * @param <K> the type of the (ignored) keys
- */
- public <K> PerKeyCombineFn<K, InputT, AccumT, OutputT> asKeyedFn();
- }
-
- /**
- * A {@code PerKeyCombineFn<K, InputT, AccumT, OutputT>} specifies how to combine
- * a collection of input values of type {@code InputT}, associated with
- * a key of type {@code K}, into a single output value of type
- * {@code OutputT}. It does this via one or more intermediate mutable
- * accumulator values of type {@code AccumT}.
- *
- * <p>Do not implement this interface directly.
- * Extends {@link KeyedCombineFn} and {@link KeyedCombineFnWithContext} instead.
- *
- * @param <K> type of keys
- * @param <InputT> type of input values
- * @param <AccumT> type of mutable accumulator values
- * @param <OutputT> type of output values
- */
- public interface PerKeyCombineFn<K, InputT, AccumT, OutputT> extends Serializable {
- /**
- * Returns the {@code Coder} to use for accumulator {@code AccumT}
- * values, or null if it is not able to be inferred.
- *
- * <p>By default, uses the knowledge of the {@code Coder} being
- * used for {@code K} keys and input {@code InputT} values and the
- * enclosing {@code Pipeline}'s {@code CoderRegistry} to try to
- * infer the Coder for {@code AccumT} values.
- *
- * <p>This is the Coder used to send data through a communication-intensive
- * shuffle step, so a compact and efficient representation may have
- * significant performance benefits.
- */
- public Coder<AccumT> getAccumulatorCoder(CoderRegistry registry, Coder<K> keyCoder,
- Coder<InputT> inputCoder) throws CannotProvideCoderException;
-
- /**
- * Returns the {@code Coder} to use by default for output
- * {@code OutputT} values, or null if it is not able to be inferred.
- *
- * <p>By default, uses the knowledge of the {@code Coder} being
- * used for {@code K} keys and input {@code InputT} values and the
- * enclosing {@code Pipeline}'s {@code CoderRegistry} to try to
- * infer the Coder for {@code OutputT} values.
- */
- public Coder<OutputT> getDefaultOutputCoder(CoderRegistry registry, Coder<K> keyCoder,
- Coder<InputT> inputCoder) throws CannotProvideCoderException;
-
- /**
- * Returns the a regular {@link GlobalCombineFn} that operates on a specific key.
- */
- public abstract GlobalCombineFn<InputT, AccumT, OutputT> forKey(
- final K key, final Coder<K> keyCoder);
- }
-
- /**
- * An abstract {@link GlobalCombineFn} base class shared by
- * {@link CombineFn} and {@link CombineFnWithContext}.
- *
- * <p>Do not extend this class directly.
- * Extends {@link CombineFn} and {@link CombineFnWithContext} instead.
- *
- * @param <InputT> type of input values
- * @param <AccumT> type of mutable accumulator values
- * @param <OutputT> type of output values
- */
- abstract static class AbstractGlobalCombineFn<InputT, AccumT, OutputT>
- implements GlobalCombineFn<InputT, AccumT, OutputT>, Serializable {
- private static final String INCOMPATIBLE_GLOBAL_WINDOW_ERROR_MESSAGE =
- "Default values are not supported in Combine.globally() if the output "
- + "PCollection is not windowed by GlobalWindows. Instead, use "
- + "Combine.globally().withoutDefaults() to output an empty PCollection if the input "
- + "PCollection is empty, or Combine.globally().asSingletonView() to get the default "
- + "output of the CombineFn if the input PCollection is empty.";
-
- @Override
- public Coder<AccumT> getAccumulatorCoder(CoderRegistry registry, Coder<InputT> inputCoder)
- throws CannotProvideCoderException {
- return registry.getDefaultCoder(getClass(), AbstractGlobalCombineFn.class,
- ImmutableMap.<Type, Coder<?>>of(getInputTVariable(), inputCoder), getAccumTVariable());
- }
-
- @Override
- public Coder<OutputT> getDefaultOutputCoder(CoderRegistry registry, Coder<InputT> inputCoder)
- throws CannotProvideCoderException {
- return registry.getDefaultCoder(getClass(), AbstractGlobalCombineFn.class,
- ImmutableMap.<Type, Coder<?>>of(getInputTVariable(), inputCoder, getAccumTVariable(),
- this.getAccumulatorCoder(registry, inputCoder)),
- getOutputTVariable());
- }
-
- @Override
- public String getIncompatibleGlobalWindowErrorMessage() {
- return INCOMPATIBLE_GLOBAL_WINDOW_ERROR_MESSAGE;
- }
-
- /**
- * Returns the {@link TypeVariable} of {@code InputT}.
- */
- public TypeVariable<?> getInputTVariable() {
- return (TypeVariable<?>)
- new TypeDescriptor<InputT>(AbstractGlobalCombineFn.class) {}.getType();
- }
-
- /**
- * Returns the {@link TypeVariable} of {@code AccumT}.
- */
- public TypeVariable<?> getAccumTVariable() {
- return (TypeVariable<?>)
- new TypeDescriptor<AccumT>(AbstractGlobalCombineFn.class) {}.getType();
- }
-
- /**
- * Returns the {@link TypeVariable} of {@code OutputT}.
- */
- public TypeVariable<?> getOutputTVariable() {
- return (TypeVariable<?>)
- new TypeDescriptor<OutputT>(AbstractGlobalCombineFn.class) {}.getType();
- }
- }
-
- /**
- * An abstract {@link PerKeyCombineFn} base class shared by
- * {@link KeyedCombineFn} and {@link KeyedCombineFnWithContext}.
- *
- * <p>Do not extends this class directly.
- * Extends {@link KeyedCombineFn} and {@link KeyedCombineFnWithContext} instead.
- *
- * @param <K> type of keys
- * @param <InputT> type of input values
- * @param <AccumT> type of mutable accumulator values
- * @param <OutputT> type of output values
- */
- abstract static class AbstractPerKeyCombineFn<K, InputT, AccumT, OutputT>
- implements PerKeyCombineFn<K, InputT, AccumT, OutputT> {
- @Override
- public Coder<AccumT> getAccumulatorCoder(CoderRegistry registry, Coder<K> keyCoder,
- Coder<InputT> inputCoder) throws CannotProvideCoderException {
- return registry.getDefaultCoder(getClass(), AbstractPerKeyCombineFn.class,
- ImmutableMap.<Type, Coder<?>>of(
- getKTypeVariable(), keyCoder, getInputTVariable(), inputCoder),
- getAccumTVariable());
- }
-
- @Override
- public Coder<OutputT> getDefaultOutputCoder(CoderRegistry registry, Coder<K> keyCoder,
- Coder<InputT> inputCoder) throws CannotProvideCoderException {
- return registry.getDefaultCoder(getClass(), AbstractPerKeyCombineFn.class,
- ImmutableMap.<Type, Coder<?>>of(getKTypeVariable(), keyCoder, getInputTVariable(),
- inputCoder, getAccumTVariable(),
- this.getAccumulatorCoder(registry, keyCoder, inputCoder)),
- getOutputTVariable());
- }
-
- /**
- * Returns the {@link TypeVariable} of {@code K}.
- */
- public TypeVariable<?> getKTypeVariable() {
- return (TypeVariable<?>) new TypeDescriptor<K>(AbstractPerKeyCombineFn.class) {}.getType();
- }
-
- /**
- * Returns the {@link TypeVariable} of {@code InputT}.
- */
- public TypeVariable<?> getInputTVariable() {
- return (TypeVariable<?>)
- new TypeDescriptor<InputT>(AbstractPerKeyCombineFn.class) {}.getType();
- }
-
- /**
- * Returns the {@link TypeVariable} of {@code AccumT}.
- */
- public TypeVariable<?> getAccumTVariable() {
- return (TypeVariable<?>)
- new TypeDescriptor<AccumT>(AbstractPerKeyCombineFn.class) {}.getType();
- }
-
- /**
- * Returns the {@link TypeVariable} of {@code OutputT}.
- */
- public TypeVariable<?> getOutputTVariable() {
- return (TypeVariable<?>)
- new TypeDescriptor<OutputT>(AbstractPerKeyCombineFn.class) {}.getType();
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/CombineFns.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/CombineFns.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/CombineFns.java
deleted file mode 100644
index 656c010..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/CombineFns.java
+++ /dev/null
@@ -1,1100 +0,0 @@
-/*
- * Copyright (C) 2016 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.transforms;
-
-import static com.google.common.base.Preconditions.checkArgument;
-
-import com.google.cloud.dataflow.sdk.coders.CannotProvideCoderException;
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.coders.CoderException;
-import com.google.cloud.dataflow.sdk.coders.CoderRegistry;
-import com.google.cloud.dataflow.sdk.coders.StandardCoder;
-import com.google.cloud.dataflow.sdk.transforms.Combine.CombineFn;
-import com.google.cloud.dataflow.sdk.transforms.Combine.KeyedCombineFn;
-import com.google.cloud.dataflow.sdk.transforms.CombineFnBase.GlobalCombineFn;
-import com.google.cloud.dataflow.sdk.transforms.CombineFnBase.PerKeyCombineFn;
-import com.google.cloud.dataflow.sdk.transforms.CombineWithContext.CombineFnWithContext;
-import com.google.cloud.dataflow.sdk.transforms.CombineWithContext.Context;
-import com.google.cloud.dataflow.sdk.transforms.CombineWithContext.KeyedCombineFnWithContext;
-import com.google.cloud.dataflow.sdk.util.PropertyNames;
-import com.google.cloud.dataflow.sdk.values.TupleTag;
-import com.google.common.collect.ImmutableList;
-import com.google.common.collect.ImmutableMap;
-import com.google.common.collect.Lists;
-import com.google.common.collect.Maps;
-
-import com.fasterxml.jackson.annotation.JsonCreator;
-import com.fasterxml.jackson.annotation.JsonProperty;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.io.Serializable;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-import java.util.Map.Entry;
-
-/**
- * Static utility methods that create combine function instances.
- */
-public class CombineFns {
-
- /**
- * Returns a {@link ComposeKeyedCombineFnBuilder} to construct a composed
- * {@link PerKeyCombineFn}.
- *
- * <p>The same {@link TupleTag} cannot be used in a composition multiple times.
- *
- * <p>Example:
- * <pre>{ @code
- * PCollection<KV<K, Integer>> latencies = ...;
- *
- * TupleTag<Integer> maxLatencyTag = new TupleTag<Integer>();
- * TupleTag<Double> meanLatencyTag = new TupleTag<Double>();
- *
- * SimpleFunction<Integer, Integer> identityFn =
- * new SimpleFunction<Integer, Integer>() {
- * @Override
- * public Integer apply(Integer input) {
- * return input;
- * }};
- * PCollection<KV<K, CoCombineResult>> maxAndMean = latencies.apply(
- * Combine.perKey(
- * CombineFns.composeKeyed()
- * .with(identityFn, new MaxIntegerFn(), maxLatencyTag)
- * .with(identityFn, new MeanFn<Integer>(), meanLatencyTag)));
- *
- * PCollection<T> finalResultCollection = maxAndMean
- * .apply(ParDo.of(
- * new DoFn<KV<K, CoCombineResult>, T>() {
- * @Override
- * public void processElement(ProcessContext c) throws Exception {
- * KV<K, CoCombineResult> e = c.element();
- * Integer maxLatency = e.getValue().get(maxLatencyTag);
- * Double meanLatency = e.getValue().get(meanLatencyTag);
- * .... Do Something ....
- * c.output(...some T...);
- * }
- * }));
- * } </pre>
- */
- public static ComposeKeyedCombineFnBuilder composeKeyed() {
- return new ComposeKeyedCombineFnBuilder();
- }
-
- /**
- * Returns a {@link ComposeCombineFnBuilder} to construct a composed
- * {@link GlobalCombineFn}.
- *
- * <p>The same {@link TupleTag} cannot be used in a composition multiple times.
- *
- * <p>Example:
- * <pre>{ @code
- * PCollection<Integer> globalLatencies = ...;
- *
- * TupleTag<Integer> maxLatencyTag = new TupleTag<Integer>();
- * TupleTag<Double> meanLatencyTag = new TupleTag<Double>();
- *
- * SimpleFunction<Integer, Integer> identityFn =
- * new SimpleFunction<Integer, Integer>() {
- * @Override
- * public Integer apply(Integer input) {
- * return input;
- * }};
- * PCollection<CoCombineResult> maxAndMean = globalLatencies.apply(
- * Combine.globally(
- * CombineFns.compose()
- * .with(identityFn, new MaxIntegerFn(), maxLatencyTag)
- * .with(identityFn, new MeanFn<Integer>(), meanLatencyTag)));
- *
- * PCollection<T> finalResultCollection = maxAndMean
- * .apply(ParDo.of(
- * new DoFn<CoCombineResult, T>() {
- * @Override
- * public void processElement(ProcessContext c) throws Exception {
- * CoCombineResult e = c.element();
- * Integer maxLatency = e.get(maxLatencyTag);
- * Double meanLatency = e.get(meanLatencyTag);
- * .... Do Something ....
- * c.output(...some T...);
- * }
- * }));
- * } </pre>
- */
- public static ComposeCombineFnBuilder compose() {
- return new ComposeCombineFnBuilder();
- }
-
- /////////////////////////////////////////////////////////////////////////////
-
- /**
- * A builder class to construct a composed {@link PerKeyCombineFn}.
- */
- public static class ComposeKeyedCombineFnBuilder {
- /**
- * Returns a {@link ComposedKeyedCombineFn} that can take additional
- * {@link PerKeyCombineFn PerKeyCombineFns} and apply them as a single combine function.
- *
- * <p>The {@link ComposedKeyedCombineFn} extracts inputs from {@code DataT} with
- * the {@code extractInputFn} and combines them with the {@code keyedCombineFn},
- * and then it outputs each combined value with a {@link TupleTag} to a
- * {@link CoCombineResult}.
- */
- public <K, DataT, InputT, OutputT> ComposedKeyedCombineFn<DataT, K> with(
- SimpleFunction<DataT, InputT> extractInputFn,
- KeyedCombineFn<K, InputT, ?, OutputT> keyedCombineFn,
- TupleTag<OutputT> outputTag) {
- return new ComposedKeyedCombineFn<DataT, K>()
- .with(extractInputFn, keyedCombineFn, outputTag);
- }
-
- /**
- * Returns a {@link ComposedKeyedCombineFnWithContext} that can take additional
- * {@link PerKeyCombineFn PerKeyCombineFns} and apply them as a single combine function.
- *
- * <p>The {@link ComposedKeyedCombineFnWithContext} extracts inputs from {@code DataT} with
- * the {@code extractInputFn} and combines them with the {@code keyedCombineFnWithContext},
- * and then it outputs each combined value with a {@link TupleTag} to a
- * {@link CoCombineResult}.
- */
- public <K, DataT, InputT, OutputT> ComposedKeyedCombineFnWithContext<DataT, K> with(
- SimpleFunction<DataT, InputT> extractInputFn,
- KeyedCombineFnWithContext<K, InputT, ?, OutputT> keyedCombineFnWithContext,
- TupleTag<OutputT> outputTag) {
- return new ComposedKeyedCombineFnWithContext<DataT, K>()
- .with(extractInputFn, keyedCombineFnWithContext, outputTag);
- }
-
- /**
- * Returns a {@link ComposedKeyedCombineFn} that can take additional
- * {@link PerKeyCombineFn PerKeyCombineFns} and apply them as a single combine function.
- */
- public <K, DataT, InputT, OutputT> ComposedKeyedCombineFn<DataT, K> with(
- SimpleFunction<DataT, InputT> extractInputFn,
- CombineFn<InputT, ?, OutputT> combineFn,
- TupleTag<OutputT> outputTag) {
- return with(extractInputFn, combineFn.<K>asKeyedFn(), outputTag);
- }
-
- /**
- * Returns a {@link ComposedKeyedCombineFnWithContext} that can take additional
- * {@link PerKeyCombineFn PerKeyCombineFns} and apply them as a single combine function.
- */
- public <K, DataT, InputT, OutputT> ComposedKeyedCombineFnWithContext<DataT, K> with(
- SimpleFunction<DataT, InputT> extractInputFn,
- CombineFnWithContext<InputT, ?, OutputT> combineFnWithContext,
- TupleTag<OutputT> outputTag) {
- return with(extractInputFn, combineFnWithContext.<K>asKeyedFn(), outputTag);
- }
- }
-
- /**
- * A builder class to construct a composed {@link GlobalCombineFn}.
- */
- public static class ComposeCombineFnBuilder {
- /**
- * Returns a {@link ComposedCombineFn} that can take additional
- * {@link GlobalCombineFn GlobalCombineFns} and apply them as a single combine function.
- *
- * <p>The {@link ComposedCombineFn} extracts inputs from {@code DataT} with
- * the {@code extractInputFn} and combines them with the {@code combineFn},
- * and then it outputs each combined value with a {@link TupleTag} to a
- * {@link CoCombineResult}.
- */
- public <DataT, InputT, OutputT> ComposedCombineFn<DataT> with(
- SimpleFunction<DataT, InputT> extractInputFn,
- CombineFn<InputT, ?, OutputT> combineFn,
- TupleTag<OutputT> outputTag) {
- return new ComposedCombineFn<DataT>()
- .with(extractInputFn, combineFn, outputTag);
- }
-
- /**
- * Returns a {@link ComposedCombineFnWithContext} that can take additional
- * {@link GlobalCombineFn GlobalCombineFns} and apply them as a single combine function.
- *
- * <p>The {@link ComposedCombineFnWithContext} extracts inputs from {@code DataT} with
- * the {@code extractInputFn} and combines them with the {@code combineFnWithContext},
- * and then it outputs each combined value with a {@link TupleTag} to a
- * {@link CoCombineResult}.
- */
- public <DataT, InputT, OutputT> ComposedCombineFnWithContext<DataT> with(
- SimpleFunction<DataT, InputT> extractInputFn,
- CombineFnWithContext<InputT, ?, OutputT> combineFnWithContext,
- TupleTag<OutputT> outputTag) {
- return new ComposedCombineFnWithContext<DataT>()
- .with(extractInputFn, combineFnWithContext, outputTag);
- }
- }
-
- /////////////////////////////////////////////////////////////////////////////
-
- /**
- * A tuple of outputs produced by a composed combine functions.
- *
- * <p>See {@link #compose()} or {@link #composeKeyed()}) for details.
- */
- public static class CoCombineResult implements Serializable {
-
- private enum NullValue {
- INSTANCE;
- }
-
- private final Map<TupleTag<?>, Object> valuesMap;
-
- /**
- * The constructor of {@link CoCombineResult}.
- *
- * <p>Null values should have been filtered out from the {@code valuesMap}.
- * {@link TupleTag TupleTags} that associate with null values doesn't exist in the key set of
- * {@code valuesMap}.
- *
- * @throws NullPointerException if any key or value in {@code valuesMap} is null
- */
- CoCombineResult(Map<TupleTag<?>, Object> valuesMap) {
- ImmutableMap.Builder<TupleTag<?>, Object> builder = ImmutableMap.builder();
- for (Entry<TupleTag<?>, Object> entry : valuesMap.entrySet()) {
- if (entry.getValue() != null) {
- builder.put(entry);
- } else {
- builder.put(entry.getKey(), NullValue.INSTANCE);
- }
- }
- this.valuesMap = builder.build();
- }
-
- /**
- * Returns the value represented by the given {@link TupleTag}.
- *
- * <p>It is an error to request a non-exist tuple tag from the {@link CoCombineResult}.
- */
- @SuppressWarnings("unchecked")
- public <V> V get(TupleTag<V> tag) {
- checkArgument(
- valuesMap.keySet().contains(tag), "TupleTag " + tag + " is not in the CoCombineResult");
- Object value = valuesMap.get(tag);
- if (value == NullValue.INSTANCE) {
- return null;
- } else {
- return (V) value;
- }
- }
- }
-
- /////////////////////////////////////////////////////////////////////////////
-
- /**
- * A composed {@link CombineFn} that applies multiple {@link CombineFn CombineFns}.
- *
- * <p>For each {@link CombineFn} it extracts inputs from {@code DataT} with
- * the {@code extractInputFn} and combines them,
- * and then it outputs each combined value with a {@link TupleTag} to a
- * {@link CoCombineResult}.
- */
- public static class ComposedCombineFn<DataT> extends CombineFn<DataT, Object[], CoCombineResult> {
-
- private final List<CombineFn<Object, Object, Object>> combineFns;
- private final List<SerializableFunction<DataT, Object>> extractInputFns;
- private final List<TupleTag<?>> outputTags;
- private final int combineFnCount;
-
- private ComposedCombineFn() {
- this.extractInputFns = ImmutableList.of();
- this.combineFns = ImmutableList.of();
- this.outputTags = ImmutableList.of();
- this.combineFnCount = 0;
- }
-
- private ComposedCombineFn(
- ImmutableList<SerializableFunction<DataT, ?>> extractInputFns,
- ImmutableList<CombineFn<?, ?, ?>> combineFns,
- ImmutableList<TupleTag<?>> outputTags) {
- @SuppressWarnings({"unchecked", "rawtypes"})
- List<SerializableFunction<DataT, Object>> castedExtractInputFns = (List) extractInputFns;
- this.extractInputFns = castedExtractInputFns;
-
- @SuppressWarnings({"unchecked", "rawtypes"})
- List<CombineFn<Object, Object, Object>> castedCombineFns = (List) combineFns;
- this.combineFns = castedCombineFns;
-
- this.outputTags = outputTags;
- this.combineFnCount = this.combineFns.size();
- }
-
- /**
- * Returns a {@link ComposedCombineFn} with an additional {@link CombineFn}.
- */
- public <InputT, OutputT> ComposedCombineFn<DataT> with(
- SimpleFunction<DataT, InputT> extractInputFn,
- CombineFn<InputT, ?, OutputT> combineFn,
- TupleTag<OutputT> outputTag) {
- checkUniqueness(outputTags, outputTag);
- return new ComposedCombineFn<>(
- ImmutableList.<SerializableFunction<DataT, ?>>builder()
- .addAll(extractInputFns)
- .add(extractInputFn)
- .build(),
- ImmutableList.<CombineFn<?, ?, ?>>builder()
- .addAll(combineFns)
- .add(combineFn)
- .build(),
- ImmutableList.<TupleTag<?>>builder()
- .addAll(outputTags)
- .add(outputTag)
- .build());
- }
-
- /**
- * Returns a {@link ComposedCombineFnWithContext} with an additional
- * {@link CombineFnWithContext}.
- */
- public <InputT, OutputT> ComposedCombineFnWithContext<DataT> with(
- SimpleFunction<DataT, InputT> extractInputFn,
- CombineFnWithContext<InputT, ?, OutputT> combineFn,
- TupleTag<OutputT> outputTag) {
- checkUniqueness(outputTags, outputTag);
- List<CombineFnWithContext<Object, Object, Object>> fnsWithContext = Lists.newArrayList();
- for (CombineFn<Object, Object, Object> fn : combineFns) {
- fnsWithContext.add(toFnWithContext(fn));
- }
- return new ComposedCombineFnWithContext<>(
- ImmutableList.<SerializableFunction<DataT, ?>>builder()
- .addAll(extractInputFns)
- .add(extractInputFn)
- .build(),
- ImmutableList.<CombineFnWithContext<?, ?, ?>>builder()
- .addAll(fnsWithContext)
- .add(combineFn)
- .build(),
- ImmutableList.<TupleTag<?>>builder()
- .addAll(outputTags)
- .add(outputTag)
- .build());
- }
-
- @Override
- public Object[] createAccumulator() {
- Object[] accumsArray = new Object[combineFnCount];
- for (int i = 0; i < combineFnCount; ++i) {
- accumsArray[i] = combineFns.get(i).createAccumulator();
- }
- return accumsArray;
- }
-
- @Override
- public Object[] addInput(Object[] accumulator, DataT value) {
- for (int i = 0; i < combineFnCount; ++i) {
- Object input = extractInputFns.get(i).apply(value);
- accumulator[i] = combineFns.get(i).addInput(accumulator[i], input);
- }
- return accumulator;
- }
-
- @Override
- public Object[] mergeAccumulators(Iterable<Object[]> accumulators) {
- Iterator<Object[]> iter = accumulators.iterator();
- if (!iter.hasNext()) {
- return createAccumulator();
- } else {
- // Reuses the first accumulator, and overwrites its values.
- // It is safe because {@code accum[i]} only depends on
- // the i-th component of each accumulator.
- Object[] accum = iter.next();
- for (int i = 0; i < combineFnCount; ++i) {
- accum[i] = combineFns.get(i).mergeAccumulators(new ProjectionIterable(accumulators, i));
- }
- return accum;
- }
- }
-
- @Override
- public CoCombineResult extractOutput(Object[] accumulator) {
- Map<TupleTag<?>, Object> valuesMap = Maps.newHashMap();
- for (int i = 0; i < combineFnCount; ++i) {
- valuesMap.put(
- outputTags.get(i),
- combineFns.get(i).extractOutput(accumulator[i]));
- }
- return new CoCombineResult(valuesMap);
- }
-
- @Override
- public Object[] compact(Object[] accumulator) {
- for (int i = 0; i < combineFnCount; ++i) {
- accumulator[i] = combineFns.get(i).compact(accumulator[i]);
- }
- return accumulator;
- }
-
- @Override
- public Coder<Object[]> getAccumulatorCoder(CoderRegistry registry, Coder<DataT> dataCoder)
- throws CannotProvideCoderException {
- List<Coder<Object>> coders = Lists.newArrayList();
- for (int i = 0; i < combineFnCount; ++i) {
- Coder<Object> inputCoder =
- registry.getDefaultOutputCoder(extractInputFns.get(i), dataCoder);
- coders.add(combineFns.get(i).getAccumulatorCoder(registry, inputCoder));
- }
- return new ComposedAccumulatorCoder(coders);
- }
- }
-
- /**
- * A composed {@link CombineFnWithContext} that applies multiple
- * {@link CombineFnWithContext CombineFnWithContexts}.
- *
- * <p>For each {@link CombineFnWithContext} it extracts inputs from {@code DataT} with
- * the {@code extractInputFn} and combines them,
- * and then it outputs each combined value with a {@link TupleTag} to a
- * {@link CoCombineResult}.
- */
- public static class ComposedCombineFnWithContext<DataT>
- extends CombineFnWithContext<DataT, Object[], CoCombineResult> {
-
- private final List<SerializableFunction<DataT, Object>> extractInputFns;
- private final List<CombineFnWithContext<Object, Object, Object>> combineFnWithContexts;
- private final List<TupleTag<?>> outputTags;
- private final int combineFnCount;
-
- private ComposedCombineFnWithContext() {
- this.extractInputFns = ImmutableList.of();
- this.combineFnWithContexts = ImmutableList.of();
- this.outputTags = ImmutableList.of();
- this.combineFnCount = 0;
- }
-
- private ComposedCombineFnWithContext(
- ImmutableList<SerializableFunction<DataT, ?>> extractInputFns,
- ImmutableList<CombineFnWithContext<?, ?, ?>> combineFnWithContexts,
- ImmutableList<TupleTag<?>> outputTags) {
- @SuppressWarnings({"unchecked", "rawtypes"})
- List<SerializableFunction<DataT, Object>> castedExtractInputFns =
- (List) extractInputFns;
- this.extractInputFns = castedExtractInputFns;
-
- @SuppressWarnings({"rawtypes", "unchecked"})
- List<CombineFnWithContext<Object, Object, Object>> castedCombineFnWithContexts
- = (List) combineFnWithContexts;
- this.combineFnWithContexts = castedCombineFnWithContexts;
-
- this.outputTags = outputTags;
- this.combineFnCount = this.combineFnWithContexts.size();
- }
-
- /**
- * Returns a {@link ComposedCombineFnWithContext} with an additional {@link GlobalCombineFn}.
- */
- public <InputT, OutputT> ComposedCombineFnWithContext<DataT> with(
- SimpleFunction<DataT, InputT> extractInputFn,
- GlobalCombineFn<InputT, ?, OutputT> globalCombineFn,
- TupleTag<OutputT> outputTag) {
- checkUniqueness(outputTags, outputTag);
- return new ComposedCombineFnWithContext<>(
- ImmutableList.<SerializableFunction<DataT, ?>>builder()
- .addAll(extractInputFns)
- .add(extractInputFn)
- .build(),
- ImmutableList.<CombineFnWithContext<?, ?, ?>>builder()
- .addAll(combineFnWithContexts)
- .add(toFnWithContext(globalCombineFn))
- .build(),
- ImmutableList.<TupleTag<?>>builder()
- .addAll(outputTags)
- .add(outputTag)
- .build());
- }
-
- @Override
- public Object[] createAccumulator(Context c) {
- Object[] accumsArray = new Object[combineFnCount];
- for (int i = 0; i < combineFnCount; ++i) {
- accumsArray[i] = combineFnWithContexts.get(i).createAccumulator(c);
- }
- return accumsArray;
- }
-
- @Override
- public Object[] addInput(Object[] accumulator, DataT value, Context c) {
- for (int i = 0; i < combineFnCount; ++i) {
- Object input = extractInputFns.get(i).apply(value);
- accumulator[i] = combineFnWithContexts.get(i).addInput(accumulator[i], input, c);
- }
- return accumulator;
- }
-
- @Override
- public Object[] mergeAccumulators(Iterable<Object[]> accumulators, Context c) {
- Iterator<Object[]> iter = accumulators.iterator();
- if (!iter.hasNext()) {
- return createAccumulator(c);
- } else {
- // Reuses the first accumulator, and overwrites its values.
- // It is safe because {@code accum[i]} only depends on
- // the i-th component of each accumulator.
- Object[] accum = iter.next();
- for (int i = 0; i < combineFnCount; ++i) {
- accum[i] = combineFnWithContexts.get(i).mergeAccumulators(
- new ProjectionIterable(accumulators, i), c);
- }
- return accum;
- }
- }
-
- @Override
- public CoCombineResult extractOutput(Object[] accumulator, Context c) {
- Map<TupleTag<?>, Object> valuesMap = Maps.newHashMap();
- for (int i = 0; i < combineFnCount; ++i) {
- valuesMap.put(
- outputTags.get(i),
- combineFnWithContexts.get(i).extractOutput(accumulator[i], c));
- }
- return new CoCombineResult(valuesMap);
- }
-
- @Override
- public Object[] compact(Object[] accumulator, Context c) {
- for (int i = 0; i < combineFnCount; ++i) {
- accumulator[i] = combineFnWithContexts.get(i).compact(accumulator[i], c);
- }
- return accumulator;
- }
-
- @Override
- public Coder<Object[]> getAccumulatorCoder(CoderRegistry registry, Coder<DataT> dataCoder)
- throws CannotProvideCoderException {
- List<Coder<Object>> coders = Lists.newArrayList();
- for (int i = 0; i < combineFnCount; ++i) {
- Coder<Object> inputCoder =
- registry.getDefaultOutputCoder(extractInputFns.get(i), dataCoder);
- coders.add(combineFnWithContexts.get(i).getAccumulatorCoder(registry, inputCoder));
- }
- return new ComposedAccumulatorCoder(coders);
- }
- }
-
- /**
- * A composed {@link KeyedCombineFn} that applies multiple {@link KeyedCombineFn KeyedCombineFns}.
- *
- * <p>For each {@link KeyedCombineFn} it extracts inputs from {@code DataT} with
- * the {@code extractInputFn} and combines them,
- * and then it outputs each combined value with a {@link TupleTag} to a
- * {@link CoCombineResult}.
- */
- public static class ComposedKeyedCombineFn<DataT, K>
- extends KeyedCombineFn<K, DataT, Object[], CoCombineResult> {
-
- private final List<SerializableFunction<DataT, Object>> extractInputFns;
- private final List<KeyedCombineFn<K, Object, Object, Object>> keyedCombineFns;
- private final List<TupleTag<?>> outputTags;
- private final int combineFnCount;
-
- private ComposedKeyedCombineFn() {
- this.extractInputFns = ImmutableList.of();
- this.keyedCombineFns = ImmutableList.of();
- this.outputTags = ImmutableList.of();
- this.combineFnCount = 0;
- }
-
- private ComposedKeyedCombineFn(
- ImmutableList<SerializableFunction<DataT, ?>> extractInputFns,
- ImmutableList<KeyedCombineFn<K, ?, ?, ?>> keyedCombineFns,
- ImmutableList<TupleTag<?>> outputTags) {
- @SuppressWarnings({"unchecked", "rawtypes"})
- List<SerializableFunction<DataT, Object>> castedExtractInputFns = (List) extractInputFns;
- this.extractInputFns = castedExtractInputFns;
-
- @SuppressWarnings({"unchecked", "rawtypes"})
- List<KeyedCombineFn<K, Object, Object, Object>> castedKeyedCombineFns =
- (List) keyedCombineFns;
- this.keyedCombineFns = castedKeyedCombineFns;
- this.outputTags = outputTags;
- this.combineFnCount = this.keyedCombineFns.size();
- }
-
- /**
- * Returns a {@link ComposedKeyedCombineFn} with an additional {@link KeyedCombineFn}.
- */
- public <InputT, OutputT> ComposedKeyedCombineFn<DataT, K> with(
- SimpleFunction<DataT, InputT> extractInputFn,
- KeyedCombineFn<K, InputT, ?, OutputT> keyedCombineFn,
- TupleTag<OutputT> outputTag) {
- checkUniqueness(outputTags, outputTag);
- return new ComposedKeyedCombineFn<>(
- ImmutableList.<SerializableFunction<DataT, ?>>builder()
- .addAll(extractInputFns)
- .add(extractInputFn)
- .build(),
- ImmutableList.<KeyedCombineFn<K, ?, ?, ?>>builder()
- .addAll(keyedCombineFns)
- .add(keyedCombineFn)
- .build(),
- ImmutableList.<TupleTag<?>>builder()
- .addAll(outputTags)
- .add(outputTag)
- .build());
- }
-
- /**
- * Returns a {@link ComposedKeyedCombineFnWithContext} with an additional
- * {@link KeyedCombineFnWithContext}.
- */
- public <InputT, OutputT> ComposedKeyedCombineFnWithContext<DataT, K> with(
- SimpleFunction<DataT, InputT> extractInputFn,
- KeyedCombineFnWithContext<K, InputT, ?, OutputT> keyedCombineFn,
- TupleTag<OutputT> outputTag) {
- checkUniqueness(outputTags, outputTag);
- List<KeyedCombineFnWithContext<K, Object, Object, Object>> fnsWithContext =
- Lists.newArrayList();
- for (KeyedCombineFn<K, Object, Object, Object> fn : keyedCombineFns) {
- fnsWithContext.add(toFnWithContext(fn));
- }
- return new ComposedKeyedCombineFnWithContext<>(
- ImmutableList.<SerializableFunction<DataT, ?>>builder()
- .addAll(extractInputFns)
- .add(extractInputFn)
- .build(),
- ImmutableList.<KeyedCombineFnWithContext<K, ?, ?, ?>>builder()
- .addAll(fnsWithContext)
- .add(keyedCombineFn)
- .build(),
- ImmutableList.<TupleTag<?>>builder()
- .addAll(outputTags)
- .add(outputTag)
- .build());
- }
-
- /**
- * Returns a {@link ComposedKeyedCombineFn} with an additional {@link CombineFn}.
- */
- public <InputT, OutputT> ComposedKeyedCombineFn<DataT, K> with(
- SimpleFunction<DataT, InputT> extractInputFn,
- CombineFn<InputT, ?, OutputT> keyedCombineFn,
- TupleTag<OutputT> outputTag) {
- return with(extractInputFn, keyedCombineFn.<K>asKeyedFn(), outputTag);
- }
-
- /**
- * Returns a {@link ComposedKeyedCombineFnWithContext} with an additional
- * {@link CombineFnWithContext}.
- */
- public <InputT, OutputT> ComposedKeyedCombineFnWithContext<DataT, K> with(
- SimpleFunction<DataT, InputT> extractInputFn,
- CombineFnWithContext<InputT, ?, OutputT> keyedCombineFn,
- TupleTag<OutputT> outputTag) {
- return with(extractInputFn, keyedCombineFn.<K>asKeyedFn(), outputTag);
- }
-
- @Override
- public Object[] createAccumulator(K key) {
- Object[] accumsArray = new Object[combineFnCount];
- for (int i = 0; i < combineFnCount; ++i) {
- accumsArray[i] = keyedCombineFns.get(i).createAccumulator(key);
- }
- return accumsArray;
- }
-
- @Override
- public Object[] addInput(K key, Object[] accumulator, DataT value) {
- for (int i = 0; i < combineFnCount; ++i) {
- Object input = extractInputFns.get(i).apply(value);
- accumulator[i] = keyedCombineFns.get(i).addInput(key, accumulator[i], input);
- }
- return accumulator;
- }
-
- @Override
- public Object[] mergeAccumulators(K key, final Iterable<Object[]> accumulators) {
- Iterator<Object[]> iter = accumulators.iterator();
- if (!iter.hasNext()) {
- return createAccumulator(key);
- } else {
- // Reuses the first accumulator, and overwrites its values.
- // It is safe because {@code accum[i]} only depends on
- // the i-th component of each accumulator.
- Object[] accum = iter.next();
- for (int i = 0; i < combineFnCount; ++i) {
- accum[i] = keyedCombineFns.get(i).mergeAccumulators(
- key, new ProjectionIterable(accumulators, i));
- }
- return accum;
- }
- }
-
- @Override
- public CoCombineResult extractOutput(K key, Object[] accumulator) {
- Map<TupleTag<?>, Object> valuesMap = Maps.newHashMap();
- for (int i = 0; i < combineFnCount; ++i) {
- valuesMap.put(
- outputTags.get(i),
- keyedCombineFns.get(i).extractOutput(key, accumulator[i]));
- }
- return new CoCombineResult(valuesMap);
- }
-
- @Override
- public Object[] compact(K key, Object[] accumulator) {
- for (int i = 0; i < combineFnCount; ++i) {
- accumulator[i] = keyedCombineFns.get(i).compact(key, accumulator[i]);
- }
- return accumulator;
- }
-
- @Override
- public Coder<Object[]> getAccumulatorCoder(
- CoderRegistry registry, Coder<K> keyCoder, Coder<DataT> dataCoder)
- throws CannotProvideCoderException {
- List<Coder<Object>> coders = Lists.newArrayList();
- for (int i = 0; i < combineFnCount; ++i) {
- Coder<Object> inputCoder =
- registry.getDefaultOutputCoder(extractInputFns.get(i), dataCoder);
- coders.add(keyedCombineFns.get(i).getAccumulatorCoder(registry, keyCoder, inputCoder));
- }
- return new ComposedAccumulatorCoder(coders);
- }
- }
-
- /**
- * A composed {@link KeyedCombineFnWithContext} that applies multiple
- * {@link KeyedCombineFnWithContext KeyedCombineFnWithContexts}.
- *
- * <p>For each {@link KeyedCombineFnWithContext} it extracts inputs from {@code DataT} with
- * the {@code extractInputFn} and combines them,
- * and then it outputs each combined value with a {@link TupleTag} to a
- * {@link CoCombineResult}.
- */
- public static class ComposedKeyedCombineFnWithContext<DataT, K>
- extends KeyedCombineFnWithContext<K, DataT, Object[], CoCombineResult> {
-
- private final List<SerializableFunction<DataT, Object>> extractInputFns;
- private final List<KeyedCombineFnWithContext<K, Object, Object, Object>> keyedCombineFns;
- private final List<TupleTag<?>> outputTags;
- private final int combineFnCount;
-
- private ComposedKeyedCombineFnWithContext() {
- this.extractInputFns = ImmutableList.of();
- this.keyedCombineFns = ImmutableList.of();
- this.outputTags = ImmutableList.of();
- this.combineFnCount = 0;
- }
-
- private ComposedKeyedCombineFnWithContext(
- ImmutableList<SerializableFunction<DataT, ?>> extractInputFns,
- ImmutableList<KeyedCombineFnWithContext<K, ?, ?, ?>> keyedCombineFns,
- ImmutableList<TupleTag<?>> outputTags) {
- @SuppressWarnings({"unchecked", "rawtypes"})
- List<SerializableFunction<DataT, Object>> castedExtractInputFns =
- (List) extractInputFns;
- this.extractInputFns = castedExtractInputFns;
-
- @SuppressWarnings({"unchecked", "rawtypes"})
- List<KeyedCombineFnWithContext<K, Object, Object, Object>> castedKeyedCombineFns =
- (List) keyedCombineFns;
- this.keyedCombineFns = castedKeyedCombineFns;
- this.outputTags = outputTags;
- this.combineFnCount = this.keyedCombineFns.size();
- }
-
- /**
- * Returns a {@link ComposedKeyedCombineFnWithContext} with an additional
- * {@link PerKeyCombineFn}.
- */
- public <InputT, OutputT> ComposedKeyedCombineFnWithContext<DataT, K> with(
- SimpleFunction<DataT, InputT> extractInputFn,
- PerKeyCombineFn<K, InputT, ?, OutputT> perKeyCombineFn,
- TupleTag<OutputT> outputTag) {
- checkUniqueness(outputTags, outputTag);
- return new ComposedKeyedCombineFnWithContext<>(
- ImmutableList.<SerializableFunction<DataT, ?>>builder()
- .addAll(extractInputFns)
- .add(extractInputFn)
- .build(),
- ImmutableList.<KeyedCombineFnWithContext<K, ?, ?, ?>>builder()
- .addAll(keyedCombineFns)
- .add(toFnWithContext(perKeyCombineFn))
- .build(),
- ImmutableList.<TupleTag<?>>builder()
- .addAll(outputTags)
- .add(outputTag)
- .build());
- }
-
- /**
- * Returns a {@link ComposedKeyedCombineFnWithContext} with an additional
- * {@link GlobalCombineFn}.
- */
- public <InputT, OutputT> ComposedKeyedCombineFnWithContext<DataT, K> with(
- SimpleFunction<DataT, InputT> extractInputFn,
- GlobalCombineFn<InputT, ?, OutputT> perKeyCombineFn,
- TupleTag<OutputT> outputTag) {
- return with(extractInputFn, perKeyCombineFn.<K>asKeyedFn(), outputTag);
- }
-
- @Override
- public Object[] createAccumulator(K key, Context c) {
- Object[] accumsArray = new Object[combineFnCount];
- for (int i = 0; i < combineFnCount; ++i) {
- accumsArray[i] = keyedCombineFns.get(i).createAccumulator(key, c);
- }
- return accumsArray;
- }
-
- @Override
- public Object[] addInput(K key, Object[] accumulator, DataT value, Context c) {
- for (int i = 0; i < combineFnCount; ++i) {
- Object input = extractInputFns.get(i).apply(value);
- accumulator[i] = keyedCombineFns.get(i).addInput(key, accumulator[i], input, c);
- }
- return accumulator;
- }
-
- @Override
- public Object[] mergeAccumulators(K key, Iterable<Object[]> accumulators, Context c) {
- Iterator<Object[]> iter = accumulators.iterator();
- if (!iter.hasNext()) {
- return createAccumulator(key, c);
- } else {
- // Reuses the first accumulator, and overwrites its values.
- // It is safe because {@code accum[i]} only depends on
- // the i-th component of each accumulator.
- Object[] accum = iter.next();
- for (int i = 0; i < combineFnCount; ++i) {
- accum[i] = keyedCombineFns.get(i).mergeAccumulators(
- key, new ProjectionIterable(accumulators, i), c);
- }
- return accum;
- }
- }
-
- @Override
- public CoCombineResult extractOutput(K key, Object[] accumulator, Context c) {
- Map<TupleTag<?>, Object> valuesMap = Maps.newHashMap();
- for (int i = 0; i < combineFnCount; ++i) {
- valuesMap.put(
- outputTags.get(i),
- keyedCombineFns.get(i).extractOutput(key, accumulator[i], c));
- }
- return new CoCombineResult(valuesMap);
- }
-
- @Override
- public Object[] compact(K key, Object[] accumulator, Context c) {
- for (int i = 0; i < combineFnCount; ++i) {
- accumulator[i] = keyedCombineFns.get(i).compact(key, accumulator[i], c);
- }
- return accumulator;
- }
-
- @Override
- public Coder<Object[]> getAccumulatorCoder(
- CoderRegistry registry, Coder<K> keyCoder, Coder<DataT> dataCoder)
- throws CannotProvideCoderException {
- List<Coder<Object>> coders = Lists.newArrayList();
- for (int i = 0; i < combineFnCount; ++i) {
- Coder<Object> inputCoder =
- registry.getDefaultOutputCoder(extractInputFns.get(i), dataCoder);
- coders.add(keyedCombineFns.get(i).getAccumulatorCoder(
- registry, keyCoder, inputCoder));
- }
- return new ComposedAccumulatorCoder(coders);
- }
- }
-
- /////////////////////////////////////////////////////////////////////////////
-
- private static class ProjectionIterable implements Iterable<Object> {
- private final Iterable<Object[]> iterable;
- private final int column;
-
- private ProjectionIterable(Iterable<Object[]> iterable, int column) {
- this.iterable = iterable;
- this.column = column;
- }
-
- @Override
- public Iterator<Object> iterator() {
- final Iterator<Object[]> iter = iterable.iterator();
- return new Iterator<Object>() {
- @Override
- public boolean hasNext() {
- return iter.hasNext();
- }
-
- @Override
- public Object next() {
- return iter.next()[column];
- }
-
- @Override
- public void remove() {
- throw new UnsupportedOperationException();
- }
- };
- }
- }
-
- private static class ComposedAccumulatorCoder extends StandardCoder<Object[]> {
- private List<Coder<Object>> coders;
- private int codersCount;
-
- public ComposedAccumulatorCoder(List<Coder<Object>> coders) {
- this.coders = ImmutableList.copyOf(coders);
- this.codersCount = coders.size();
- }
-
- @SuppressWarnings({"rawtypes", "unchecked"})
- @JsonCreator
- public static ComposedAccumulatorCoder of(
- @JsonProperty(PropertyNames.COMPONENT_ENCODINGS)
- List<Coder<?>> components) {
- return new ComposedAccumulatorCoder((List) components);
- }
-
- @Override
- public void encode(Object[] value, OutputStream outStream, Context context)
- throws CoderException, IOException {
- checkArgument(value.length == codersCount);
- Context nestedContext = context.nested();
- for (int i = 0; i < codersCount; ++i) {
- coders.get(i).encode(value[i], outStream, nestedContext);
- }
- }
-
- @Override
- public Object[] decode(InputStream inStream, Context context)
- throws CoderException, IOException {
- Object[] ret = new Object[codersCount];
- Context nestedContext = context.nested();
- for (int i = 0; i < codersCount; ++i) {
- ret[i] = coders.get(i).decode(inStream, nestedContext);
- }
- return ret;
- }
-
- @Override
- public List<? extends Coder<?>> getCoderArguments() {
- return coders;
- }
-
- @Override
- public void verifyDeterministic() throws NonDeterministicException {
- for (int i = 0; i < codersCount; ++i) {
- coders.get(i).verifyDeterministic();
- }
- }
- }
-
- @SuppressWarnings("unchecked")
- private static <InputT, AccumT, OutputT> CombineFnWithContext<InputT, AccumT, OutputT>
- toFnWithContext(GlobalCombineFn<InputT, AccumT, OutputT> globalCombineFn) {
- if (globalCombineFn instanceof CombineFnWithContext) {
- return (CombineFnWithContext<InputT, AccumT, OutputT>) globalCombineFn;
- } else {
- final CombineFn<InputT, AccumT, OutputT> combineFn =
- (CombineFn<InputT, AccumT, OutputT>) globalCombineFn;
- return new CombineFnWithContext<InputT, AccumT, OutputT>() {
- @Override
- public AccumT createAccumulator(Context c) {
- return combineFn.createAccumulator();
- }
- @Override
- public AccumT addInput(AccumT accumulator, InputT input, Context c) {
- return combineFn.addInput(accumulator, input);
- }
- @Override
- public AccumT mergeAccumulators(Iterable<AccumT> accumulators, Context c) {
- return combineFn.mergeAccumulators(accumulators);
- }
- @Override
- public OutputT extractOutput(AccumT accumulator, Context c) {
- return combineFn.extractOutput(accumulator);
- }
- @Override
- public AccumT compact(AccumT accumulator, Context c) {
- return combineFn.compact(accumulator);
- }
- @Override
- public OutputT defaultValue() {
- return combineFn.defaultValue();
- }
- @Override
- public Coder<AccumT> getAccumulatorCoder(CoderRegistry registry, Coder<InputT> inputCoder)
- throws CannotProvideCoderException {
- return combineFn.getAccumulatorCoder(registry, inputCoder);
- }
- @Override
- public Coder<OutputT> getDefaultOutputCoder(
- CoderRegistry registry, Coder<InputT> inputCoder) throws CannotProvideCoderException {
- return combineFn.getDefaultOutputCoder(registry, inputCoder);
- }
- };
- }
- }
-
- private static <K, InputT, AccumT, OutputT> KeyedCombineFnWithContext<K, InputT, AccumT, OutputT>
- toFnWithContext(PerKeyCombineFn<K, InputT, AccumT, OutputT> perKeyCombineFn) {
- if (perKeyCombineFn instanceof KeyedCombineFnWithContext) {
- @SuppressWarnings("unchecked")
- KeyedCombineFnWithContext<K, InputT, AccumT, OutputT> keyedCombineFnWithContext =
- (KeyedCombineFnWithContext<K, InputT, AccumT, OutputT>) perKeyCombineFn;
- return keyedCombineFnWithContext;
- } else {
- @SuppressWarnings("unchecked")
- final KeyedCombineFn<K, InputT, AccumT, OutputT> keyedCombineFn =
- (KeyedCombineFn<K, InputT, AccumT, OutputT>) perKeyCombineFn;
- return new KeyedCombineFnWithContext<K, InputT, AccumT, OutputT>() {
- @Override
- public AccumT createAccumulator(K key, Context c) {
- return keyedCombineFn.createAccumulator(key);
- }
- @Override
- public AccumT addInput(K key, AccumT accumulator, InputT value, Context c) {
- return keyedCombineFn.addInput(key, accumulator, value);
- }
- @Override
- public AccumT mergeAccumulators(K key, Iterable<AccumT> accumulators, Context c) {
- return keyedCombineFn.mergeAccumulators(key, accumulators);
- }
- @Override
- public OutputT extractOutput(K key, AccumT accumulator, Context c) {
- return keyedCombineFn.extractOutput(key, accumulator);
- }
- @Override
- public AccumT compact(K key, AccumT accumulator, Context c) {
- return keyedCombineFn.compact(key, accumulator);
- }
- @Override
- public Coder<AccumT> getAccumulatorCoder(CoderRegistry registry, Coder<K> keyCoder,
- Coder<InputT> inputCoder) throws CannotProvideCoderException {
- return keyedCombineFn.getAccumulatorCoder(registry, keyCoder, inputCoder);
- }
- @Override
- public Coder<OutputT> getDefaultOutputCoder(CoderRegistry registry, Coder<K> keyCoder,
- Coder<InputT> inputCoder) throws CannotProvideCoderException {
- return keyedCombineFn.getDefaultOutputCoder(registry, keyCoder, inputCoder);
- }
- };
- }
- }
-
- private static <OutputT> void checkUniqueness(
- List<TupleTag<?>> registeredTags, TupleTag<OutputT> outputTag) {
- checkArgument(
- !registeredTags.contains(outputTag),
- "Cannot compose with tuple tag %s because it is already present in the composition.",
- outputTag);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/CombineWithContext.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/CombineWithContext.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/CombineWithContext.java
deleted file mode 100644
index fdf56e3..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/CombineWithContext.java
+++ /dev/null
@@ -1,277 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.transforms;
-
-import com.google.cloud.dataflow.sdk.coders.CannotProvideCoderException;
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.coders.CoderRegistry;
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-import com.google.cloud.dataflow.sdk.transforms.Combine.CombineFn;
-import com.google.cloud.dataflow.sdk.transforms.Combine.KeyedCombineFn;
-import com.google.cloud.dataflow.sdk.values.PCollectionView;
-
-/**
- * This class contains combine functions that have access to {@code PipelineOptions} and side inputs
- * through {@code CombineWithContext.Context}.
- *
- * <p>{@link CombineFnWithContext} and {@link KeyedCombineFnWithContext} are for users to extend.
- */
-public class CombineWithContext {
-
- /**
- * Information accessible to all methods in {@code CombineFnWithContext}
- * and {@code KeyedCombineFnWithContext}.
- */
- public abstract static class Context {
- /**
- * Returns the {@code PipelineOptions} specified with the
- * {@link com.google.cloud.dataflow.sdk.runners.PipelineRunner}
- * invoking this {@code KeyedCombineFn}.
- */
- public abstract PipelineOptions getPipelineOptions();
-
- /**
- * Returns the value of the side input for the window corresponding to the
- * window of the main input element.
- */
- public abstract <T> T sideInput(PCollectionView<T> view);
- }
-
- /**
- * An internal interface for signaling that a {@code GloballyCombineFn}
- * or a {@code PerKeyCombineFn} needs to access {@code CombineWithContext.Context}.
- *
- * <p>For internal use only.
- */
- public interface RequiresContextInternal {}
-
- /**
- * A combine function that has access to {@code PipelineOptions} and side inputs through
- * {@code CombineWithContext.Context}.
- *
- * See the equivalent {@link CombineFn} for details about combine functions.
- */
- public abstract static class CombineFnWithContext<InputT, AccumT, OutputT>
- extends CombineFnBase.AbstractGlobalCombineFn<InputT, AccumT, OutputT>
- implements RequiresContextInternal {
- /**
- * Returns a new, mutable accumulator value, representing the accumulation of zero input values.
- *
- * <p>It is equivalent to {@link CombineFn#createAccumulator}, but it has additional access to
- * {@code CombineWithContext.Context}.
- */
- public abstract AccumT createAccumulator(Context c);
-
- /**
- * Adds the given input value to the given accumulator, returning the
- * new accumulator value.
- *
- * <p>It is equivalent to {@link CombineFn#addInput}, but it has additional access to
- * {@code CombineWithContext.Context}.
- */
- public abstract AccumT addInput(AccumT accumulator, InputT input, Context c);
-
- /**
- * Returns an accumulator representing the accumulation of all the
- * input values accumulated in the merging accumulators.
- *
- * <p>It is equivalent to {@link CombineFn#mergeAccumulators}, but it has additional access to
- * {@code CombineWithContext.Context}.
- */
- public abstract AccumT mergeAccumulators(Iterable<AccumT> accumulators, Context c);
-
- /**
- * Returns the output value that is the result of combining all
- * the input values represented by the given accumulator.
- *
- * <p>It is equivalent to {@link CombineFn#extractOutput}, but it has additional access to
- * {@code CombineWithContext.Context}.
- */
- public abstract OutputT extractOutput(AccumT accumulator, Context c);
-
- /**
- * Returns an accumulator that represents the same logical value as the
- * input accumulator, but may have a more compact representation.
- *
- * <p>It is equivalent to {@link CombineFn#compact}, but it has additional access to
- * {@code CombineWithContext.Context}.
- */
- public AccumT compact(AccumT accumulator, Context c) {
- return accumulator;
- }
-
- @Override
- public OutputT defaultValue() {
- throw new UnsupportedOperationException(
- "Override this function to provide the default value.");
- }
-
- @SuppressWarnings({"unchecked", "rawtypes"})
- @Override
- public <K> KeyedCombineFnWithContext<K, InputT, AccumT, OutputT> asKeyedFn() {
- // The key, an object, is never even looked at.
- return new KeyedCombineFnWithContext<K, InputT, AccumT, OutputT>() {
- @Override
- public AccumT createAccumulator(K key, Context c) {
- return CombineFnWithContext.this.createAccumulator(c);
- }
-
- @Override
- public AccumT addInput(K key, AccumT accumulator, InputT input, Context c) {
- return CombineFnWithContext.this.addInput(accumulator, input, c);
- }
-
- @Override
- public AccumT mergeAccumulators(K key, Iterable<AccumT> accumulators, Context c) {
- return CombineFnWithContext.this.mergeAccumulators(accumulators, c);
- }
-
- @Override
- public OutputT extractOutput(K key, AccumT accumulator, Context c) {
- return CombineFnWithContext.this.extractOutput(accumulator, c);
- }
-
- @Override
- public AccumT compact(K key, AccumT accumulator, Context c) {
- return CombineFnWithContext.this.compact(accumulator, c);
- }
-
- @Override
- public Coder<AccumT> getAccumulatorCoder(CoderRegistry registry, Coder<K> keyCoder,
- Coder<InputT> inputCoder) throws CannotProvideCoderException {
- return CombineFnWithContext.this.getAccumulatorCoder(registry, inputCoder);
- }
-
- @Override
- public Coder<OutputT> getDefaultOutputCoder(CoderRegistry registry, Coder<K> keyCoder,
- Coder<InputT> inputCoder) throws CannotProvideCoderException {
- return CombineFnWithContext.this.getDefaultOutputCoder(registry, inputCoder);
- }
-
- @Override
- public CombineFnWithContext<InputT, AccumT, OutputT> forKey(K key, Coder<K> keyCoder) {
- return CombineFnWithContext.this;
- }
- };
- }
- }
-
- /**
- * A keyed combine function that has access to {@code PipelineOptions} and side inputs through
- * {@code CombineWithContext.Context}.
- *
- * See the equivalent {@link KeyedCombineFn} for details about keyed combine functions.
- */
- public abstract static class KeyedCombineFnWithContext<K, InputT, AccumT, OutputT>
- extends CombineFnBase.AbstractPerKeyCombineFn<K, InputT, AccumT, OutputT>
- implements RequiresContextInternal {
- /**
- * Returns a new, mutable accumulator value representing the accumulation of zero input values.
- *
- * <p>It is equivalent to {@link KeyedCombineFn#createAccumulator},
- * but it has additional access to {@code CombineWithContext.Context}.
- */
- public abstract AccumT createAccumulator(K key, Context c);
-
- /**
- * Adds the given input value to the given accumulator, returning the new accumulator value.
- *
- * <p>It is equivalent to {@link KeyedCombineFn#addInput}, but it has additional access to
- * {@code CombineWithContext.Context}.
- */
- public abstract AccumT addInput(K key, AccumT accumulator, InputT value, Context c);
-
- /**
- * Returns an accumulator representing the accumulation of all the
- * input values accumulated in the merging accumulators.
- *
- * <p>It is equivalent to {@link KeyedCombineFn#mergeAccumulators},
- * but it has additional access to {@code CombineWithContext.Context}..
- */
- public abstract AccumT mergeAccumulators(K key, Iterable<AccumT> accumulators, Context c);
-
- /**
- * Returns the output value that is the result of combining all
- * the input values represented by the given accumulator.
- *
- * <p>It is equivalent to {@link KeyedCombineFn#extractOutput}, but it has additional access to
- * {@code CombineWithContext.Context}.
- */
- public abstract OutputT extractOutput(K key, AccumT accumulator, Context c);
-
- /**
- * Returns an accumulator that represents the same logical value as the
- * input accumulator, but may have a more compact representation.
- *
- * <p>It is equivalent to {@link KeyedCombineFn#compact}, but it has additional access to
- * {@code CombineWithContext.Context}.
- */
- public AccumT compact(K key, AccumT accumulator, Context c) {
- return accumulator;
- }
-
- /**
- * Applies this {@code KeyedCombineFnWithContext} to a key and a collection
- * of input values to produce a combined output value.
- */
- public OutputT apply(K key, Iterable<? extends InputT> inputs, Context c) {
- AccumT accum = createAccumulator(key, c);
- for (InputT input : inputs) {
- accum = addInput(key, accum, input, c);
- }
- return extractOutput(key, accum, c);
- }
-
- @Override
- public CombineFnWithContext<InputT, AccumT, OutputT> forKey(
- final K key, final Coder<K> keyCoder) {
- return new CombineFnWithContext<InputT, AccumT, OutputT>() {
- @Override
- public AccumT createAccumulator(Context c) {
- return KeyedCombineFnWithContext.this.createAccumulator(key, c);
- }
-
- @Override
- public AccumT addInput(AccumT accumulator, InputT input, Context c) {
- return KeyedCombineFnWithContext.this.addInput(key, accumulator, input, c);
- }
-
- @Override
- public AccumT mergeAccumulators(Iterable<AccumT> accumulators, Context c) {
- return KeyedCombineFnWithContext.this.mergeAccumulators(key, accumulators, c);
- }
-
- @Override
- public OutputT extractOutput(AccumT accumulator, Context c) {
- return KeyedCombineFnWithContext.this.extractOutput(key, accumulator, c);
- }
-
- @Override
- public Coder<AccumT> getAccumulatorCoder(CoderRegistry registry, Coder<InputT> inputCoder)
- throws CannotProvideCoderException {
- return KeyedCombineFnWithContext.this.getAccumulatorCoder(registry, keyCoder, inputCoder);
- }
-
- @Override
- public Coder<OutputT> getDefaultOutputCoder(
- CoderRegistry registry, Coder<InputT> inputCoder) throws CannotProvideCoderException {
- return KeyedCombineFnWithContext.this.getDefaultOutputCoder(
- registry, keyCoder, inputCoder);
- }
- };
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/Count.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/Count.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/Count.java
deleted file mode 100644
index ffa11d1..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/Count.java
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms;
-
-import com.google.cloud.dataflow.sdk.transforms.Combine.CombineFn;
-import com.google.cloud.dataflow.sdk.values.KV;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-
-/**
- * {@code PTransorm}s to count the elements in a {@link PCollection}.
- *
- * <p>{@link Count#perElement()} can be used to count the number of occurrences of each
- * distinct element in the PCollection, {@link Count#perKey()} can be used to count the
- * number of values per key, and {@link Count#globally()} can be used to count the total
- * number of elements in a PCollection.
- */
-public class Count {
- private Count() {
- // do not instantiate
- }
-
- /**
- * Returns a {@link Combine.Globally} {@link PTransform} that counts the number of elements in
- * its input {@link PCollection}.
- */
- public static <T> Combine.Globally<T, Long> globally() {
- return Combine.globally(new CountFn<T>()).named("Count.Globally");
- }
-
- /**
- * Returns a {@link Combine.PerKey} {@link PTransform} that counts the number of elements
- * associated with each key of its input {@link PCollection}.
- */
- public static <K, V> Combine.PerKey<K, V, Long> perKey() {
- return Combine.<K, V, Long>perKey(new CountFn<V>()).named("Count.PerKey");
- }
-
- /**
- * Returns a {@link PerElement Count.PerElement} {@link PTransform} that counts the number of
- * occurrences of each element in its input {@link PCollection}.
- *
- * <p>See {@link PerElement Count.PerElement} for more details.
- */
- public static <T> PerElement<T> perElement() {
- return new PerElement<>();
- }
-
- /**
- * {@code Count.PerElement<T>} takes a {@code PCollection<T>} and returns a
- * {@code PCollection<KV<T, Long>>} representing a map from each distinct element of the input
- * {@code PCollection} to the number of times that element occurs in the input. Each key in the
- * output {@code PCollection} is unique.
- *
- * <p>This transform compares two values of type {@code T} by first encoding each element using
- * the input {@code PCollection}'s {@code Coder}, then comparing the encoded bytes. Because of
- * this, the input coder must be deterministic.
- * (See {@link com.google.cloud.dataflow.sdk.coders.Coder#verifyDeterministic()} for more detail).
- * Performing the comparison in this manner admits efficient parallel evaluation.
- *
- * <p>By default, the {@code Coder} of the keys of the output {@code PCollection} is the same as
- * the {@code Coder} of the elements of the input {@code PCollection}.
- *
- * <p>Example of use:
- * <pre> {@code
- * PCollection<String> words = ...;
- * PCollection<KV<String, Long>> wordCounts =
- * words.apply(Count.<String>perElement());
- * } </pre>
- *
- * @param <T> the type of the elements of the input {@code PCollection}, and the type of the keys
- * of the output {@code PCollection}
- */
- public static class PerElement<T>
- extends PTransform<PCollection<T>, PCollection<KV<T, Long>>> {
-
- public PerElement() { }
-
- @Override
- public PCollection<KV<T, Long>> apply(PCollection<T> input) {
- return
- input
- .apply(ParDo.named("Init").of(new DoFn<T, KV<T, Void>>() {
- @Override
- public void processElement(ProcessContext c) {
- c.output(KV.of(c.element(), (Void) null));
- }
- }))
- .apply(Count.<T, Void>perKey());
- }
- }
-
- /**
- * A {@link CombineFn} that counts elements.
- */
- private static class CountFn<T> extends CombineFn<T, Long, Long> {
-
- @Override
- public Long createAccumulator() {
- return 0L;
- }
-
- @Override
- public Long addInput(Long accumulator, T input) {
- return accumulator + 1;
- }
-
- @Override
- public Long mergeAccumulators(Iterable<Long> accumulators) {
- long result = 0L;
- for (Long accum : accumulators) {
- result += accum;
- }
- return result;
- }
-
- @Override
- public Long extractOutput(Long accumulator) {
- return accumulator;
- }
- }
-}
[16/67] [partial] incubator-beam git commit: Directory reorganization
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/PaneInfo.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/PaneInfo.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/PaneInfo.java
deleted file mode 100644
index 18f7a97..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/PaneInfo.java
+++ /dev/null
@@ -1,384 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms.windowing;
-
-import com.google.cloud.dataflow.sdk.coders.AtomicCoder;
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.coders.CoderException;
-import com.google.cloud.dataflow.sdk.transforms.DoFn;
-import com.google.cloud.dataflow.sdk.transforms.GroupByKey;
-import com.google.cloud.dataflow.sdk.util.VarInt;
-import com.google.common.base.MoreObjects;
-import com.google.common.base.Preconditions;
-import com.google.common.collect.ImmutableMap;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.util.Objects;
-
-/**
- * Provides information about the pane an element belongs to. Every pane is implicitly associated
- * with a window. Panes are observable only via the
- * {@link com.google.cloud.dataflow.sdk.transforms.DoFn.ProcessContext#pane} method of the context
- * passed to a {@link DoFn#processElement} overridden method.
- *
- * <p>Note: This does not uniquely identify a pane, and should not be used for comparisons.
- */
-public final class PaneInfo {
- /**
- * Enumerates the possibilities for the timing of this pane firing related to the
- * input and output watermarks for its computation.
- *
- * <p>A window may fire multiple panes, and the timing of those panes generally follows the
- * regular expression {@code EARLY* ON_TIME? LATE*}. Generally a pane is considered:
- * <ol>
- * <li>{@code EARLY} if the system cannot be sure it has seen all data which may contribute
- * to the pane's window.
- * <li>{@code ON_TIME} if the system predicts it has seen all the data which may contribute
- * to the pane's window.
- * <li>{@code LATE} if the system has encountered new data after predicting no more could arrive.
- * It is possible an {@code ON_TIME} pane has already been emitted, in which case any
- * following panes are considered {@code LATE}.
- * </ol>
- *
- * <p>Only an
- * {@link AfterWatermark#pastEndOfWindow} trigger may produce an {@code ON_TIME} pane.
- * With merging {@link WindowFn}'s, windows may be merged to produce new windows that satisfy
- * their own instance of the above regular expression. The only guarantee is that once a window
- * produces a final pane, it will not be merged into any new windows.
- *
- * <p>The predictions above are made using the mechanism of watermarks.
- * See {@link com.google.cloud.dataflow.sdk.util.TimerInternals} for more information
- * about watermarks.
- *
- * <p>We can state some properties of {@code LATE} and {@code ON_TIME} panes, but first need some
- * definitions:
- * <ol>
- * <li>We'll call a pipeline 'simple' if it does not use
- * {@link com.google.cloud.dataflow.sdk.transforms.DoFn.Context#outputWithTimestamp} in
- * any {@code DoFn}, and it uses the same
- * {@link com.google.cloud.dataflow.sdk.transforms.windowing.Window.Bound#withAllowedLateness}
- * argument value on all windows (or uses the default of {@link org.joda.time.Duration#ZERO}).
- * <li>We'll call an element 'locally late', from the point of view of a computation on a
- * worker, if the element's timestamp is before the input watermark for that computation
- * on that worker. The element is otherwise 'locally on-time'.
- * <li>We'll say 'the pane's timestamp' to mean the timestamp of the element produced to
- * represent the pane's contents.
- * </ol>
- *
- * <p>Then in simple pipelines:
- * <ol>
- * <li> (Soundness) An {@code ON_TIME} pane can never cause a later computation to generate a
- * {@code LATE} pane. (If it did, it would imply a later computation's input watermark progressed
- * ahead of an earlier stage's output watermark, which by design is not possible.)
- * <li> (Liveness) An {@code ON_TIME} pane is emitted as soon as possible after the input
- * watermark passes the end of the pane's window.
- * <li> (Consistency) A pane with only locally on-time elements will always be {@code ON_TIME}.
- * And a {@code LATE} pane cannot contain locally on-time elements.
- * </ol>
- *
- * However, note that:
- * <ol>
- * <li> An {@code ON_TIME} pane may contain locally late elements. It may even contain only
- * locally late elements. Provided a locally late element finds its way into an {@code ON_TIME}
- * pane its lateness becomes unobservable.
- * <li> A {@code LATE} pane does not necessarily cause any following computation panes to be
- * marked as {@code LATE}.
- * </ol>
- */
- public enum Timing {
- /**
- * Pane was fired before the input watermark had progressed after the end of the window.
- */
- EARLY,
- /**
- * Pane was fired by a {@link AfterWatermark#pastEndOfWindow} trigger because the input
- * watermark progressed after the end of the window. However the output watermark has not
- * yet progressed after the end of the window. Thus it is still possible to assign a timestamp
- * to the element representing this pane which cannot be considered locally late by any
- * following computation.
- */
- ON_TIME,
- /**
- * Pane was fired after the output watermark had progressed past the end of the window.
- */
- LATE,
- /**
- * This element was not produced in a triggered pane and its relation to input and
- * output watermarks is unknown.
- */
- UNKNOWN;
-
- // NOTE: Do not add fields or re-order them. The ordinal is used as part of
- // the encoding.
- }
-
- private static byte encodedByte(boolean isFirst, boolean isLast, Timing timing) {
- byte result = 0x0;
- if (isFirst) {
- result |= 1;
- }
- if (isLast) {
- result |= 2;
- }
- result |= timing.ordinal() << 2;
- return result;
- }
-
- private static final ImmutableMap<Byte, PaneInfo> BYTE_TO_PANE_INFO;
- static {
- ImmutableMap.Builder<Byte, PaneInfo> decodingBuilder = ImmutableMap.builder();
- for (Timing timing : Timing.values()) {
- long onTimeIndex = timing == Timing.EARLY ? -1 : 0;
- register(decodingBuilder, new PaneInfo(true, true, timing, 0, onTimeIndex));
- register(decodingBuilder, new PaneInfo(true, false, timing, 0, onTimeIndex));
- register(decodingBuilder, new PaneInfo(false, true, timing, -1, onTimeIndex));
- register(decodingBuilder, new PaneInfo(false, false, timing, -1, onTimeIndex));
- }
- BYTE_TO_PANE_INFO = decodingBuilder.build();
- }
-
- private static void register(ImmutableMap.Builder<Byte, PaneInfo> builder, PaneInfo info) {
- builder.put(info.encodedByte, info);
- }
-
- private final byte encodedByte;
-
- private final boolean isFirst;
- private final boolean isLast;
- private final Timing timing;
- private final long index;
- private final long nonSpeculativeIndex;
-
- /**
- * {@code PaneInfo} to use for elements on (and before) initial window assignemnt (including
- * elements read from sources) before they have passed through a {@link GroupByKey} and are
- * associated with a particular trigger firing.
- */
- public static final PaneInfo NO_FIRING =
- PaneInfo.createPane(true, true, Timing.UNKNOWN, 0, 0);
-
- /**
- * {@code PaneInfo} to use when there will be exactly one firing and it is on time.
- */
- public static final PaneInfo ON_TIME_AND_ONLY_FIRING =
- PaneInfo.createPane(true, true, Timing.ON_TIME, 0, 0);
-
- private PaneInfo(boolean isFirst, boolean isLast, Timing timing, long index, long onTimeIndex) {
- this.encodedByte = encodedByte(isFirst, isLast, timing);
- this.isFirst = isFirst;
- this.isLast = isLast;
- this.timing = timing;
- this.index = index;
- this.nonSpeculativeIndex = onTimeIndex;
- }
-
- public static PaneInfo createPane(boolean isFirst, boolean isLast, Timing timing) {
- Preconditions.checkArgument(isFirst, "Indices must be provided for non-first pane info.");
- return createPane(isFirst, isLast, timing, 0, timing == Timing.EARLY ? -1 : 0);
- }
-
- /**
- * Factory method to create a {@link PaneInfo} with the specified parameters.
- */
- public static PaneInfo createPane(
- boolean isFirst, boolean isLast, Timing timing, long index, long onTimeIndex) {
- if (isFirst || timing == Timing.UNKNOWN) {
- return Preconditions.checkNotNull(
- BYTE_TO_PANE_INFO.get(encodedByte(isFirst, isLast, timing)));
- } else {
- return new PaneInfo(isFirst, isLast, timing, index, onTimeIndex);
- }
- }
-
- public static PaneInfo decodePane(byte encodedPane) {
- return Preconditions.checkNotNull(BYTE_TO_PANE_INFO.get(encodedPane));
- }
-
- /**
- * Return true if there is no timing information for the current {@link PaneInfo}.
- * This typically indicates that the current element has not been assigned to
- * windows or passed through an operation that executes triggers yet.
- */
- public boolean isUnknown() {
- return Timing.UNKNOWN.equals(timing);
- }
-
- /**
- * Return true if this is the first pane produced for the associated window.
- */
- public boolean isFirst() {
- return isFirst;
- }
-
- /**
- * Return true if this is the last pane that will be produced in the associated window.
- */
- public boolean isLast() {
- return isLast;
- }
-
- /**
- * Return true if this is the last pane that will be produced in the associated window.
- */
- public Timing getTiming() {
- return timing;
- }
-
- /**
- * The zero-based index of this trigger firing that produced this pane.
- *
- * <p>This will return 0 for the first time the timer fires, 1 for the next time, etc.
- *
- * <p>A given (key, window, pane-index) is guaranteed to be unique in the
- * output of a group-by-key operation.
- */
- public long getIndex() {
- return index;
- }
-
- /**
- * The zero-based index of this trigger firing among non-speculative panes.
- *
- * <p> This will return 0 for the first non-{@link Timing#EARLY} timer firing, 1 for the next one,
- * etc.
- *
- * <p>Always -1 for speculative data.
- */
- public long getNonSpeculativeIndex() {
- return nonSpeculativeIndex;
- }
-
- int getEncodedByte() {
- return encodedByte;
- }
-
- @Override
- public int hashCode() {
- return Objects.hash(encodedByte, index, nonSpeculativeIndex);
- }
-
- @Override
- public boolean equals(Object obj) {
- if (this == obj) {
- // Simple PaneInfos are interned.
- return true;
- } else if (obj instanceof PaneInfo) {
- PaneInfo that = (PaneInfo) obj;
- return this.encodedByte == that.encodedByte
- && this.index == that.index
- && this.nonSpeculativeIndex == that.nonSpeculativeIndex;
- } else {
- return false;
- }
- }
-
- @Override
- public String toString() {
- return MoreObjects.toStringHelper(getClass())
- .omitNullValues()
- .add("isFirst", isFirst ? true : null)
- .add("isLast", isLast ? true : null)
- .add("timing", timing)
- .add("index", index)
- .add("onTimeIndex", nonSpeculativeIndex != -1 ? nonSpeculativeIndex : null)
- .toString();
- }
-
- /**
- * A Coder for encoding PaneInfo instances.
- */
- public static class PaneInfoCoder extends AtomicCoder<PaneInfo> {
- private static enum Encoding {
- FIRST,
- ONE_INDEX,
- TWO_INDICES;
-
- // NOTE: Do not reorder fields. The ordinal is used as part of
- // the encoding.
-
- public final byte tag;
-
- private Encoding() {
- assert ordinal() < 16;
- tag = (byte) (ordinal() << 4);
- }
-
- public static Encoding fromTag(byte b) {
- return Encoding.values()[b >> 4];
- }
- }
-
- private Encoding chooseEncoding(PaneInfo value) {
- if (value.index == 0 && value.nonSpeculativeIndex == 0 || value.timing == Timing.UNKNOWN) {
- return Encoding.FIRST;
- } else if (value.index == value.nonSpeculativeIndex || value.timing == Timing.EARLY) {
- return Encoding.ONE_INDEX;
- } else {
- return Encoding.TWO_INDICES;
- }
- }
-
- public static final PaneInfoCoder INSTANCE = new PaneInfoCoder();
-
- @Override
- public void encode(PaneInfo value, final OutputStream outStream, Coder.Context context)
- throws CoderException, IOException {
- Encoding encoding = chooseEncoding(value);
- switch (chooseEncoding(value)) {
- case FIRST:
- outStream.write(value.encodedByte);
- break;
- case ONE_INDEX:
- outStream.write(value.encodedByte | encoding.tag);
- VarInt.encode(value.index, outStream);
- break;
- case TWO_INDICES:
- outStream.write(value.encodedByte | encoding.tag);
- VarInt.encode(value.index, outStream);
- VarInt.encode(value.nonSpeculativeIndex, outStream);
- break;
- default:
- throw new CoderException("Unknown encoding " + encoding);
- }
- }
-
- @Override
- public PaneInfo decode(final InputStream inStream, Coder.Context context)
- throws CoderException, IOException {
- byte keyAndTag = (byte) inStream.read();
- PaneInfo base = BYTE_TO_PANE_INFO.get((byte) (keyAndTag & 0x0F));
- long index, onTimeIndex;
- switch (Encoding.fromTag(keyAndTag)) {
- case FIRST:
- return base;
- case ONE_INDEX:
- index = VarInt.decodeLong(inStream);
- onTimeIndex = base.timing == Timing.EARLY ? -1 : index;
- break;
- case TWO_INDICES:
- index = VarInt.decodeLong(inStream);
- onTimeIndex = VarInt.decodeLong(inStream);
- break;
- default:
- throw new CoderException("Unknown encoding " + (keyAndTag & 0xF0));
- }
- return new PaneInfo(base.isFirst, base.isLast, base.timing, index, onTimeIndex);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/PartitioningWindowFn.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/PartitioningWindowFn.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/PartitioningWindowFn.java
deleted file mode 100644
index bea0285..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/PartitioningWindowFn.java
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms.windowing;
-
-import org.joda.time.Instant;
-
-import java.util.Arrays;
-import java.util.Collection;
-
-/**
- * A {@link WindowFn} that places each value into exactly one window based on its timestamp and
- * never merges windows.
- *
- * @param <T> type of elements being windowed
- * @param <W> window type
- */
-public abstract class PartitioningWindowFn<T, W extends BoundedWindow>
- extends NonMergingWindowFn<T, W> {
- /**
- * Returns the single window to which elements with this timestamp belong.
- */
- public abstract W assignWindow(Instant timestamp);
-
- @Override
- public final Collection<W> assignWindows(AssignContext c) {
- return Arrays.asList(assignWindow(c.timestamp()));
- }
-
- @Override
- public W getSideInputWindow(final BoundedWindow window) {
- if (window instanceof GlobalWindow) {
- throw new IllegalArgumentException(
- "Attempted to get side input window for GlobalWindow from non-global WindowFn");
- }
- return assignWindow(window.maxTimestamp());
- }
-
- @Override
- public boolean assignsToSingleWindow() {
- return true;
- }
-
- @Override
- public Instant getOutputTime(Instant inputTimestamp, W window) {
- return inputTimestamp;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/Repeatedly.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/Repeatedly.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/Repeatedly.java
deleted file mode 100644
index e77e2a1..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/Repeatedly.java
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms.windowing;
-
-import com.google.cloud.dataflow.sdk.util.ExecutableTrigger;
-
-import org.joda.time.Instant;
-
-import java.util.Arrays;
-import java.util.List;
-
-/**
- * Repeat a trigger, either until some condition is met or forever.
- *
- * <p>For example, to fire after the end of the window, and every time late data arrives:
- * <pre> {@code
- * Repeatedly.forever(AfterWatermark.isPastEndOfWindow());
- * } </pre>
- *
- * <p>{@code Repeatedly.forever(someTrigger)} behaves like an infinite
- * {@code AfterEach.inOrder(someTrigger, someTrigger, someTrigger, ...)}.
- *
- * @param <W> {@link BoundedWindow} subclass used to represent the windows used by this
- * {@code Trigger}
- */
-public class Repeatedly<W extends BoundedWindow> extends Trigger<W> {
-
- private static final int REPEATED = 0;
-
- /**
- * Create a composite trigger that repeatedly executes the trigger {@code toRepeat}, firing each
- * time it fires and ignoring any indications to finish.
- *
- * <p>Unless used with {@link Trigger#orFinally} the composite trigger will never finish.
- *
- * @param repeated the trigger to execute repeatedly.
- */
- public static <W extends BoundedWindow> Repeatedly<W> forever(Trigger<W> repeated) {
- return new Repeatedly<W>(repeated);
- }
-
- private Repeatedly(Trigger<W> repeated) {
- super(Arrays.asList(repeated));
- }
-
-
- @Override
- public void onElement(OnElementContext c) throws Exception {
- getRepeated(c).invokeOnElement(c);
- }
-
- @Override
- public void onMerge(OnMergeContext c) throws Exception {
- getRepeated(c).invokeOnMerge(c);
- }
-
- @Override
- public Instant getWatermarkThatGuaranteesFiring(W window) {
- // This trigger fires once the repeated trigger fires.
- return subTriggers.get(REPEATED).getWatermarkThatGuaranteesFiring(window);
- }
-
- @Override
- public Trigger<W> getContinuationTrigger(List<Trigger<W>> continuationTriggers) {
- return new Repeatedly<W>(continuationTriggers.get(REPEATED));
- }
-
- @Override
- public boolean shouldFire(Trigger<W>.TriggerContext context) throws Exception {
- return getRepeated(context).invokeShouldFire(context);
- }
-
- @Override
- public void onFire(TriggerContext context) throws Exception {
- getRepeated(context).invokeOnFire(context);
-
- if (context.trigger().isFinished(REPEATED)) {
- context.trigger().setFinished(false, REPEATED);
- getRepeated(context).invokeClear(context);
- }
- }
-
- private ExecutableTrigger<W> getRepeated(TriggerContext context) {
- return context.trigger().subTrigger(REPEATED);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/Sessions.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/Sessions.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/Sessions.java
deleted file mode 100644
index da137c1..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/Sessions.java
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms.windowing;
-
-import com.google.cloud.dataflow.sdk.annotations.Experimental;
-import com.google.cloud.dataflow.sdk.annotations.Experimental.Kind;
-import com.google.cloud.dataflow.sdk.coders.Coder;
-
-import org.joda.time.Duration;
-
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.Objects;
-
-/**
- * A {@link WindowFn} windowing values into sessions separated by {@link #gapDuration}-long
- * periods with no elements.
- *
- * <p>For example, in order to window data into session with at least 10 minute
- * gaps in between them:
- * <pre> {@code
- * PCollection<Integer> pc = ...;
- * PCollection<Integer> windowed_pc = pc.apply(
- * Window.<Integer>into(Sessions.withGapDuration(Duration.standardMinutes(10))));
- * } </pre>
- */
-public class Sessions extends WindowFn<Object, IntervalWindow> {
- /**
- * Duration of the gaps between sessions.
- */
- private final Duration gapDuration;
-
- /**
- * Creates a {@code Sessions} {@link WindowFn} with the specified gap duration.
- */
- public static Sessions withGapDuration(Duration gapDuration) {
- return new Sessions(gapDuration);
- }
-
- /**
- * Creates a {@code Sessions} {@link WindowFn} with the specified gap duration.
- */
- private Sessions(Duration gapDuration) {
- this.gapDuration = gapDuration;
- }
-
- @Override
- public Collection<IntervalWindow> assignWindows(AssignContext c) {
- // Assign each element into a window from its timestamp until gapDuration in the
- // future. Overlapping windows (representing elements within gapDuration of
- // each other) will be merged.
- return Arrays.asList(new IntervalWindow(c.timestamp(), gapDuration));
- }
-
- @Override
- public void mergeWindows(MergeContext c) throws Exception {
- MergeOverlappingIntervalWindows.mergeWindows(c);
- }
-
- @Override
- public Coder<IntervalWindow> windowCoder() {
- return IntervalWindow.getCoder();
- }
-
- @Override
- public boolean isCompatible(WindowFn<?, ?> other) {
- return other instanceof Sessions;
- }
-
- @Override
- public IntervalWindow getSideInputWindow(BoundedWindow window) {
- throw new UnsupportedOperationException("Sessions is not allowed in side inputs");
- }
-
- @Experimental(Kind.OUTPUT_TIME)
- @Override
- public OutputTimeFn<? super IntervalWindow> getOutputTimeFn() {
- return OutputTimeFns.outputAtEarliestInputTimestamp();
- }
-
- public Duration getGapDuration() {
- return gapDuration;
- }
-
- @Override
- public boolean equals(Object object) {
- if (!(object instanceof Sessions)) {
- return false;
- }
- Sessions other = (Sessions) object;
- return getGapDuration().equals(other.getGapDuration());
- }
-
- @Override
- public int hashCode() {
- return Objects.hash(gapDuration);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/SlidingWindows.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/SlidingWindows.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/SlidingWindows.java
deleted file mode 100644
index b0066d6..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/SlidingWindows.java
+++ /dev/null
@@ -1,214 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms.windowing;
-
-import com.google.cloud.dataflow.sdk.annotations.Experimental;
-import com.google.cloud.dataflow.sdk.annotations.Experimental.Kind;
-import com.google.cloud.dataflow.sdk.coders.Coder;
-
-import org.joda.time.Duration;
-import org.joda.time.Instant;
-
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.List;
-import java.util.Objects;
-
-/**
- * A {@link WindowFn} that windows values into possibly overlapping fixed-size
- * timestamp-based windows.
- *
- * <p>For example, in order to window data into 10 minute windows that
- * update every minute:
- * <pre> {@code
- * PCollection<Integer> items = ...;
- * PCollection<Integer> windowedItems = items.apply(
- * Window.<Integer>into(SlidingWindows.of(Duration.standardMinutes(10))));
- * } </pre>
- */
-public class SlidingWindows extends NonMergingWindowFn<Object, IntervalWindow> {
-
- /**
- * Amount of time between generated windows.
- */
- private final Duration period;
-
- /**
- * Size of the generated windows.
- */
- private final Duration size;
-
- /**
- * Offset of the generated windows.
- * Windows start at time N * start + offset, where 0 is the epoch.
- */
- private final Duration offset;
-
- /**
- * Assigns timestamps into half-open intervals of the form
- * [N * period, N * period + size), where 0 is the epoch.
- *
- * <p>If {@link SlidingWindows#every} is not called, the period defaults
- * to the largest time unit smaller than the given duration. For example,
- * specifying a size of 5 seconds will result in a default period of 1 second.
- */
- public static SlidingWindows of(Duration size) {
- return new SlidingWindows(getDefaultPeriod(size), size, Duration.ZERO);
- }
-
- /**
- * Returns a new {@code SlidingWindows} with the original size, that assigns
- * timestamps into half-open intervals of the form
- * [N * period, N * period + size), where 0 is the epoch.
- */
- public SlidingWindows every(Duration period) {
- return new SlidingWindows(period, size, offset);
- }
-
- /**
- * Assigns timestamps into half-open intervals of the form
- * [N * period + offset, N * period + offset + size).
- *
- * @throws IllegalArgumentException if offset is not in [0, period)
- */
- public SlidingWindows withOffset(Duration offset) {
- return new SlidingWindows(period, size, offset);
- }
-
- private SlidingWindows(Duration period, Duration size, Duration offset) {
- if (offset.isShorterThan(Duration.ZERO)
- || !offset.isShorterThan(period)
- || !size.isLongerThan(Duration.ZERO)) {
- throw new IllegalArgumentException(
- "SlidingWindows WindowingStrategies must have 0 <= offset < period and 0 < size");
- }
- this.period = period;
- this.size = size;
- this.offset = offset;
- }
-
- @Override
- public Coder<IntervalWindow> windowCoder() {
- return IntervalWindow.getCoder();
- }
-
- @Override
- public Collection<IntervalWindow> assignWindows(AssignContext c) {
- List<IntervalWindow> windows =
- new ArrayList<>((int) (size.getMillis() / period.getMillis()));
- Instant timestamp = c.timestamp();
- long lastStart = lastStartFor(timestamp);
- for (long start = lastStart;
- start > timestamp.minus(size).getMillis();
- start -= period.getMillis()) {
- windows.add(new IntervalWindow(new Instant(start), size));
- }
- return windows;
- }
-
- /**
- * Return the earliest window that contains the end of the main-input window.
- */
- @Override
- public IntervalWindow getSideInputWindow(final BoundedWindow window) {
- if (window instanceof GlobalWindow) {
- throw new IllegalArgumentException(
- "Attempted to get side input window for GlobalWindow from non-global WindowFn");
- }
- long lastStart = lastStartFor(window.maxTimestamp().minus(size));
- return new IntervalWindow(new Instant(lastStart + period.getMillis()), size);
- }
-
- @Override
- public boolean isCompatible(WindowFn<?, ?> other) {
- return equals(other);
- }
-
- /**
- * Return the last start of a sliding window that contains the timestamp.
- */
- private long lastStartFor(Instant timestamp) {
- return timestamp.getMillis()
- - timestamp.plus(period).minus(offset).getMillis() % period.getMillis();
- }
-
- static Duration getDefaultPeriod(Duration size) {
- if (size.isLongerThan(Duration.standardHours(1))) {
- return Duration.standardHours(1);
- }
- if (size.isLongerThan(Duration.standardMinutes(1))) {
- return Duration.standardMinutes(1);
- }
- if (size.isLongerThan(Duration.standardSeconds(1))) {
- return Duration.standardSeconds(1);
- }
- return Duration.millis(1);
- }
-
- public Duration getPeriod() {
- return period;
- }
-
- public Duration getSize() {
- return size;
- }
-
- public Duration getOffset() {
- return offset;
- }
-
- /**
- * Ensures that later sliding windows have an output time that is past the end of earlier windows.
- *
- * <p>If this is the earliest sliding window containing {@code inputTimestamp}, that's fine.
- * Otherwise, we pick the earliest time that doesn't overlap with earlier windows.
- */
- @Experimental(Kind.OUTPUT_TIME)
- @Override
- public OutputTimeFn<? super IntervalWindow> getOutputTimeFn() {
- return new OutputTimeFn.Defaults<BoundedWindow>() {
- @Override
- public Instant assignOutputTime(Instant inputTimestamp, BoundedWindow window) {
- Instant startOfLastSegment = window.maxTimestamp().minus(period);
- return startOfLastSegment.isBefore(inputTimestamp)
- ? inputTimestamp
- : startOfLastSegment.plus(1);
- }
-
- @Override
- public boolean dependsOnlyOnEarliestInputTimestamp() {
- return true;
- }
- };
- }
-
- @Override
- public boolean equals(Object object) {
- if (!(object instanceof SlidingWindows)) {
- return false;
- }
- SlidingWindows other = (SlidingWindows) object;
- return getOffset().equals(other.getOffset())
- && getSize().equals(other.getSize())
- && getPeriod().equals(other.getPeriod());
- }
-
- @Override
- public int hashCode() {
- return Objects.hash(size, offset, period);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/Trigger.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/Trigger.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/Trigger.java
deleted file mode 100644
index 4471563..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/Trigger.java
+++ /dev/null
@@ -1,544 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms.windowing;
-
-import com.google.cloud.dataflow.sdk.annotations.Experimental;
-import com.google.cloud.dataflow.sdk.util.ExecutableTrigger;
-import com.google.cloud.dataflow.sdk.util.TimeDomain;
-import com.google.cloud.dataflow.sdk.util.state.MergingStateAccessor;
-import com.google.cloud.dataflow.sdk.util.state.StateAccessor;
-import com.google.common.base.Joiner;
-
-import org.joda.time.Instant;
-
-import java.io.Serializable;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Objects;
-
-import javax.annotation.Nullable;
-
-/**
- * {@code Trigger}s control when the elements for a specific key and window are output. As elements
- * arrive, they are put into one or more windows by a {@link Window} transform and its associated
- * {@link WindowFn}, and then passed to the associated {@code Trigger} to determine if the
- * {@code Window}s contents should be output.
- *
- * <p>See {@link com.google.cloud.dataflow.sdk.transforms.GroupByKey} and {@link Window}
- * for more information about how grouping with windows works.
- *
- * <p>The elements that are assigned to a window since the last time it was fired (or since the
- * window was created) are placed into the current window pane. Triggers are evaluated against the
- * elements as they are added. When the root trigger fires, the elements in the current pane will be
- * output. When the root trigger finishes (indicating it will never fire again), the window is
- * closed and any new elements assigned to that window are discarded.
- *
- * <p>Several predefined {@code Trigger}s are provided:
- * <ul>
- * <li> {@link AfterWatermark} for firing when the watermark passes a timestamp determined from
- * either the end of the window or the arrival of the first element in a pane.
- * <li> {@link AfterProcessingTime} for firing after some amount of processing time has elapsed
- * (typically since the first element in a pane).
- * <li> {@link AfterPane} for firing off a property of the elements in the current pane, such as
- * the number of elements that have been assigned to the current pane.
- * </ul>
- *
- * <p>In addition, {@code Trigger}s can be combined in a variety of ways:
- * <ul>
- * <li> {@link Repeatedly#forever} to create a trigger that executes forever. Any time its
- * argument finishes it gets reset and starts over. Can be combined with
- * {@link Trigger#orFinally} to specify a condition that causes the repetition to stop.
- * <li> {@link AfterEach#inOrder} to execute each trigger in sequence, firing each (and every)
- * time that a trigger fires, and advancing to the next trigger in the sequence when it finishes.
- * <li> {@link AfterFirst#of} to create a trigger that fires after at least one of its arguments
- * fires. An {@link AfterFirst} trigger finishes after it fires once.
- * <li> {@link AfterAll#of} to create a trigger that fires after all least one of its arguments
- * have fired at least once. An {@link AfterAll} trigger finishes after it fires once.
- * </ul>
- *
- * <p>Each trigger tree is instantiated per-key and per-window. Every trigger in the tree is in one
- * of the following states:
- * <ul>
- * <li> Never Existed - before the trigger has started executing, there is no state associated
- * with it anywhere in the system. A trigger moves to the executing state as soon as it
- * processes in the current pane.
- * <li> Executing - while the trigger is receiving items and may fire. While it is in this state,
- * it may persist book-keeping information to persisted state, set timers, etc.
- * <li> Finished - after a trigger finishes, all of its book-keeping data is cleaned up, and the
- * system remembers only that it is finished. Entering this state causes us to discard any
- * elements in the buffer for that window, as well.
- * </ul>
- *
- * <p>Once finished, a trigger cannot return itself back to an earlier state, however a composite
- * trigger could reset its sub-triggers.
- *
- * <p>Triggers should not build up any state internally since they may be recreated
- * between invocations of the callbacks. All important values should be persisted using
- * state before the callback returns.
- *
- * @param <W> {@link BoundedWindow} subclass used to represent the windows used by this
- * {@code Trigger}
- */
-@Experimental(Experimental.Kind.TRIGGER)
-public abstract class Trigger<W extends BoundedWindow> implements Serializable, TriggerBuilder<W> {
-
- /**
- * Interface for accessing information about the trigger being executed and other triggers in the
- * same tree.
- */
- public interface TriggerInfo<W extends BoundedWindow> {
-
- /**
- * Returns true if the windowing strategy of the current {@code PCollection} is a merging
- * WindowFn. If true, the trigger execution needs to keep enough information to support the
- * possibility of {@link Trigger#onMerge} being called. If false, {@link Trigger#onMerge} will
- * never be called.
- */
- boolean isMerging();
-
- /**
- * Access the executable versions of the sub-triggers of the current trigger.
- */
- Iterable<ExecutableTrigger<W>> subTriggers();
-
- /**
- * Access the executable version of the specified sub-trigger.
- */
- ExecutableTrigger<W> subTrigger(int subtriggerIndex);
-
- /**
- * Returns true if the current trigger is marked finished.
- */
- boolean isFinished();
-
- /**
- * Return true if the given subtrigger is marked finished.
- */
- boolean isFinished(int subtriggerIndex);
-
- /**
- * Returns true if all the sub-triggers of the current trigger are marked finished.
- */
- boolean areAllSubtriggersFinished();
-
- /**
- * Returns an iterable over the unfinished sub-triggers of the current trigger.
- */
- Iterable<ExecutableTrigger<W>> unfinishedSubTriggers();
-
- /**
- * Returns the first unfinished sub-trigger.
- */
- ExecutableTrigger<W> firstUnfinishedSubTrigger();
-
- /**
- * Clears all keyed state for triggers in the current sub-tree and unsets all the associated
- * finished bits.
- */
- void resetTree() throws Exception;
-
- /**
- * Sets the finished bit for the current trigger.
- */
- void setFinished(boolean finished);
-
- /**
- * Sets the finished bit for the given sub-trigger.
- */
- void setFinished(boolean finished, int subTriggerIndex);
- }
-
- /**
- * Interact with properties of the trigger being executed, with extensions to deal with the
- * merging windows.
- */
- public interface MergingTriggerInfo<W extends BoundedWindow> extends TriggerInfo<W> {
-
- /** Return true if the trigger is finished in any window being merged. */
- public abstract boolean finishedInAnyMergingWindow();
-
- /** Return true if the trigger is finished in all windows being merged. */
- public abstract boolean finishedInAllMergingWindows();
-
- /** Return the merging windows in which the trigger is finished. */
- public abstract Iterable<W> getFinishedMergingWindows();
- }
-
- /**
- * Information accessible to all operational hooks in this {@code Trigger}.
- *
- * <p>Used directly in {@link Trigger#shouldFire} and {@link Trigger#clear}, and
- * extended with additional information in other methods.
- */
- public abstract class TriggerContext {
-
- /** Returns the interface for accessing trigger info. */
- public abstract TriggerInfo<W> trigger();
-
- /** Returns the interface for accessing persistent state. */
- public abstract StateAccessor<?> state();
-
- /** The window that the current context is executing in. */
- public abstract W window();
-
- /** Create a sub-context for the given sub-trigger. */
- public abstract TriggerContext forTrigger(ExecutableTrigger<W> trigger);
-
- /**
- * Removes the timer set in this trigger context for the given {@link Instant}
- * and {@link TimeDomain}.
- */
- public abstract void deleteTimer(Instant timestamp, TimeDomain domain);
-
- /** The current processing time. */
- public abstract Instant currentProcessingTime();
-
- /** The current synchronized upstream processing time or {@code null} if unknown. */
- @Nullable
- public abstract Instant currentSynchronizedProcessingTime();
-
- /** The current event time for the input or {@code null} if unknown. */
- @Nullable
- public abstract Instant currentEventTime();
- }
-
- /**
- * Extended {@link TriggerContext} containing information accessible to the {@link #onElement}
- * operational hook.
- */
- public abstract class OnElementContext extends TriggerContext {
- /** The event timestamp of the element currently being processed. */
- public abstract Instant eventTimestamp();
-
- /**
- * Sets a timer to fire when the watermark or processing time is beyond the given timestamp.
- * Timers are not guaranteed to fire immediately, but will be delivered at some time afterwards.
- *
- * <p>As with {@link #state}, timers are implicitly scoped to the current window. All
- * timer firings for a window will be received, but the implementation should choose to ignore
- * those that are not applicable.
- *
- * @param timestamp the time at which the trigger should be re-evaluated
- * @param domain the domain that the {@code timestamp} applies to
- */
- public abstract void setTimer(Instant timestamp, TimeDomain domain);
-
- /** Create an {@code OnElementContext} for executing the given trigger. */
- @Override
- public abstract OnElementContext forTrigger(ExecutableTrigger<W> trigger);
- }
-
- /**
- * Extended {@link TriggerContext} containing information accessible to the {@link #onMerge}
- * operational hook.
- */
- public abstract class OnMergeContext extends TriggerContext {
- /**
- * Sets a timer to fire when the watermark or processing time is beyond the given timestamp.
- * Timers are not guaranteed to fire immediately, but will be delivered at some time afterwards.
- *
- * <p>As with {@link #state}, timers are implicitly scoped to the current window. All
- * timer firings for a window will be received, but the implementation should choose to ignore
- * those that are not applicable.
- *
- * @param timestamp the time at which the trigger should be re-evaluated
- * @param domain the domain that the {@code timestamp} applies to
- */
- public abstract void setTimer(Instant timestamp, TimeDomain domain);
-
- /** Create an {@code OnMergeContext} for executing the given trigger. */
- @Override
- public abstract OnMergeContext forTrigger(ExecutableTrigger<W> trigger);
-
- @Override
- public abstract MergingStateAccessor<?, W> state();
-
- @Override
- public abstract MergingTriggerInfo<W> trigger();
- }
-
- @Nullable
- protected final List<Trigger<W>> subTriggers;
-
- protected Trigger(@Nullable List<Trigger<W>> subTriggers) {
- this.subTriggers = subTriggers;
- }
-
-
- /**
- * Called immediately after an element is first incorporated into a window.
- */
- public abstract void onElement(OnElementContext c) throws Exception;
-
- /**
- * Called immediately after windows have been merged.
- *
- * <p>Leaf triggers should update their state by inspecting their status and any state
- * in the merging windows. Composite triggers should update their state by calling
- * {@link ExecutableTrigger#invokeOnMerge} on their sub-triggers, and applying appropriate logic.
- *
- * <p>A trigger such as {@link AfterWatermark#pastEndOfWindow} may no longer be finished;
- * it is the responsibility of the trigger itself to record this fact. It is forbidden for
- * a trigger to become finished due to {@link #onMerge}, as it has not yet fired the pending
- * elements that led to it being ready to fire.
- *
- * <p>The implementation does not need to clear out any state associated with the old windows.
- */
- public abstract void onMerge(OnMergeContext c) throws Exception;
-
- /**
- * Returns {@code true} if the current state of the trigger indicates that its condition
- * is satisfied and it is ready to fire.
- */
- public abstract boolean shouldFire(TriggerContext context) throws Exception;
-
- /**
- * Adjusts the state of the trigger to be ready for the next pane. For example, a
- * {@link Repeatedly} trigger will reset its inner trigger, since it has fired.
- *
- * <p>If the trigger is finished, it is the responsibility of the trigger itself to
- * record that fact via the {@code context}.
- */
- public abstract void onFire(TriggerContext context) throws Exception;
-
- /**
- * Called to allow the trigger to prefetch any state it will likely need to read from during
- * an {@link #onElement} call.
- */
- public void prefetchOnElement(StateAccessor<?> state) {
- if (subTriggers != null) {
- for (Trigger<W> trigger : subTriggers) {
- trigger.prefetchOnElement(state);
- }
- }
- }
-
- /**
- * Called to allow the trigger to prefetch any state it will likely need to read from during
- * an {@link #onMerge} call.
- */
- public void prefetchOnMerge(MergingStateAccessor<?, W> state) {
- if (subTriggers != null) {
- for (Trigger<W> trigger : subTriggers) {
- trigger.prefetchOnMerge(state);
- }
- }
- }
-
- /**
- * Called to allow the trigger to prefetch any state it will likely need to read from during
- * an {@link #shouldFire} call.
- */
- public void prefetchShouldFire(StateAccessor<?> state) {
- if (subTriggers != null) {
- for (Trigger<W> trigger : subTriggers) {
- trigger.prefetchShouldFire(state);
- }
- }
- }
-
- /**
- * Called to allow the trigger to prefetch any state it will likely need to read from during
- * an {@link #onFire} call.
- */
- public void prefetchOnFire(StateAccessor<?> state) {
- if (subTriggers != null) {
- for (Trigger<W> trigger : subTriggers) {
- trigger.prefetchOnFire(state);
- }
- }
- }
-
- /**
- * Clear any state associated with this trigger in the given window.
- *
- * <p>This is called after a trigger has indicated it will never fire again. The trigger system
- * keeps enough information to know that the trigger is finished, so this trigger should clear all
- * of its state.
- */
- public void clear(TriggerContext c) throws Exception {
- if (subTriggers != null) {
- for (ExecutableTrigger<W> trigger : c.trigger().subTriggers()) {
- trigger.invokeClear(c);
- }
- }
- }
-
- public Iterable<Trigger<W>> subTriggers() {
- return subTriggers;
- }
-
- /**
- * Return a trigger to use after a {@code GroupByKey} to preserve the
- * intention of this trigger. Specifically, triggers that are time based
- * and intended to provide speculative results should continue providing
- * speculative results. Triggers that fire once (or multiple times) should
- * continue firing once (or multiple times).
- */
- public Trigger<W> getContinuationTrigger() {
- if (subTriggers == null) {
- return getContinuationTrigger(null);
- }
-
- List<Trigger<W>> subTriggerContinuations = new ArrayList<>();
- for (Trigger<W> subTrigger : subTriggers) {
- subTriggerContinuations.add(subTrigger.getContinuationTrigger());
- }
- return getContinuationTrigger(subTriggerContinuations);
- }
-
- /**
- * Return the {@link #getContinuationTrigger} of this {@code Trigger}. For convenience, this
- * is provided the continuation trigger of each of the sub-triggers.
- */
- protected abstract Trigger<W> getContinuationTrigger(List<Trigger<W>> continuationTriggers);
-
- /**
- * Returns a bound in watermark time by which this trigger would have fired at least once
- * for a given window had there been input data. This is a static property of a trigger
- * that does not depend on its state.
- *
- * <p>For triggers that do not fire based on the watermark advancing, returns
- * {@link BoundedWindow#TIMESTAMP_MAX_VALUE}.
- *
- * <p>This estimate is used to determine that there are no elements in a side-input window, which
- * causes the default value to be used instead.
- */
- public abstract Instant getWatermarkThatGuaranteesFiring(W window);
-
- /**
- * Returns whether this performs the same triggering as the given {@code Trigger}.
- */
- public boolean isCompatible(Trigger<?> other) {
- if (!getClass().equals(other.getClass())) {
- return false;
- }
-
- if (subTriggers == null) {
- return other.subTriggers == null;
- } else if (other.subTriggers == null) {
- return false;
- } else if (subTriggers.size() != other.subTriggers.size()) {
- return false;
- }
-
- for (int i = 0; i < subTriggers.size(); i++) {
- if (!subTriggers.get(i).isCompatible(other.subTriggers.get(i))) {
- return false;
- }
- }
-
- return true;
- }
-
- @Override
- public String toString() {
- String simpleName = getClass().getSimpleName();
- if (getClass().getEnclosingClass() != null) {
- simpleName = getClass().getEnclosingClass().getSimpleName() + "." + simpleName;
- }
- if (subTriggers == null || subTriggers.size() == 0) {
- return simpleName;
- } else {
- return simpleName + "(" + Joiner.on(", ").join(subTriggers) + ")";
- }
- }
-
- @Override
- public boolean equals(Object obj) {
- if (this == obj) {
- return true;
- }
- if (!(obj instanceof Trigger)) {
- return false;
- }
- @SuppressWarnings("unchecked")
- Trigger<W> that = (Trigger<W>) obj;
- return Objects.equals(getClass(), that.getClass())
- && Objects.equals(subTriggers, that.subTriggers);
- }
-
- @Override
- public int hashCode() {
- return Objects.hash(getClass(), subTriggers);
- }
-
- /**
- * Specify an ending condition for this trigger. If the {@code until} fires then the combination
- * fires.
- *
- * <p>The expression {@code t1.orFinally(t2)} fires every time {@code t1} fires, and finishes
- * as soon as either {@code t1} finishes or {@code t2} fires, in which case it fires one last time
- * for {@code t2}. Both {@code t1} and {@code t2} are executed in parallel. This means that
- * {@code t1} may have fired since {@code t2} started, so not all of the elements that {@code t2}
- * has seen are necessarily in the current pane.
- *
- * <p>For example the final firing of the following trigger may only have 1 element:
- * <pre> {@code
- * Repeatedly.forever(AfterPane.elementCountAtLeast(2))
- * .orFinally(AfterPane.elementCountAtLeast(5))
- * } </pre>
- *
- * <p>Note that if {@code t1} is {@link OnceTrigger}, then {@code t1.orFinally(t2)} is the same
- * as {@code AfterFirst.of(t1, t2)}.
- */
- public Trigger<W> orFinally(OnceTrigger<W> until) {
- return new OrFinallyTrigger<W>(this, until);
- }
-
- @Override
- public Trigger<W> buildTrigger() {
- return this;
- }
-
- /**
- * {@link Trigger}s that are guaranteed to fire at most once should extend from this, rather
- * than the general {@link Trigger} class to indicate that behavior.
- *
- * @param <W> {@link BoundedWindow} subclass used to represent the windows used by this
- * {@code AtMostOnceTrigger}
- */
- public abstract static class OnceTrigger<W extends BoundedWindow> extends Trigger<W> {
- protected OnceTrigger(List<Trigger<W>> subTriggers) {
- super(subTriggers);
- }
-
- @Override
- public final OnceTrigger<W> getContinuationTrigger() {
- Trigger<W> continuation = super.getContinuationTrigger();
- if (!(continuation instanceof OnceTrigger)) {
- throw new IllegalStateException("Continuation of a OnceTrigger must be a OnceTrigger");
- }
- return (OnceTrigger<W>) continuation;
- }
-
- /**
- * {@inheritDoc}
- */
- @Override
- public final void onFire(TriggerContext context) throws Exception {
- onOnlyFiring(context);
- context.trigger().setFinished(true);
- }
-
- /**
- * Called exactly once by {@link #onFire} when the trigger is fired. By default,
- * invokes {@link #onFire} on all subtriggers for which {@link #shouldFire} is {@code true}.
- */
- protected abstract void onOnlyFiring(TriggerContext context) throws Exception;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/TriggerBuilder.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/TriggerBuilder.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/TriggerBuilder.java
deleted file mode 100644
index cc817ba..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/TriggerBuilder.java
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.transforms.windowing;
-
-/**
- * Anything that can be used to create an instance of a {@code Trigger} implements this interface.
- *
- * <p>This includes {@code Trigger}s (which can return themselves) and any "enhanced" syntax for
- * constructing a trigger.
- *
- * @param <W> The type of windows the built trigger will operate on.
- */
-public interface TriggerBuilder<W extends BoundedWindow> {
- /** Return the {@code Trigger} built by this builder. */
- Trigger<W> buildTrigger();
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/Window.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/Window.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/Window.java
deleted file mode 100644
index 6793e76..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/Window.java
+++ /dev/null
@@ -1,662 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms.windowing;
-
-import com.google.cloud.dataflow.sdk.annotations.Experimental;
-import com.google.cloud.dataflow.sdk.annotations.Experimental.Kind;
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.coders.Coder.NonDeterministicException;
-import com.google.cloud.dataflow.sdk.transforms.DoFn;
-import com.google.cloud.dataflow.sdk.transforms.GroupByKey;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-import com.google.cloud.dataflow.sdk.transforms.ParDo;
-import com.google.cloud.dataflow.sdk.util.AssignWindowsDoFn;
-import com.google.cloud.dataflow.sdk.util.WindowingStrategy;
-import com.google.cloud.dataflow.sdk.util.WindowingStrategy.AccumulationMode;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-
-import org.joda.time.Duration;
-
-import javax.annotation.Nullable;
-
-/**
- * {@code Window} logically divides up or groups the elements of a
- * {@link PCollection} into finite windows according to a {@link WindowFn}.
- * The output of {@code Window} contains the same elements as input, but they
- * have been logically assigned to windows. The next
- * {@link com.google.cloud.dataflow.sdk.transforms.GroupByKey GroupByKeys},
- * including one within composite transforms, will group by the combination of
- * keys and windows.
-
- * <p>See {@link com.google.cloud.dataflow.sdk.transforms.GroupByKey}
- * for more information about how grouping with windows works.
- *
- * <h2> Windowing </h2>
- *
- * <p>Windowing a {@code PCollection} divides the elements into windows based
- * on the associated event time for each element. This is especially useful
- * for {@code PCollection}s with unbounded size, since it allows operating on
- * a sub-group of the elements placed into a related window. For {@code PCollection}s
- * with a bounded size (aka. conventional batch mode), by default, all data is
- * implicitly in a single window, unless {@code Window} is applied.
- *
- * <p>For example, a simple form of windowing divides up the data into
- * fixed-width time intervals, using {@link FixedWindows}.
- * The following example demonstrates how to use {@code Window} in a pipeline
- * that counts the number of occurrences of strings each minute:
- *
- * <pre> {@code
- * PCollection<String> items = ...;
- * PCollection<String> windowed_items = items.apply(
- * Window.<String>into(FixedWindows.of(Duration.standardMinutes(1))));
- * PCollection<KV<String, Long>> windowed_counts = windowed_items.apply(
- * Count.<String>perElement());
- * } </pre>
- *
- * <p>Let (data, timestamp) denote a data element along with its timestamp.
- * Then, if the input to this pipeline consists of
- * {("foo", 15s), ("bar", 30s), ("foo", 45s), ("foo", 1m30s)},
- * the output will be
- * {(KV("foo", 2), 1m), (KV("bar", 1), 1m), (KV("foo", 1), 2m)}
- *
- * <p>Several predefined {@link WindowFn}s are provided:
- * <ul>
- * <li> {@link FixedWindows} partitions the timestamps into fixed-width intervals.
- * <li> {@link SlidingWindows} places data into overlapping fixed-width intervals.
- * <li> {@link Sessions} groups data into sessions where each item in a window
- * is separated from the next by no more than a specified gap.
- * </ul>
- *
- * <p>Additionally, custom {@link WindowFn}s can be created, by creating new
- * subclasses of {@link WindowFn}.
- *
- * <h2> Triggers </h2>
- *
- * <p>{@link Window.Bound#triggering(TriggerBuilder)} allows specifying a trigger to control when
- * (in processing time) results for the given window can be produced. If unspecified, the default
- * behavior is to trigger first when the watermark passes the end of the window, and then trigger
- * again every time there is late arriving data.
- *
- * <p>Elements are added to the current window pane as they arrive. When the root trigger fires,
- * output is produced based on the elements in the current pane.
- *
- * <p>Depending on the trigger, this can be used both to output partial results
- * early during the processing of the whole window, and to deal with late
- * arriving in batches.
- *
- * <p>Continuing the earlier example, if we wanted to emit the values that were available
- * when the watermark passed the end of the window, and then output any late arriving
- * elements once-per (actual hour) hour until we have finished processing the next 24-hours of data.
- * (The use of watermark time to stop processing tends to be more robust if the data source is slow
- * for a few days, etc.)
- *
- * <pre> {@code
- * PCollection<String> items = ...;
- * PCollection<String> windowed_items = items.apply(
- * Window.<String>into(FixedWindows.of(Duration.standardMinutes(1)))
- * .triggering(
- * AfterWatermark.pastEndOfWindow()
- * .withLateFirings(AfterProcessingTime
- * .pastFirstElementInPane().plusDelayOf(Duration.standardHours(1))))
- * .withAllowedLateness(Duration.standardDays(1)));
- * PCollection<KV<String, Long>> windowed_counts = windowed_items.apply(
- * Count.<String>perElement());
- * } </pre>
- *
- * <p>On the other hand, if we wanted to get early results every minute of processing
- * time (for which there were new elements in the given window) we could do the following:
- *
- * <pre> {@code
- * PCollection<String> windowed_items = items.apply(
- * Window.<String>into(FixedWindows.of(Duration.standardMinutes(1))
- * .triggering(
- * .triggering(
- * AfterWatermark.pastEndOfWindow()
- * .withEarlyFirings(AfterProcessingTime
- * .pastFirstElementInPane().plusDelayOf(Duration.standardMinutes(1))))
- * .withAllowedLateness(Duration.ZERO));
- * } </pre>
- *
- * <p>After a {@link com.google.cloud.dataflow.sdk.transforms.GroupByKey} the trigger is set to
- * a trigger that will preserve the intent of the upstream trigger. See
- * {@link Trigger#getContinuationTrigger} for more information.
- *
- * <p>See {@link Trigger} for details on the available triggers.
- */
-public class Window {
-
- /**
- * Specifies the conditions under which a final pane will be created when a window is permanently
- * closed.
- */
- public enum ClosingBehavior {
- /**
- * Always fire the last pane. Even if there is no new data since the previous firing, an element
- * with {@link PaneInfo#isLast()} {@code true} will be produced.
- */
- FIRE_ALWAYS,
- /**
- * Only fire the last pane if there is new data since the previous firing.
- *
- * <p>This is the default behavior.
- */
- FIRE_IF_NON_EMPTY;
- }
-
- /**
- * Creates a {@code Window} {@code PTransform} with the given name.
- *
- * <p>See the discussion of Naming in
- * {@link com.google.cloud.dataflow.sdk.transforms.ParDo} for more explanation.
- *
- * <p>The resulting {@code PTransform} is incomplete, and its input/output
- * type is not yet bound. Use {@link Window.Unbound#into} to specify the
- * {@link WindowFn} to use, which will also bind the input/output type of this
- * {@code PTransform}.
- */
- public static Unbound named(String name) {
- return new Unbound().named(name);
- }
-
- /**
- * Creates a {@code Window} {@code PTransform} that uses the given
- * {@link WindowFn} to window the data.
- *
- * <p>The resulting {@code PTransform}'s types have been bound, with both the
- * input and output being a {@code PCollection<T>}, inferred from the types of
- * the argument {@code WindowFn}. It is ready to be applied, or further
- * properties can be set on it first.
- */
- public static <T> Bound<T> into(WindowFn<? super T, ?> fn) {
- return new Unbound().into(fn);
- }
-
- /**
- * Sets a non-default trigger for this {@code Window} {@code PTransform}.
- * Elements that are assigned to a specific window will be output when
- * the trigger fires.
- *
- * <p>Must also specify allowed lateness using {@link #withAllowedLateness} and accumulation
- * mode using either {@link #discardingFiredPanes()} or {@link #accumulatingFiredPanes()}.
- */
- @Experimental(Kind.TRIGGER)
- public static <T> Bound<T> triggering(TriggerBuilder<?> trigger) {
- return new Unbound().triggering(trigger);
- }
-
- /**
- * Returns a new {@code Window} {@code PTransform} that uses the registered WindowFn and
- * Triggering behavior, and that discards elements in a pane after they are triggered.
- *
- * <p>Does not modify this transform. The resulting {@code PTransform} is sufficiently
- * specified to be applied, but more properties can still be specified.
- */
- @Experimental(Kind.TRIGGER)
- public static <T> Bound<T> discardingFiredPanes() {
- return new Unbound().discardingFiredPanes();
- }
-
- /**
- * Returns a new {@code Window} {@code PTransform} that uses the registered WindowFn and
- * Triggering behavior, and that accumulates elements in a pane after they are triggered.
- *
- * <p>Does not modify this transform. The resulting {@code PTransform} is sufficiently
- * specified to be applied, but more properties can still be specified.
- */
- @Experimental(Kind.TRIGGER)
- public static <T> Bound<T> accumulatingFiredPanes() {
- return new Unbound().accumulatingFiredPanes();
- }
-
- /**
- * Override the amount of lateness allowed for data elements in the pipeline. Like
- * the other properties on this {@link Window} operation, this will be applied at
- * the next {@link GroupByKey}. Any elements that are later than this as decided by
- * the system-maintained watermark will be dropped.
- *
- * <p>This value also determines how long state will be kept around for old windows.
- * Once no elements will be added to a window (because this duration has passed) any state
- * associated with the window will be cleaned up.
- */
- @Experimental(Kind.TRIGGER)
- public static <T> Bound<T> withAllowedLateness(Duration allowedLateness) {
- return new Unbound().withAllowedLateness(allowedLateness);
- }
-
- /**
- * An incomplete {@code Window} transform, with unbound input/output type.
- *
- * <p>Before being applied, {@link Window.Unbound#into} must be
- * invoked to specify the {@link WindowFn} to invoke, which will also
- * bind the input/output type of this {@code PTransform}.
- */
- public static class Unbound {
- String name;
-
- Unbound() {}
-
- Unbound(String name) {
- this.name = name;
- }
-
- /**
- * Returns a new {@code Window} transform that's like this
- * transform but with the specified name. Does not modify this
- * transform. The resulting transform is still incomplete.
- *
- * <p>See the discussion of Naming in
- * {@link com.google.cloud.dataflow.sdk.transforms.ParDo} for more
- * explanation.
- */
- public Unbound named(String name) {
- return new Unbound(name);
- }
-
- /**
- * Returns a new {@code Window} {@code PTransform} that's like this
- * transform but that will use the given {@link WindowFn}, and that has
- * its input and output types bound. Does not modify this transform. The
- * resulting {@code PTransform} is sufficiently specified to be applied,
- * but more properties can still be specified.
- */
- public <T> Bound<T> into(WindowFn<? super T, ?> fn) {
- return new Bound<T>(name).into(fn);
- }
-
- /**
- * Sets a non-default trigger for this {@code Window} {@code PTransform}.
- * Elements that are assigned to a specific window will be output when
- * the trigger fires.
- *
- * <p>{@link com.google.cloud.dataflow.sdk.transforms.windowing.Trigger}
- * has more details on the available triggers.
- *
- * <p>Must also specify allowed lateness using {@link #withAllowedLateness} and accumulation
- * mode using either {@link #discardingFiredPanes()} or {@link #accumulatingFiredPanes()}.
- */
- @Experimental(Kind.TRIGGER)
- public <T> Bound<T> triggering(TriggerBuilder<?> trigger) {
- return new Bound<T>(name).triggering(trigger);
- }
-
- /**
- * Returns a new {@code Window} {@code PTransform} that uses the registered WindowFn and
- * Triggering behavior, and that discards elements in a pane after they are triggered.
- *
- * <p>Does not modify this transform. The resulting {@code PTransform} is sufficiently
- * specified to be applied, but more properties can still be specified.
- */
- @Experimental(Kind.TRIGGER)
- public <T> Bound<T> discardingFiredPanes() {
- return new Bound<T>(name).discardingFiredPanes();
- }
-
- /**
- * Returns a new {@code Window} {@code PTransform} that uses the registered WindowFn and
- * Triggering behavior, and that accumulates elements in a pane after they are triggered.
- *
- * <p>Does not modify this transform. The resulting {@code PTransform} is sufficiently
- * specified to be applied, but more properties can still be specified.
- */
- @Experimental(Kind.TRIGGER)
- public <T> Bound<T> accumulatingFiredPanes() {
- return new Bound<T>(name).accumulatingFiredPanes();
- }
-
- /**
- * Override the amount of lateness allowed for data elements in the pipeline. Like
- * the other properties on this {@link Window} operation, this will be applied at
- * the next {@link GroupByKey}. Any elements that are later than this as decided by
- * the system-maintained watermark will be dropped.
- *
- * <p>This value also determines how long state will be kept around for old windows.
- * Once no elements will be added to a window (because this duration has passed) any state
- * associated with the window will be cleaned up.
- *
- * <p>Depending on the trigger this may not produce a pane with {@link PaneInfo#isLast}. See
- * {@link ClosingBehavior#FIRE_IF_NON_EMPTY} for more details.
- */
- @Experimental(Kind.TRIGGER)
- public <T> Bound<T> withAllowedLateness(Duration allowedLateness) {
- return new Bound<T>(name).withAllowedLateness(allowedLateness);
- }
-
- /**
- * Override the amount of lateness allowed for data elements in the pipeline. Like
- * the other properties on this {@link Window} operation, this will be applied at
- * the next {@link GroupByKey}. Any elements that are later than this as decided by
- * the system-maintained watermark will be dropped.
- *
- * <p>This value also determines how long state will be kept around for old windows.
- * Once no elements will be added to a window (because this duration has passed) any state
- * associated with the window will be cleaned up.
- */
- @Experimental(Kind.TRIGGER)
- public <T> Bound<T> withAllowedLateness(Duration allowedLateness, ClosingBehavior behavior) {
- return new Bound<T>(name).withAllowedLateness(allowedLateness, behavior);
- }
- }
-
- /**
- * A {@code PTransform} that windows the elements of a {@code PCollection<T>},
- * into finite windows according to a user-specified {@code WindowFn}.
- *
- * @param <T> The type of elements this {@code Window} is applied to
- */
- public static class Bound<T> extends PTransform<PCollection<T>, PCollection<T>> {
-
- @Nullable private final WindowFn<? super T, ?> windowFn;
- @Nullable private final Trigger<?> trigger;
- @Nullable private final AccumulationMode mode;
- @Nullable private final Duration allowedLateness;
- @Nullable private final ClosingBehavior closingBehavior;
- @Nullable private final OutputTimeFn<?> outputTimeFn;
-
- private Bound(String name,
- @Nullable WindowFn<? super T, ?> windowFn, @Nullable Trigger<?> trigger,
- @Nullable AccumulationMode mode, @Nullable Duration allowedLateness,
- ClosingBehavior behavior, @Nullable OutputTimeFn<?> outputTimeFn) {
- super(name);
- this.windowFn = windowFn;
- this.trigger = trigger;
- this.mode = mode;
- this.allowedLateness = allowedLateness;
- this.closingBehavior = behavior;
- this.outputTimeFn = outputTimeFn;
- }
-
- private Bound(String name) {
- this(name, null, null, null, null, null, null);
- }
-
- /**
- * Returns a new {@code Window} {@code PTransform} that's like this
- * transform but that will use the given {@link WindowFn}, and that has
- * its input and output types bound. Does not modify this transform. The
- * resulting {@code PTransform} is sufficiently specified to be applied,
- * but more properties can still be specified.
- */
- private Bound<T> into(WindowFn<? super T, ?> windowFn) {
- try {
- windowFn.windowCoder().verifyDeterministic();
- } catch (NonDeterministicException e) {
- throw new IllegalArgumentException("Window coders must be deterministic.", e);
- }
-
- return new Bound<>(
- name, windowFn, trigger, mode, allowedLateness, closingBehavior, outputTimeFn);
- }
-
- /**
- * Returns a new {@code Window} {@code PTransform} that's like this
- * {@code PTransform} but with the specified name. Does not
- * modify this {@code PTransform}.
- *
- * <p>See the discussion of Naming in
- * {@link com.google.cloud.dataflow.sdk.transforms.ParDo} for more
- * explanation.
- */
- public Bound<T> named(String name) {
- return new Bound<>(
- name, windowFn, trigger, mode, allowedLateness, closingBehavior, outputTimeFn);
- }
-
- /**
- * Sets a non-default trigger for this {@code Window} {@code PTransform}.
- * Elements that are assigned to a specific window will be output when
- * the trigger fires.
- *
- * <p>{@link com.google.cloud.dataflow.sdk.transforms.windowing.Trigger}
- * has more details on the available triggers.
- *
- * <p>Must also specify allowed lateness using {@link #withAllowedLateness} and accumulation
- * mode using either {@link #discardingFiredPanes()} or {@link #accumulatingFiredPanes()}.
- */
- @Experimental(Kind.TRIGGER)
- public Bound<T> triggering(TriggerBuilder<?> trigger) {
- return new Bound<T>(
- name,
- windowFn,
- trigger.buildTrigger(),
- mode,
- allowedLateness,
- closingBehavior,
- outputTimeFn);
- }
-
- /**
- * Returns a new {@code Window} {@code PTransform} that uses the registered WindowFn and
- * Triggering behavior, and that discards elements in a pane after they are triggered.
- *
- * <p>Does not modify this transform. The resulting {@code PTransform} is sufficiently
- * specified to be applied, but more properties can still be specified.
- */
- @Experimental(Kind.TRIGGER)
- public Bound<T> discardingFiredPanes() {
- return new Bound<T>(
- name,
- windowFn,
- trigger,
- AccumulationMode.DISCARDING_FIRED_PANES,
- allowedLateness,
- closingBehavior,
- outputTimeFn);
- }
-
- /**
- * Returns a new {@code Window} {@code PTransform} that uses the registered WindowFn and
- * Triggering behavior, and that accumulates elements in a pane after they are triggered.
- *
- * <p>Does not modify this transform. The resulting {@code PTransform} is sufficiently
- * specified to be applied, but more properties can still be specified.
- */
- @Experimental(Kind.TRIGGER)
- public Bound<T> accumulatingFiredPanes() {
- return new Bound<T>(
- name,
- windowFn,
- trigger,
- AccumulationMode.ACCUMULATING_FIRED_PANES,
- allowedLateness,
- closingBehavior,
- outputTimeFn);
- }
-
- /**
- * Override the amount of lateness allowed for data elements in the pipeline. Like
- * the other properties on this {@link Window} operation, this will be applied at
- * the next {@link GroupByKey}. Any elements that are later than this as decided by
- * the system-maintained watermark will be dropped.
- *
- * <p>This value also determines how long state will be kept around for old windows.
- * Once no elements will be added to a window (because this duration has passed) any state
- * associated with the window will be cleaned up.
- *
- * <p>Depending on the trigger this may not produce a pane with {@link PaneInfo#isLast}. See
- * {@link ClosingBehavior#FIRE_IF_NON_EMPTY} for more details.
- */
- @Experimental(Kind.TRIGGER)
- public Bound<T> withAllowedLateness(Duration allowedLateness) {
- return new Bound<T>(
- name, windowFn, trigger, mode, allowedLateness, closingBehavior, outputTimeFn);
- }
-
- /**
- * <b><i>(Experimental)</i></b> Override the default {@link OutputTimeFn}, to control
- * the output timestamp of values output from a {@link GroupByKey} operation.
- */
- @Experimental(Kind.OUTPUT_TIME)
- public Bound<T> withOutputTimeFn(OutputTimeFn<?> outputTimeFn) {
- return new Bound<T>(
- name, windowFn, trigger, mode, allowedLateness, closingBehavior, outputTimeFn);
- }
-
- /**
- * Override the amount of lateness allowed for data elements in the pipeline. Like
- * the other properties on this {@link Window} operation, this will be applied at
- * the next {@link GroupByKey}. Any elements that are later than this as decided by
- * the system-maintained watermark will be dropped.
- *
- * <p>This value also determines how long state will be kept around for old windows.
- * Once no elements will be added to a window (because this duration has passed) any state
- * associated with the window will be cleaned up.
- */
- @Experimental(Kind.TRIGGER)
- public Bound<T> withAllowedLateness(Duration allowedLateness, ClosingBehavior behavior) {
- return new Bound<T>(name, windowFn, trigger, mode, allowedLateness, behavior, outputTimeFn);
- }
-
- /**
- * Get the output strategy of this {@link Window.Bound Window PTransform}. For internal use
- * only.
- */
- // Rawtype cast of OutputTimeFn cannot be eliminated with intermediate variable, as it is
- // casting between wildcards
- public WindowingStrategy<?, ?> getOutputStrategyInternal(
- WindowingStrategy<?, ?> inputStrategy) {
- WindowingStrategy<?, ?> result = inputStrategy;
- if (windowFn != null) {
- result = result.withWindowFn(windowFn);
- }
- if (trigger != null) {
- result = result.withTrigger(trigger);
- }
- if (mode != null) {
- result = result.withMode(mode);
- }
- if (allowedLateness != null) {
- result = result.withAllowedLateness(allowedLateness);
- }
- if (closingBehavior != null) {
- result = result.withClosingBehavior(closingBehavior);
- }
- if (outputTimeFn != null) {
- result = result.withOutputTimeFn(outputTimeFn);
- }
- return result;
- }
-
- /**
- * Get the {@link WindowFn} of this {@link Window.Bound Window PTransform}.
- */
- public WindowFn<? super T, ?> getWindowFn() {
- return windowFn;
- }
-
- @Override
- public void validate(PCollection<T> input) {
- WindowingStrategy<?, ?> outputStrategy =
- getOutputStrategyInternal(input.getWindowingStrategy());
-
- // Make sure that the windowing strategy is complete & valid.
- if (outputStrategy.isTriggerSpecified()
- && !(outputStrategy.getTrigger().getSpec() instanceof DefaultTrigger)) {
- if (!(outputStrategy.getWindowFn() instanceof GlobalWindows)
- && !outputStrategy.isAllowedLatenessSpecified()) {
- throw new IllegalArgumentException("Except when using GlobalWindows,"
- + " calling .triggering() to specify a trigger requires that the allowed lateness be"
- + " specified using .withAllowedLateness() to set the upper bound on how late data"
- + " can arrive before being dropped. See Javadoc for more details.");
- }
-
- if (!outputStrategy.isModeSpecified()) {
- throw new IllegalArgumentException(
- "Calling .triggering() to specify a trigger requires that the accumulation mode be"
- + " specified using .discardingFiredPanes() or .accumulatingFiredPanes()."
- + " See Javadoc for more details.");
- }
- }
- }
-
- @Override
- public PCollection<T> apply(PCollection<T> input) {
- WindowingStrategy<?, ?> outputStrategy =
- getOutputStrategyInternal(input.getWindowingStrategy());
- PCollection<T> output;
- if (windowFn != null) {
- // If the windowFn changed, we create a primitive, and run the AssignWindows operation here.
- output = assignWindows(input, windowFn);
- } else {
- // If the windowFn didn't change, we just run a pass-through transform and then set the
- // new windowing strategy.
- output = input.apply(Window.<T>identity());
- }
- return output.setWindowingStrategyInternal(outputStrategy);
- }
-
- private <T, W extends BoundedWindow> PCollection<T> assignWindows(
- PCollection<T> input, WindowFn<? super T, W> windowFn) {
- return input.apply("AssignWindows", ParDo.of(new AssignWindowsDoFn<T, W>(windowFn)));
- }
-
- @Override
- protected Coder<?> getDefaultOutputCoder(PCollection<T> input) {
- return input.getCoder();
- }
-
- @Override
- protected String getKindString() {
- return "Window.Into()";
- }
- }
-
- /////////////////////////////////////////////////////////////////////////////
-
- private static <T> PTransform<PCollection<? extends T>, PCollection<T>> identity() {
- return ParDo.named("Identity").of(new DoFn<T, T>() {
- @Override public void processElement(ProcessContext c) {
- c.output(c.element());
- }
- });
- }
-
- /**
- * Creates a {@code Window} {@code PTransform} that does not change assigned
- * windows, but will cause windows to be merged again as part of the next
- * {@link com.google.cloud.dataflow.sdk.transforms.GroupByKey}.
- */
- public static <T> Remerge<T> remerge() {
- return new Remerge<T>();
- }
-
- /**
- * {@code PTransform} that does not change assigned windows, but will cause
- * windows to be merged again as part of the next
- * {@link com.google.cloud.dataflow.sdk.transforms.GroupByKey}.
- */
- public static class Remerge<T> extends PTransform<PCollection<T>, PCollection<T>> {
- @Override
- public PCollection<T> apply(PCollection<T> input) {
- WindowingStrategy<?, ?> outputWindowingStrategy = getOutputWindowing(
- input.getWindowingStrategy());
-
- return input.apply(Window.<T>identity())
- .setWindowingStrategyInternal(outputWindowingStrategy);
- }
-
- private <W extends BoundedWindow> WindowingStrategy<?, W> getOutputWindowing(
- WindowingStrategy<?, W> inputStrategy) {
- if (inputStrategy.getWindowFn() instanceof InvalidWindows) {
- @SuppressWarnings("unchecked")
- InvalidWindows<W> invalidWindows = (InvalidWindows<W>) inputStrategy.getWindowFn();
- return inputStrategy.withWindowFn(invalidWindows.getOriginalWindowFn());
- } else {
- return inputStrategy;
- }
- }
- }
-}
[46/67] [partial] incubator-beam git commit: Directory reorganization
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/VoidCoder.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/VoidCoder.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/VoidCoder.java
deleted file mode 100644
index 0de606b..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/VoidCoder.java
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.coders;
-
-import com.fasterxml.jackson.annotation.JsonCreator;
-
-import java.io.InputStream;
-import java.io.OutputStream;
-
-/**
- * A {@link Coder} for {@link Void}. Uses zero bytes per {@link Void}.
- */
-public class VoidCoder extends AtomicCoder<Void> {
-
- @JsonCreator
- public static VoidCoder of() {
- return INSTANCE;
- }
-
- /////////////////////////////////////////////////////////////////////////////
-
- private static final VoidCoder INSTANCE = new VoidCoder();
-
- private VoidCoder() {}
-
- @Override
- public void encode(Void value, OutputStream outStream, Context context) {
- // Nothing to write!
- }
-
- @Override
- public Void decode(InputStream inStream, Context context) {
- // Nothing to read!
- return null;
- }
-
- /**
- * {@inheritDoc}
- *
- * @return {@code true}. {@link VoidCoder} is (vacuously) injective.
- */
- @Override
- public boolean consistentWithEquals() {
- return true;
- }
-
- /**
- * {@inheritDoc}
- *
- * @return {@code true}. {@link VoidCoder#getEncodedElementByteSize} runs in constant time.
- */
- @Override
- public boolean isRegisterByteSizeObserverCheap(Void value, Context context) {
- return true;
- }
-
- @Override
- protected long getEncodedElementByteSize(Void value, Context context)
- throws Exception {
- return 0;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/package-info.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/package-info.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/package-info.java
deleted file mode 100644
index fdf931f..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/package-info.java
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-/**
- * Defines {@link com.google.cloud.dataflow.sdk.coders.Coder Coders}
- * to specify how data is encoded to and decoded from byte strings.
- *
- * <p>During execution of a Pipeline, elements in a
- * {@link com.google.cloud.dataflow.sdk.values.PCollection}
- * may need to be encoded into byte strings.
- * This happens both at the beginning and end of a pipeline when data is read from and written to
- * persistent storage and also during execution of a pipeline when elements are communicated between
- * machines.
- *
- * <p>Exactly when PCollection elements are encoded during execution depends on which
- * {@link com.google.cloud.dataflow.sdk.runners.PipelineRunner} is being used and how that runner
- * chooses to execute the pipeline. As such, Dataflow requires that all PCollections have an
- * appropriate Coder in case it becomes necessary. In many cases, the Coder can be inferred from
- * the available Java type
- * information and the Pipeline's {@link com.google.cloud.dataflow.sdk.coders.CoderRegistry}. It
- * can be specified per PCollection via
- * {@link com.google.cloud.dataflow.sdk.values.PCollection#setCoder(Coder)} or per type using the
- * {@link com.google.cloud.dataflow.sdk.coders.DefaultCoder} annotation.
- *
- * <p>This package provides a number of coders for common types like {@code Integer},
- * {@code String}, and {@code List}, as well as coders like
- * {@link com.google.cloud.dataflow.sdk.coders.AvroCoder} that can be used to encode many custom
- * types.
- *
- */
-package com.google.cloud.dataflow.sdk.coders;
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/protobuf/ProtoCoder.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/protobuf/ProtoCoder.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/protobuf/ProtoCoder.java
deleted file mode 100644
index 111c24d..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/protobuf/ProtoCoder.java
+++ /dev/null
@@ -1,404 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.coders.protobuf;
-
-import static com.google.common.base.Preconditions.checkArgument;
-
-import com.google.cloud.dataflow.sdk.coders.AtomicCoder;
-import com.google.cloud.dataflow.sdk.coders.CannotProvideCoderException;
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.coders.CoderException;
-import com.google.cloud.dataflow.sdk.coders.CoderProvider;
-import com.google.cloud.dataflow.sdk.coders.CoderRegistry;
-import com.google.cloud.dataflow.sdk.util.CloudObject;
-import com.google.cloud.dataflow.sdk.util.Structs;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.cloud.dataflow.sdk.values.TypeDescriptor;
-import com.google.common.collect.ImmutableSet;
-import com.google.common.collect.Lists;
-import com.google.common.collect.Sets;
-import com.google.protobuf.ExtensionRegistry;
-import com.google.protobuf.Message;
-import com.google.protobuf.Parser;
-
-import com.fasterxml.jackson.annotation.JsonCreator;
-import com.fasterxml.jackson.annotation.JsonProperty;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.lang.reflect.InvocationTargetException;
-import java.lang.reflect.Method;
-import java.lang.reflect.Modifier;
-import java.util.Arrays;
-import java.util.List;
-import java.util.Objects;
-import java.util.Set;
-import java.util.SortedSet;
-import java.util.TreeSet;
-
-import javax.annotation.Nullable;
-
-/**
- * A {@link Coder} using Google Protocol Buffers binary format. {@link ProtoCoder} supports both
- * Protocol Buffers syntax versions 2 and 3.
- *
- * <p>To learn more about Protocol Buffers, visit:
- * <a href="https://developers.google.com/protocol-buffers">https://developers.google.com/protocol-buffers</a>
- *
- * <p>{@link ProtoCoder} is registered in the global {@link CoderRegistry} as the default
- * {@link Coder} for any {@link Message} object. Custom message extensions are also supported, but
- * these extensions must be registered for a particular {@link ProtoCoder} instance and that
- * instance must be registered on the {@link PCollection} that needs the extensions:
- *
- * <pre>{@code
- * import MyProtoFile;
- * import MyProtoFile.MyMessage;
- *
- * Coder<MyMessage> coder = ProtoCoder.of(MyMessage.class).withExtensionsFrom(MyProtoFile.class);
- * PCollection<MyMessage> records = input.apply(...).setCoder(coder);
- * }</pre>
- *
- * <h3>Versioning</h3>
- *
- * <p>{@link ProtoCoder} supports both versions 2 and 3 of the Protocol Buffers syntax. However,
- * the Java runtime version of the <code>google.com.protobuf</code> library must match exactly the
- * version of <code>protoc</code> that was used to produce the JAR files containing the compiled
- * <code>.proto</code> messages.
- *
- * <p>For more information, see the
- * <a href="https://developers.google.com/protocol-buffers/docs/proto3#using-proto2-message-types">Protocol Buffers documentation</a>.
- *
- * <h3>{@link ProtoCoder} and Determinism</h3>
- *
- * <p>In general, Protocol Buffers messages can be encoded deterministically within a single
- * pipeline as long as:
- *
- * <ul>
- * <li>The encoded messages (and any transitively linked messages) do not use <code>map</code>
- * fields.</li>
- * <li>Every Java VM that encodes or decodes the messages use the same runtime version of the
- * Protocol Buffers library and the same compiled <code>.proto</code> file JAR.</li>
- * </ul>
- *
- * <h3>{@link ProtoCoder} and Encoding Stability</h3>
- *
- * <p>When changing Protocol Buffers messages, follow the rules in the Protocol Buffers language
- * guides for
- * <a href="https://developers.google.com/protocol-buffers/docs/proto#updating">{@code proto2}</a>
- * and
- * <a href="https://developers.google.com/protocol-buffers/docs/proto3#updating">{@code proto3}</a>
- * syntaxes, depending on your message type. Following these guidelines will ensure that the
- * old encoded data can be read by new versions of the code.
- *
- * <p>Generally, any change to the message type, registered extensions, runtime library, or
- * compiled proto JARs may change the encoding. Thus even if both the original and updated messages
- * can be encoded deterministically within a single job, these deterministic encodings may not be
- * the same across jobs.
- *
- * @param <T> the Protocol Buffers {@link Message} handled by this {@link Coder}.
- */
-public class ProtoCoder<T extends Message> extends AtomicCoder<T> {
-
- /**
- * A {@link CoderProvider} that returns a {@link ProtoCoder} with an empty
- * {@link ExtensionRegistry}.
- */
- public static CoderProvider coderProvider() {
- return PROVIDER;
- }
-
- /**
- * Returns a {@link ProtoCoder} for the given Protocol Buffers {@link Message}.
- */
- public static <T extends Message> ProtoCoder<T> of(Class<T> protoMessageClass) {
- return new ProtoCoder<T>(protoMessageClass, ImmutableSet.<Class<?>>of());
- }
-
- /**
- * Returns a {@link ProtoCoder} for the Protocol Buffers {@link Message} indicated by the given
- * {@link TypeDescriptor}.
- */
- public static <T extends Message> ProtoCoder<T> of(TypeDescriptor<T> protoMessageType) {
- @SuppressWarnings("unchecked")
- Class<T> protoMessageClass = (Class<T>) protoMessageType.getRawType();
- return of(protoMessageClass);
- }
-
- /**
- * Returns a {@link ProtoCoder} like this one, but with the extensions from the given classes
- * registered.
- *
- * <p>Each of the extension host classes must be an class automatically generated by the
- * Protocol Buffers compiler, {@code protoc}, that contains messages.
- *
- * <p>Does not modify this object.
- */
- public ProtoCoder<T> withExtensionsFrom(Iterable<Class<?>> moreExtensionHosts) {
- for (Class<?> extensionHost : moreExtensionHosts) {
- // Attempt to access the required method, to make sure it's present.
- try {
- Method registerAllExtensions =
- extensionHost.getDeclaredMethod("registerAllExtensions", ExtensionRegistry.class);
- checkArgument(
- Modifier.isStatic(registerAllExtensions.getModifiers()),
- "Method registerAllExtensions() must be static");
- } catch (NoSuchMethodException | SecurityException e) {
- throw new IllegalArgumentException(
- String.format("Unable to register extensions for %s", extensionHost.getCanonicalName()),
- e);
- }
- }
-
- return new ProtoCoder<T>(
- protoMessageClass,
- new ImmutableSet.Builder<Class<?>>()
- .addAll(extensionHostClasses)
- .addAll(moreExtensionHosts)
- .build());
- }
-
- /**
- * See {@link #withExtensionsFrom(Iterable)}.
- *
- * <p>Does not modify this object.
- */
- public ProtoCoder<T> withExtensionsFrom(Class<?>... moreExtensionHosts) {
- return withExtensionsFrom(Arrays.asList(moreExtensionHosts));
- }
-
- @Override
- public void encode(T value, OutputStream outStream, Context context) throws IOException {
- if (value == null) {
- throw new CoderException("cannot encode a null " + protoMessageClass.getSimpleName());
- }
- if (context.isWholeStream) {
- value.writeTo(outStream);
- } else {
- value.writeDelimitedTo(outStream);
- }
- }
-
- @Override
- public T decode(InputStream inStream, Context context) throws IOException {
- if (context.isWholeStream) {
- return getParser().parseFrom(inStream, getExtensionRegistry());
- } else {
- return getParser().parseDelimitedFrom(inStream, getExtensionRegistry());
- }
- }
-
- @Override
- public boolean equals(Object other) {
- if (this == other) {
- return true;
- }
- if (!(other instanceof ProtoCoder)) {
- return false;
- }
- ProtoCoder<?> otherCoder = (ProtoCoder<?>) other;
- return protoMessageClass.equals(otherCoder.protoMessageClass)
- && Sets.newHashSet(extensionHostClasses)
- .equals(Sets.newHashSet(otherCoder.extensionHostClasses));
- }
-
- @Override
- public int hashCode() {
- return Objects.hash(protoMessageClass, extensionHostClasses);
- }
-
- /**
- * The encoding identifier is designed to support evolution as per the design of Protocol
- * Buffers. In order to use this class effectively, carefully follow the advice in the Protocol
- * Buffers documentation at
- * <a href="https://developers.google.com/protocol-buffers/docs/proto#updating">Updating
- * A Message Type</a>.
- *
- * <p>In particular, the encoding identifier is guaranteed to be the same for {@link ProtoCoder}
- * instances of the same principal message class, with the same registered extension host classes,
- * and otherwise distinct. Note that the encoding ID does not encode any version of the message
- * or extensions, nor does it include the message schema.
- *
- * <p>When modifying a message class, here are the broadest guidelines; see the above link
- * for greater detail.
- *
- * <ul>
- * <li>Do not change the numeric tags for any fields.
- * <li>Never remove a <code>required</code> field.
- * <li>Only add <code>optional</code> or <code>repeated</code> fields, with sensible defaults.
- * <li>When changing the type of a field, consult the Protocol Buffers documentation to ensure
- * the new and old types are interchangeable.
- * </ul>
- *
- * <p>Code consuming this message class should be prepared to support <i>all</i> versions of
- * the class until it is certain that no remaining serialized instances exist.
- *
- * <p>If backwards incompatible changes must be made, the best recourse is to change the name
- * of your Protocol Buffers message class.
- */
- @Override
- public String getEncodingId() {
- return protoMessageClass.getName() + getSortedExtensionClasses().toString();
- }
-
- @Override
- public void verifyDeterministic() throws NonDeterministicException {
- ProtobufUtil.verifyDeterministic(this);
- }
-
- /**
- * Returns the Protocol Buffers {@link Message} type this {@link ProtoCoder} supports.
- */
- public Class<T> getMessageType() {
- return protoMessageClass;
- }
-
- /**
- * Returns the {@link ExtensionRegistry} listing all known Protocol Buffers extension messages
- * to {@code T} registered with this {@link ProtoCoder}.
- */
- public ExtensionRegistry getExtensionRegistry() {
- if (memoizedExtensionRegistry == null) {
- ExtensionRegistry registry = ExtensionRegistry.newInstance();
- for (Class<?> extensionHost : extensionHostClasses) {
- try {
- extensionHost
- .getDeclaredMethod("registerAllExtensions", ExtensionRegistry.class)
- .invoke(null, registry);
- } catch (IllegalAccessException | InvocationTargetException | NoSuchMethodException e) {
- throw new IllegalStateException(e);
- }
- }
- memoizedExtensionRegistry = registry.getUnmodifiable();
- }
- return memoizedExtensionRegistry;
- }
-
- ////////////////////////////////////////////////////////////////////////////////////
- // Private implementation details below.
-
- /** The {@link Message} type to be coded. */
- private final Class<T> protoMessageClass;
-
- /**
- * All extension host classes included in this {@link ProtoCoder}. The extensions from these
- * classes will be included in the {@link ExtensionRegistry} used during encoding and decoding.
- */
- private final Set<Class<?>> extensionHostClasses;
-
- // Constants used to serialize and deserialize
- private static final String PROTO_MESSAGE_CLASS = "proto_message_class";
- private static final String PROTO_EXTENSION_HOSTS = "proto_extension_hosts";
-
- // Transient fields that are lazy initialized and then memoized.
- private transient ExtensionRegistry memoizedExtensionRegistry;
- private transient Parser<T> memoizedParser;
-
- /** Private constructor. */
- private ProtoCoder(Class<T> protoMessageClass, Set<Class<?>> extensionHostClasses) {
- this.protoMessageClass = protoMessageClass;
- this.extensionHostClasses = extensionHostClasses;
- }
-
- /**
- * @deprecated For JSON deserialization only.
- */
- @JsonCreator
- @Deprecated
- public static <T extends Message> ProtoCoder<T> of(
- @JsonProperty(PROTO_MESSAGE_CLASS) String protoMessageClassName,
- @Nullable @JsonProperty(PROTO_EXTENSION_HOSTS) List<String> extensionHostClassNames) {
-
- try {
- @SuppressWarnings("unchecked")
- Class<T> protoMessageClass = (Class<T>) Class.forName(protoMessageClassName);
- List<Class<?>> extensionHostClasses = Lists.newArrayList();
- if (extensionHostClassNames != null) {
- for (String extensionHostClassName : extensionHostClassNames) {
- extensionHostClasses.add(Class.forName(extensionHostClassName));
- }
- }
- return of(protoMessageClass).withExtensionsFrom(extensionHostClasses);
- } catch (ClassNotFoundException e) {
- throw new IllegalArgumentException(e);
- }
- }
-
- @Override
- public CloudObject asCloudObject() {
- CloudObject result = super.asCloudObject();
- Structs.addString(result, PROTO_MESSAGE_CLASS, protoMessageClass.getName());
- List<CloudObject> extensionHostClassNames = Lists.newArrayList();
- for (String className : getSortedExtensionClasses()) {
- extensionHostClassNames.add(CloudObject.forString(className));
- }
- Structs.addList(result, PROTO_EXTENSION_HOSTS, extensionHostClassNames);
- return result;
- }
-
- /** Get the memoized {@link Parser}, possibly initializing it lazily. */
- private Parser<T> getParser() {
- if (memoizedParser == null) {
- try {
- @SuppressWarnings("unchecked")
- T protoMessageInstance = (T) protoMessageClass.getMethod("getDefaultInstance").invoke(null);
- @SuppressWarnings("unchecked")
- Parser<T> tParser = (Parser<T>) protoMessageInstance.getParserForType();
- memoizedParser = tParser;
- } catch (IllegalAccessException | InvocationTargetException | NoSuchMethodException e) {
- throw new IllegalArgumentException(e);
- }
- }
- return memoizedParser;
- }
-
- /**
- * The implementation of the {@link CoderProvider} for this {@link ProtoCoder} returned by
- * {@link #coderProvider()}.
- */
- private static final CoderProvider PROVIDER =
- new CoderProvider() {
- @Override
- public <T> Coder<T> getCoder(TypeDescriptor<T> type) throws CannotProvideCoderException {
- if (!type.isSubtypeOf(new TypeDescriptor<Message>() {})) {
- throw new CannotProvideCoderException(
- String.format(
- "Cannot provide %s because %s is not a subclass of %s",
- ProtoCoder.class.getSimpleName(),
- type,
- Message.class.getName()));
- }
-
- @SuppressWarnings("unchecked")
- TypeDescriptor<? extends Message> messageType = (TypeDescriptor<? extends Message>) type;
- try {
- @SuppressWarnings("unchecked")
- Coder<T> coder = (Coder<T>) ProtoCoder.of(messageType);
- return coder;
- } catch (IllegalArgumentException e) {
- throw new CannotProvideCoderException(e);
- }
- }
- };
-
- private SortedSet<String> getSortedExtensionClasses() {
- SortedSet<String> ret = new TreeSet<>();
- for (Class<?> clazz : extensionHostClasses) {
- ret.add(clazz.getName());
- }
- return ret;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/protobuf/ProtobufUtil.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/protobuf/ProtobufUtil.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/protobuf/ProtobufUtil.java
deleted file mode 100644
index 597b1de..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/protobuf/ProtobufUtil.java
+++ /dev/null
@@ -1,171 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.coders.protobuf;
-
-import static com.google.common.base.Preconditions.checkArgument;
-
-import com.google.cloud.dataflow.sdk.coders.Coder.NonDeterministicException;
-import com.google.protobuf.Descriptors.Descriptor;
-import com.google.protobuf.Descriptors.FieldDescriptor;
-import com.google.protobuf.Descriptors.FileDescriptor.Syntax;
-import com.google.protobuf.Descriptors.GenericDescriptor;
-import com.google.protobuf.ExtensionRegistry;
-import com.google.protobuf.ExtensionRegistry.ExtensionInfo;
-import com.google.protobuf.Message;
-
-import java.lang.reflect.InvocationTargetException;
-import java.util.HashSet;
-import java.util.Set;
-
-/**
- * Utility functions for reflecting and analyzing Protocol Buffers classes.
- *
- * <p>Used by {@link ProtoCoder}, but in a separate file for testing and isolation.
- */
-class ProtobufUtil {
- /**
- * Returns the {@link Descriptor} for the given Protocol Buffers {@link Message}.
- *
- * @throws IllegalArgumentException if there is an error in Java reflection.
- */
- static Descriptor getDescriptorForClass(Class<? extends Message> clazz) {
- try {
- return (Descriptor) clazz.getMethod("getDescriptor").invoke(null);
- } catch (IllegalAccessException | InvocationTargetException | NoSuchMethodException e) {
- throw new IllegalArgumentException(e);
- }
- }
-
- /**
- * Returns the {@link Descriptor} for the given Protocol Buffers {@link Message} as well as
- * every class it can include transitively.
- *
- * @throws IllegalArgumentException if there is an error in Java reflection.
- */
- static Set<Descriptor> getRecursiveDescriptorsForClass(
- Class<? extends Message> clazz, ExtensionRegistry registry) {
- Descriptor root = getDescriptorForClass(clazz);
- Set<Descriptor> descriptors = new HashSet<>();
- recursivelyAddDescriptors(root, descriptors, registry);
- return descriptors;
- }
-
- /**
- * Recursively walks the given {@link Message} class and verifies that every field or message
- * linked in uses the Protocol Buffers proto2 syntax.
- */
- static void checkProto2Syntax(Class<? extends Message> clazz, ExtensionRegistry registry) {
- for (GenericDescriptor d : getRecursiveDescriptorsForClass(clazz, registry)) {
- Syntax s = d.getFile().getSyntax();
- checkArgument(
- s == Syntax.PROTO2,
- "Message %s or one of its dependencies does not use proto2 syntax: %s in file %s",
- clazz.getName(),
- d.getFullName(),
- d.getFile().getName());
- }
- }
-
- /**
- * Recursively checks whether the specified class uses any Protocol Buffers fields that cannot
- * be deterministically encoded.
- *
- * @throws NonDeterministicException if the object cannot be encoded deterministically.
- */
- static void verifyDeterministic(ProtoCoder<?> coder) throws NonDeterministicException {
- Class<? extends Message> message = coder.getMessageType();
- ExtensionRegistry registry = coder.getExtensionRegistry();
- Set<Descriptor> descriptors = getRecursiveDescriptorsForClass(message, registry);
- for (Descriptor d : descriptors) {
- for (FieldDescriptor fd : d.getFields()) {
- // If there is a transitively reachable Protocol Buffers map field, then this object cannot
- // be encoded deterministically.
- if (fd.isMapField()) {
- String reason =
- String.format(
- "Protocol Buffers message %s transitively includes Map field %s (from file %s)."
- + " Maps cannot be deterministically encoded.",
- message.getName(),
- fd.getFullName(),
- fd.getFile().getFullName());
- throw new NonDeterministicException(coder, reason);
- }
- }
- }
- }
-
- ////////////////////////////////////////////////////////////////////////////////////////////////
- // Disable construction of utility class
- private ProtobufUtil() {}
-
- private static void recursivelyAddDescriptors(
- Descriptor message, Set<Descriptor> descriptors, ExtensionRegistry registry) {
- if (descriptors.contains(message)) {
- return;
- }
- descriptors.add(message);
-
- for (FieldDescriptor f : message.getFields()) {
- recursivelyAddDescriptors(f, descriptors, registry);
- }
- for (FieldDescriptor f : message.getExtensions()) {
- recursivelyAddDescriptors(f, descriptors, registry);
- }
- for (ExtensionInfo info :
- registry.getAllImmutableExtensionsByExtendedType(message.getFullName())) {
- recursivelyAddDescriptors(info.descriptor, descriptors, registry);
- }
- for (ExtensionInfo info :
- registry.getAllMutableExtensionsByExtendedType(message.getFullName())) {
- recursivelyAddDescriptors(info.descriptor, descriptors, registry);
- }
- }
-
- private static void recursivelyAddDescriptors(
- FieldDescriptor field, Set<Descriptor> descriptors, ExtensionRegistry registry) {
- switch (field.getType()) {
- case BOOL:
- case BYTES:
- case DOUBLE:
- case ENUM:
- case FIXED32:
- case FIXED64:
- case FLOAT:
- case INT32:
- case INT64:
- case SFIXED32:
- case SFIXED64:
- case SINT32:
- case SINT64:
- case STRING:
- case UINT32:
- case UINT64:
- // Primitive types do not transitively access anything else.
- break;
-
- case GROUP:
- case MESSAGE:
- // Recursively adds all the fields from this nested Message.
- recursivelyAddDescriptors(field.getMessageType(), descriptors, registry);
- break;
-
- default:
- throw new UnsupportedOperationException(
- "Unexpected Protocol Buffers field type: " + field.getType());
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/protobuf/package-info.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/protobuf/package-info.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/protobuf/package-info.java
deleted file mode 100644
index b5bcf18..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/protobuf/package-info.java
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-/**
- * Defines a {@link com.google.cloud.dataflow.sdk.coders.Coder}
- * for Protocol Buffers messages, {@code ProtoCoder}.
- *
- * @see com.google.cloud.dataflow.sdk.coders.protobuf.ProtoCoder
- */
-package com.google.cloud.dataflow.sdk.coders.protobuf;
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/AvroIO.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/AvroIO.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/AvroIO.java
deleted file mode 100644
index f016b5b..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/AvroIO.java
+++ /dev/null
@@ -1,810 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.io;
-
-import static com.google.common.base.Preconditions.checkState;
-
-import com.google.cloud.dataflow.sdk.coders.AvroCoder;
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.coders.VoidCoder;
-import com.google.cloud.dataflow.sdk.io.Read.Bounded;
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-import com.google.cloud.dataflow.sdk.runners.PipelineRunner;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-import com.google.cloud.dataflow.sdk.util.IOChannelUtils;
-import com.google.cloud.dataflow.sdk.util.MimeTypes;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.cloud.dataflow.sdk.values.PDone;
-import com.google.cloud.dataflow.sdk.values.PInput;
-import com.google.common.annotations.VisibleForTesting;
-import com.google.common.base.Preconditions;
-
-import org.apache.avro.Schema;
-import org.apache.avro.file.DataFileWriter;
-import org.apache.avro.generic.GenericRecord;
-import org.apache.avro.reflect.ReflectData;
-
-import java.io.IOException;
-import java.nio.channels.Channels;
-import java.nio.channels.WritableByteChannel;
-import java.util.regex.Pattern;
-
-import javax.annotation.Nullable;
-
-/**
- * {@link PTransform}s for reading and writing Avro files.
- *
- * <p>To read a {@link PCollection} from one or more Avro files, use
- * {@link AvroIO.Read}, specifying {@link AvroIO.Read#from} to specify
- * the path of the file(s) to read from (e.g., a local filename or
- * filename pattern if running locally, or a Google Cloud Storage
- * filename or filename pattern of the form
- * {@code "gs://<bucket>/<filepath>"}), and optionally
- * {@link AvroIO.Read#named} to specify the name of the pipeline step.
- *
- * <p>It is required to specify {@link AvroIO.Read#withSchema}. To
- * read specific records, such as Avro-generated classes, provide an
- * Avro-generated class type. To read {@link GenericRecord GenericRecords}, provide either
- * a {@link Schema} object or an Avro schema in a JSON-encoded string form.
- * An exception will be thrown if a record doesn't match the specified
- * schema.
- *
- * <p>For example:
- * <pre> {@code
- * Pipeline p = ...;
- *
- * // A simple Read of a local file (only runs locally):
- * PCollection<AvroAutoGenClass> records =
- * p.apply(AvroIO.Read.from("/path/to/file.avro")
- * .withSchema(AvroAutoGenClass.class));
- *
- * // A Read from a GCS file (runs locally and via the Google Cloud
- * // Dataflow service):
- * Schema schema = new Schema.Parser().parse(new File("schema.avsc"));
- * PCollection<GenericRecord> records =
- * p.apply(AvroIO.Read.named("ReadFromAvro")
- * .from("gs://my_bucket/path/to/records-*.avro")
- * .withSchema(schema));
- * } </pre>
- *
- * <p>To write a {@link PCollection} to one or more Avro files, use
- * {@link AvroIO.Write}, specifying {@link AvroIO.Write#to} to specify
- * the path of the file to write to (e.g., a local filename or sharded
- * filename pattern if running locally, or a Google Cloud Storage
- * filename or sharded filename pattern of the form
- * {@code "gs://<bucket>/<filepath>"}), and optionally
- * {@link AvroIO.Write#named} to specify the name of the pipeline step.
- *
- * <p>It is required to specify {@link AvroIO.Write#withSchema}. To
- * write specific records, such as Avro-generated classes, provide an
- * Avro-generated class type. To write {@link GenericRecord GenericRecords}, provide either
- * a {@link Schema} object or a schema in a JSON-encoded string form.
- * An exception will be thrown if a record doesn't match the specified
- * schema.
- *
- * <p>For example:
- * <pre> {@code
- * // A simple Write to a local file (only runs locally):
- * PCollection<AvroAutoGenClass> records = ...;
- * records.apply(AvroIO.Write.to("/path/to/file.avro")
- * .withSchema(AvroAutoGenClass.class));
- *
- * // A Write to a sharded GCS file (runs locally and via the Google Cloud
- * // Dataflow service):
- * Schema schema = new Schema.Parser().parse(new File("schema.avsc"));
- * PCollection<GenericRecord> records = ...;
- * records.apply(AvroIO.Write.named("WriteToAvro")
- * .to("gs://my_bucket/path/to/numbers")
- * .withSchema(schema)
- * .withSuffix(".avro"));
- * } </pre>
- *
- * <p><h3>Permissions</h3>
- * Permission requirements depend on the {@link PipelineRunner} that is used to execute the
- * Dataflow job. Please refer to the documentation of corresponding {@link PipelineRunner}s for
- * more details.
- */
-public class AvroIO {
- /**
- * A root {@link PTransform} that reads from an Avro file (or multiple Avro
- * files matching a pattern) and returns a {@link PCollection} containing
- * the decoding of each record.
- */
- public static class Read {
- /**
- * Returns a {@link PTransform} with the given step name.
- */
- public static Bound<GenericRecord> named(String name) {
- return new Bound<>(GenericRecord.class).named(name);
- }
-
- /**
- * Returns a {@link PTransform} that reads from the file(s)
- * with the given name or pattern. This can be a local filename
- * or filename pattern (if running locally), or a Google Cloud
- * Storage filename or filename pattern of the form
- * {@code "gs://<bucket>/<filepath>"} (if running locally or via
- * the Google Cloud Dataflow service). Standard
- * <a href="http://docs.oracle.com/javase/tutorial/essential/io/find.html">Java
- * Filesystem glob patterns</a> ("*", "?", "[..]") are supported.
- */
- public static Bound<GenericRecord> from(String filepattern) {
- return new Bound<>(GenericRecord.class).from(filepattern);
- }
-
- /**
- * Returns a {@link PTransform} that reads Avro file(s)
- * containing records whose type is the specified Avro-generated class.
- *
- * @param <T> the type of the decoded elements, and the elements
- * of the resulting {@link PCollection}
- */
- public static <T> Bound<T> withSchema(Class<T> type) {
- return new Bound<>(type).withSchema(type);
- }
-
- /**
- * Returns a {@link PTransform} that reads Avro file(s)
- * containing records of the specified schema.
- */
- public static Bound<GenericRecord> withSchema(Schema schema) {
- return new Bound<>(GenericRecord.class).withSchema(schema);
- }
-
- /**
- * Returns a {@link PTransform} that reads Avro file(s)
- * containing records of the specified schema in a JSON-encoded
- * string form.
- */
- public static Bound<GenericRecord> withSchema(String schema) {
- return withSchema((new Schema.Parser()).parse(schema));
- }
-
- /**
- * Returns a {@link PTransform} that reads Avro file(s)
- * that has GCS path validation on pipeline creation disabled.
- *
- * <p>This can be useful in the case where the GCS input location does
- * not exist at the pipeline creation time, but is expected to be available
- * at execution time.
- */
- public static Bound<GenericRecord> withoutValidation() {
- return new Bound<>(GenericRecord.class).withoutValidation();
- }
-
- /**
- * A {@link PTransform} that reads from an Avro file (or multiple Avro
- * files matching a pattern) and returns a bounded {@link PCollection} containing
- * the decoding of each record.
- *
- * @param <T> the type of each of the elements of the resulting
- * PCollection
- */
- public static class Bound<T> extends PTransform<PInput, PCollection<T>> {
- /** The filepattern to read from. */
- @Nullable
- final String filepattern;
- /** The class type of the records. */
- final Class<T> type;
- /** The schema of the input file. */
- @Nullable
- final Schema schema;
- /** An option to indicate if input validation is desired. Default is true. */
- final boolean validate;
-
- Bound(Class<T> type) {
- this(null, null, type, null, true);
- }
-
- Bound(String name, String filepattern, Class<T> type, Schema schema, boolean validate) {
- super(name);
- this.filepattern = filepattern;
- this.type = type;
- this.schema = schema;
- this.validate = validate;
- }
-
- /**
- * Returns a new {@link PTransform} that's like this one but
- * with the given step name.
- *
- * <p>Does not modify this object.
- */
- public Bound<T> named(String name) {
- return new Bound<>(name, filepattern, type, schema, validate);
- }
-
- /**
- * Returns a new {@link PTransform} that's like this one but
- * that reads from the file(s) with the given name or pattern.
- * (See {@link AvroIO.Read#from} for a description of
- * filepatterns.)
- *
- * <p>Does not modify this object.
- */
- public Bound<T> from(String filepattern) {
- return new Bound<>(name, filepattern, type, schema, validate);
- }
-
- /**
- * Returns a new {@link PTransform} that's like this one but
- * that reads Avro file(s) containing records whose type is the
- * specified Avro-generated class.
- *
- * <p>Does not modify this object.
- *
- * @param <X> the type of the decoded elements and the elements of
- * the resulting PCollection
- */
- public <X> Bound<X> withSchema(Class<X> type) {
- return new Bound<>(name, filepattern, type, ReflectData.get().getSchema(type), validate);
- }
-
- /**
- * Returns a new {@link PTransform} that's like this one but
- * that reads Avro file(s) containing records of the specified schema.
- *
- * <p>Does not modify this object.
- */
- public Bound<GenericRecord> withSchema(Schema schema) {
- return new Bound<>(name, filepattern, GenericRecord.class, schema, validate);
- }
-
- /**
- * Returns a new {@link PTransform} that's like this one but
- * that reads Avro file(s) containing records of the specified schema
- * in a JSON-encoded string form.
- *
- * <p>Does not modify this object.
- */
- public Bound<GenericRecord> withSchema(String schema) {
- return withSchema((new Schema.Parser()).parse(schema));
- }
-
- /**
- * Returns a new {@link PTransform} that's like this one but
- * that has GCS input path validation on pipeline creation disabled.
- *
- * <p>Does not modify this object.
- *
- * <p>This can be useful in the case where the GCS input location does
- * not exist at the pipeline creation time, but is expected to be
- * available at execution time.
- */
- public Bound<T> withoutValidation() {
- return new Bound<>(name, filepattern, type, schema, false);
- }
-
- @Override
- public PCollection<T> apply(PInput input) {
- if (filepattern == null) {
- throw new IllegalStateException(
- "need to set the filepattern of an AvroIO.Read transform");
- }
- if (schema == null) {
- throw new IllegalStateException("need to set the schema of an AvroIO.Read transform");
- }
- if (validate) {
- try {
- checkState(
- !IOChannelUtils.getFactory(filepattern).match(filepattern).isEmpty(),
- "Unable to find any files matching %s",
- filepattern);
- } catch (IOException e) {
- throw new IllegalStateException(
- String.format("Failed to validate %s", filepattern), e);
- }
- }
-
- @SuppressWarnings("unchecked")
- Bounded<T> read =
- type == GenericRecord.class
- ? (Bounded<T>) com.google.cloud.dataflow.sdk.io.Read.from(
- AvroSource.from(filepattern).withSchema(schema))
- : com.google.cloud.dataflow.sdk.io.Read.from(
- AvroSource.from(filepattern).withSchema(type));
-
- PCollection<T> pcol = input.getPipeline().apply("Read", read);
- // Honor the default output coder that would have been used by this PTransform.
- pcol.setCoder(getDefaultOutputCoder());
- return pcol;
- }
-
- @Override
- protected Coder<T> getDefaultOutputCoder() {
- return AvroCoder.of(type, schema);
- }
-
- public String getFilepattern() {
- return filepattern;
- }
-
- public Schema getSchema() {
- return schema;
- }
-
- public boolean needsValidation() {
- return validate;
- }
- }
-
- /** Disallow construction of utility class. */
- private Read() {}
- }
-
- /////////////////////////////////////////////////////////////////////////////
-
- /**
- * A root {@link PTransform} that writes a {@link PCollection} to an Avro file (or
- * multiple Avro files matching a sharding pattern).
- */
- public static class Write {
- /**
- * Returns a {@link PTransform} with the given step name.
- */
- public static Bound<GenericRecord> named(String name) {
- return new Bound<>(GenericRecord.class).named(name);
- }
-
- /**
- * Returns a {@link PTransform} that writes to the file(s)
- * with the given prefix. This can be a local filename
- * (if running locally), or a Google Cloud Storage filename of
- * the form {@code "gs://<bucket>/<filepath>"}
- * (if running locally or via the Google Cloud Dataflow service).
- *
- * <p>The files written will begin with this prefix, followed by
- * a shard identifier (see {@link Bound#withNumShards}, and end
- * in a common extension, if given by {@link Bound#withSuffix}.
- */
- public static Bound<GenericRecord> to(String prefix) {
- return new Bound<>(GenericRecord.class).to(prefix);
- }
-
- /**
- * Returns a {@link PTransform} that writes to the file(s) with the
- * given filename suffix.
- */
- public static Bound<GenericRecord> withSuffix(String filenameSuffix) {
- return new Bound<>(GenericRecord.class).withSuffix(filenameSuffix);
- }
-
- /**
- * Returns a {@link PTransform} that uses the provided shard count.
- *
- * <p>Constraining the number of shards is likely to reduce
- * the performance of a pipeline. Setting this value is not recommended
- * unless you require a specific number of output files.
- *
- * @param numShards the number of shards to use, or 0 to let the system
- * decide.
- */
- public static Bound<GenericRecord> withNumShards(int numShards) {
- return new Bound<>(GenericRecord.class).withNumShards(numShards);
- }
-
- /**
- * Returns a {@link PTransform} that uses the given shard name
- * template.
- *
- * <p>See {@link ShardNameTemplate} for a description of shard templates.
- */
- public static Bound<GenericRecord> withShardNameTemplate(String shardTemplate) {
- return new Bound<>(GenericRecord.class).withShardNameTemplate(shardTemplate);
- }
-
- /**
- * Returns a {@link PTransform} that forces a single file as
- * output.
- *
- * <p>Constraining the number of shards is likely to reduce
- * the performance of a pipeline. Setting this value is not recommended
- * unless you require a specific number of output files.
- */
- public static Bound<GenericRecord> withoutSharding() {
- return new Bound<>(GenericRecord.class).withoutSharding();
- }
-
- /**
- * Returns a {@link PTransform} that writes Avro file(s)
- * containing records whose type is the specified Avro-generated class.
- *
- * @param <T> the type of the elements of the input PCollection
- */
- public static <T> Bound<T> withSchema(Class<T> type) {
- return new Bound<>(type).withSchema(type);
- }
-
- /**
- * Returns a {@link PTransform} that writes Avro file(s)
- * containing records of the specified schema.
- */
- public static Bound<GenericRecord> withSchema(Schema schema) {
- return new Bound<>(GenericRecord.class).withSchema(schema);
- }
-
- /**
- * Returns a {@link PTransform} that writes Avro file(s)
- * containing records of the specified schema in a JSON-encoded
- * string form.
- */
- public static Bound<GenericRecord> withSchema(String schema) {
- return withSchema((new Schema.Parser()).parse(schema));
- }
-
- /**
- * Returns a {@link PTransform} that writes Avro file(s) that has GCS path validation on
- * pipeline creation disabled.
- *
- * <p>This can be useful in the case where the GCS output location does
- * not exist at the pipeline creation time, but is expected to be available
- * at execution time.
- */
- public static Bound<GenericRecord> withoutValidation() {
- return new Bound<>(GenericRecord.class).withoutValidation();
- }
-
- /**
- * A {@link PTransform} that writes a bounded {@link PCollection} to an Avro file (or
- * multiple Avro files matching a sharding pattern).
- *
- * @param <T> the type of each of the elements of the input PCollection
- */
- public static class Bound<T> extends PTransform<PCollection<T>, PDone> {
- /** The filename to write to. */
- @Nullable
- final String filenamePrefix;
- /** Suffix to use for each filename. */
- final String filenameSuffix;
- /** Requested number of shards. 0 for automatic. */
- final int numShards;
- /** Shard template string. */
- final String shardTemplate;
- /** The class type of the records. */
- final Class<T> type;
- /** The schema of the output file. */
- @Nullable
- final Schema schema;
- /** An option to indicate if output validation is desired. Default is true. */
- final boolean validate;
-
- Bound(Class<T> type) {
- this(null, null, "", 0, ShardNameTemplate.INDEX_OF_MAX, type, null, true);
- }
-
- Bound(
- String name,
- String filenamePrefix,
- String filenameSuffix,
- int numShards,
- String shardTemplate,
- Class<T> type,
- Schema schema,
- boolean validate) {
- super(name);
- this.filenamePrefix = filenamePrefix;
- this.filenameSuffix = filenameSuffix;
- this.numShards = numShards;
- this.shardTemplate = shardTemplate;
- this.type = type;
- this.schema = schema;
- this.validate = validate;
- }
-
- /**
- * Returns a new {@link PTransform} that's like this one but
- * with the given step name.
- *
- * <p>Does not modify this object.
- */
- public Bound<T> named(String name) {
- return new Bound<>(
- name, filenamePrefix, filenameSuffix, numShards, shardTemplate, type, schema, validate);
- }
-
- /**
- * Returns a new {@link PTransform} that's like this one but
- * that writes to the file(s) with the given filename prefix.
- *
- * <p>See {@link AvroIO.Write#to(String)} for more information
- * about filenames.
- *
- * <p>Does not modify this object.
- */
- public Bound<T> to(String filenamePrefix) {
- validateOutputComponent(filenamePrefix);
- return new Bound<>(
- name, filenamePrefix, filenameSuffix, numShards, shardTemplate, type, schema, validate);
- }
-
- /**
- * Returns a new {@link PTransform} that's like this one but
- * that writes to the file(s) with the given filename suffix.
- *
- * <p>See {@link ShardNameTemplate} for a description of shard templates.
- *
- * <p>Does not modify this object.
- */
- public Bound<T> withSuffix(String filenameSuffix) {
- validateOutputComponent(filenameSuffix);
- return new Bound<>(
- name, filenamePrefix, filenameSuffix, numShards, shardTemplate, type, schema, validate);
- }
-
- /**
- * Returns a new {@link PTransform} that's like this one but
- * that uses the provided shard count.
- *
- * <p>Constraining the number of shards is likely to reduce
- * the performance of a pipeline. Setting this value is not recommended
- * unless you require a specific number of output files.
- *
- * <p>Does not modify this object.
- *
- * @param numShards the number of shards to use, or 0 to let the system
- * decide.
- * @see ShardNameTemplate
- */
- public Bound<T> withNumShards(int numShards) {
- Preconditions.checkArgument(numShards >= 0);
- return new Bound<>(
- name, filenamePrefix, filenameSuffix, numShards, shardTemplate, type, schema, validate);
- }
-
- /**
- * Returns a new {@link PTransform} that's like this one but
- * that uses the given shard name template.
- *
- * <p>Does not modify this object.
- *
- * @see ShardNameTemplate
- */
- public Bound<T> withShardNameTemplate(String shardTemplate) {
- return new Bound<>(
- name, filenamePrefix, filenameSuffix, numShards, shardTemplate, type, schema, validate);
- }
-
- /**
- * Returns a new {@link PTransform} that's like this one but
- * that forces a single file as output.
- *
- * <p>This is a shortcut for
- * {@code .withNumShards(1).withShardNameTemplate("")}
- *
- * <p>Does not modify this object.
- */
- public Bound<T> withoutSharding() {
- return new Bound<>(name, filenamePrefix, filenameSuffix, 1, "", type, schema, validate);
- }
-
- /**
- * Returns a new {@link PTransform} that's like this one but
- * that writes to Avro file(s) containing records whose type is the
- * specified Avro-generated class.
- *
- * <p>Does not modify this object.
- *
- * @param <X> the type of the elements of the input PCollection
- */
- public <X> Bound<X> withSchema(Class<X> type) {
- return new Bound<>(
- name,
- filenamePrefix,
- filenameSuffix,
- numShards,
- shardTemplate,
- type,
- ReflectData.get().getSchema(type),
- validate);
- }
-
- /**
- * Returns a new {@link PTransform} that's like this one but
- * that writes to Avro file(s) containing records of the specified
- * schema.
- *
- * <p>Does not modify this object.
- */
- public Bound<GenericRecord> withSchema(Schema schema) {
- return new Bound<>(
- name,
- filenamePrefix,
- filenameSuffix,
- numShards,
- shardTemplate,
- GenericRecord.class,
- schema,
- validate);
- }
-
- /**
- * Returns a new {@link PTransform} that's like this one but
- * that writes to Avro file(s) containing records of the specified
- * schema in a JSON-encoded string form.
- *
- * <p>Does not modify this object.
- */
- public Bound<GenericRecord> withSchema(String schema) {
- return withSchema((new Schema.Parser()).parse(schema));
- }
-
- /**
- * Returns a new {@link PTransform} that's like this one but
- * that has GCS output path validation on pipeline creation disabled.
- *
- * <p>Does not modify this object.
- *
- * <p>This can be useful in the case where the GCS output location does
- * not exist at the pipeline creation time, but is expected to be
- * available at execution time.
- */
- public Bound<T> withoutValidation() {
- return new Bound<>(
- name, filenamePrefix, filenameSuffix, numShards, shardTemplate, type, schema, false);
- }
-
- @Override
- public PDone apply(PCollection<T> input) {
- if (filenamePrefix == null) {
- throw new IllegalStateException(
- "need to set the filename prefix of an AvroIO.Write transform");
- }
- if (schema == null) {
- throw new IllegalStateException("need to set the schema of an AvroIO.Write transform");
- }
-
- // Note that custom sinks currently do not expose sharding controls.
- // Thus pipeline runner writers need to individually add support internally to
- // apply user requested sharding limits.
- return input.apply(
- "Write",
- com.google.cloud.dataflow.sdk.io.Write.to(
- new AvroSink<>(
- filenamePrefix, filenameSuffix, shardTemplate, AvroCoder.of(type, schema))));
- }
-
- /**
- * Returns the current shard name template string.
- */
- public String getShardNameTemplate() {
- return shardTemplate;
- }
-
- @Override
- protected Coder<Void> getDefaultOutputCoder() {
- return VoidCoder.of();
- }
-
- public String getFilenamePrefix() {
- return filenamePrefix;
- }
-
- public String getShardTemplate() {
- return shardTemplate;
- }
-
- public int getNumShards() {
- return numShards;
- }
-
- public String getFilenameSuffix() {
- return filenameSuffix;
- }
-
- public Class<T> getType() {
- return type;
- }
-
- public Schema getSchema() {
- return schema;
- }
-
- public boolean needsValidation() {
- return validate;
- }
- }
-
- /** Disallow construction of utility class. */
- private Write() {}
- }
-
- // Pattern which matches old-style shard output patterns, which are now
- // disallowed.
- private static final Pattern SHARD_OUTPUT_PATTERN = Pattern.compile("@([0-9]+|\\*)");
-
- private static void validateOutputComponent(String partialFilePattern) {
- Preconditions.checkArgument(
- !SHARD_OUTPUT_PATTERN.matcher(partialFilePattern).find(),
- "Output name components are not allowed to contain @* or @N patterns: "
- + partialFilePattern);
- }
-
- /////////////////////////////////////////////////////////////////////////////
-
- /** Disallow construction of utility class. */
- private AvroIO() {}
-
- /**
- * A {@link FileBasedSink} for Avro files.
- */
- @VisibleForTesting
- static class AvroSink<T> extends FileBasedSink<T> {
- private final AvroCoder<T> coder;
-
- @VisibleForTesting
- AvroSink(
- String baseOutputFilename, String extension, String fileNameTemplate, AvroCoder<T> coder) {
- super(baseOutputFilename, extension, fileNameTemplate);
- this.coder = coder;
- }
-
- @Override
- public FileBasedSink.FileBasedWriteOperation<T> createWriteOperation(PipelineOptions options) {
- return new AvroWriteOperation<>(this, coder);
- }
-
- /**
- * A {@link com.google.cloud.dataflow.sdk.io.FileBasedSink.FileBasedWriteOperation
- * FileBasedWriteOperation} for Avro files.
- */
- private static class AvroWriteOperation<T> extends FileBasedWriteOperation<T> {
- private final AvroCoder<T> coder;
-
- private AvroWriteOperation(AvroSink<T> sink, AvroCoder<T> coder) {
- super(sink);
- this.coder = coder;
- }
-
- @Override
- public FileBasedWriter<T> createWriter(PipelineOptions options) throws Exception {
- return new AvroWriter<>(this, coder);
- }
- }
-
- /**
- * A {@link com.google.cloud.dataflow.sdk.io.FileBasedSink.FileBasedWriter FileBasedWriter}
- * for Avro files.
- */
- private static class AvroWriter<T> extends FileBasedWriter<T> {
- private final AvroCoder<T> coder;
- private DataFileWriter<T> dataFileWriter;
-
- public AvroWriter(FileBasedWriteOperation<T> writeOperation, AvroCoder<T> coder) {
- super(writeOperation);
- this.mimeType = MimeTypes.BINARY;
- this.coder = coder;
- }
-
- @SuppressWarnings("deprecation") // uses internal test functionality.
- @Override
- protected void prepareWrite(WritableByteChannel channel) throws Exception {
- dataFileWriter = new DataFileWriter<>(coder.createDatumWriter());
- dataFileWriter.create(coder.getSchema(), Channels.newOutputStream(channel));
- }
-
- @Override
- public void write(T value) throws Exception {
- dataFileWriter.append(value);
- }
-
- @Override
- protected void writeFooter() throws Exception {
- dataFileWriter.flush();
- }
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/AvroSource.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/AvroSource.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/AvroSource.java
deleted file mode 100644
index 297663e..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/AvroSource.java
+++ /dev/null
@@ -1,647 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.io;
-
-import com.google.cloud.dataflow.sdk.annotations.Experimental;
-import com.google.cloud.dataflow.sdk.coders.AvroCoder;
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-import com.google.cloud.dataflow.sdk.runners.PipelineRunner;
-import com.google.cloud.dataflow.sdk.util.AvroUtils;
-import com.google.cloud.dataflow.sdk.util.AvroUtils.AvroMetadata;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.common.base.Preconditions;
-
-import org.apache.avro.Schema;
-import org.apache.avro.file.CodecFactory;
-import org.apache.avro.file.DataFileConstants;
-import org.apache.avro.generic.GenericDatumReader;
-import org.apache.avro.generic.GenericRecord;
-import org.apache.avro.io.BinaryDecoder;
-import org.apache.avro.io.DatumReader;
-import org.apache.avro.io.DecoderFactory;
-import org.apache.avro.reflect.ReflectData;
-import org.apache.avro.reflect.ReflectDatumReader;
-import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
-import org.apache.commons.compress.compressors.snappy.SnappyCompressorInputStream;
-import org.apache.commons.compress.compressors.xz.XZCompressorInputStream;
-
-import java.io.ByteArrayInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.PushbackInputStream;
-import java.nio.ByteBuffer;
-import java.nio.channels.Channels;
-import java.nio.channels.ReadableByteChannel;
-import java.util.Collection;
-import java.util.zip.Inflater;
-import java.util.zip.InflaterInputStream;
-
-// CHECKSTYLE.OFF: JavadocStyle
-/**
- * A {@link FileBasedSource} for reading Avro files.
- *
- * <p>To read a {@link PCollection} of objects from one or more Avro files, use
- * {@link AvroSource#from} to specify the path(s) of the files to read. The {@link AvroSource} that
- * is returned will read objects of type {@link GenericRecord} with the schema(s) that were written
- * at file creation. To further configure the {@link AvroSource} to read with a user-defined schema,
- * or to return records of a type other than {@link GenericRecord}, use
- * {@link AvroSource#withSchema(Schema)} (using an Avro {@link Schema}),
- * {@link AvroSource#withSchema(String)} (using a JSON schema), or
- * {@link AvroSource#withSchema(Class)} (to return objects of the Avro-generated class specified).
- *
- * <p>An {@link AvroSource} can be read from using the {@link Read} transform. For example:
- *
- * <pre>
- * {@code
- * AvroSource<MyType> source = AvroSource.from(file.toPath()).withSchema(MyType.class);
- * PCollection<MyType> records = Read.from(mySource);
- * }
- * </pre>
- *
- * <p>The {@link AvroSource#readFromFileWithClass(String, Class)} method is a convenience method
- * that returns a read transform. For example:
- *
- * <pre>
- * {@code
- * PCollection<MyType> records = AvroSource.readFromFileWithClass(file.toPath(), MyType.class));
- * }
- * </pre>
- *
- * <p>This class's implementation is based on the <a
- * href="https://avro.apache.org/docs/1.7.7/spec.html">Avro 1.7.7</a> specification and implements
- * parsing of some parts of Avro Object Container Files. The rationale for doing so is that the Avro
- * API does not provide efficient ways of computing the precise offsets of blocks within a file,
- * which is necessary to support dynamic work rebalancing. However, whenever it is possible to use
- * the Avro API in a way that supports maintaining precise offsets, this class uses the Avro API.
- *
- * <p>Avro Object Container files store records in blocks. Each block contains a collection of
- * records. Blocks may be encoded (e.g., with bzip2, deflate, snappy, etc.). Blocks are delineated
- * from one another by a 16-byte sync marker.
- *
- * <p>An {@link AvroSource} for a subrange of a single file contains records in the blocks such that
- * the start offset of the block is greater than or equal to the start offset of the source and less
- * than the end offset of the source.
- *
- * <p>To use XZ-encoded Avro files, please include an explicit dependency on {@code xz-1.5.jar},
- * which has been marked as optional in the Maven {@code sdk/pom.xml} for Google Cloud Dataflow:
- *
- * <pre>{@code
- * <dependency>
- * <groupId>org.tukaani</groupId>
- * <artifactId>xz</artifactId>
- * <version>1.5</version>
- * </dependency>
- * }</pre>
- *
- * <h3>Permissions</h3>
- * <p>Permission requirements depend on the {@link PipelineRunner} that is used to execute the
- * Dataflow job. Please refer to the documentation of corresponding {@link PipelineRunner}s for
- * more details.
- *
- * @param <T> The type of records to be read from the source.
- */
-// CHECKSTYLE.ON: JavadocStyle
-@Experimental(Experimental.Kind.SOURCE_SINK)
-public class AvroSource<T> extends BlockBasedSource<T> {
- // Default minimum bundle size (chosen as two default-size Avro blocks to attempt to
- // ensure that every source has at least one block of records).
- // The default sync interval is 64k.
- static final long DEFAULT_MIN_BUNDLE_SIZE = 2 * DataFileConstants.DEFAULT_SYNC_INTERVAL;
-
- // The JSON schema used to encode records.
- private final String readSchemaString;
-
- // The JSON schema that was used to write the source Avro file (may differ from the schema we will
- // use to read from it).
- private final String fileSchemaString;
-
- // The type of the records contained in the file.
- private final Class<T> type;
-
- // The following metadata fields are not user-configurable. They are extracted from the object
- // container file header upon subsource creation.
-
- // The codec used to encode the blocks in the Avro file. String value drawn from those in
- // https://avro.apache.org/docs/1.7.7/api/java/org/apache/avro/file/CodecFactory.html
- private final String codec;
-
- // The object container file's 16-byte sync marker.
- private final byte[] syncMarker;
-
- // Default output coder, lazily initialized.
- private transient AvroCoder<T> coder = null;
-
- // Schema of the file, lazily initialized.
- private transient Schema fileSchema;
-
- // Schema used to encode records, lazily initialized.
- private transient Schema readSchema;
-
- /**
- * Creates a {@link Read} transform that will read from an {@link AvroSource} that is configured
- * to read records of the given type from a file pattern.
- */
- public static <T> Read.Bounded<T> readFromFileWithClass(String filePattern, Class<T> clazz) {
- return Read.from(new AvroSource<T>(filePattern, DEFAULT_MIN_BUNDLE_SIZE,
- ReflectData.get().getSchema(clazz).toString(), clazz, null, null));
- }
-
- /**
- * Creates an {@link AvroSource} that reads from the given file name or pattern ("glob"). The
- * returned source can be further configured by calling {@link #withSchema} to return a type other
- * than {@link GenericRecord}.
- */
- public static AvroSource<GenericRecord> from(String fileNameOrPattern) {
- return new AvroSource<>(
- fileNameOrPattern, DEFAULT_MIN_BUNDLE_SIZE, null, GenericRecord.class, null, null);
- }
-
- /**
- * Returns an {@link AvroSource} that's like this one but reads files containing records that
- * conform to the given schema.
- *
- * <p>Does not modify this object.
- */
- public AvroSource<GenericRecord> withSchema(String schema) {
- return new AvroSource<>(
- getFileOrPatternSpec(), getMinBundleSize(), schema, GenericRecord.class, codec, syncMarker);
- }
-
- /**
- * Returns an {@link AvroSource} that's like this one but reads files containing records that
- * conform to the given schema.
- *
- * <p>Does not modify this object.
- */
- public AvroSource<GenericRecord> withSchema(Schema schema) {
- return new AvroSource<>(getFileOrPatternSpec(), getMinBundleSize(), schema.toString(),
- GenericRecord.class, codec, syncMarker);
- }
-
- /**
- * Returns an {@link AvroSource} that's like this one but reads files containing records of the
- * type of the given class.
- *
- * <p>Does not modify this object.
- */
- public <X> AvroSource<X> withSchema(Class<X> clazz) {
- return new AvroSource<X>(getFileOrPatternSpec(), getMinBundleSize(),
- ReflectData.get().getSchema(clazz).toString(), clazz, codec, syncMarker);
- }
-
- /**
- * Returns an {@link AvroSource} that's like this one but uses the supplied minimum bundle size.
- * Refer to {@link OffsetBasedSource} for a description of {@code minBundleSize} and its use.
- *
- * <p>Does not modify this object.
- */
- public AvroSource<T> withMinBundleSize(long minBundleSize) {
- return new AvroSource<T>(
- getFileOrPatternSpec(), minBundleSize, readSchemaString, type, codec, syncMarker);
- }
-
- private AvroSource(String fileNameOrPattern, long minBundleSize, String schema, Class<T> type,
- String codec, byte[] syncMarker) {
- super(fileNameOrPattern, minBundleSize);
- this.readSchemaString = schema;
- this.codec = codec;
- this.syncMarker = syncMarker;
- this.type = type;
- this.fileSchemaString = null;
- }
-
- private AvroSource(String fileName, long minBundleSize, long startOffset, long endOffset,
- String schema, Class<T> type, String codec, byte[] syncMarker, String fileSchema) {
- super(fileName, minBundleSize, startOffset, endOffset);
- this.readSchemaString = schema;
- this.codec = codec;
- this.syncMarker = syncMarker;
- this.type = type;
- this.fileSchemaString = fileSchema;
- }
-
- @Override
- public void validate() {
- // AvroSource objects do not need to be configured with more than a file pattern. Overridden to
- // make this explicit.
- super.validate();
- }
-
- @Override
- public BlockBasedSource<T> createForSubrangeOfFile(String fileName, long start, long end) {
- byte[] syncMarker = this.syncMarker;
- String codec = this.codec;
- String readSchemaString = this.readSchemaString;
- String fileSchemaString = this.fileSchemaString;
- // codec and syncMarker are initially null when the source is created, as they differ
- // across input files and must be read from the file. Here, when we are creating a source
- // for a subrange of a file, we can initialize these values. When the resulting AvroSource
- // is further split, they do not need to be read again.
- if (codec == null || syncMarker == null || fileSchemaString == null) {
- AvroMetadata metadata;
- try {
- Collection<String> files = FileBasedSource.expandFilePattern(fileName);
- Preconditions.checkArgument(files.size() <= 1, "More than 1 file matched %s");
- metadata = AvroUtils.readMetadataFromFile(fileName);
- } catch (IOException e) {
- throw new RuntimeException("Error reading metadata from file " + fileName, e);
- }
- codec = metadata.getCodec();
- syncMarker = metadata.getSyncMarker();
- fileSchemaString = metadata.getSchemaString();
- // If the source was created with a null schema, use the schema that we read from the file's
- // metadata.
- if (readSchemaString == null) {
- readSchemaString = metadata.getSchemaString();
- }
- }
- return new AvroSource<T>(fileName, getMinBundleSize(), start, end, readSchemaString, type,
- codec, syncMarker, fileSchemaString);
- }
-
- @Override
- protected BlockBasedReader<T> createSingleFileReader(PipelineOptions options) {
- return new AvroReader<T>(this);
- }
-
- @Override
- public boolean producesSortedKeys(PipelineOptions options) throws Exception {
- return false;
- }
-
- @Override
- public AvroCoder<T> getDefaultOutputCoder() {
- if (coder == null) {
- Schema.Parser parser = new Schema.Parser();
- coder = AvroCoder.of(type, parser.parse(readSchemaString));
- }
- return coder;
- }
-
- public String getSchema() {
- return readSchemaString;
- }
-
- private Schema getReadSchema() {
- if (readSchemaString == null) {
- return null;
- }
-
- // If the schema has not been parsed, parse it.
- if (readSchema == null) {
- Schema.Parser parser = new Schema.Parser();
- readSchema = parser.parse(readSchemaString);
- }
- return readSchema;
- }
-
- private Schema getFileSchema() {
- if (fileSchemaString == null) {
- return null;
- }
-
- // If the schema has not been parsed, parse it.
- if (fileSchema == null) {
- Schema.Parser parser = new Schema.Parser();
- fileSchema = parser.parse(fileSchemaString);
- }
- return fileSchema;
- }
-
- private byte[] getSyncMarker() {
- return syncMarker;
- }
-
- private String getCodec() {
- return codec;
- }
-
- private DatumReader<T> createDatumReader() {
- Schema readSchema = getReadSchema();
- Schema fileSchema = getFileSchema();
- Preconditions.checkNotNull(
- readSchema, "No read schema has been initialized for source %s", this);
- Preconditions.checkNotNull(
- fileSchema, "No file schema has been initialized for source %s", this);
- if (type == GenericRecord.class) {
- return new GenericDatumReader<>(fileSchema, readSchema);
- } else {
- return new ReflectDatumReader<>(fileSchema, readSchema);
- }
- }
-
- /**
- * A {@link BlockBasedSource.Block} of Avro records.
- *
- * @param <T> The type of records stored in the block.
- */
- @Experimental(Experimental.Kind.SOURCE_SINK)
- static class AvroBlock<T> extends Block<T> {
- // The number of records in the block.
- private final long numRecords;
-
- // The current record in the block.
- private T currentRecord;
-
- // The index of the current record in the block.
- private long currentRecordIndex = 0;
-
- // A DatumReader to read records from the block.
- private final DatumReader<T> reader;
-
- // A BinaryDecoder used by the reader to decode records.
- private final BinaryDecoder decoder;
-
- /**
- * Decodes a byte array as an InputStream. The byte array may be compressed using some
- * codec. Reads from the returned stream will result in decompressed bytes.
- *
- * <p>This supports the same codecs as Avro's {@link CodecFactory}, namely those defined in
- * {@link DataFileConstants}.
- *
- * <ul>
- * <li>"snappy" : Google's Snappy compression
- * <li>"deflate" : deflate compression
- * <li>"bzip2" : Bzip2 compression
- * <li>"xz" : xz compression
- * <li>"null" (the string, not the value): Uncompressed data
- * </ul>
- */
- private static InputStream decodeAsInputStream(byte[] data, String codec) throws IOException {
- ByteArrayInputStream byteStream = new ByteArrayInputStream(data);
- switch (codec) {
- case DataFileConstants.SNAPPY_CODEC:
- return new SnappyCompressorInputStream(byteStream);
- case DataFileConstants.DEFLATE_CODEC:
- // nowrap == true: Do not expect ZLIB header or checksum, as Avro does not write them.
- Inflater inflater = new Inflater(true);
- return new InflaterInputStream(byteStream, inflater);
- case DataFileConstants.XZ_CODEC:
- return new XZCompressorInputStream(byteStream);
- case DataFileConstants.BZIP2_CODEC:
- return new BZip2CompressorInputStream(byteStream);
- case DataFileConstants.NULL_CODEC:
- return byteStream;
- default:
- throw new IllegalArgumentException("Unsupported codec: " + codec);
- }
- }
-
- AvroBlock(byte[] data, long numRecords, AvroSource<T> source) throws IOException {
- this.numRecords = numRecords;
- this.reader = source.createDatumReader();
- this.decoder =
- DecoderFactory.get().binaryDecoder(decodeAsInputStream(data, source.getCodec()), null);
- }
-
- @Override
- public T getCurrentRecord() {
- return currentRecord;
- }
-
- @Override
- public boolean readNextRecord() throws IOException {
- if (currentRecordIndex >= numRecords) {
- return false;
- }
- currentRecord = reader.read(null, decoder);
- currentRecordIndex++;
- return true;
- }
-
- @Override
- public double getFractionOfBlockConsumed() {
- return ((double) currentRecordIndex) / numRecords;
- }
- }
-
- /**
- * A {@link BlockBasedSource.BlockBasedReader} for reading blocks from Avro files.
- *
- * <p>An Avro Object Container File consists of a header followed by a 16-bit sync marker
- * and then a sequence of blocks, where each block begins with two encoded longs representing
- * the total number of records in the block and the block's size in bytes, followed by the
- * block's (optionally-encoded) records. Each block is terminated by a 16-bit sync marker.
- *
- * <p>Here, we consider the sync marker that precedes a block to be its offset, as this allows
- * a reader that begins reading at that offset to detect the sync marker and the beginning of
- * the block.
- *
- * @param <T> The type of records contained in the block.
- */
- @Experimental(Experimental.Kind.SOURCE_SINK)
- public static class AvroReader<T> extends BlockBasedReader<T> {
- // The current block.
- private AvroBlock<T> currentBlock;
-
- // Offset of the block.
- private long currentBlockOffset = 0;
-
- // Size of the current block.
- private long currentBlockSizeBytes = 0;
-
- // Current offset within the stream.
- private long currentOffset = 0;
-
- // Stream used to read from the underlying file.
- // A pushback stream is used to restore bytes buffered during seeking/decoding.
- private PushbackInputStream stream;
-
- // Small buffer for reading encoded values from the stream.
- // The maximum size of an encoded long is 10 bytes, and this buffer will be used to read two.
- private final byte[] readBuffer = new byte[20];
-
- // Decoder to decode binary-encoded values from the buffer.
- private BinaryDecoder decoder;
-
- /**
- * Reads Avro records of type {@code T} from the specified source.
- */
- public AvroReader(AvroSource<T> source) {
- super(source);
- }
-
- @Override
- public synchronized AvroSource<T> getCurrentSource() {
- return (AvroSource<T>) super.getCurrentSource();
- }
-
- @Override
- public boolean readNextBlock() throws IOException {
- // The next block in the file is after the first sync marker that can be read starting from
- // the current offset. First, we seek past the next sync marker, if it exists. After a sync
- // marker is the start of a block. A block begins with the number of records contained in
- // the block, encoded as a long, followed by the size of the block in bytes, encoded as a
- // long. The currentOffset after this method should be last byte after this block, and the
- // currentBlockOffset should be the start of the sync marker before this block.
-
- // Seek to the next sync marker, if one exists.
- currentOffset += advancePastNextSyncMarker(stream, getCurrentSource().getSyncMarker());
-
- // The offset of the current block includes its preceding sync marker.
- currentBlockOffset = currentOffset - getCurrentSource().getSyncMarker().length;
-
- // Read a small buffer to parse the block header.
- // We cannot use a BinaryDecoder to do this directly from the stream because a BinaryDecoder
- // internally buffers data and we only want to read as many bytes from the stream as the size
- // of the header. Though BinaryDecoder#InputStream returns an input stream that is aware of
- // its internal buffering, we would have to re-wrap this input stream to seek for the next
- // block in the file.
- int read = stream.read(readBuffer);
- // We reached the last sync marker in the file.
- if (read <= 0) {
- return false;
- }
- decoder = DecoderFactory.get().binaryDecoder(readBuffer, decoder);
- long numRecords = decoder.readLong();
- long blockSize = decoder.readLong();
-
- // The decoder buffers data internally, but since we know the size of the stream the
- // decoder has constructed from the readBuffer, the number of bytes available in the
- // input stream is equal to the number of unconsumed bytes.
- int headerSize = readBuffer.length - decoder.inputStream().available();
- stream.unread(readBuffer, headerSize, read - headerSize);
-
- // Create the current block by reading blockSize bytes. Block sizes permitted by the Avro
- // specification are [32, 2^30], so this narrowing is ok.
- byte[] data = new byte[(int) blockSize];
- stream.read(data);
- currentBlock = new AvroBlock<>(data, numRecords, getCurrentSource());
- currentBlockSizeBytes = blockSize;
-
- // Update current offset with the number of bytes we read to get the next block.
- currentOffset += headerSize + blockSize;
- return true;
- }
-
- @Override
- public AvroBlock<T> getCurrentBlock() {
- return currentBlock;
- }
-
- @Override
- public long getCurrentBlockOffset() {
- return currentBlockOffset;
- }
-
- @Override
- public long getCurrentBlockSize() {
- return currentBlockSizeBytes;
- }
-
- /**
- * Creates a {@link PushbackInputStream} that has a large enough pushback buffer to be able
- * to push back the syncBuffer and the readBuffer.
- */
- private PushbackInputStream createStream(ReadableByteChannel channel) {
- return new PushbackInputStream(
- Channels.newInputStream(channel),
- getCurrentSource().getSyncMarker().length + readBuffer.length);
- }
-
- /**
- * Starts reading from the provided channel. Assumes that the channel is already seeked to
- * the source's start offset.
- */
- @Override
- protected void startReading(ReadableByteChannel channel) throws IOException {
- stream = createStream(channel);
- currentOffset = getCurrentSource().getStartOffset();
- }
-
- /**
- * Advances to the first byte after the next occurrence of the sync marker in the
- * stream when reading from the current offset. Returns the number of bytes consumed
- * from the stream. Note that this method requires a PushbackInputStream with a buffer
- * at least as big as the marker it is seeking for.
- */
- static long advancePastNextSyncMarker(PushbackInputStream stream, byte[] syncMarker)
- throws IOException {
- Seeker seeker = new Seeker(syncMarker);
- byte[] syncBuffer = new byte[syncMarker.length];
- long totalBytesConsumed = 0;
- // Seek until either a sync marker is found or we reach the end of the file.
- int mark = -1; // Position of the last byte in the sync marker.
- int read; // Number of bytes read.
- do {
- read = stream.read(syncBuffer);
- if (read >= 0) {
- mark = seeker.find(syncBuffer, read);
- // Update the currentOffset with the number of bytes read.
- totalBytesConsumed += read;
- }
- } while (mark < 0 && read > 0);
-
- // If the sync marker was found, unread block data and update the current offsets.
- if (mark >= 0) {
- // The current offset after this call should be just past the sync marker, so we should
- // unread the remaining buffer contents and update the currentOffset accordingly.
- stream.unread(syncBuffer, mark + 1, read - (mark + 1));
- totalBytesConsumed = totalBytesConsumed - (read - (mark + 1));
- }
- return totalBytesConsumed;
- }
-
- /**
- * A {@link Seeker} looks for a given marker within a byte buffer. Uses naive string matching
- * with a sliding window, as sync markers are small and random.
- */
- static class Seeker {
- // The marker to search for.
- private byte[] marker;
-
- // Buffer used for the sliding window.
- private byte[] searchBuffer;
-
- // Number of bytes available to be matched in the buffer.
- private int available = 0;
-
- /**
- * Create a {@link Seeker} that looks for the given marker.
- */
- public Seeker(byte[] marker) {
- this.marker = marker;
- this.searchBuffer = new byte[marker.length];
- }
-
- /**
- * Find the marker in the byte buffer. Returns the index of the end of the marker in the
- * buffer. If the marker is not found, returns -1.
- *
- * <p>State is maintained between calls. If the marker was partially matched, a subsequent
- * call to find will resume matching the marker.
- *
- * @param buffer
- * @return the index of the end of the marker within the buffer, or -1 if the buffer was not
- * found.
- */
- public int find(byte[] buffer, int length) {
- for (int i = 0; i < length; i++) {
- System.arraycopy(searchBuffer, 1, searchBuffer, 0, searchBuffer.length - 1);
- searchBuffer[searchBuffer.length - 1] = buffer[i];
- available = Math.min(available + 1, searchBuffer.length);
- if (ByteBuffer.wrap(searchBuffer, searchBuffer.length - available, available)
- .equals(ByteBuffer.wrap(marker))) {
- available = 0;
- return i;
- }
- }
- return -1;
- }
- }
- }
-}
[19/67] [partial] incubator-beam git commit: Directory reorganization
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/Sum.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/Sum.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/Sum.java
deleted file mode 100644
index 5b30475..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/Sum.java
+++ /dev/null
@@ -1,188 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms;
-
-import com.google.cloud.dataflow.sdk.util.common.Counter;
-import com.google.cloud.dataflow.sdk.util.common.Counter.AggregationKind;
-import com.google.cloud.dataflow.sdk.util.common.CounterProvider;
-
-/**
- * {@code PTransform}s for computing the sum of the elements in a
- * {@code PCollection}, or the sum of the values associated with
- * each key in a {@code PCollection} of {@code KV}s.
- *
- * <p>Example 1: get the sum of a {@code PCollection} of {@code Double}s.
- * <pre> {@code
- * PCollection<Double> input = ...;
- * PCollection<Double> sum = input.apply(Sum.doublesGlobally());
- * } </pre>
- *
- * <p>Example 2: calculate the sum of the {@code Integer}s
- * associated with each unique key (which is of type {@code String}).
- * <pre> {@code
- * PCollection<KV<String, Integer>> input = ...;
- * PCollection<KV<String, Integer>> sumPerKey = input
- * .apply(Sum.<String>integersPerKey());
- * } </pre>
- */
-public class Sum {
-
- private Sum() {
- // do not instantiate
- }
-
- /**
- * Returns a {@code PTransform} that takes an input
- * {@code PCollection<Integer>} and returns a
- * {@code PCollection<Integer>} whose contents is the sum of the
- * input {@code PCollection}'s elements, or
- * {@code 0} if there are no elements.
- */
- public static Combine.Globally<Integer, Integer> integersGlobally() {
- return Combine.globally(new SumIntegerFn()).named("Sum.Globally");
- }
-
- /**
- * Returns a {@code PTransform} that takes an input
- * {@code PCollection<KV<K, Integer>>} and returns a
- * {@code PCollection<KV<K, Integer>>} that contains an output
- * element mapping each distinct key in the input
- * {@code PCollection} to the sum of the values associated with
- * that key in the input {@code PCollection}.
- */
- public static <K> Combine.PerKey<K, Integer, Integer> integersPerKey() {
- return Combine.<K, Integer, Integer>perKey(new SumIntegerFn()).named("Sum.PerKey");
- }
-
- /**
- * Returns a {@code PTransform} that takes an input
- * {@code PCollection<Long>} and returns a
- * {@code PCollection<Long>} whose contents is the sum of the
- * input {@code PCollection}'s elements, or
- * {@code 0} if there are no elements.
- */
- public static Combine.Globally<Long, Long> longsGlobally() {
- return Combine.globally(new SumLongFn()).named("Sum.Globally");
- }
-
- /**
- * Returns a {@code PTransform} that takes an input
- * {@code PCollection<KV<K, Long>>} and returns a
- * {@code PCollection<KV<K, Long>>} that contains an output
- * element mapping each distinct key in the input
- * {@code PCollection} to the sum of the values associated with
- * that key in the input {@code PCollection}.
- */
- public static <K> Combine.PerKey<K, Long, Long> longsPerKey() {
- return Combine.<K, Long, Long>perKey(new SumLongFn()).named("Sum.PerKey");
- }
-
- /**
- * Returns a {@code PTransform} that takes an input
- * {@code PCollection<Double>} and returns a
- * {@code PCollection<Double>} whose contents is the sum of the
- * input {@code PCollection}'s elements, or
- * {@code 0} if there are no elements.
- */
- public static Combine.Globally<Double, Double> doublesGlobally() {
- return Combine.globally(new SumDoubleFn()).named("Sum.Globally");
- }
-
- /**
- * Returns a {@code PTransform} that takes an input
- * {@code PCollection<KV<K, Double>>} and returns a
- * {@code PCollection<KV<K, Double>>} that contains an output
- * element mapping each distinct key in the input
- * {@code PCollection} to the sum of the values associated with
- * that key in the input {@code PCollection}.
- */
- public static <K> Combine.PerKey<K, Double, Double> doublesPerKey() {
- return Combine.<K, Double, Double>perKey(new SumDoubleFn()).named("Sum.PerKey");
- }
-
-
- /////////////////////////////////////////////////////////////////////////////
-
- /**
- * A {@code SerializableFunction} that computes the sum of an
- * {@code Iterable} of {@code Integer}s, useful as an argument to
- * {@link Combine#globally} or {@link Combine#perKey}.
- */
- public static class SumIntegerFn
- extends Combine.BinaryCombineIntegerFn implements CounterProvider<Integer> {
- @Override
- public int apply(int a, int b) {
- return a + b;
- }
-
- @Override
- public int identity() {
- return 0;
- }
-
- @Override
- public Counter<Integer> getCounter(String name) {
- return Counter.ints(name, AggregationKind.SUM);
- }
- }
-
- /**
- * A {@code SerializableFunction} that computes the sum of an
- * {@code Iterable} of {@code Long}s, useful as an argument to
- * {@link Combine#globally} or {@link Combine#perKey}.
- */
- public static class SumLongFn
- extends Combine.BinaryCombineLongFn implements CounterProvider<Long> {
- @Override
- public long apply(long a, long b) {
- return a + b;
- }
-
- @Override
- public long identity() {
- return 0;
- }
-
- @Override
- public Counter<Long> getCounter(String name) {
- return Counter.longs(name, AggregationKind.SUM);
- }
- }
-
- /**
- * A {@code SerializableFunction} that computes the sum of an
- * {@code Iterable} of {@code Double}s, useful as an argument to
- * {@link Combine#globally} or {@link Combine#perKey}.
- */
- public static class SumDoubleFn
- extends Combine.BinaryCombineDoubleFn implements CounterProvider<Double> {
- @Override
- public double apply(double a, double b) {
- return a + b;
- }
-
- @Override
- public double identity() {
- return 0;
- }
-
- @Override
- public Counter<Double> getCounter(String name) {
- return Counter.doubles(name, AggregationKind.SUM);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/Top.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/Top.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/Top.java
deleted file mode 100644
index 98fe53c..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/Top.java
+++ /dev/null
@@ -1,559 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms;
-
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.coders.CoderException;
-import com.google.cloud.dataflow.sdk.coders.CoderRegistry;
-import com.google.cloud.dataflow.sdk.coders.CustomCoder;
-import com.google.cloud.dataflow.sdk.coders.ListCoder;
-import com.google.cloud.dataflow.sdk.transforms.Combine.AccumulatingCombineFn;
-import com.google.cloud.dataflow.sdk.transforms.Combine.AccumulatingCombineFn.Accumulator;
-import com.google.cloud.dataflow.sdk.transforms.Combine.PerKey;
-import com.google.cloud.dataflow.sdk.transforms.windowing.GlobalWindow;
-import com.google.cloud.dataflow.sdk.transforms.windowing.GlobalWindows;
-import com.google.cloud.dataflow.sdk.util.common.ElementByteSizeObserver;
-import com.google.cloud.dataflow.sdk.values.KV;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.common.base.Preconditions;
-import com.google.common.collect.Lists;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.io.Serializable;
-import java.util.ArrayList;
-import java.util.Comparator;
-import java.util.List;
-import java.util.PriorityQueue;
-
-/**
- * {@code PTransform}s for finding the largest (or smallest) set
- * of elements in a {@code PCollection}, or the largest (or smallest)
- * set of values associated with each key in a {@code PCollection} of
- * {@code KV}s.
- */
-public class Top {
-
- private Top() {
- // do not instantiate
- }
-
- /**
- * Returns a {@code PTransform} that takes an input
- * {@code PCollection<T>} and returns a {@code PCollection<List<T>>} with a
- * single element containing the largest {@code count} elements of the input
- * {@code PCollection<T>}, in decreasing order, sorted using the
- * given {@code Comparator<T>}. The {@code Comparator<T>} must also
- * be {@code Serializable}.
- *
- * <p>If {@code count} {@code <} the number of elements in the
- * input {@code PCollection}, then all the elements of the input
- * {@code PCollection} will be in the resulting
- * {@code List}, albeit in sorted order.
- *
- * <p>All the elements of the result's {@code List}
- * must fit into the memory of a single machine.
- *
- * <p>Example of use:
- * <pre> {@code
- * PCollection<Student> students = ...;
- * PCollection<List<Student>> top10Students =
- * students.apply(Top.of(10, new CompareStudentsByAvgGrade()));
- * } </pre>
- *
- * <p>By default, the {@code Coder} of the output {@code PCollection}
- * is a {@code ListCoder} of the {@code Coder} of the elements of
- * the input {@code PCollection}.
- *
- * <p>If the input {@code PCollection} is windowed into {@link GlobalWindows},
- * an empty {@code List<T>} in the {@link GlobalWindow} will be output if the input
- * {@code PCollection} is empty. To use this with inputs with other windowing,
- * either {@link Combine.Globally#withoutDefaults withoutDefaults} or
- * {@link Combine.Globally#asSingletonView asSingletonView} must be called.
- *
- * <p>See also {@link #smallest} and {@link #largest}, which sort
- * {@code Comparable} elements using their natural ordering.
- *
- * <p>See also {@link #perKey}, {@link #smallestPerKey}, and
- * {@link #largestPerKey}, which take a {@code PCollection} of
- * {@code KV}s and return the top values associated with each key.
- */
- public static <T, ComparatorT extends Comparator<T> & Serializable>
- Combine.Globally<T, List<T>> of(int count, ComparatorT compareFn) {
- return Combine.globally(new TopCombineFn<>(count, compareFn)).named("Top.Globally");
- }
-
- /**
- * Returns a {@code PTransform} that takes an input
- * {@code PCollection<T>} and returns a {@code PCollection<List<T>>} with a
- * single element containing the smallest {@code count} elements of the input
- * {@code PCollection<T>}, in increasing order, sorted according to
- * their natural order.
- *
- * <p>If {@code count} {@code <} the number of elements in the
- * input {@code PCollection}, then all the elements of the input
- * {@code PCollection} will be in the resulting {@code PCollection}'s
- * {@code List}, albeit in sorted order.
- *
- * <p>All the elements of the result {@code List}
- * must fit into the memory of a single machine.
- *
- * <p>Example of use:
- * <pre> {@code
- * PCollection<Integer> values = ...;
- * PCollection<List<Integer>> smallest10Values = values.apply(Top.smallest(10));
- * } </pre>
- *
- * <p>By default, the {@code Coder} of the output {@code PCollection}
- * is a {@code ListCoder} of the {@code Coder} of the elements of
- * the input {@code PCollection}.
- *
- * <p>If the input {@code PCollection} is windowed into {@link GlobalWindows},
- * an empty {@code List<T>} in the {@link GlobalWindow} will be output if the input
- * {@code PCollection} is empty. To use this with inputs with other windowing,
- * either {@link Combine.Globally#withoutDefaults withoutDefaults} or
- * {@link Combine.Globally#asSingletonView asSingletonView} must be called.
- *
- * <p>See also {@link #largest}.
- *
- * <p>See also {@link #of}, which sorts using a user-specified
- * {@code Comparator} function.
- *
- * <p>See also {@link #perKey}, {@link #smallestPerKey}, and
- * {@link #largestPerKey}, which take a {@code PCollection} of
- * {@code KV}s and return the top values associated with each key.
- */
- public static <T extends Comparable<T>> Combine.Globally<T, List<T>> smallest(int count) {
- return Combine.globally(new TopCombineFn<>(count, new Smallest<T>()))
- .named("Smallest.Globally");
- }
-
- /**
- * Returns a {@code PTransform} that takes an input
- * {@code PCollection<T>} and returns a {@code PCollection<List<T>>} with a
- * single element containing the largest {@code count} elements of the input
- * {@code PCollection<T>}, in decreasing order, sorted according to
- * their natural order.
- *
- * <p>If {@code count} {@code <} the number of elements in the
- * input {@code PCollection}, then all the elements of the input
- * {@code PCollection} will be in the resulting {@code PCollection}'s
- * {@code List}, albeit in sorted order.
- *
- * <p>All the elements of the result's {@code List}
- * must fit into the memory of a single machine.
- *
- * <p>Example of use:
- * <pre> {@code
- * PCollection<Integer> values = ...;
- * PCollection<List<Integer>> largest10Values = values.apply(Top.largest(10));
- * } </pre>
- *
- * <p>By default, the {@code Coder} of the output {@code PCollection}
- * is a {@code ListCoder} of the {@code Coder} of the elements of
- * the input {@code PCollection}.
- *
- * <p>If the input {@code PCollection} is windowed into {@link GlobalWindows},
- * an empty {@code List<T>} in the {@link GlobalWindow} will be output if the input
- * {@code PCollection} is empty. To use this with inputs with other windowing,
- * either {@link Combine.Globally#withoutDefaults withoutDefaults} or
- * {@link Combine.Globally#asSingletonView asSingletonView} must be called.
- *
- * <p>See also {@link #smallest}.
- *
- * <p>See also {@link #of}, which sorts using a user-specified
- * {@code Comparator} function.
- *
- * <p>See also {@link #perKey}, {@link #smallestPerKey}, and
- * {@link #largestPerKey}, which take a {@code PCollection} of
- * {@code KV}s and return the top values associated with each key.
- */
- public static <T extends Comparable<T>> Combine.Globally<T, List<T>> largest(int count) {
- return Combine.globally(new TopCombineFn<>(count, new Largest<T>())).named("Largest.Globally");
- }
-
- /**
- * Returns a {@code PTransform} that takes an input
- * {@code PCollection<KV<K, V>>} and returns a
- * {@code PCollection<KV<K, List<V>>>} that contains an output
- * element mapping each distinct key in the input
- * {@code PCollection} to the largest {@code count} values
- * associated with that key in the input
- * {@code PCollection<KV<K, V>>}, in decreasing order, sorted using
- * the given {@code Comparator<V>}. The
- * {@code Comparator<V>} must also be {@code Serializable}.
- *
- * <p>If there are fewer than {@code count} values associated with
- * a particular key, then all those values will be in the result
- * mapping for that key, albeit in sorted order.
- *
- * <p>All the values associated with a single key must fit into the
- * memory of a single machine, but there can be many more
- * {@code KV}s in the resulting {@code PCollection} than can fit
- * into the memory of a single machine.
- *
- * <p>Example of use:
- * <pre> {@code
- * PCollection<KV<School, Student>> studentsBySchool = ...;
- * PCollection<KV<School, List<Student>>> top10StudentsBySchool =
- * studentsBySchool.apply(
- * Top.perKey(10, new CompareStudentsByAvgGrade()));
- * } </pre>
- *
- * <p>By default, the {@code Coder} of the keys of the output
- * {@code PCollection} is the same as that of the keys of the input
- * {@code PCollection}, and the {@code Coder} of the values of the
- * output {@code PCollection} is a {@code ListCoder} of the
- * {@code Coder} of the values of the input {@code PCollection}.
- *
- * <p>See also {@link #smallestPerKey} and {@link #largestPerKey}, which
- * sort {@code Comparable<V>} values using their natural
- * ordering.
- *
- * <p>See also {@link #of}, {@link #smallest}, and {@link #largest}, which
- * take a {@code PCollection} and return the top elements.
- */
- public static <K, V, ComparatorT extends Comparator<V> & Serializable>
- PTransform<PCollection<KV<K, V>>, PCollection<KV<K, List<V>>>>
- perKey(int count, ComparatorT compareFn) {
- return Combine.perKey(
- new TopCombineFn<>(count, compareFn).<K>asKeyedFn()).named("Top.PerKey");
- }
-
- /**
- * Returns a {@code PTransform} that takes an input
- * {@code PCollection<KV<K, V>>} and returns a
- * {@code PCollection<KV<K, List<V>>>} that contains an output
- * element mapping each distinct key in the input
- * {@code PCollection} to the smallest {@code count} values
- * associated with that key in the input
- * {@code PCollection<KV<K, V>>}, in increasing order, sorted
- * according to their natural order.
- *
- * <p>If there are fewer than {@code count} values associated with
- * a particular key, then all those values will be in the result
- * mapping for that key, albeit in sorted order.
- *
- * <p>All the values associated with a single key must fit into the
- * memory of a single machine, but there can be many more
- * {@code KV}s in the resulting {@code PCollection} than can fit
- * into the memory of a single machine.
- *
- * <p>Example of use:
- * <pre> {@code
- * PCollection<KV<String, Integer>> keyedValues = ...;
- * PCollection<KV<String, List<Integer>>> smallest10ValuesPerKey =
- * keyedValues.apply(Top.smallestPerKey(10));
- * } </pre>
- *
- * <p>By default, the {@code Coder} of the keys of the output
- * {@code PCollection} is the same as that of the keys of the input
- * {@code PCollection}, and the {@code Coder} of the values of the
- * output {@code PCollection} is a {@code ListCoder} of the
- * {@code Coder} of the values of the input {@code PCollection}.
- *
- * <p>See also {@link #largestPerKey}.
- *
- * <p>See also {@link #perKey}, which sorts values using a user-specified
- * {@code Comparator} function.
- *
- * <p>See also {@link #of}, {@link #smallest}, and {@link #largest}, which
- * take a {@code PCollection} and return the top elements.
- */
- public static <K, V extends Comparable<V>>
- PTransform<PCollection<KV<K, V>>, PCollection<KV<K, List<V>>>>
- smallestPerKey(int count) {
- return Combine.perKey(new TopCombineFn<>(count, new Smallest<V>()).<K>asKeyedFn())
- .named("Smallest.PerKey");
- }
-
- /**
- * Returns a {@code PTransform} that takes an input
- * {@code PCollection<KV<K, V>>} and returns a
- * {@code PCollection<KV<K, List<V>>>} that contains an output
- * element mapping each distinct key in the input
- * {@code PCollection} to the largest {@code count} values
- * associated with that key in the input
- * {@code PCollection<KV<K, V>>}, in decreasing order, sorted
- * according to their natural order.
- *
- * <p>If there are fewer than {@code count} values associated with
- * a particular key, then all those values will be in the result
- * mapping for that key, albeit in sorted order.
- *
- * <p>All the values associated with a single key must fit into the
- * memory of a single machine, but there can be many more
- * {@code KV}s in the resulting {@code PCollection} than can fit
- * into the memory of a single machine.
- *
- * <p>Example of use:
- * <pre> {@code
- * PCollection<KV<String, Integer>> keyedValues = ...;
- * PCollection<KV<String, List<Integer>>> largest10ValuesPerKey =
- * keyedValues.apply(Top.largestPerKey(10));
- * } </pre>
- *
- * <p>By default, the {@code Coder} of the keys of the output
- * {@code PCollection} is the same as that of the keys of the input
- * {@code PCollection}, and the {@code Coder} of the values of the
- * output {@code PCollection} is a {@code ListCoder} of the
- * {@code Coder} of the values of the input {@code PCollection}.
- *
- * <p>See also {@link #smallestPerKey}.
- *
- * <p>See also {@link #perKey}, which sorts values using a user-specified
- * {@code Comparator} function.
- *
- * <p>See also {@link #of}, {@link #smallest}, and {@link #largest}, which
- * take a {@code PCollection} and return the top elements.
- */
- public static <K, V extends Comparable<V>>
- PerKey<K, V, List<V>>
- largestPerKey(int count) {
- return Combine.perKey(
-new TopCombineFn<>(count, new Largest<V>()).<K>asKeyedFn())
- .named("Largest.PerKey");
- }
-
- /**
- * A {@code Serializable} {@code Comparator} that that uses the compared elements' natural
- * ordering.
- */
- public static class Largest<T extends Comparable<? super T>>
- implements Comparator<T>, Serializable {
- @Override
- public int compare(T a, T b) {
- return a.compareTo(b);
- }
- }
-
- /**
- * {@code Serializable} {@code Comparator} that that uses the reverse of the compared elements'
- * natural ordering.
- */
- public static class Smallest<T extends Comparable<? super T>>
- implements Comparator<T>, Serializable {
- @Override
- public int compare(T a, T b) {
- return b.compareTo(a);
- }
- }
-
-
- ////////////////////////////////////////////////////////////////////////////
-
- /**
- * {@code CombineFn} for {@code Top} transforms that combines a
- * bunch of {@code T}s into a single {@code count}-long
- * {@code List<T>}, using {@code compareFn} to choose the largest
- * {@code T}s.
- *
- * @param <T> type of element being compared
- */
- public static class TopCombineFn<T, ComparatorT extends Comparator<T> & Serializable>
- extends AccumulatingCombineFn<T, BoundedHeap<T, ComparatorT>, List<T>> {
-
- private final int count;
- private final ComparatorT compareFn;
-
- public TopCombineFn(int count, ComparatorT compareFn) {
- Preconditions.checkArgument(
- count >= 0,
- "count must be >= 0");
- this.count = count;
- this.compareFn = compareFn;
- }
-
- @Override
- public BoundedHeap<T, ComparatorT> createAccumulator() {
- return new BoundedHeap<>(count, compareFn, new ArrayList<T>());
- }
-
- @Override
- public Coder<BoundedHeap<T, ComparatorT>> getAccumulatorCoder(
- CoderRegistry registry, Coder<T> inputCoder) {
- return new BoundedHeapCoder<>(count, compareFn, inputCoder);
- }
-
- @Override
- public String getIncompatibleGlobalWindowErrorMessage() {
- return "Default values are not supported in Top.[of, smallest, largest]() if the output "
- + "PCollection is not windowed by GlobalWindows. Instead, use "
- + "Top.[of, smallest, largest]().withoutDefaults() to output an empty PCollection if the"
- + " input PCollection is empty, or Top.[of, smallest, largest]().asSingletonView() to "
- + "get a PCollection containing the empty list if the input PCollection is empty.";
- }
- }
-
- /**
- * A heap that stores only a finite number of top elements according to its provided
- * {@code Comparator}. Implemented as an {@link Accumulator} to facilitate implementation of
- * {@link Top}.
- *
- * <p>This class is <i>not</i> safe for multithreaded use, except read-only.
- */
- static class BoundedHeap<T, ComparatorT extends Comparator<T> & Serializable>
- implements Accumulator<T, BoundedHeap<T, ComparatorT>, List<T>> {
-
- /**
- * A queue with smallest at the head, for quick adds.
- *
- * <p>Only one of asList and asQueue may be non-null.
- */
- private PriorityQueue<T> asQueue;
-
- /**
- * A list in with largest first, the form of extractOutput().
- *
- * <p>Only one of asList and asQueue may be non-null.
- */
- private List<T> asList;
-
- /** The user-provided Comparator. */
- private final ComparatorT compareFn;
-
- /** The maximum size of the heap. */
- private final int maximumSize;
-
- /**
- * Creates a new heap with the provided size, comparator, and initial elements.
- */
- private BoundedHeap(int maximumSize, ComparatorT compareFn, List<T> asList) {
- this.maximumSize = maximumSize;
- this.asList = asList;
- this.compareFn = compareFn;
- }
-
- @Override
- public void addInput(T value) {
- maybeAddInput(value);
- }
-
- /**
- * Adds {@code value} to this heap if it is larger than any of the current elements.
- * Returns {@code true} if {@code value} was added.
- */
- private boolean maybeAddInput(T value) {
- if (maximumSize == 0) {
- // Don't add anything.
- return false;
- }
-
- // If asQueue == null, then this is the first add after the latest call to the
- // constructor or asList().
- if (asQueue == null) {
- asQueue = new PriorityQueue<>(maximumSize, compareFn);
- for (T item : asList) {
- asQueue.add(item);
- }
- asList = null;
- }
-
- if (asQueue.size() < maximumSize) {
- asQueue.add(value);
- return true;
- } else if (compareFn.compare(value, asQueue.peek()) > 0) {
- asQueue.poll();
- asQueue.add(value);
- return true;
- } else {
- return false;
- }
- }
-
- @Override
- public void mergeAccumulator(BoundedHeap<T, ComparatorT> accumulator) {
- for (T value : accumulator.asList()) {
- if (!maybeAddInput(value)) {
- // If this element of accumulator does not make the top N, neither
- // will the rest, which are all smaller.
- break;
- }
- }
- }
-
- @Override
- public List<T> extractOutput() {
- return asList();
- }
-
- /**
- * Returns the contents of this Heap as a List sorted largest-to-smallest.
- */
- private List<T> asList() {
- if (asList == null) {
- List<T> smallestFirstList = Lists.newArrayListWithCapacity(asQueue.size());
- while (!asQueue.isEmpty()) {
- smallestFirstList.add(asQueue.poll());
- }
- asList = Lists.reverse(smallestFirstList);
- asQueue = null;
- }
- return asList;
- }
- }
-
- /**
- * A {@link Coder} for {@link BoundedHeap}, using Java serialization via {@link CustomCoder}.
- */
- private static class BoundedHeapCoder<T, ComparatorT extends Comparator<T> & Serializable>
- extends CustomCoder<BoundedHeap<T, ComparatorT>> {
- private final Coder<List<T>> listCoder;
- private final ComparatorT compareFn;
- private final int maximumSize;
-
- public BoundedHeapCoder(int maximumSize, ComparatorT compareFn, Coder<T> elementCoder) {
- listCoder = ListCoder.of(elementCoder);
- this.compareFn = compareFn;
- this.maximumSize = maximumSize;
- }
-
- @Override
- public void encode(
- BoundedHeap<T, ComparatorT> value, OutputStream outStream, Context context)
- throws CoderException, IOException {
- listCoder.encode(value.asList(), outStream, context);
- }
-
- @Override
- public BoundedHeap<T, ComparatorT> decode(InputStream inStream, Coder.Context context)
- throws CoderException, IOException {
- return new BoundedHeap<>(maximumSize, compareFn, listCoder.decode(inStream, context));
- }
-
- @Override
- public void verifyDeterministic() throws NonDeterministicException {
- verifyDeterministic(
- "HeapCoder requires a deterministic list coder", listCoder);
- }
-
- @Override
- public boolean isRegisterByteSizeObserverCheap(
- BoundedHeap<T, ComparatorT> value, Context context) {
- return listCoder.isRegisterByteSizeObserverCheap(
- value.asList(), context);
- }
-
- @Override
- public void registerByteSizeObserver(
- BoundedHeap<T, ComparatorT> value, ElementByteSizeObserver observer, Context context)
- throws Exception {
- listCoder.registerByteSizeObserver(value.asList(), observer, context);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/Values.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/Values.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/Values.java
deleted file mode 100644
index d84bc77..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/Values.java
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms;
-
-import com.google.cloud.dataflow.sdk.values.KV;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-
-/**
- * {@code Values<V>} takes a {@code PCollection} of {@code KV<K, V>}s and
- * returns a {@code PCollection<V>} of the values.
- *
- * <p>Example of use:
- * <pre> {@code
- * PCollection<KV<String, Long>> wordCounts = ...;
- * PCollection<Long> counts = wordCounts.apply(Values.<String>create());
- * } </pre>
- *
- * <p>Each output element has the same timestamp and is in the same windows
- * as its corresponding input element, and the output {@code PCollection}
- * has the same
- * {@link com.google.cloud.dataflow.sdk.transforms.windowing.WindowFn}
- * associated with it as the input.
- *
- * <p>See also {@link Keys}.
- *
- * @param <V> the type of the values in the input {@code PCollection},
- * and the type of the elements in the output {@code PCollection}
- */
-public class Values<V> extends PTransform<PCollection<? extends KV<?, V>>,
- PCollection<V>> {
- /**
- * Returns a {@code Values<V>} {@code PTransform}.
- *
- * @param <V> the type of the values in the input {@code PCollection},
- * and the type of the elements in the output {@code PCollection}
- */
- public static <V> Values<V> create() {
- return new Values<>();
- }
-
- private Values() { }
-
- @Override
- public PCollection<V> apply(PCollection<? extends KV<?, V>> in) {
- return
- in.apply(ParDo.named("Values")
- .of(new DoFn<KV<?, V>, V>() {
- @Override
- public void processElement(ProcessContext c) {
- c.output(c.element().getValue());
- }
- }));
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/View.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/View.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/View.java
deleted file mode 100644
index e2c4487..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/View.java
+++ /dev/null
@@ -1,470 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms;
-
-import com.google.cloud.dataflow.sdk.runners.DirectPipelineRunner;
-import com.google.cloud.dataflow.sdk.runners.PipelineRunner;
-import com.google.cloud.dataflow.sdk.util.PCollectionViews;
-import com.google.cloud.dataflow.sdk.util.WindowedValue;
-import com.google.cloud.dataflow.sdk.values.KV;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.cloud.dataflow.sdk.values.PCollectionView;
-
-import java.util.List;
-import java.util.Map;
-
-/**
- * Transforms for creating {@link PCollectionView PCollectionViews} from
- * {@link PCollection PCollections} (to read them as side inputs).
- *
- * <p>While a {@link PCollection PCollection<ElemT>} has many values of type {@code ElemT} per
- * window, a {@link PCollectionView PCollectionView<ViewT>} has a single value of type
- * {@code ViewT} for each window. It can be thought of as a mapping from windows to values of
- * type {@code ViewT}. The transforms here represent ways of converting the {@code ElemT} values
- * in a window into a {@code ViewT} for that window.
- *
- * <p>When a {@link ParDo} tranform is processing a main input
- * element in a window {@code w} and a {@link PCollectionView} is read via
- * {@link DoFn.ProcessContext#sideInput}, the value of the view for {@code w} is
- * returned.
- *
- * <p>The SDK supports viewing a {@link PCollection}, per window, as a single value,
- * a {@link List}, an {@link Iterable}, a {@link Map}, or a multimap (iterable-valued {@link Map}).
- *
- * <p>For a {@link PCollection} that contains a single value of type {@code T}
- * per window, such as the output of {@link Combine#globally},
- * use {@link View#asSingleton()} to prepare it for use as a side input:
- *
- * <pre>
- * {@code
- * PCollectionView<T> output = someOtherPCollection
- * .apply(Combine.globally(...))
- * .apply(View.<T>asSingleton());
- * }
- * </pre>
- *
- * <p>For a small {@link PCollection} with windows that can fit entirely in memory,
- * use {@link View#asList()} to prepare it for use as a {@code List}.
- * When read as a side input, the entire list for a window will be cached in memory.
- *
- * <pre>
- * {@code
- * PCollectionView<List<T>> output =
- * smallPCollection.apply(View.<T>asList());
- * }
- * </pre>
- *
- * <p>If a {@link PCollection} of {@code KV<K, V>} is known to
- * have a single value per window for each key, then use {@link View#asMap()}
- * to view it as a {@code Map<K, V>}:
- *
- * <pre>
- * {@code
- * PCollectionView<Map<K, V> output =
- * somePCollection.apply(View.<K, V>asMap());
- * }
- * </pre>
- *
- * <p>Otherwise, to access a {@link PCollection} of {@code KV<K, V>} as a
- * {@code Map<K, Iterable<V>>} side input, use {@link View#asMultimap()}:
- *
- * <pre>
- * {@code
- * PCollectionView<Map<K, Iterable<V>> output =
- * somePCollection.apply(View.<K, Iterable<V>>asMap());
- * }
- * </pre>
- *
- * <p>To iterate over an entire window of a {@link PCollection} via
- * side input, use {@link View#asIterable()}:
- *
- * <pre>
- * {@code
- * PCollectionView<Iterable<T>> output =
- * somePCollection.apply(View.<T>asIterable());
- * }
- * </pre>
- *
- *
- * <p>Both {@link View#asMultimap()} and {@link View#asMap()} are useful
- * for implementing lookup based "joins" with the main input, when the
- * side input is small enough to fit into memory.
- *
- * <p>For example, if you represent a page on a website via some {@code Page} object and
- * have some type {@code UrlVisits} logging that a URL was visited, you could convert these
- * to more fully structured {@code PageVisit} objects using a side input, something like the
- * following:
- *
- * <pre>
- * {@code
- * PCollection<Page> pages = ... // pages fit into memory
- * PCollection<UrlVisit> urlVisits = ... // very large collection
- * final PCollectionView<Map<URL, Page>> = urlToPage
- * .apply(WithKeys.of( ... )) // extract the URL from the page
- * .apply(View.<URL, Page>asMap());
- *
- * PCollection PageVisits = urlVisits
- * .apply(ParDo.withSideInputs(urlToPage)
- * .of(new DoFn<UrlVisit, PageVisit>() {
- * {@literal @}Override
- * void processElement(ProcessContext context) {
- * UrlVisit urlVisit = context.element();
- * Page page = urlToPage.get(urlVisit.getUrl());
- * c.output(new PageVisit(page, urlVisit.getVisitData()));
- * }
- * }));
- * }
- * </pre>
- *
- * <p>See {@link ParDo#withSideInputs} for details on how to access
- * this variable inside a {@link ParDo} over another {@link PCollection}.
- */
-public class View {
-
- // Do not instantiate
- private View() { }
-
- /**
- * Returns a {@link AsSingleton} transform that takes a
- * {@link PCollection} with a single value per window
- * as input and produces a {@link PCollectionView} that returns
- * the value in the main input window when read as a side input.
- *
- * <pre>
- * {@code
- * PCollection<InputT> input = ...
- * CombineFn<InputT, OutputT> yourCombineFn = ...
- * PCollectionView<OutputT> output = input
- * .apply(Combine.globally(yourCombineFn))
- * .apply(View.<OutputT>asSingleton());
- * }</pre>
- *
- * <p>If the input {@link PCollection} is empty,
- * throws {@link java.util.NoSuchElementException} in the consuming
- * {@link DoFn}.
- *
- * <p>If the input {@link PCollection} contains more than one
- * element, throws {@link IllegalArgumentException} in the
- * consuming {@link DoFn}.
- */
- public static <T> AsSingleton<T> asSingleton() {
- return new AsSingleton<>();
- }
-
- /**
- * Returns a {@link View.AsList} transform that takes a {@link PCollection} and returns a
- * {@link PCollectionView} mapping each window to a {@link List} containing
- * all of the elements in the window.
- *
- * <p>The resulting list is required to fit in memory.
- */
- public static <T> AsList<T> asList() {
- return new AsList<>();
- }
-
- /**
- * Returns a {@link View.AsIterable} transform that takes a {@link PCollection} as input
- * and produces a {@link PCollectionView} mapping each window to an
- * {@link Iterable} of the values in that window.
- *
- * <p>The values of the {@link Iterable} for a window are not required to fit in memory,
- * but they may also not be effectively cached. If it is known that every window fits in memory,
- * and stronger caching is desired, use {@link #asList}.
- */
- public static <T> AsIterable<T> asIterable() {
- return new AsIterable<>();
- }
-
- /**
- * Returns a {@link View.AsMap} transform that takes a
- * {@link PCollection PCollection<KV<K V>>} as
- * input and produces a {@link PCollectionView} mapping each window to
- * a {@link Map Map>K, V>}. It is required that each key of the input be
- * associated with a single value, per window. If this is not the case, precede this
- * view with {@code Combine.perKey}, as in the example below, or alternatively
- * use {@link View#asMultimap()}.
- *
- * <pre>
- * {@code
- * PCollection<KV<K, V>> input = ...
- * CombineFn<V, OutputT> yourCombineFn = ...
- * PCollectionView<Map<K, OutputT>> output = input
- * .apply(Combine.perKey(yourCombineFn.<K>asKeyedFn()))
- * .apply(View.<K, OutputT>asMap());
- * }</pre>
- *
- * <p>Currently, the resulting map is required to fit into memory.
- */
- public static <K, V> AsMap<K, V> asMap() {
- return new AsMap<K, V>();
- }
-
- /**
- * Returns a {@link View.AsMultimap} transform that takes a
- * {@link PCollection PCollection<KV<K, V>>}
- * as input and produces a {@link PCollectionView} mapping
- * each window to its contents as a {@link Map Map<K, Iterable<V>>}
- * for use as a side input.
- * In contrast to {@link View#asMap()}, it is not required that the keys in the
- * input collection be unique.
- *
- * <pre>
- * {@code
- * PCollection<KV<K, V>> input = ... // maybe more than one occurrence of a some keys
- * PCollectionView<Map<K, V>> output = input.apply(View.<K, V>asMultimap());
- * }</pre>
- *
- * <p>Currently, the resulting map is required to fit into memory.
- */
- public static <K, V> AsMultimap<K, V> asMultimap() {
- return new AsMultimap<K, V>();
- }
-
- /**
- * Not intended for direct use by pipeline authors; public only so a {@link PipelineRunner} may
- * override its behavior.
- *
- * <p>See {@link View#asList()}.
- */
- public static class AsList<T> extends PTransform<PCollection<T>, PCollectionView<List<T>>> {
- private AsList() { }
-
- @Override
- public void validate(PCollection<T> input) {
- try {
- GroupByKey.applicableTo(input);
- } catch (IllegalStateException e) {
- throw new IllegalStateException("Unable to create a side-input view from input", e);
- }
- }
-
- @Override
- public PCollectionView<List<T>> apply(PCollection<T> input) {
- return input.apply(CreatePCollectionView.<T, List<T>>of(PCollectionViews.listView(
- input.getPipeline(), input.getWindowingStrategy(), input.getCoder())));
- }
- }
-
- /**
- * Not intended for direct use by pipeline authors; public only so a {@link PipelineRunner} may
- * override its behavior.
- *
- * <p>See {@link View#asIterable()}.
- */
- public static class AsIterable<T>
- extends PTransform<PCollection<T>, PCollectionView<Iterable<T>>> {
- private AsIterable() { }
-
- @Override
- public void validate(PCollection<T> input) {
- try {
- GroupByKey.applicableTo(input);
- } catch (IllegalStateException e) {
- throw new IllegalStateException("Unable to create a side-input view from input", e);
- }
- }
-
- @Override
- public PCollectionView<Iterable<T>> apply(PCollection<T> input) {
- return input.apply(CreatePCollectionView.<T, Iterable<T>>of(PCollectionViews.iterableView(
- input.getPipeline(), input.getWindowingStrategy(), input.getCoder())));
- }
- }
-
- /**
- * Not intended for direct use by pipeline authors; public only so a {@link PipelineRunner} may
- * override its behavior.
- *
- * <p>See {@link View#asSingleton()}.
- */
- public static class AsSingleton<T> extends PTransform<PCollection<T>, PCollectionView<T>> {
- private final T defaultValue;
- private final boolean hasDefault;
-
- private AsSingleton() {
- this.defaultValue = null;
- this.hasDefault = false;
- }
-
- private AsSingleton(T defaultValue) {
- this.defaultValue = defaultValue;
- this.hasDefault = true;
- }
-
- /**
- * Returns whether this transform has a default value.
- */
- public boolean hasDefaultValue() {
- return hasDefault;
- }
-
- /**
- * Returns the default value of this transform, or null if there isn't one.
- */
- public T defaultValue() {
- return defaultValue;
- }
-
- /**
- * Default value to return for windows with no value in them.
- */
- public AsSingleton<T> withDefaultValue(T defaultValue) {
- return new AsSingleton<>(defaultValue);
- }
-
- @Override
- public void validate(PCollection<T> input) {
- try {
- GroupByKey.applicableTo(input);
- } catch (IllegalStateException e) {
- throw new IllegalStateException("Unable to create a side-input view from input", e);
- }
- }
-
- @Override
- public PCollectionView<T> apply(PCollection<T> input) {
- return input.apply(CreatePCollectionView.<T, T>of(PCollectionViews.singletonView(
- input.getPipeline(),
- input.getWindowingStrategy(),
- hasDefault,
- defaultValue,
- input.getCoder())));
- }
- }
-
- /**
- * Not intended for direct use by pipeline authors; public only so a {@link PipelineRunner} may
- * override its behavior.
- *
- * <p>See {@link View#asMultimap()}.
- */
- public static class AsMultimap<K, V>
- extends PTransform<PCollection<KV<K, V>>, PCollectionView<Map<K, Iterable<V>>>> {
- private AsMultimap() { }
-
- @Override
- public void validate(PCollection<KV<K, V>> input) {
- try {
- GroupByKey.applicableTo(input);
- } catch (IllegalStateException e) {
- throw new IllegalStateException("Unable to create a side-input view from input", e);
- }
- }
-
- @Override
- public PCollectionView<Map<K, Iterable<V>>> apply(PCollection<KV<K, V>> input) {
- return input.apply(CreatePCollectionView.<KV<K, V>, Map<K, Iterable<V>>>of(
- PCollectionViews.multimapView(
- input.getPipeline(),
- input.getWindowingStrategy(),
- input.getCoder())));
- }
- }
-
- /**
- * Not intended for direct use by pipeline authors; public only so a {@link PipelineRunner} may
- * override its behavior.
- *
- * <p>See {@link View#asMap()}.
- */
- public static class AsMap<K, V>
- extends PTransform<PCollection<KV<K, V>>, PCollectionView<Map<K, V>>> {
- private AsMap() { }
-
- /**
- * @deprecated this method simply returns this AsMap unmodified
- */
- @Deprecated()
- public AsMap<K, V> withSingletonValues() {
- return this;
- }
-
- @Override
- public void validate(PCollection<KV<K, V>> input) {
- try {
- GroupByKey.applicableTo(input);
- } catch (IllegalStateException e) {
- throw new IllegalStateException("Unable to create a side-input view from input", e);
- }
- }
-
- @Override
- public PCollectionView<Map<K, V>> apply(PCollection<KV<K, V>> input) {
- return input.apply(CreatePCollectionView.<KV<K, V>, Map<K, V>>of(
- PCollectionViews.mapView(
- input.getPipeline(),
- input.getWindowingStrategy(),
- input.getCoder())));
- }
- }
-
- ////////////////////////////////////////////////////////////////////////////
- // Internal details below
-
- /**
- * Creates a primitive {@link PCollectionView}.
- *
- * <p>For internal use only by runner implementors.
- *
- * @param <ElemT> The type of the elements of the input PCollection
- * @param <ViewT> The type associated with the {@link PCollectionView} used as a side input
- */
- public static class CreatePCollectionView<ElemT, ViewT>
- extends PTransform<PCollection<ElemT>, PCollectionView<ViewT>> {
- private PCollectionView<ViewT> view;
-
- private CreatePCollectionView(PCollectionView<ViewT> view) {
- this.view = view;
- }
-
- public static <ElemT, ViewT> CreatePCollectionView<ElemT, ViewT> of(
- PCollectionView<ViewT> view) {
- return new CreatePCollectionView<>(view);
- }
-
- public PCollectionView<ViewT> getView() {
- return view;
- }
-
- @Override
- public PCollectionView<ViewT> apply(PCollection<ElemT> input) {
- return view;
- }
-
- static {
- DirectPipelineRunner.registerDefaultTransformEvaluator(
- CreatePCollectionView.class,
- new DirectPipelineRunner.TransformEvaluator<CreatePCollectionView>() {
- @SuppressWarnings("rawtypes")
- @Override
- public void evaluate(
- CreatePCollectionView transform,
- DirectPipelineRunner.EvaluationContext context) {
- evaluateTyped(transform, context);
- }
-
- private <ElemT, ViewT> void evaluateTyped(
- CreatePCollectionView<ElemT, ViewT> transform,
- DirectPipelineRunner.EvaluationContext context) {
- List<WindowedValue<ElemT>> elems =
- context.getPCollectionWindowedValues(context.getInput(transform));
- context.setPCollectionView(context.getOutput(transform), elems);
- }
- });
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/WithKeys.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/WithKeys.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/WithKeys.java
deleted file mode 100644
index c06795c..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/WithKeys.java
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms;
-
-import com.google.cloud.dataflow.sdk.coders.CannotProvideCoderException;
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.coders.CoderRegistry;
-import com.google.cloud.dataflow.sdk.coders.KvCoder;
-import com.google.cloud.dataflow.sdk.values.KV;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.cloud.dataflow.sdk.values.TypeDescriptor;
-
-/**
- * {@code WithKeys<K, V>} takes a {@code PCollection<V>}, and either a
- * constant key of type {@code K} or a function from {@code V} to
- * {@code K}, and returns a {@code PCollection<KV<K, V>>}, where each
- * of the values in the input {@code PCollection} has been paired with
- * either the constant key or a key computed from the value.
- *
- * <p>Example of use:
- * <pre> {@code
- * PCollection<String> words = ...;
- * PCollection<KV<Integer, String>> lengthsToWords =
- * words.apply(WithKeys.of(new SerializableFunction<String, Integer>() {
- * public Integer apply(String s) { return s.length(); } }));
- * } </pre>
- *
- * <p>Each output element has the same timestamp and is in the same windows
- * as its corresponding input element, and the output {@code PCollection}
- * has the same
- * {@link com.google.cloud.dataflow.sdk.transforms.windowing.WindowFn}
- * associated with it as the input.
- *
- * @param <K> the type of the keys in the output {@code PCollection}
- * @param <V> the type of the elements in the input
- * {@code PCollection} and the values in the output
- * {@code PCollection}
- */
-public class WithKeys<K, V> extends PTransform<PCollection<V>,
- PCollection<KV<K, V>>> {
- /**
- * Returns a {@code PTransform} that takes a {@code PCollection<V>}
- * and returns a {@code PCollection<KV<K, V>>}, where each of the
- * values in the input {@code PCollection} has been paired with a
- * key computed from the value by invoking the given
- * {@code SerializableFunction}.
- *
- * <p>If using a lambda in Java 8, {@link #withKeyType(TypeDescriptor)} must
- * be called on the result {@link PTransform}.
- */
- public static <K, V> WithKeys<K, V> of(SerializableFunction<V, K> fn) {
- return new WithKeys<>(fn, null);
- }
-
- /**
- * Returns a {@code PTransform} that takes a {@code PCollection<V>}
- * and returns a {@code PCollection<KV<K, V>>}, where each of the
- * values in the input {@code PCollection} has been paired with the
- * given key.
- */
- @SuppressWarnings("unchecked")
- public static <K, V> WithKeys<K, V> of(final K key) {
- return new WithKeys<>(
- new SerializableFunction<V, K>() {
- @Override
- public K apply(V value) {
- return key;
- }
- },
- (Class<K>) (key == null ? null : key.getClass()));
- }
-
-
- /////////////////////////////////////////////////////////////////////////////
-
- private SerializableFunction<V, K> fn;
- private transient Class<K> keyClass;
-
- private WithKeys(SerializableFunction<V, K> fn, Class<K> keyClass) {
- this.fn = fn;
- this.keyClass = keyClass;
- }
-
- /**
- * Return a {@link WithKeys} that is like this one with the specified key type descriptor.
- *
- * For use with lambdas in Java 8, either this method must be called with an appropriate type
- * descriptor or {@link PCollection#setCoder(Coder)} must be called on the output
- * {@link PCollection}.
- */
- public WithKeys<K, V> withKeyType(TypeDescriptor<K> keyType) {
- // Safe cast
- @SuppressWarnings("unchecked")
- Class<K> rawType = (Class<K>) keyType.getRawType();
- return new WithKeys<>(fn, rawType);
- }
-
- @Override
- public PCollection<KV<K, V>> apply(PCollection<V> in) {
- PCollection<KV<K, V>> result =
- in.apply(ParDo.named("AddKeys")
- .of(new DoFn<V, KV<K, V>>() {
- @Override
- public void processElement(ProcessContext c) {
- c.output(KV.of(fn.apply(c.element()),
- c.element()));
- }
- }));
-
- try {
- Coder<K> keyCoder;
- CoderRegistry coderRegistry = in.getPipeline().getCoderRegistry();
- if (keyClass == null) {
- keyCoder = coderRegistry.getDefaultOutputCoder(fn, in.getCoder());
- } else {
- keyCoder = coderRegistry.getDefaultCoder(TypeDescriptor.of(keyClass));
- }
- // TODO: Remove when we can set the coder inference context.
- result.setCoder(KvCoder.of(keyCoder, in.getCoder()));
- } catch (CannotProvideCoderException exc) {
- // let lazy coder inference have a try
- }
-
- return result;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/WithTimestamps.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/WithTimestamps.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/WithTimestamps.java
deleted file mode 100644
index 85a93bf..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/WithTimestamps.java
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms;
-
-import static com.google.common.base.Preconditions.checkNotNull;
-
-import com.google.cloud.dataflow.sdk.io.Source;
-import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
-import com.google.cloud.dataflow.sdk.transforms.windowing.Window;
-import com.google.cloud.dataflow.sdk.transforms.windowing.WindowFn;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-
-import org.joda.time.Duration;
-import org.joda.time.Instant;
-
-/**
- * A {@link PTransform} for assigning timestamps to all the elements of a {@link PCollection}.
- *
- * <p>Timestamps are used to assign {@link BoundedWindow Windows} to elements within the
- * {@link Window#into(com.google.cloud.dataflow.sdk.transforms.windowing.WindowFn)}
- * {@link PTransform}. Assigning timestamps is useful when the input data set comes from a
- * {@link Source} without implicit timestamps (such as
- * {@link com.google.cloud.dataflow.sdk.io.TextIO.Read TextIO}).
- *
- */
-public class WithTimestamps<T> extends PTransform<PCollection<T>, PCollection<T>> {
- /**
- * For a {@link SerializableFunction} {@code fn} from {@code T} to {@link Instant}, outputs a
- * {@link PTransform} that takes an input {@link PCollection PCollection<T>} and outputs a
- * {@link PCollection PCollection<T>} containing every element {@code v} in the input where
- * each element is output with a timestamp obtained as the result of {@code fn.apply(v)}.
- *
- * <p>If the input {@link PCollection} elements have timestamps, the output timestamp for each
- * element must not be before the input element's timestamp minus the value of
- * {@link #getAllowedTimestampSkew()}. If an output timestamp is before this time, the transform
- * will throw an {@link IllegalArgumentException} when executed. Use
- * {@link #withAllowedTimestampSkew(Duration)} to update the allowed skew.
- *
- * <p>Each output element will be in the same windows as the input element. If a new window based
- * on the new output timestamp is desired, apply a new instance of {@link Window#into(WindowFn)}.
- *
- * <p>This transform will fail at execution time with a {@link NullPointerException} if for any
- * input element the result of {@code fn.apply(v)} is {@code null}.
- *
- * <p>Example of use in Java 8:
- * <pre>{@code
- * PCollection<Record> timestampedRecords = records.apply(
- * WithTimestamps.of((Record rec) -> rec.getInstant());
- * }</pre>
- */
- public static <T> WithTimestamps<T> of(SerializableFunction<T, Instant> fn) {
- return new WithTimestamps<>(fn, Duration.ZERO);
- }
-
- ///////////////////////////////////////////////////////////////////
-
- private final SerializableFunction<T, Instant> fn;
- private final Duration allowedTimestampSkew;
-
- private WithTimestamps(SerializableFunction<T, Instant> fn, Duration allowedTimestampSkew) {
- this.fn = checkNotNull(fn, "WithTimestamps fn cannot be null");
- this.allowedTimestampSkew = allowedTimestampSkew;
- }
-
- /**
- * Return a new WithTimestamps like this one with updated allowed timestamp skew, which is the
- * maximum duration that timestamps can be shifted backward. Does not modify this object.
- *
- * <p>The default value is {@code Duration.ZERO}, allowing timestamps to only be shifted into the
- * future. For infinite skew, use {@code new Duration(Long.MAX_VALUE)}.
- */
- public WithTimestamps<T> withAllowedTimestampSkew(Duration allowedTimestampSkew) {
- return new WithTimestamps<>(this.fn, allowedTimestampSkew);
- }
-
- /**
- * Returns the allowed timestamp skew duration, which is the maximum
- * duration that timestamps can be shifted backwards from the timestamp of the input element.
- *
- * @see DoFn#getAllowedTimestampSkew()
- */
- public Duration getAllowedTimestampSkew() {
- return allowedTimestampSkew;
- }
-
- @Override
- public PCollection<T> apply(PCollection<T> input) {
- return input
- .apply(ParDo.named("AddTimestamps").of(new AddTimestampsDoFn<T>(fn, allowedTimestampSkew)))
- .setTypeDescriptorInternal(input.getTypeDescriptor());
- }
-
- private static class AddTimestampsDoFn<T> extends DoFn<T, T> {
- private final SerializableFunction<T, Instant> fn;
- private final Duration allowedTimestampSkew;
-
- public AddTimestampsDoFn(SerializableFunction<T, Instant> fn, Duration allowedTimestampSkew) {
- this.fn = fn;
- this.allowedTimestampSkew = allowedTimestampSkew;
- }
-
- @Override
- public void processElement(ProcessContext c) {
- Instant timestamp = fn.apply(c.element());
- checkNotNull(
- timestamp, "Timestamps for WithTimestamps cannot be null. Timestamp provided by %s.", fn);
- c.outputWithTimestamp(c.element(), timestamp);
- }
-
- @Override
- public Duration getAllowedTimestampSkew() {
- return allowedTimestampSkew;
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/Write.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/Write.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/Write.java
deleted file mode 100644
index 5cf655a..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/Write.java
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
- * in compliance with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software distributed under the License
- * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
- * or implied. See the License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms;
-
-/**
- * A backwards-compatible {@code Write} class that simply inherits from the
- * {@link com.google.cloud.dataflow.sdk.io.Write} class that should be used instead.
- *
- * @deprecated: use {@link com.google.cloud.dataflow.sdk.io.Write} from the
- * {@code com.google.cloud.dataflow.sdk.io} package instead.
- */
-@Deprecated
-public class Write extends com.google.cloud.dataflow.sdk.io.Write {
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/display/DisplayData.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/display/DisplayData.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/display/DisplayData.java
deleted file mode 100644
index dadc730..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/display/DisplayData.java
+++ /dev/null
@@ -1,530 +0,0 @@
-/*
- * Copyright (C) 2016 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms.display;
-
-import static com.google.common.base.Preconditions.checkArgument;
-import static com.google.common.base.Preconditions.checkNotNull;
-
-import com.google.cloud.dataflow.sdk.transforms.DoFn;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-import com.google.cloud.dataflow.sdk.transforms.ParDo;
-import com.google.common.collect.ImmutableMap;
-import com.google.common.collect.Maps;
-import com.google.common.collect.Sets;
-
-import com.fasterxml.jackson.annotation.JsonGetter;
-import com.fasterxml.jackson.annotation.JsonInclude;
-
-import org.apache.avro.reflect.Nullable;
-import org.joda.time.Duration;
-import org.joda.time.Instant;
-import org.joda.time.format.DateTimeFormatter;
-import org.joda.time.format.ISODateTimeFormat;
-
-import java.util.Collection;
-import java.util.Map;
-import java.util.Objects;
-import java.util.Set;
-
-/**
- * Static display metadata associated with a pipeline component. Display data is useful for
- * pipeline runner UIs and diagnostic dashboards to display details about
- * {@link PTransform PTransforms} that make up a pipeline.
- *
- * <p>Components specify their display data by implementing the {@link HasDisplayData}
- * interface.
- */
-public class DisplayData {
- private static final DisplayData EMPTY = new DisplayData(Maps.<Identifier, Item>newHashMap());
- private static final DateTimeFormatter TIMESTAMP_FORMATTER = ISODateTimeFormat.dateTime();
-
- private final ImmutableMap<Identifier, Item> entries;
-
- private DisplayData(Map<Identifier, Item> entries) {
- this.entries = ImmutableMap.copyOf(entries);
- }
-
- /**
- * Default empty {@link DisplayData} instance.
- */
- public static DisplayData none() {
- return EMPTY;
- }
-
- /**
- * Collect the {@link DisplayData} from a component. This will traverse all subcomponents
- * specified via {@link Builder#include} in the given component. Data in this component will be in
- * a namespace derived from the component.
- */
- public static DisplayData from(HasDisplayData component) {
- checkNotNull(component);
- return InternalBuilder.forRoot(component).build();
- }
-
- public Collection<Item> items() {
- return entries.values();
- }
-
- public Map<Identifier, Item> asMap() {
- return entries;
- }
-
- @Override
- public String toString() {
- StringBuilder builder = new StringBuilder();
- boolean isFirstLine = true;
- for (Map.Entry<Identifier, Item> entry : entries.entrySet()) {
- if (isFirstLine) {
- isFirstLine = false;
- } else {
- builder.append("\n");
- }
-
- builder.append(entry);
- }
-
- return builder.toString();
- }
-
- /**
- * Utility to build up display metadata from a component and its included
- * subcomponents.
- */
- public interface Builder {
- /**
- * Include display metadata from the specified subcomponent. For example, a {@link ParDo}
- * transform includes display metadata from the encapsulated {@link DoFn}.
- *
- * @return A builder instance to continue to build in a fluent-style.
- */
- Builder include(HasDisplayData subComponent);
-
- /**
- * Register the given string display metadata. The metadata item will be registered with type
- * {@link DisplayData.Type#STRING}, and is identified by the specified key and namespace from
- * the current transform or component.
- */
- ItemBuilder add(String key, String value);
-
- /**
- * Register the given numeric display metadata. The metadata item will be registered with type
- * {@link DisplayData.Type#INTEGER}, and is identified by the specified key and namespace from
- * the current transform or component.
- */
- ItemBuilder add(String key, long value);
-
- /**
- * Register the given floating point display metadata. The metadata item will be registered with
- * type {@link DisplayData.Type#FLOAT}, and is identified by the specified key and namespace
- * from the current transform or component.
- */
- ItemBuilder add(String key, double value);
-
- /**
- * Register the given timestamp display metadata. The metadata item will be registered with type
- * {@link DisplayData.Type#TIMESTAMP}, and is identified by the specified key and namespace from
- * the current transform or component.
- */
- ItemBuilder add(String key, Instant value);
-
- /**
- * Register the given duration display metadata. The metadata item will be registered with type
- * {@link DisplayData.Type#DURATION}, and is identified by the specified key and namespace from
- * the current transform or component.
- */
- ItemBuilder add(String key, Duration value);
-
- /**
- * Register the given class display metadata. The metadata item will be registered with type
- * {@link DisplayData.Type#JAVA_CLASS}, and is identified by the specified key and namespace
- * from the current transform or component.
- */
- ItemBuilder add(String key, Class<?> value);
- }
-
- /**
- * Utility to append optional fields to display metadata, or register additional display metadata
- * items.
- */
- public interface ItemBuilder extends Builder {
- /**
- * Add a human-readable label to describe the most-recently added metadata field.
- * A label is optional; if unspecified, UIs should display the metadata key to identify the
- * display item.
- *
- * <p>Specifying a null value will clear the label if it was previously defined.
- */
- ItemBuilder withLabel(@Nullable String label);
-
- /**
- * Add a link URL to the most-recently added display metadata. A link URL is optional and
- * can be provided to point the reader to additional details about the metadata.
- *
- * <p>Specifying a null value will clear the URL if it was previously defined.
- */
- ItemBuilder withLinkUrl(@Nullable String url);
- }
-
- /**
- * A display metadata item. DisplayData items are registered via {@link Builder#add} within
- * {@link HasDisplayData#populateDisplayData} implementations. Each metadata item is uniquely
- * identified by the specified key and namespace generated from the registering component's
- * class name.
- */
- public static class Item {
- private final String key;
- private final String ns;
- private final Type type;
- private final String value;
- private final String shortValue;
- private final String label;
- private final String url;
-
- private static <T> Item create(String namespace, String key, Type type, T value) {
- FormattedItemValue formatted = type.format(value);
- return new Item(
- namespace, key, type, formatted.getLongValue(), formatted.getShortValue(), null, null);
- }
-
- private Item(
- String namespace,
- String key,
- Type type,
- String value,
- String shortValue,
- String url,
- String label) {
- this.ns = namespace;
- this.key = key;
- this.type = type;
- this.value = value;
- this.shortValue = shortValue;
- this.url = url;
- this.label = label;
- }
-
- @JsonGetter("namespace")
- public String getNamespace() {
- return ns;
- }
-
- @JsonGetter("key")
- public String getKey() {
- return key;
- }
-
- /**
- * Retrieve the {@link DisplayData.Type} of display metadata. All metadata conforms to a
- * predefined set of allowed types.
- */
- @JsonGetter("type")
- public Type getType() {
- return type;
- }
-
- /**
- * Retrieve the value of the metadata item.
- */
- @JsonGetter("value")
- public String getValue() {
- return value;
- }
-
- /**
- * Return the optional short value for an item. Types may provide a short-value to displayed
- * instead of or in addition to the full {@link Item#value}.
- *
- * <p>Some display data types will not provide a short value, in which case the return value
- * will be null.
- */
- @JsonGetter("shortValue")
- @JsonInclude(JsonInclude.Include.NON_NULL)
- @Nullable
- public String getShortValue() {
- return shortValue;
- }
-
- /**
- * Retrieve the optional label for an item. The label is a human-readable description of what
- * the metadata represents. UIs may choose to display the label instead of the item key.
- *
- * <p>If no label was specified, this will return {@code null}.
- */
- @JsonGetter("label")
- @JsonInclude(JsonInclude.Include.NON_NULL)
- @Nullable
- public String getLabel() {
- return label;
- }
-
- /**
- * Retrieve the optional link URL for an item. The URL points to an address where the reader
- * can find additional context for the display metadata.
- *
- * <p>If no URL was specified, this will return {@code null}.
- */
- @JsonGetter("linkUrl")
- @JsonInclude(JsonInclude.Include.NON_NULL)
- @Nullable
- public String getLinkUrl() {
- return url;
- }
-
- @Override
- public String toString() {
- return getValue();
- }
-
- private Item withLabel(String label) {
- return new Item(this.ns, this.key, this.type, this.value, this.shortValue, this.url, label);
- }
-
- private Item withUrl(String url) {
- return new Item(this.ns, this.key, this.type, this.value, this.shortValue, url, this.label);
- }
- }
-
- /**
- * Unique identifier for a display metadata item within a component.
- * Identifiers are composed of the key they are registered with and a namespace generated from
- * the class of the component which registered the item.
- *
- * <p>Display metadata registered with the same key from different components will have different
- * namespaces and thus will both be represented in the composed {@link DisplayData}. If a
- * single component registers multiple metadata items with the same key, only the most recent
- * item will be retained; previous versions are discarded.
- */
- public static class Identifier {
- private final String ns;
- private final String key;
-
- static Identifier of(Class<?> namespace, String key) {
- return new Identifier(namespace.getName(), key);
- }
-
- private Identifier(String ns, String key) {
- this.ns = ns;
- this.key = key;
- }
-
- public String getNamespace() {
- return ns;
- }
-
- public String getKey() {
- return key;
- }
-
- @Override
- public boolean equals(Object obj) {
- if (obj instanceof Identifier) {
- Identifier that = (Identifier) obj;
- return Objects.equals(this.ns, that.ns)
- && Objects.equals(this.key, that.key);
- }
-
- return false;
- }
-
- @Override
- public int hashCode() {
- return Objects.hash(ns, key);
- }
-
- @Override
- public String toString() {
- return String.format("%s:%s", ns, key);
- }
- }
-
- /**
- * Display metadata type.
- */
- enum Type {
- STRING {
- @Override
- FormattedItemValue format(Object value) {
- return new FormattedItemValue((String) value);
- }
- },
- INTEGER {
- @Override
- FormattedItemValue format(Object value) {
- return new FormattedItemValue(Long.toString((long) value));
- }
- },
- FLOAT {
- @Override
- FormattedItemValue format(Object value) {
- return new FormattedItemValue(Double.toString((Double) value));
- }
- },
- TIMESTAMP() {
- @Override
- FormattedItemValue format(Object value) {
- return new FormattedItemValue((TIMESTAMP_FORMATTER.print((Instant) value)));
- }
- },
- DURATION {
- @Override
- FormattedItemValue format(Object value) {
- return new FormattedItemValue(Long.toString(((Duration) value).getMillis()));
- }
- },
- JAVA_CLASS {
- @Override
- FormattedItemValue format(Object value) {
- Class<?> clazz = (Class<?>) value;
- return new FormattedItemValue(clazz.getName(), clazz.getSimpleName());
- }
- };
-
- /**
- * Format the display metadata value into a long string representation, and optionally
- * a shorter representation for display.
- *
- * <p>Internal-only. Value objects can be safely cast to the expected Java type.
- */
- abstract FormattedItemValue format(Object value);
- }
-
- private static class FormattedItemValue {
- private final String shortValue;
- private final String longValue;
-
- private FormattedItemValue(String longValue) {
- this(longValue, null);
- }
-
- private FormattedItemValue(String longValue, String shortValue) {
- this.longValue = longValue;
- this.shortValue = shortValue;
- }
-
- private String getLongValue () {
- return this.longValue;
- }
-
- private String getShortValue() {
- return this.shortValue;
- }
- }
-
- private static class InternalBuilder implements ItemBuilder {
- private final Map<Identifier, Item> entries;
- private final Set<Object> visited;
-
- private Class<?> latestNs;
- private Item latestItem;
- private Identifier latestIdentifier;
-
- private InternalBuilder() {
- this.entries = Maps.newHashMap();
- this.visited = Sets.newIdentityHashSet();
- }
-
- private static InternalBuilder forRoot(HasDisplayData instance) {
- InternalBuilder builder = new InternalBuilder();
- builder.include(instance);
- return builder;
- }
-
- @Override
- public Builder include(HasDisplayData subComponent) {
- checkNotNull(subComponent);
- boolean newComponent = visited.add(subComponent);
- if (newComponent) {
- Class prevNs = this.latestNs;
- this.latestNs = subComponent.getClass();
- subComponent.populateDisplayData(this);
- this.latestNs = prevNs;
- }
-
- return this;
- }
-
- @Override
- public ItemBuilder add(String key, String value) {
- checkNotNull(value);
- return addItem(key, Type.STRING, value);
- }
-
- @Override
- public ItemBuilder add(String key, long value) {
- return addItem(key, Type.INTEGER, value);
- }
-
- @Override
- public ItemBuilder add(String key, double value) {
- return addItem(key, Type.FLOAT, value);
- }
-
- @Override
- public ItemBuilder add(String key, Instant value) {
- checkNotNull(value);
- return addItem(key, Type.TIMESTAMP, value);
- }
-
- @Override
- public ItemBuilder add(String key, Duration value) {
- checkNotNull(value);
- return addItem(key, Type.DURATION, value);
- }
-
- @Override
- public ItemBuilder add(String key, Class<?> value) {
- checkNotNull(value);
- return addItem(key, Type.JAVA_CLASS, value);
- }
-
- private <T> ItemBuilder addItem(String key, Type type, T value) {
- checkNotNull(key);
- checkArgument(!key.isEmpty());
-
- Identifier id = Identifier.of(latestNs, key);
- if (entries.containsKey(id)) {
- throw new IllegalArgumentException("DisplayData key already exists. All display data "
- + "for a component must be registered with a unique key.\nKey: " + id);
- }
- Item item = Item.create(id.getNamespace(), key, type, value);
- entries.put(id, item);
-
- latestItem = item;
- latestIdentifier = id;
-
- return this;
- }
-
- @Override
- public ItemBuilder withLabel(String label) {
- latestItem = latestItem.withLabel(label);
- entries.put(latestIdentifier, latestItem);
- return this;
- }
-
- @Override
- public ItemBuilder withLinkUrl(String url) {
- latestItem = latestItem.withUrl(url);
- entries.put(latestIdentifier, latestItem);
- return this;
- }
-
- private DisplayData build() {
- return new DisplayData(this.entries);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/display/HasDisplayData.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/display/HasDisplayData.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/display/HasDisplayData.java
deleted file mode 100644
index b2eca3d..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/display/HasDisplayData.java
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (C) 2016 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms.display;
-
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-
-/**
- * Marker interface for {@link PTransform PTransforms} and components used within
- * {@link PTransform PTransforms} to specify display metadata to be used within UIs and diagnostic
- * tools.
- *
- * <p>Display metadata is optional and may be collected during pipeline construction. It should
- * only be used to informational purposes. Tools and components should not assume that display data
- * will always be collected, or that collected display data will always be displayed.
- */
-public interface HasDisplayData {
- /**
- * Register display metadata for the given transform or component. Metadata can be registered
- * directly on the provided builder, as well as via included sub-components.
- *
- * <pre>
- * {@code
- * @Override
- * public void populateDisplayData(DisplayData.Builder builder) {
- * builder
- * .include(subComponent)
- * .add("minFilter", 42)
- * .add("topic", "projects/myproject/topics/mytopic")
- * .withLabel("Pub/Sub Topic")
- * .add("serviceInstance", "myservice.com/fizzbang")
- * .withLinkUrl("http://www.myservice.com/fizzbang");
- * }
- * }
- * </pre>
- *
- * @param builder The builder to populate with display metadata.
- */
- void populateDisplayData(DisplayData.Builder builder);
-}
[03/67] [partial] incubator-beam git commit: Directory reorganization
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/values/PCollectionTuple.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/values/PCollectionTuple.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/values/PCollectionTuple.java
deleted file mode 100644
index 58550e4..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/values/PCollectionTuple.java
+++ /dev/null
@@ -1,264 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.values;
-
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.transforms.AppliedPTransform;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-import com.google.cloud.dataflow.sdk.transforms.ParDo;
-import com.google.cloud.dataflow.sdk.util.WindowingStrategy;
-import com.google.cloud.dataflow.sdk.values.PCollection.IsBounded;
-import com.google.common.collect.ImmutableMap;
-
-import java.util.Collection;
-import java.util.Collections;
-import java.util.LinkedHashMap;
-import java.util.Map;
-
-/**
- * A {@link PCollectionTuple} is an immutable tuple of
- * heterogeneously-typed {@link PCollection PCollections}, "keyed" by
- * {@link TupleTag TupleTags}. A {@link PCollectionTuple} can be used as the input or
- * output of a
- * {@link PTransform} taking
- * or producing multiple PCollection inputs or outputs that can be of
- * different types, for instance a
- * {@link ParDo} with side
- * outputs.
- *
- * <p>A {@link PCollectionTuple} can be created and accessed like follows:
- * <pre> {@code
- * PCollection<String> pc1 = ...;
- * PCollection<Integer> pc2 = ...;
- * PCollection<Iterable<String>> pc3 = ...;
- *
- * // Create TupleTags for each of the PCollections to put in the
- * // PCollectionTuple (the type of the TupleTag enables tracking the
- * // static type of each of the PCollections in the PCollectionTuple):
- * TupleTag<String> tag1 = new TupleTag<>();
- * TupleTag<Integer> tag2 = new TupleTag<>();
- * TupleTag<Iterable<String>> tag3 = new TupleTag<>();
- *
- * // Create a PCollectionTuple with three PCollections:
- * PCollectionTuple pcs =
- * PCollectionTuple.of(tag1, pc1)
- * .and(tag2, pc2)
- * .and(tag3, pc3);
- *
- * // Create an empty PCollectionTuple:
- * Pipeline p = ...;
- * PCollectionTuple pcs2 = PCollectionTuple.empty(p);
- *
- * // Get PCollections out of a PCollectionTuple, using the same tags
- * // that were used to put them in:
- * PCollection<Integer> pcX = pcs.get(tag2);
- * PCollection<String> pcY = pcs.get(tag1);
- * PCollection<Iterable<String>> pcZ = pcs.get(tag3);
- *
- * // Get a map of all PCollections in a PCollectionTuple:
- * Map<TupleTag<?>, PCollection<?>> allPcs = pcs.getAll();
- * } </pre>
- */
-public class PCollectionTuple implements PInput, POutput {
- /**
- * Returns an empty {@link PCollectionTuple} that is part of the given {@link Pipeline}.
- *
- * <p>A {@link PCollectionTuple} containing additional elements can be created by calling
- * {@link #and} on the result.
- */
- public static PCollectionTuple empty(Pipeline pipeline) {
- return new PCollectionTuple(pipeline);
- }
-
- /**
- * Returns a singleton {@link PCollectionTuple} containing the given
- * {@link PCollection} keyed by the given {@link TupleTag}.
- *
- * <p>A {@link PCollectionTuple} containing additional elements can be created by calling
- * {@link #and} on the result.
- */
- public static <T> PCollectionTuple of(TupleTag<T> tag, PCollection<T> pc) {
- return empty(pc.getPipeline()).and(tag, pc);
- }
-
- /**
- * Returns a new {@link PCollectionTuple} that has each {@link PCollection} and
- * {@link TupleTag} of this {@link PCollectionTuple} plus the given {@link PCollection}
- * associated with the given {@link TupleTag}.
- *
- * <p>The given {@link TupleTag} should not already be mapped to a
- * {@link PCollection} in this {@link PCollectionTuple}.
- *
- * <p>Each {@link PCollection} in the resulting {@link PCollectionTuple} must be
- * part of the same {@link Pipeline}.
- */
- public <T> PCollectionTuple and(TupleTag<T> tag, PCollection<T> pc) {
- if (pc.getPipeline() != pipeline) {
- throw new IllegalArgumentException(
- "PCollections come from different Pipelines");
- }
-
- return new PCollectionTuple(pipeline,
- new ImmutableMap.Builder<TupleTag<?>, PCollection<?>>()
- .putAll(pcollectionMap)
- .put(tag, pc)
- .build());
- }
-
- /**
- * Returns whether this {@link PCollectionTuple} contains a {@link PCollection} with
- * the given tag.
- */
- public <T> boolean has(TupleTag<T> tag) {
- return pcollectionMap.containsKey(tag);
- }
-
- /**
- * Returns the {@link PCollection} associated with the given {@link TupleTag}
- * in this {@link PCollectionTuple}. Throws {@link IllegalArgumentException} if there is no
- * such {@link PCollection}, i.e., {@code !has(tag)}.
- */
- public <T> PCollection<T> get(TupleTag<T> tag) {
- @SuppressWarnings("unchecked")
- PCollection<T> pcollection = (PCollection<T>) pcollectionMap.get(tag);
- if (pcollection == null) {
- throw new IllegalArgumentException(
- "TupleTag not found in this PCollectionTuple tuple");
- }
- return pcollection;
- }
-
- /**
- * Returns an immutable Map from {@link TupleTag} to corresponding
- * {@link PCollection}, for all the members of this {@link PCollectionTuple}.
- */
- public Map<TupleTag<?>, PCollection<?>> getAll() {
- return pcollectionMap;
- }
-
- /**
- * Like {@link #apply(String, PTransform)} but defaulting to the name
- * of the {@link PTransform}.
- *
- * @return the output of the applied {@link PTransform}
- */
- public <OutputT extends POutput> OutputT apply(
- PTransform<PCollectionTuple, OutputT> t) {
- return Pipeline.applyTransform(this, t);
- }
-
- /**
- * Applies the given {@link PTransform} to this input {@link PCollectionTuple},
- * using {@code name} to identify this specific application of the transform.
- * This name is used in various places, including the monitoring UI, logging,
- * and to stably identify this application node in the job graph.
- *
- * @return the output of the applied {@link PTransform}
- */
- public <OutputT extends POutput> OutputT apply(
- String name, PTransform<PCollectionTuple, OutputT> t) {
- return Pipeline.applyTransform(name, this, t);
- }
-
-
- /////////////////////////////////////////////////////////////////////////////
- // Internal details below here.
-
- Pipeline pipeline;
- final Map<TupleTag<?>, PCollection<?>> pcollectionMap;
-
- PCollectionTuple(Pipeline pipeline) {
- this(pipeline, new LinkedHashMap<TupleTag<?>, PCollection<?>>());
- }
-
- PCollectionTuple(Pipeline pipeline,
- Map<TupleTag<?>, PCollection<?>> pcollectionMap) {
- this.pipeline = pipeline;
- this.pcollectionMap = Collections.unmodifiableMap(pcollectionMap);
- }
-
- /**
- * Returns a {@link PCollectionTuple} with each of the given tags mapping to a new
- * output {@link PCollection}.
- *
- * <p>For use by primitive transformations only.
- */
- public static PCollectionTuple ofPrimitiveOutputsInternal(
- Pipeline pipeline,
- TupleTagList outputTags,
- WindowingStrategy<?, ?> windowingStrategy,
- IsBounded isBounded) {
- Map<TupleTag<?>, PCollection<?>> pcollectionMap = new LinkedHashMap<>();
- for (TupleTag<?> outputTag : outputTags.tupleTags) {
- if (pcollectionMap.containsKey(outputTag)) {
- throw new IllegalArgumentException(
- "TupleTag already present in this tuple");
- }
-
- // In fact, `token` and `outputCollection` should have
- // types TypeDescriptor<T> and PCollection<T> for some
- // unknown T. It is safe to create `outputCollection`
- // with type PCollection<Object> because it has the same
- // erasure as the correct type. When a transform adds
- // elements to `outputCollection` they will be of type T.
- @SuppressWarnings("unchecked")
- TypeDescriptor<Object> token = (TypeDescriptor<Object>) outputTag.getTypeDescriptor();
- PCollection<Object> outputCollection = PCollection
- .createPrimitiveOutputInternal(pipeline, windowingStrategy, isBounded)
- .setTypeDescriptorInternal(token);
-
- pcollectionMap.put(outputTag, outputCollection);
- }
- return new PCollectionTuple(pipeline, pcollectionMap);
- }
-
- @Override
- public Pipeline getPipeline() {
- return pipeline;
- }
-
- @Override
- public Collection<? extends PValue> expand() {
- return pcollectionMap.values();
- }
-
- @Override
- public void recordAsOutput(AppliedPTransform<?, ?, ?> transform) {
- int i = 0;
- for (Map.Entry<TupleTag<?>, PCollection<?>> entry
- : pcollectionMap.entrySet()) {
- TupleTag<?> tag = entry.getKey();
- PCollection<?> pc = entry.getValue();
- pc.recordAsOutput(transform, tag.getOutName(i));
- i++;
- }
- }
-
- @Override
- public void finishSpecifying() {
- for (PCollection<?> pc : pcollectionMap.values()) {
- pc.finishSpecifying();
- }
- }
-
- @Override
- public void finishSpecifyingOutput() {
- for (PCollection<?> pc : pcollectionMap.values()) {
- pc.finishSpecifyingOutput();
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/values/PCollectionView.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/values/PCollectionView.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/values/PCollectionView.java
deleted file mode 100644
index 515e21b..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/values/PCollectionView.java
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.values;
-
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.transforms.ParDo;
-import com.google.cloud.dataflow.sdk.transforms.View;
-import com.google.cloud.dataflow.sdk.util.WindowedValue;
-import com.google.cloud.dataflow.sdk.util.WindowingStrategy;
-
-import java.io.Serializable;
-
-/**
- * A {@link PCollectionView PCollectionView<T>} is an immutable view of a {@link PCollection}
- * as a value of type {@code T} that can be accessed
- * as a side input to a {@link ParDo} transform.
- *
- * <p>A {@link PCollectionView} should always be the output of a
- * {@link com.google.cloud.dataflow.sdk.transforms.PTransform}. It is the joint responsibility of
- * this transform and each {@link com.google.cloud.dataflow.sdk.runners.PipelineRunner} to implement
- * the view in a runner-specific manner.
- *
- * <p>The most common case is using the {@link View} transforms to prepare a {@link PCollection}
- * for use as a side input to {@link ParDo}. See {@link View#asSingleton()},
- * {@link View#asIterable()}, and {@link View#asMap()} for more detail on specific views
- * available in the SDK.
- *
- * @param <T> the type of the value(s) accessible via this {@link PCollectionView}
- */
-public interface PCollectionView<T> extends PValue, Serializable {
- /**
- * A unique identifier, for internal use.
- */
- public TupleTag<Iterable<WindowedValue<?>>> getTagInternal();
-
- /**
- * For internal use only.
- */
- public T fromIterableInternal(Iterable<WindowedValue<?>> contents);
-
- /**
- * For internal use only.
- */
- public WindowingStrategy<?, ?> getWindowingStrategyInternal();
-
- /**
- * For internal use only.
- */
- public Coder<Iterable<WindowedValue<?>>> getCoderInternal();
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/values/PDone.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/values/PDone.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/values/PDone.java
deleted file mode 100644
index 39a0061..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/values/PDone.java
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.values;
-
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-
-import java.util.Collection;
-import java.util.Collections;
-
-/**
- * {@link PDone} is the output of a {@link PTransform} that has a trivial result,
- * such as a {@link Write}.
- */
-public class PDone extends POutputValueBase {
-
- /**
- * Creates a {@link PDone} in the given {@link Pipeline}.
- */
- public static PDone in(Pipeline pipeline) {
- return new PDone(pipeline);
- }
-
- @Override
- public Collection<? extends PValue> expand() {
- // A PDone contains no PValues.
- return Collections.emptyList();
- }
-
- private PDone(Pipeline pipeline) {
- super(pipeline);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/values/PInput.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/values/PInput.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/values/PInput.java
deleted file mode 100644
index 89b097a..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/values/PInput.java
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.values;
-
-import com.google.cloud.dataflow.sdk.Pipeline;
-
-import java.util.Collection;
-
-/**
- * The interface for things that might be input to a
- * {@link com.google.cloud.dataflow.sdk.transforms.PTransform}.
- */
-public interface PInput {
- /**
- * Returns the owning {@link Pipeline} of this {@link PInput}.
- */
- public Pipeline getPipeline();
-
- /**
- * Expands this {@link PInput} into a list of its component output
- * {@link PValue PValues}.
- *
- * <ul>
- * <li>A {@link PValue} expands to itself.</li>
- * <li>A tuple or list of {@link PValue PValues} (such as
- * {@link PCollectionTuple} or {@link PCollectionList})
- * expands to its component {@code PValue PValues}.</li>
- * </ul>
- *
- * <p>Not intended to be invoked directly by user code.
- */
- public Collection<? extends PValue> expand();
-
- /**
- * <p>After building, finalizes this {@code PInput} to make it ready for
- * being used as an input to a {@link com.google.cloud.dataflow.sdk.transforms.PTransform}.
- *
- * <p>Automatically invoked whenever {@code apply()} is invoked on
- * this {@code PInput}, so users do not normally call this explicitly.
- */
- public void finishSpecifying();
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/values/POutput.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/values/POutput.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/values/POutput.java
deleted file mode 100644
index f99bc0b..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/values/POutput.java
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.values;
-
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.transforms.AppliedPTransform;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-
-import java.util.Collection;
-
-/**
- * The interface for things that might be output from a {@link PTransform}.
- */
-public interface POutput {
-
- /**
- * Returns the owning {@link Pipeline} of this {@link POutput}.
- */
- public Pipeline getPipeline();
-
- /**
- * Expands this {@link POutput} into a list of its component output
- * {@link PValue PValues}.
- *
- * <ul>
- * <li>A {@link PValue} expands to itself.</li>
- * <li>A tuple or list of {@link PValue PValues} (such as
- * {@link PCollectionTuple} or {@link PCollectionList})
- * expands to its component {@code PValue PValues}.</li>
- * </ul>
- *
- * <p>Not intended to be invoked directly by user code.
- */
- public Collection<? extends PValue> expand();
-
- /**
- * Records that this {@code POutput} is an output of the given
- * {@code PTransform}.
- *
- * <p>For a compound {@code POutput}, it is advised to call
- * this method on each component {@code POutput}.
- *
- * <p>This is not intended to be invoked by user code, but
- * is automatically invoked as part of applying the
- * producing {@link PTransform}.
- */
- public void recordAsOutput(AppliedPTransform<?, ?, ?> transform);
-
- /**
- * As part of applying the producing {@link PTransform}, finalizes this
- * output to make it ready for being used as an input and for running.
- *
- * <p>This includes ensuring that all {@link PCollection PCollections}
- * have {@link Coder Coders} specified or defaulted.
- *
- * <p>Automatically invoked whenever this {@link POutput} is used
- * as a {@link PInput} to another {@link PTransform}, or if never
- * used as a {@link PInput}, when {@link Pipeline#run}
- * is called, so users do not normally call this explicitly.
- */
- public void finishSpecifyingOutput();
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/values/POutputValueBase.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/values/POutputValueBase.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/values/POutputValueBase.java
deleted file mode 100644
index 69e04c3..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/values/POutputValueBase.java
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.values;
-
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.transforms.AppliedPTransform;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-
-/**
- * A {@link POutputValueBase} is the abstract base class of
- * {@code PTransform} outputs.
- *
- * <p>A {@link PValueBase} that adds tracking of its producing
- * {@link AppliedPTransform}.
- *
- * <p>For internal use.
- */
-public abstract class POutputValueBase implements POutput {
-
- private final Pipeline pipeline;
-
- protected POutputValueBase(Pipeline pipeline) {
- this.pipeline = pipeline;
- }
-
- /**
- * No-arg constructor for Java serialization only.
- * The resulting {@link POutputValueBase} is unlikely to be
- * valid.
- */
- protected POutputValueBase() {
- pipeline = null;
- }
-
- @Override
- public Pipeline getPipeline() {
- return pipeline;
- }
-
- /**
- * Returns the {@link AppliedPTransform} that this {@link POutputValueBase}
- * is an output of.
- *
- * <p>For internal use only.
- */
- public AppliedPTransform<?, ?, ?> getProducingTransformInternal() {
- return producingTransform;
- }
-
- /**
- * Records that this {@link POutputValueBase} is an output with the
- * given name of the given {@link AppliedPTransform}.
- *
- * <p>To be invoked only by {@link POutput#recordAsOutput}
- * implementations. Not to be invoked directly by user code.
- */
- @Override
- public void recordAsOutput(AppliedPTransform<?, ?, ?> transform) {
- if (producingTransform != null) {
- // Already used this POutput as a PTransform output. This can
- // happen if the POutput is an output of a transform within a
- // composite transform, and is also the result of the composite.
- // We want to record the "immediate" atomic transform producing
- // this output, and ignore all later composite transforms that
- // also produce this output.
- //
- // Pipeline.applyInternal() uses !hasProducingTransform() to
- // avoid calling this operation redundantly, but
- // hasProducingTransform() doesn't apply to POutputValueBases
- // that aren't PValues or composites of PValues, e.g., PDone.
- return;
- }
- producingTransform = transform;
- }
-
- /**
- * Default behavior for {@link #finishSpecifyingOutput()} is
- * to do nothing. Override if your {@link PValue} requires
- * finalization.
- */
- @Override
- public void finishSpecifyingOutput() { }
-
- /**
- * The {@link PTransform} that produces this {@link POutputValueBase}.
- */
- private AppliedPTransform<?, ?, ?> producingTransform;
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/values/PValue.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/values/PValue.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/values/PValue.java
deleted file mode 100644
index eb95a23..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/values/PValue.java
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.values;
-
-import com.google.cloud.dataflow.sdk.transforms.AppliedPTransform;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-
-/**
- * The interface for values that can be input to and output from {@link PTransform PTransforms}.
- */
-public interface PValue extends POutput, PInput {
-
- /**
- * Returns the name of this {@link PValue}.
- */
- public String getName();
-
- /**
- * Returns the {@link AppliedPTransform} that this {@link PValue} is an output of.
- *
- * <p>For internal use only.
- */
- public AppliedPTransform<?, ?, ?> getProducingTransformInternal();
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/values/PValueBase.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/values/PValueBase.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/values/PValueBase.java
deleted file mode 100644
index 7e57204..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/values/PValueBase.java
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.values;
-
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.transforms.AppliedPTransform;
-import com.google.cloud.dataflow.sdk.util.StringUtils;
-
-import java.util.Collection;
-import java.util.Collections;
-
-/**
- * A {@link PValueBase} is an abstract base class that provides
- * sensible default implementations for methods of {@link PValue}.
- * In particular, this includes functionality for getting/setting:
- *
- * <ul>
- * <li> The {@link Pipeline} that the {@link PValue} is part of.</li>
- * <li> Whether the {@link PValue} has bee finalized (as an input
- * or an output), after which its properties can no longer be changed.</li>
- * </ul>
- *
- * <p>For internal use.
- */
-public abstract class PValueBase extends POutputValueBase implements PValue {
- /**
- * Returns the name of this {@link PValueBase}.
- *
- * <p>By default, the name of a {@link PValueBase} is based on the
- * name of the {@link PTransform} that produces it. It can be
- * specified explicitly by calling {@link #setName}.
- *
- * @throws IllegalStateException if the name hasn't been set yet
- */
- @Override
- public String getName() {
- if (name == null) {
- throw new IllegalStateException("name not set");
- }
- return name;
- }
-
- /**
- * Sets the name of this {@link PValueBase}. Returns {@code this}.
- *
- * @throws IllegalStateException if this {@link PValueBase} has
- * already been finalized and may no longer be set.
- */
- public PValueBase setName(String name) {
- if (finishedSpecifying) {
- throw new IllegalStateException(
- "cannot change the name of " + this + " once it's been used");
- }
- this.name = name;
- return this;
- }
-
- /////////////////////////////////////////////////////////////////////////////
-
- protected PValueBase(Pipeline pipeline) {
- super(pipeline);
- }
-
- /**
- * No-arg constructor for Java serialization only.
- * The resulting {@link PValueBase} is unlikely to be
- * valid.
- */
- protected PValueBase() {
- super();
- }
-
- /**
- * The name of this {@link PValueBase}, or null if not yet set.
- */
- private String name;
-
- /**
- * Whether this {@link PValueBase} has been finalized, and its core
- * properties, e.g., name, can no longer be changed.
- */
- private boolean finishedSpecifying = false;
-
- @Override
- public void recordAsOutput(AppliedPTransform<?, ?, ?> transform) {
- recordAsOutput(transform, "out");
- }
-
- /**
- * Records that this {@link POutputValueBase} is an output with the
- * given name of the given {@link AppliedPTransform} in the given
- * {@link Pipeline}.
- *
- * <p>To be invoked only by {@link POutput#recordAsOutput}
- * implementations. Not to be invoked directly by user code.
- */
- protected void recordAsOutput(AppliedPTransform<?, ?, ?> transform,
- String outName) {
- super.recordAsOutput(transform);
- if (name == null) {
- name = transform.getFullName() + "." + outName;
- }
- }
-
- /**
- * Returns whether this {@link PValueBase} has been finalized, and
- * its core properties, e.g., name, can no longer be changed.
- *
- * <p>For internal use only.
- */
- public boolean isFinishedSpecifyingInternal() {
- return finishedSpecifying;
- }
-
- @Override
- public Collection<? extends PValue> expand() {
- return Collections.singletonList(this);
- }
-
- @Override
- public void finishSpecifying() {
- finishSpecifyingOutput();
- finishedSpecifying = true;
- }
-
- @Override
- public String toString() {
- return (name == null ? "<unnamed>" : getName())
- + " [" + getKindString() + "]";
- }
-
- /**
- * Returns a {@link String} capturing the kind of this
- * {@link PValueBase}.
- *
- * <p>By default, uses the base name of the current class as its kind string.
- */
- protected String getKindString() {
- return StringUtils.approximateSimpleName(getClass());
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/values/TimestampedValue.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/values/TimestampedValue.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/values/TimestampedValue.java
deleted file mode 100644
index 1085d44..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/values/TimestampedValue.java
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.values;
-
-import static com.google.common.base.Preconditions.checkArgument;
-import static com.google.common.base.Preconditions.checkNotNull;
-
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.coders.InstantCoder;
-import com.google.cloud.dataflow.sdk.coders.StandardCoder;
-import com.google.cloud.dataflow.sdk.util.PropertyNames;
-
-import com.fasterxml.jackson.annotation.JsonCreator;
-import com.fasterxml.jackson.annotation.JsonProperty;
-
-import org.joda.time.Instant;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.util.Arrays;
-import java.util.List;
-import java.util.Objects;
-
-/**
- * An immutable pair of a value and a timestamp.
- *
- * <p>The timestamp of a value determines many properties, such as its assignment to
- * windows and whether the value is late (with respect to the watermark of a {@link PCollection}).
- *
- * @param <V> the type of the value
- */
-public class TimestampedValue<V> {
-
- /**
- * Returns a new {@code TimestampedValue} with the given value and timestamp.
- */
- public static <V> TimestampedValue<V> of(V value, Instant timestamp) {
- return new TimestampedValue<>(value, timestamp);
- }
-
- public V getValue() {
- return value;
- }
-
- public Instant getTimestamp() {
- return timestamp;
- }
-
- @Override
- public boolean equals(Object other) {
- if (!(other instanceof TimestampedValue)) {
- return false;
- }
- TimestampedValue<?> that = (TimestampedValue<?>) other;
- return Objects.equals(value, that.value) && Objects.equals(timestamp, that.timestamp);
- }
-
- @Override
- public int hashCode() {
- return Objects.hash(value, timestamp);
- }
-
- @Override
- public String toString() {
- return "TimestampedValue(" + value + ", " + timestamp + ")";
- }
-
- /////////////////////////////////////////////////////////////////////////////
-
- /**
- * A {@link Coder} for {@link TimestampedValue}.
- */
- public static class TimestampedValueCoder<T>
- extends StandardCoder<TimestampedValue<T>> {
-
- private final Coder<T> valueCoder;
-
- public static <T> TimestampedValueCoder<T> of(Coder<T> valueCoder) {
- return new TimestampedValueCoder<>(valueCoder);
- }
-
- @JsonCreator
- public static TimestampedValueCoder<?> of(
- @JsonProperty(PropertyNames.COMPONENT_ENCODINGS)
- List<Object> components) {
- checkArgument(components.size() == 1,
- "Expecting 1 component, got " + components.size());
- return of((Coder<?>) components.get(0));
- }
-
- @SuppressWarnings("unchecked")
- TimestampedValueCoder(Coder<T> valueCoder) {
- this.valueCoder = checkNotNull(valueCoder);
- }
-
- @Override
- public void encode(TimestampedValue<T> windowedElem,
- OutputStream outStream,
- Context context)
- throws IOException {
- valueCoder.encode(windowedElem.getValue(), outStream, context.nested());
- InstantCoder.of().encode(
- windowedElem.getTimestamp(), outStream, context);
- }
-
- @Override
- public TimestampedValue<T> decode(InputStream inStream, Context context)
- throws IOException {
- T value = valueCoder.decode(inStream, context.nested());
- Instant timestamp = InstantCoder.of().decode(inStream, context);
- return TimestampedValue.of(value, timestamp);
- }
-
- @Override
- public void verifyDeterministic() throws NonDeterministicException {
- verifyDeterministic(
- "TimestampedValueCoder requires a deterministic valueCoder",
- valueCoder);
- }
-
- @Override
- public List<? extends Coder<?>> getCoderArguments() {
- return Arrays.<Coder<?>>asList(valueCoder);
- }
-
- public static <T> List<Object> getInstanceComponents(TimestampedValue<T> exampleValue) {
- return Arrays.<Object>asList(exampleValue.getValue());
- }
- }
-
- /////////////////////////////////////////////////////////////////////////////
-
- private final V value;
- private final Instant timestamp;
-
- protected TimestampedValue(V value, Instant timestamp) {
- this.value = value;
- this.timestamp = timestamp;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/values/TupleTag.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/values/TupleTag.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/values/TupleTag.java
deleted file mode 100644
index 7494921..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/values/TupleTag.java
+++ /dev/null
@@ -1,196 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.values;
-
-import static com.google.cloud.dataflow.sdk.util.Structs.addBoolean;
-import static com.google.cloud.dataflow.sdk.util.Structs.addString;
-
-import com.google.cloud.dataflow.sdk.util.CloudObject;
-import com.google.cloud.dataflow.sdk.util.PropertyNames;
-import com.google.common.collect.HashMultiset;
-import com.google.common.collect.Multiset;
-
-import com.fasterxml.jackson.annotation.JsonCreator;
-import com.fasterxml.jackson.annotation.JsonProperty;
-
-import java.io.Serializable;
-import java.util.Random;
-
-/**
- * A {@link TupleTag} is a typed tag to use as the key of a
- * heterogeneously typed tuple, like {@link PCollectionTuple}.
- * Its generic type parameter allows tracking
- * the static type of things stored in tuples.
- *
- * <p>To aid in assigning default {@link Coder Coders} for results of
- * side outputs of {@link ParDo}, an output
- * {@link TupleTag} should be instantiated with an extra {@code {}} so
- * it is an instance of an anonymous subclass without generic type
- * parameters. Input {@link TupleTag TupleTags} require no such extra
- * instantiation (although it doesn't hurt). For example:
- *
- * <pre> {@code
- * TupleTag<SomeType> inputTag = new TupleTag<>();
- * TupleTag<SomeOtherType> outputTag = new TupleTag<SomeOtherType>(){};
- * } </pre>
- *
- * @param <V> the type of the elements or values of the tagged thing,
- * e.g., a {@code PCollection<V>}.
- */
-public class TupleTag<V> implements Serializable {
- /**
- * Constructs a new {@code TupleTag}, with a fresh unique id.
- *
- * <p>This is the normal way {@code TupleTag}s are constructed.
- */
- public TupleTag() {
- this(genId(), true);
- }
-
- /**
- * Constructs a new {@code TupleTag} with the given id.
- *
- * <p>It is up to the user to ensure that two {@code TupleTag}s
- * with the same id actually mean the same tag and carry the same
- * generic type parameter. Violating this invariant can lead to
- * hard-to-diagnose runtime type errors. Consequently, this
- * operation should be used very sparingly, such as when the
- * producer and consumer of {@code TupleTag}s are written in
- * separate modules and can only coordinate via ids rather than
- * shared {@code TupleTag} instances. Most of the time,
- * {@link #TupleTag()} should be preferred.
- */
- public TupleTag(String id) {
- this(id, false);
- }
-
- /**
- * Returns the id of this {@code TupleTag}.
- *
- * <p>Two {@code TupleTag}s with the same id are considered equal.
- *
- * <p>{@code TupleTag}s are not ordered, i.e., the class does not implement
- * Comparable interface. TupleTags implement equals and hashCode, making them
- * suitable for use as keys in HashMap and HashSet.
- */
- public String getId() {
- return id;
- }
-
- /**
- * If this {@code TupleTag} is tagging output {@code outputIndex} of
- * a {@code PTransform}, returns the name that should be used by
- * default for the output.
- */
- public String getOutName(int outIndex) {
- if (generated) {
- return "out" + outIndex;
- } else {
- return id;
- }
- }
-
- /**
- * Returns a {@code TypeDescriptor} capturing what is known statically
- * about the type of this {@code TupleTag} instance's most-derived
- * class.
- *
- * <p>This is useful for a {@code TupleTag} constructed as an
- * instance of an anonymous subclass with a trailing {@code {}},
- * e.g., {@code new TupleTag<SomeType>(){}}.
- */
- public TypeDescriptor<V> getTypeDescriptor() {
- return new TypeDescriptor<V>(getClass()) {};
- }
-
- /////////////////////////////////////////////////////////////////////////////
- // Internal details below here.
-
- static final Random RANDOM = new Random(0);
- private static final Multiset<String> staticInits = HashMultiset.create();
-
- final String id;
- final boolean generated;
-
- /** Generates and returns a fresh unique id for a TupleTag's id. */
- static synchronized String genId() {
- // It is a common pattern to store tags that are shared between the main
- // program and workers in static variables, but such references are not
- // serialized as part of the *Fns state. Fortunately, most such tags
- // are constructed in static class initializers, e.g.
- //
- // static final TupleTag<T> MY_TAG = new TupleTag<>();
- //
- // and class initialization order is well defined by the JVM spec, so in
- // this case we can assign deterministic ids.
- StackTraceElement[] stackTrace = Thread.currentThread().getStackTrace();
- for (StackTraceElement frame : stackTrace) {
- if (frame.getMethodName().equals("<clinit>")) {
- int counter = staticInits.add(frame.getClassName(), 1);
- return frame.getClassName() + "#" + counter;
- }
- }
- // Otherwise, assume it'll be serialized and choose a random value to reduce
- // the chance of collision.
- String nonce = Long.toHexString(RANDOM.nextLong());
- // [Thread.getStackTrace, TupleTag.getId, TupleTag.<init>, caller, ...]
- String caller = stackTrace.length >= 4
- ? stackTrace[3].getClassName() + "." + stackTrace[3].getMethodName()
- + ":" + stackTrace[3].getLineNumber()
- : "unknown";
- return caller + "#" + nonce;
- }
-
- @JsonCreator
- @SuppressWarnings("unused")
- private static TupleTag<?> fromJson(
- @JsonProperty(PropertyNames.VALUE) String id,
- @JsonProperty(PropertyNames.IS_GENERATED) boolean generated) {
- return new TupleTag<>(id, generated);
- }
-
- private TupleTag(String id, boolean generated) {
- this.id = id;
- this.generated = generated;
- }
-
- public CloudObject asCloudObject() {
- CloudObject result = CloudObject.forClass(getClass());
- addString(result, PropertyNames.VALUE, id);
- addBoolean(result, PropertyNames.IS_GENERATED, generated);
- return result;
- }
-
- @Override
- public boolean equals(Object that) {
- if (that instanceof TupleTag) {
- return this.id.equals(((TupleTag<?>) that).id);
- } else {
- return false;
- }
- }
-
- @Override
- public int hashCode() {
- return id.hashCode();
- }
-
- @Override
- public String toString() {
- return "Tag<" + id + ">";
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/values/TupleTagList.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/values/TupleTagList.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/values/TupleTagList.java
deleted file mode 100644
index f019fc2..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/values/TupleTagList.java
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.values;
-
-import com.google.cloud.dataflow.sdk.transforms.ParDo;
-import com.google.common.collect.ImmutableList;
-
-import java.io.Serializable;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
-
-/**
- * A {@link TupleTagList} is an immutable list of heterogeneously
- * typed {@link TupleTag TupleTags}. A {@link TupleTagList} is used, for instance, to
- * specify the tags of the side outputs of a
- * {@link ParDo}.
- *
- * <p>A {@link TupleTagList} can be created and accessed like follows:
- * <pre> {@code
- * TupleTag<String> tag1 = ...;
- * TupleTag<Integer> tag2 = ...;
- * TupleTag<Iterable<String>> tag3 = ...;
- *
- * // Create a TupleTagList with three TupleTags:
- * TupleTagList tags = TupleTagList.of(tag1).and(tag2).and(tag3);
- *
- * // Create an empty TupleTagList:
- * Pipeline p = ...;
- * TupleTagList tags2 = TupleTagList.empty(p);
- *
- * // Get TupleTags out of a TupleTagList, by index (origin 0):
- * TupleTag<?> tagX = tags.get(1);
- * TupleTag<?> tagY = tags.get(0);
- * TupleTag<?> tagZ = tags.get(2);
- *
- * // Get a list of all TupleTags in a TupleTagList:
- * List<TupleTag<?>> allTags = tags.getAll();
- * } </pre>
- */
-public class TupleTagList implements Serializable {
- /**
- * Returns an empty {@link TupleTagList}.
- *
- * <p>Longer {@link TupleTagList TupleTagLists} can be created by calling
- * {@link #and} on the result.
- */
- public static TupleTagList empty() {
- return new TupleTagList();
- }
-
- /**
- * Returns a singleton {@link TupleTagList} containing the given {@link TupleTag}.
- *
- * <p>Longer {@link TupleTagList TupleTagLists} can be created by calling
- * {@link #and} on the result.
- */
- public static TupleTagList of(TupleTag<?> tag) {
- return empty().and(tag);
- }
-
- /**
- * Returns a {@link TupleTagList} containing the given {@link TupleTag TupleTags}, in order.
- *
- * <p>Longer {@link TupleTagList TupleTagLists} can be created by calling
- * {@link #and} on the result.
- */
- public static TupleTagList of(List<TupleTag<?>> tags) {
- return empty().and(tags);
- }
-
- /**
- * Returns a new {@link TupleTagList} that has all the {@link TupleTag TupleTags} of
- * this {@link TupleTagList} plus the given {@link TupleTag} appended to the end.
- */
- public TupleTagList and(TupleTag<?> tag) {
- return new TupleTagList(
- new ImmutableList.Builder<TupleTag<?>>()
- .addAll(tupleTags)
- .add(tag)
- .build());
- }
-
- /**
- * Returns a new {@link TupleTagList} that has all the {@link TupleTag TupleTags} of
- * this {@link TupleTagList} plus the given {@link TupleTag TupleTags} appended to the end,
- * in order.
- */
- public TupleTagList and(List<TupleTag<?>> tags) {
- return new TupleTagList(
- new ImmutableList.Builder<TupleTag<?>>()
- .addAll(tupleTags)
- .addAll(tags)
- .build());
- }
-
- /**
- * Returns the number of TupleTags in this TupleTagList.
- */
- public int size() {
- return tupleTags.size();
- }
-
- /**
- * Returns the {@link TupleTag} at the given index (origin zero).
- *
- * @throws IndexOutOfBoundsException if the index is out of the range
- * {@code [0..size()-1]}.
- */
- public TupleTag<?> get(int index) {
- return tupleTags.get(index);
- }
-
- /**
- * Returns an immutable List of all the {@link TupleTag TupleTags} in this {@link TupleTagList}.
- */
- public List<TupleTag<?>> getAll() {
- return tupleTags;
- }
-
-
- /////////////////////////////////////////////////////////////////////////////
- // Internal details below here.
-
- final List<TupleTag<?>> tupleTags;
-
- TupleTagList() {
- this(new ArrayList<TupleTag<?>>());
- }
-
- TupleTagList(List<TupleTag<?>> tupleTags) {
- this.tupleTags = Collections.unmodifiableList(tupleTags);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/values/TypeDescriptor.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/values/TypeDescriptor.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/values/TypeDescriptor.java
deleted file mode 100644
index 559d67c..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/values/TypeDescriptor.java
+++ /dev/null
@@ -1,351 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.values;
-
-import com.google.common.collect.Lists;
-import com.google.common.reflect.Invokable;
-import com.google.common.reflect.Parameter;
-import com.google.common.reflect.TypeToken;
-
-import java.io.Serializable;
-import java.lang.reflect.Field;
-import java.lang.reflect.Method;
-import java.lang.reflect.ParameterizedType;
-import java.lang.reflect.Type;
-import java.lang.reflect.TypeVariable;
-import java.util.List;
-
-import javax.annotation.Nullable;
-
-/**
- * A description of a Java type, including actual generic parameters where possible.
- *
- * <p>To prevent losing actual type arguments due to erasure, create an anonymous subclass
- * with concrete types:
- * <pre>
- * {@code
- * TypeDecriptor<List<String>> = new TypeDescriptor<List<String>>() {};
- * }
- * </pre>
- *
- * <p>If the above were not an anonymous subclass, the type {@code List<String>}
- * would be erased and unavailable at run time.
- *
- * @param <T> the type represented by this {@link TypeDescriptor}
- */
-public abstract class TypeDescriptor<T> implements Serializable {
-
- // This class is just a wrapper for TypeToken
- private final TypeToken<T> token;
-
- /**
- * Creates a {@link TypeDescriptor} wrapping the provided token.
- * This constructor is private so Guava types do not leak.
- */
- private TypeDescriptor(TypeToken<T> token) {
- this.token = token;
- }
-
- /**
- * Creates a {@link TypeDescriptor} representing
- * the type parameter {@code T}. To use this constructor
- * properly, the type parameter must be a concrete type, for example
- * {@code new TypeDescriptor<List<String>>(){}}.
- */
- protected TypeDescriptor() {
- token = new TypeToken<T>(getClass()) {};
- }
-
- /**
- * Creates a {@link TypeDescriptor} representing the type parameter {@code T}, which should
- * resolve to a concrete type in the context of the class {@code clazz}.
- *
- * <p>Unlike {@link TypeDescriptor#TypeDescriptor(Class)} this will also use context's of the
- * enclosing instances while attempting to resolve the type. This means that the types of any
- * classes instantiated in the concrete instance should be resolvable.
- */
- protected TypeDescriptor(Object instance) {
- TypeToken<?> unresolvedToken = new TypeToken<T>(getClass()) {};
-
- // While we haven't fully resolved the parameters, refine it using the captured
- // enclosing instance of the object.
- unresolvedToken = TypeToken.of(instance.getClass()).resolveType(unresolvedToken.getType());
-
- if (hasUnresolvedParameters(unresolvedToken.getType())) {
- for (Field field : instance.getClass().getDeclaredFields()) {
- Object fieldInstance = getEnclosingInstance(field, instance);
- if (fieldInstance != null) {
- unresolvedToken =
- TypeToken.of(fieldInstance.getClass()).resolveType(unresolvedToken.getType());
- if (!hasUnresolvedParameters(unresolvedToken.getType())) {
- break;
- }
- }
- }
- }
-
- // Once we've either fully resolved the parameters or exhausted enclosing instances, we have
- // the best approximation to the token we can get.
- @SuppressWarnings("unchecked")
- TypeToken<T> typedToken = (TypeToken<T>) unresolvedToken;
- token = typedToken;
- }
-
- private boolean hasUnresolvedParameters(Type type) {
- if (type instanceof TypeVariable) {
- return true;
- } else if (type instanceof ParameterizedType) {
- ParameterizedType param = (ParameterizedType) type;
- for (Type arg : param.getActualTypeArguments()) {
- if (hasUnresolvedParameters(arg)) {
- return true;
- }
- }
- }
- return false;
- }
-
- /**
- * Returns the enclosing instance if the field is synthetic and it is able to access it, or
- * {@literal null} if not.
- */
- @Nullable
- private Object getEnclosingInstance(Field field, Object instance) {
- if (!field.isSynthetic()) {
- return null;
- }
-
- boolean accessible = field.isAccessible();
- try {
- field.setAccessible(true);
- return field.get(instance);
- } catch (IllegalArgumentException | IllegalAccessException e) {
- // If we fail to get the enclosing instance field, do nothing. In the worst case, we won't
- // refine the type based on information in this enclosing class -- that is consistent with
- // previous behavior and is still a correct answer that can be fixed by returning the correct
- // type descriptor.
- return null;
- } finally {
- field.setAccessible(accessible);
- }
- }
-
- /**
- * Creates a {@link TypeDescriptor} representing the type parameter
- * {@code T}, which should resolve to a concrete type in the context
- * of the class {@code clazz}.
- */
- @SuppressWarnings("unchecked")
- protected TypeDescriptor(Class<?> clazz) {
- TypeToken<T> unresolvedToken = new TypeToken<T>(getClass()) {};
- token = (TypeToken<T>) TypeToken.of(clazz).resolveType(unresolvedToken.getType());
- }
-
- /**
- * Returns a {@link TypeDescriptor} representing the given type.
- */
- public static <T> TypeDescriptor<T> of(Class<T> type) {
- return new SimpleTypeDescriptor<>(TypeToken.<T>of(type));
- }
-
- /**
- * Returns a {@link TypeDescriptor} representing the given type.
- */
- @SuppressWarnings("unchecked")
- public static TypeDescriptor<?> of(Type type) {
- return new SimpleTypeDescriptor<>((TypeToken<Object>) TypeToken.of(type));
- }
-
- /**
- * Returns the {@link Type} represented by this {@link TypeDescriptor}.
- */
- public Type getType() {
- return token.getType();
- }
-
- /**
- * Returns the {@link Class} underlying the {@link Type} represented by
- * this {@link TypeDescriptor}.
- */
- public Class<? super T> getRawType() {
- return token.getRawType();
- }
-
- /**
- * Returns the component type if this type is an array type,
- * otherwise returns {@code null}.
- */
- public TypeDescriptor<?> getComponentType() {
- return new SimpleTypeDescriptor<>(token.getComponentType());
- }
-
- /**
- * Returns the generic form of a supertype.
- */
- public final TypeDescriptor<? super T> getSupertype(Class<? super T> superclass) {
- return new SimpleTypeDescriptor<>(token.getSupertype(superclass));
- }
-
- /**
- * Returns true if this type is known to be an array type.
- */
- public final boolean isArray() {
- return token.isArray();
- }
-
- /**
- * Returns a {@link TypeVariable} for the named type parameter. Throws
- * {@link IllegalArgumentException} if a type variable by the requested type parameter is not
- * found.
- *
- * <p>For example, {@code new TypeDescriptor<List>(){}.getTypeParameter("T")} returns a
- * {@code TypeVariable<? super List>} representing the formal type parameter {@code T}.
- *
- * <p>Do not mistake the type parameters (formal type argument list) with the actual
- * type arguments. For example, if a class {@code Foo} extends {@code List<String>}, it
- * does not make sense to ask for a type parameter, because {@code Foo} does not have any.
- */
- public final TypeVariable<Class<? super T>> getTypeParameter(String paramName) {
- // Cannot convert TypeVariable<Class<? super T>>[] to TypeVariable<Class<? super T>>[]
- // due to how they are used here, so the result of getTypeParameters() cannot be used
- // without upcast.
- Class<?> rawType = getRawType();
- for (TypeVariable<?> param : rawType.getTypeParameters()) {
- if (param.getName().equals(paramName)) {
- @SuppressWarnings("unchecked")
- TypeVariable<Class<? super T>> typedParam = (TypeVariable<Class<? super T>>) param;
- return typedParam;
- }
- }
- throw new IllegalArgumentException(
- "No type parameter named " + paramName + " found on " + getRawType());
- }
-
- /**
- * Returns true if this type is assignable from the given type.
- */
- public final boolean isSupertypeOf(TypeDescriptor<?> source) {
- return token.isSupertypeOf(source.token);
- }
-
- /**
- * Return true if this type is a subtype of the given type.
- */
- public final boolean isSubtypeOf(TypeDescriptor<?> parent) {
- return token.isSubtypeOf(parent.token);
- }
-
- /**
- * Returns a list of argument types for the given method, which must
- * be a part of the class.
- */
- public List<TypeDescriptor<?>> getArgumentTypes(Method method) {
- Invokable<?, ?> typedMethod = token.method(method);
-
- List<TypeDescriptor<?>> argTypes = Lists.newArrayList();
- for (Parameter parameter : typedMethod.getParameters()) {
- argTypes.add(new SimpleTypeDescriptor<>(parameter.getType()));
- }
- return argTypes;
- }
-
- /**
- * Returns a {@link TypeDescriptor} representing the given
- * type, with type variables resolved according to the specialization
- * in this type.
- *
- * <p>For example, consider the following class:
- * <pre>
- * {@code
- * class MyList implements List<String> { ... }
- * }
- * </pre>
- *
- * <p>The {@link TypeDescriptor} returned by
- * <pre>
- * {@code
- * TypeDescriptor.of(MyList.class)
- * .resolveType(Mylist.class.getMethod("get", int.class).getGenericReturnType)
- * }
- * </pre>
- * will represent the type {@code String}.
- */
- public TypeDescriptor<?> resolveType(Type type) {
- return new SimpleTypeDescriptor<>(token.resolveType(type));
- }
-
- /**
- * Returns a set of {@link TypeDescriptor}s, one for each
- * interface implemented by this class.
- */
- @SuppressWarnings("rawtypes")
- public Iterable<TypeDescriptor> getInterfaces() {
- List<TypeDescriptor> interfaces = Lists.newArrayList();
- for (TypeToken<?> interfaceToken : token.getTypes().interfaces()) {
- interfaces.add(new SimpleTypeDescriptor<>(interfaceToken));
- }
- return interfaces;
- }
-
- /**
- * Returns a set of {@link TypeDescriptor}s, one for each
- * superclass (including this class).
- */
- @SuppressWarnings("rawtypes")
- public Iterable<TypeDescriptor> getClasses() {
- List<TypeDescriptor> classes = Lists.newArrayList();
- for (TypeToken<?> classToken : token.getTypes().classes()) {
- classes.add(new SimpleTypeDescriptor<>(classToken));
- }
- return classes;
- }
-
- @Override
- public String toString() {
- return token.toString();
- }
-
- /**
- * Two type descriptor are equal if and only if they
- * represent the same type.
- */
- @Override
- public boolean equals(Object other) {
- if (!(other instanceof TypeDescriptor)) {
- return false;
- } else {
- @SuppressWarnings("unchecked")
- TypeDescriptor<?> descriptor = (TypeDescriptor<?>) other;
- return token.equals(descriptor.token);
- }
- }
-
- @Override
- public int hashCode() {
- return token.hashCode();
- }
-
- /**
- * A non-abstract {@link TypeDescriptor} for construction directly from an existing
- * {@link TypeToken}.
- */
- private static final class SimpleTypeDescriptor<T> extends TypeDescriptor<T> {
- SimpleTypeDescriptor(TypeToken<T> typeToken) {
- super(typeToken);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/values/TypedPValue.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/values/TypedPValue.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/values/TypedPValue.java
deleted file mode 100644
index 29fd639..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/values/TypedPValue.java
+++ /dev/null
@@ -1,197 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.values;
-
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.coders.CannotProvideCoderException;
-import com.google.cloud.dataflow.sdk.coders.CannotProvideCoderException.ReasonCode;
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.coders.CoderRegistry;
-import com.google.cloud.dataflow.sdk.transforms.AppliedPTransform;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-import com.google.cloud.dataflow.sdk.transforms.ParDo;
-
-/**
- * A {@link TypedPValue TypedPValue<T>} is the abstract base class of things that
- * store some number of values of type {@code T}.
- *
- * <p>Because we know the type {@code T}, this is the layer of the inheritance hierarchy where
- * we store a coder for objects of type {@code T}.
- *
- * @param <T> the type of the values stored in this {@link TypedPValue}
- */
-public abstract class TypedPValue<T> extends PValueBase implements PValue {
-
- /**
- * Returns the {@link Coder} used by this {@link TypedPValue} to encode and decode
- * the values stored in it.
- *
- * @throws IllegalStateException if the {@link Coder} hasn't been set, and
- * couldn't be inferred.
- */
- public Coder<T> getCoder() {
- if (coder == null) {
- coder = inferCoderOrFail();
- }
- return coder;
- }
-
- /**
- * Sets the {@link Coder} used by this {@link TypedPValue} to encode and decode the
- * values stored in it. Returns {@code this}.
- *
- * @throws IllegalStateException if this {@link TypedPValue} has already
- * been finalized and is no longer settable, e.g., by having
- * {@code apply()} called on it
- */
- public TypedPValue<T> setCoder(Coder<T> coder) {
- if (isFinishedSpecifyingInternal()) {
- throw new IllegalStateException(
- "cannot change the Coder of " + this + " once it's been used");
- }
- if (coder == null) {
- throw new IllegalArgumentException(
- "Cannot setCoder(null)");
- }
- this.coder = coder;
- return this;
- }
-
- /**
- * After building, finalizes this {@link PValue} to make it ready for
- * running. Automatically invoked whenever the {@link PValue} is "used"
- * (e.g., when apply() is called on it) and when the Pipeline is
- * run (useful if this is a {@link PValue} with no consumers).
- */
- @Override
- public void finishSpecifying() {
- if (isFinishedSpecifyingInternal()) {
- return;
- }
- super.finishSpecifying();
- // Ensure that this TypedPValue has a coder by inferring the coder if none exists; If not,
- // this will throw an exception.
- getCoder();
- }
-
- /////////////////////////////////////////////////////////////////////////////
- // Internal details below here.
-
- /**
- * The {@link Coder} used by this {@link TypedPValue} to encode and decode the
- * values stored in it, or null if not specified nor inferred yet.
- */
- private Coder<T> coder;
-
- protected TypedPValue(Pipeline p) {
- super(p);
- }
-
- private TypeDescriptor<T> typeDescriptor;
-
- /**
- * Returns a {@link TypeDescriptor TypeDescriptor<T>} with some reflective information
- * about {@code T}, if possible. May return {@code null} if no information
- * is available. Subclasses may override this to enable better
- * {@code Coder} inference.
- */
- public TypeDescriptor<T> getTypeDescriptor() {
- return typeDescriptor;
- }
-
- /**
- * Sets the {@link TypeDescriptor TypeDescriptor<T>} associated with this class. Better
- * reflective type information will lead to better {@link Coder}
- * inference.
- */
- public TypedPValue<T> setTypeDescriptorInternal(TypeDescriptor<T> typeDescriptor) {
- this.typeDescriptor = typeDescriptor;
- return this;
- }
-
- /**
- * If the coder is not explicitly set, this sets the coder for
- * this {@link TypedPValue} to the best coder that can be inferred
- * based upon the known {@link TypeDescriptor}. By default, this is null,
- * but can and should be improved by subclasses.
- */
- @SuppressWarnings({"unchecked", "rawtypes"})
- private Coder<T> inferCoderOrFail() {
- // First option for a coder: use the Coder set on this PValue.
- if (coder != null) {
- return coder;
- }
-
- AppliedPTransform<?, ?, ?> application = getProducingTransformInternal();
-
- // Second option for a coder: Look in the coder registry.
- CoderRegistry registry = getPipeline().getCoderRegistry();
- TypeDescriptor<T> token = getTypeDescriptor();
- CannotProvideCoderException inferFromTokenException = null;
- if (token != null) {
- try {
- return registry.getDefaultCoder(token);
- } catch (CannotProvideCoderException exc) {
- inferFromTokenException = exc;
- // Attempt to detect when the token came from a TupleTag used for a ParDo side output,
- // and provide a better error message if so. Unfortunately, this information is not
- // directly available from the TypeDescriptor, so infer based on the type of the PTransform
- // and the error message itself.
- if (application.getTransform() instanceof ParDo.BoundMulti
- && exc.getReason() == ReasonCode.TYPE_ERASURE) {
- inferFromTokenException = new CannotProvideCoderException(exc.getMessage()
- + " If this error occurs for a side output of the producing ParDo, verify that the "
- + "TupleTag for this output is constructed with proper type information (see "
- + "TupleTag Javadoc) or explicitly set the Coder to use if this is not possible.");
- }
- }
- }
-
- // Third option for a coder: use the default Coder from the producing PTransform.
- CannotProvideCoderException inputCoderException;
- try {
- return ((PTransform) application.getTransform()).getDefaultOutputCoder(
- application.getInput(), this);
- } catch (CannotProvideCoderException exc) {
- inputCoderException = exc;
- }
-
- // Build up the error message and list of causes.
- StringBuilder messageBuilder = new StringBuilder()
- .append("Unable to return a default Coder for ").append(this)
- .append(". Correct one of the following root causes:");
-
- // No exception, but give the user a message about .setCoder() has not been called.
- messageBuilder.append("\n No Coder has been manually specified; ")
- .append(" you may do so using .setCoder().");
-
- if (inferFromTokenException != null) {
- messageBuilder
- .append("\n Inferring a Coder from the CoderRegistry failed: ")
- .append(inferFromTokenException.getMessage());
- }
-
- if (inputCoderException != null) {
- messageBuilder
- .append("\n Using the default output Coder from the producing PTransform failed: ")
- .append(inputCoderException.getMessage());
- }
-
- // Build and throw the exception.
- throw new IllegalStateException(messageBuilder.toString());
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/values/package-info.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/values/package-info.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/values/package-info.java
deleted file mode 100644
index b8ca756..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/values/package-info.java
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-/**
- * Defines {@link com.google.cloud.dataflow.sdk.values.PCollection} and other classes for
- * representing data in a {@link com.google.cloud.dataflow.sdk.Pipeline}.
- *
- * <p>In particular, see these collection abstractions:
- *
- * <ul>
- * <li>{@link com.google.cloud.dataflow.sdk.values.PCollection} - an immutable collection of
- * values of type {@code T} and the main representation for data in Dataflow.</li>
- * <li>{@link com.google.cloud.dataflow.sdk.values.PCollectionView} - an immutable view of a
- * {@link com.google.cloud.dataflow.sdk.values.PCollection} that can be accessed as a
- * side input of a {@link com.google.cloud.dataflow.sdk.transforms.ParDo}
- * {@link com.google.cloud.dataflow.sdk.transforms.PTransform}.</li>
- * <li>{@link com.google.cloud.dataflow.sdk.values.PCollectionTuple} - a heterogeneous tuple of
- * {@link com.google.cloud.dataflow.sdk.values.PCollection PCollections}
- * used in cases where a {@link com.google.cloud.dataflow.sdk.transforms.PTransform} takes
- * or returns multiple
- * {@link com.google.cloud.dataflow.sdk.values.PCollection PCollections}.</li>
- * <li>{@link com.google.cloud.dataflow.sdk.values.PCollectionList} - a homogeneous list of
- * {@link com.google.cloud.dataflow.sdk.values.PCollection PCollections} used, for example,
- * as input to {@link com.google.cloud.dataflow.sdk.transforms.Flatten}.</li>
- * </ul>
- *
- * <p>And these classes for individual values play particular roles in Dataflow:
- *
- * <ul>
- * <li>{@link com.google.cloud.dataflow.sdk.values.KV} - a key/value pair that is used by
- * keyed transforms, most notably {@link com.google.cloud.dataflow.sdk.transforms.GroupByKey}.
- * </li>
- * <li>{@link com.google.cloud.dataflow.sdk.values.TimestampedValue} - a timestamp/value pair
- * that is used for windowing and handling out-of-order data in streaming execution.</li>
- * </ul>
- *
- * <p>For further details, see the documentation for each class in this package.
- */
-package com.google.cloud.dataflow.sdk.values;
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/proto/README.md
----------------------------------------------------------------------
diff --git a/sdk/src/main/proto/README.md b/sdk/src/main/proto/README.md
deleted file mode 100644
index fa4e925..0000000
--- a/sdk/src/main/proto/README.md
+++ /dev/null
@@ -1,27 +0,0 @@
-## Protocol Buffers in Google Cloud Dataflow
-
-This directory contains the Protocol Buffer messages used in Google Cloud
-Dataflow.
-
-They aren't, however, used during the Maven build process, and are included here
-for completeness only. Instead, the following artifact on Maven Central contains
-the binary version of the generated code from these Protocol Buffers:
-
- <dependency>
- <groupId>com.google.cloud.dataflow</groupId>
- <artifactId>google-cloud-dataflow-java-proto-library-all</artifactId>
- <version>LATEST</version>
- </dependency>
-
-Please follow this process for testing changes:
-
-* Make changes to the Protocol Buffer messages in this directory.
-* Use `protoc` to generate the new code, and compile it into a new Java library.
-* Install that Java library into your local Maven repository.
-* Update SDK's `pom.xml` to pick up the newly installed library, instead of
-downloading it from Maven Central.
-
-Once the changes are ready for submission, please separate them into two
-commits. The first commit should update the Protocol Buffer messages only. After
-that, we need to update the generated artifact on Maven Central. Finally,
-changes that make use of the Protocol Buffer changes may be committed.
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/proto/proto2_coder_test_messages.proto
----------------------------------------------------------------------
diff --git a/sdk/src/main/proto/proto2_coder_test_messages.proto b/sdk/src/main/proto/proto2_coder_test_messages.proto
deleted file mode 100644
index eb3c3df..0000000
--- a/sdk/src/main/proto/proto2_coder_test_messages.proto
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-/*
- * Protocol Buffer messages used for testing Proto2Coder implementation.
- */
-
-syntax = "proto2";
-
-package proto2_coder_test_messages;
-
-option java_package = "com.google.cloud.dataflow.sdk.coders";
-
-message MessageA {
- optional string field1 = 1;
- repeated MessageB field2 = 2;
-}
-
-message MessageB {
- optional bool field1 = 1;
-}
-
-message MessageC {
- extensions 100 to 105;
-}
-
-extend MessageC {
- optional MessageA field1 = 101;
- optional MessageB field2 = 102;
-}
-
-message MessageWithMap {
- map<string, MessageA> field1 = 1;
-}
-
-message ReferencesMessageWithMap {
- repeated MessageWithMap field1 = 1;
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/resources/com/google/cloud/dataflow/sdk/sdk.properties
----------------------------------------------------------------------
diff --git a/sdk/src/main/resources/com/google/cloud/dataflow/sdk/sdk.properties b/sdk/src/main/resources/com/google/cloud/dataflow/sdk/sdk.properties
deleted file mode 100644
index 5b0a720..0000000
--- a/sdk/src/main/resources/com/google/cloud/dataflow/sdk/sdk.properties
+++ /dev/null
@@ -1,5 +0,0 @@
-# SDK source version.
-version=${pom.version}
-
-build.date=${timestamp}
-
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/test/java/com/google/cloud/dataflow/sdk/DataflowMatchers.java
----------------------------------------------------------------------
diff --git a/sdk/src/test/java/com/google/cloud/dataflow/sdk/DataflowMatchers.java b/sdk/src/test/java/com/google/cloud/dataflow/sdk/DataflowMatchers.java
deleted file mode 100644
index ad21072..0000000
--- a/sdk/src/test/java/com/google/cloud/dataflow/sdk/DataflowMatchers.java
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk;
-
-import com.google.protobuf.ByteString;
-
-import org.hamcrest.Description;
-import org.hamcrest.TypeSafeMatcher;
-
-import java.io.Serializable;
-
-/**
- * Matchers that are useful when writing Dataflow tests.
- */
-public class DataflowMatchers {
- /**
- * Matcher for {@link ByteString} that prints the strings in UTF8.
- */
- public static class ByteStringMatcher extends TypeSafeMatcher<ByteString>
- implements Serializable {
- private ByteString expected;
- private ByteStringMatcher(ByteString expected) {
- this.expected = expected;
- }
-
- public static ByteStringMatcher byteStringEq(ByteString expected) {
- return new ByteStringMatcher(expected);
- }
-
- @Override
- public void describeTo(Description description) {
- description
- .appendText("ByteString(")
- .appendText(expected.toStringUtf8())
- .appendText(")");
- }
-
- @Override
- public void describeMismatchSafely(ByteString actual, Description description) {
- description
- .appendText("was ByteString(")
- .appendText(actual.toStringUtf8())
- .appendText(")");
- }
-
- @Override
- protected boolean matchesSafely(ByteString actual) {
- return actual.equals(expected);
- }
- }
-}
[56/67] incubator-beam git commit: Directory reorganization
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/java/src/test/java/com/google/cloud/dataflow/examples/cookbook/MaxPerKeyExamplesTest.java
----------------------------------------------------------------------
diff --git a/examples/java/src/test/java/com/google/cloud/dataflow/examples/cookbook/MaxPerKeyExamplesTest.java b/examples/java/src/test/java/com/google/cloud/dataflow/examples/cookbook/MaxPerKeyExamplesTest.java
new file mode 100644
index 0000000..3deff2a
--- /dev/null
+++ b/examples/java/src/test/java/com/google/cloud/dataflow/examples/cookbook/MaxPerKeyExamplesTest.java
@@ -0,0 +1,85 @@
+/*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package com.google.cloud.dataflow.examples.cookbook;
+
+import com.google.api.services.bigquery.model.TableRow;
+import com.google.cloud.dataflow.examples.cookbook.MaxPerKeyExamples.ExtractTempFn;
+import com.google.cloud.dataflow.examples.cookbook.MaxPerKeyExamples.FormatMaxesFn;
+import com.google.cloud.dataflow.sdk.transforms.DoFnTester;
+import com.google.cloud.dataflow.sdk.values.KV;
+import com.google.common.collect.ImmutableList;
+
+import org.hamcrest.CoreMatchers;
+import org.junit.Assert;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+import java.util.List;
+
+/** Unit tests for {@link MaxPerKeyExamples}. */
+@RunWith(JUnit4.class)
+public class MaxPerKeyExamplesTest {
+
+ private static final TableRow row1 = new TableRow()
+ .set("month", "6").set("day", "21")
+ .set("year", "2014").set("mean_temp", "85.3")
+ .set("tornado", true);
+ private static final TableRow row2 = new TableRow()
+ .set("month", "7").set("day", "20")
+ .set("year", "2014").set("mean_temp", "75.4")
+ .set("tornado", false);
+ private static final TableRow row3 = new TableRow()
+ .set("month", "6").set("day", "18")
+ .set("year", "2014").set("mean_temp", "45.3")
+ .set("tornado", true);
+ private static final List<TableRow> TEST_ROWS = ImmutableList.of(row1, row2, row3);
+
+ private static final KV<Integer, Double> kv1 = KV.of(6, 85.3);
+ private static final KV<Integer, Double> kv2 = KV.of(6, 45.3);
+ private static final KV<Integer, Double> kv3 = KV.of(7, 75.4);
+
+ private static final List<KV<Integer, Double>> TEST_KVS = ImmutableList.of(kv1, kv2, kv3);
+
+ private static final TableRow resultRow1 = new TableRow()
+ .set("month", 6)
+ .set("max_mean_temp", 85.3);
+ private static final TableRow resultRow2 = new TableRow()
+ .set("month", 7)
+ .set("max_mean_temp", 75.4);
+
+
+ @Test
+ public void testExtractTempFn() {
+ DoFnTester<TableRow, KV<Integer, Double>> extractTempFn =
+ DoFnTester.of(new ExtractTempFn());
+ List<KV<Integer, Double>> results = extractTempFn.processBatch(TEST_ROWS);
+ Assert.assertThat(results, CoreMatchers.hasItem(kv1));
+ Assert.assertThat(results, CoreMatchers.hasItem(kv2));
+ Assert.assertThat(results, CoreMatchers.hasItem(kv3));
+ }
+
+ @Test
+ public void testFormatMaxesFn() {
+ DoFnTester<KV<Integer, Double>, TableRow> formatMaxesFnFn =
+ DoFnTester.of(new FormatMaxesFn());
+ List<TableRow> results = formatMaxesFnFn.processBatch(TEST_KVS);
+ Assert.assertThat(results, CoreMatchers.hasItem(resultRow1));
+ Assert.assertThat(results, CoreMatchers.hasItem(resultRow2));
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/java/src/test/java/com/google/cloud/dataflow/examples/cookbook/TriggerExampleTest.java
----------------------------------------------------------------------
diff --git a/examples/java/src/test/java/com/google/cloud/dataflow/examples/cookbook/TriggerExampleTest.java b/examples/java/src/test/java/com/google/cloud/dataflow/examples/cookbook/TriggerExampleTest.java
new file mode 100644
index 0000000..209ea52
--- /dev/null
+++ b/examples/java/src/test/java/com/google/cloud/dataflow/examples/cookbook/TriggerExampleTest.java
@@ -0,0 +1,139 @@
+/*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package com.google.cloud.dataflow.examples.cookbook;
+
+import com.google.api.services.bigquery.model.TableRow;
+import com.google.cloud.dataflow.examples.cookbook.TriggerExample.ExtractFlowInfo;
+import com.google.cloud.dataflow.examples.cookbook.TriggerExample.TotalFlow;
+import com.google.cloud.dataflow.sdk.Pipeline;
+import com.google.cloud.dataflow.sdk.testing.DataflowAssert;
+import com.google.cloud.dataflow.sdk.testing.RunnableOnService;
+import com.google.cloud.dataflow.sdk.testing.TestPipeline;
+import com.google.cloud.dataflow.sdk.transforms.Create;
+import com.google.cloud.dataflow.sdk.transforms.DoFn;
+import com.google.cloud.dataflow.sdk.transforms.DoFnTester;
+import com.google.cloud.dataflow.sdk.transforms.ParDo;
+import com.google.cloud.dataflow.sdk.transforms.windowing.FixedWindows;
+import com.google.cloud.dataflow.sdk.transforms.windowing.Window;
+import com.google.cloud.dataflow.sdk.values.KV;
+import com.google.cloud.dataflow.sdk.values.PCollection;
+import com.google.cloud.dataflow.sdk.values.TimestampedValue;
+
+import org.joda.time.Duration;
+import org.joda.time.Instant;
+import org.junit.Assert;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * Unit Tests for {@link TriggerExample}.
+ * The results generated by triggers are by definition non-deterministic and hence hard to test.
+ * The unit test does not test all aspects of the example.
+ */
+@RunWith(JUnit4.class)
+public class TriggerExampleTest {
+
+ private static final String[] INPUT =
+ {"01/01/2010 00:00:00,1108302,94,E,ML,36,100,29,0.0065,66,9,1,0.001,74.8,1,9,3,0.0028,71,1,9,"
+ + "12,0.0099,67.4,1,9,13,0.0121,99.0,1,,,,,0,,,,,0,,,,,0,,,,,0", "01/01/2010 00:00:00,"
+ + "1100333,5,N,FR,9,0,39,,,9,,,,0,,,,,0,,,,,0,,,,,0,,,,,0,,,,,0,,,,,0,,,,"};
+
+ private static final List<TimestampedValue<String>> TIME_STAMPED_INPUT = Arrays.asList(
+ TimestampedValue.of("01/01/2010 00:00:00,1108302,5,W,ML,36,100,30,0.0065,66,9,1,0.001,"
+ + "74.8,1,9,3,0.0028,71,1,9,12,0.0099,87.4,1,9,13,0.0121,99.0,1,,,,,0,,,,,0,,,,,0,,,"
+ + ",,0", new Instant(60000)),
+ TimestampedValue.of("01/01/2010 00:00:00,1108302,110,E,ML,36,100,40,0.0065,66,9,1,0.001,"
+ + "74.8,1,9,3,0.0028,71,1,9,12,0.0099,67.4,1,9,13,0.0121,99.0,1,,,,,0,,,,,0,,,,,0,,,"
+ + ",,0", new Instant(1)),
+ TimestampedValue.of("01/01/2010 00:00:00,1108302,110,E,ML,36,100,50,0.0065,66,9,1,"
+ + "0.001,74.8,1,9,3,0.0028,71,1,9,12,0.0099,97.4,1,9,13,0.0121,50.0,1,,,,,0,,,,,0"
+ + ",,,,,0,,,,,0", new Instant(1)));
+
+ private static final TableRow OUT_ROW_1 = new TableRow()
+ .set("trigger_type", "default")
+ .set("freeway", "5").set("total_flow", 30)
+ .set("number_of_records", 1)
+ .set("isFirst", true).set("isLast", true)
+ .set("timing", "ON_TIME")
+ .set("window", "[1970-01-01T00:01:00.000Z..1970-01-01T00:02:00.000Z)");
+
+ private static final TableRow OUT_ROW_2 = new TableRow()
+ .set("trigger_type", "default")
+ .set("freeway", "110").set("total_flow", 90)
+ .set("number_of_records", 2)
+ .set("isFirst", true).set("isLast", true)
+ .set("timing", "ON_TIME")
+ .set("window", "[1970-01-01T00:00:00.000Z..1970-01-01T00:01:00.000Z)");
+
+ @Test
+ public void testExtractTotalFlow() {
+ DoFnTester<String, KV<String, Integer>> extractFlowInfow = DoFnTester
+ .of(new ExtractFlowInfo());
+
+ List<KV<String, Integer>> results = extractFlowInfow.processBatch(INPUT);
+ Assert.assertEquals(results.size(), 1);
+ Assert.assertEquals(results.get(0).getKey(), "94");
+ Assert.assertEquals(results.get(0).getValue(), new Integer(29));
+
+ List<KV<String, Integer>> output = extractFlowInfow.processBatch("");
+ Assert.assertEquals(output.size(), 0);
+ }
+
+ @Test
+ @Category(RunnableOnService.class)
+ public void testTotalFlow () {
+ Pipeline pipeline = TestPipeline.create();
+ PCollection<KV<String, Integer>> flow = pipeline
+ .apply(Create.timestamped(TIME_STAMPED_INPUT))
+ .apply(ParDo.of(new ExtractFlowInfo()));
+
+ PCollection<TableRow> totalFlow = flow
+ .apply(Window.<KV<String, Integer>>into(FixedWindows.of(Duration.standardMinutes(1))))
+ .apply(new TotalFlow("default"));
+
+ PCollection<TableRow> results = totalFlow.apply(ParDo.of(new FormatResults()));
+
+
+ DataflowAssert.that(results).containsInAnyOrder(OUT_ROW_1, OUT_ROW_2);
+ pipeline.run();
+
+ }
+
+ static class FormatResults extends DoFn<TableRow, TableRow> {
+ @Override
+ public void processElement(ProcessContext c) throws Exception {
+ TableRow element = c.element();
+ TableRow row = new TableRow()
+ .set("trigger_type", element.get("trigger_type"))
+ .set("freeway", element.get("freeway"))
+ .set("total_flow", element.get("total_flow"))
+ .set("number_of_records", element.get("number_of_records"))
+ .set("isFirst", element.get("isFirst"))
+ .set("isLast", element.get("isLast"))
+ .set("timing", element.get("timing"))
+ .set("window", element.get("window"));
+ c.output(row);
+ }
+ }
+}
+
+
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/pom.xml
----------------------------------------------------------------------
diff --git a/examples/pom.xml b/examples/pom.xml
deleted file mode 100644
index 8b17dfe..0000000
--- a/examples/pom.xml
+++ /dev/null
@@ -1,394 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project xmlns="http://maven.apache.org/POM/4.0.0"
- xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
- xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
-
- <modelVersion>4.0.0</modelVersion>
-
- <parent>
- <groupId>org.apache.beam</groupId>
- <artifactId>parent</artifactId>
- <version>0.1.0-incubating-SNAPSHOT</version>
- <relativePath>../pom.xml</relativePath>
- </parent>
-
- <artifactId>java-examples-all</artifactId>
- <name>Apache Beam :: Examples :: Java All</name>
- <description>Apache Beam SDK provides a simple, Java-based
- interface for processing virtually any size data. This
- artifact includes all Apache Beam Java SDK examples.</description>
-
- <packaging>jar</packaging>
-
- <profiles>
- <profile>
- <id>DataflowPipelineTests</id>
- <properties>
- <runIntegrationTestOnService>true</runIntegrationTestOnService>
- <testGroups>com.google.cloud.dataflow.sdk.testing.RunnableOnService</testGroups>
- <testParallelValue>both</testParallelValue>
- </properties>
- </profile>
- </profiles>
-
- <build>
- <plugins>
- <plugin>
- <artifactId>maven-compiler-plugin</artifactId>
- </plugin>
-
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-dependency-plugin</artifactId>
- </plugin>
-
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-checkstyle-plugin</artifactId>
- <version>2.12</version>
- <dependencies>
- <dependency>
- <groupId>com.puppycrawl.tools</groupId>
- <artifactId>checkstyle</artifactId>
- <version>6.6</version>
- </dependency>
- </dependencies>
- <configuration>
- <configLocation>../checkstyle.xml</configLocation>
- <consoleOutput>true</consoleOutput>
- <failOnViolation>true</failOnViolation>
- <includeTestSourceDirectory>true</includeTestSourceDirectory>
- <includeResources>false</includeResources>
- </configuration>
- <executions>
- <execution>
- <goals>
- <goal>check</goal>
- </goals>
- </execution>
- </executions>
- </plugin>
-
- <!-- Source plugin for generating source and test-source JARs. -->
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-source-plugin</artifactId>
- <version>2.4</version>
- <executions>
- <execution>
- <id>attach-sources</id>
- <phase>compile</phase>
- <goals>
- <goal>jar</goal>
- </goals>
- </execution>
- <execution>
- <id>attach-test-sources</id>
- <phase>test-compile</phase>
- <goals>
- <goal>test-jar</goal>
- </goals>
- </execution>
- </executions>
- </plugin>
-
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-javadoc-plugin</artifactId>
- <configuration>
- <windowtitle>Apache Beam Examples</windowtitle>
- <doctitle>Apache Beam Examples</doctitle>
-
- <subpackages>com.google.cloud.dataflow.examples</subpackages>
- <additionalparam>-exclude com.google.cloud.dataflow.sdk.runners.worker:com.google.cloud.dataflow.sdk.runners.dataflow:com.google.cloud.dataflow.sdk.util ${dataflow.javadoc_opts}</additionalparam>
- <use>false</use>
- <quiet>true</quiet>
- <bottom><![CDATA[<br>]]></bottom>
-
- <offlineLinks>
- <!-- The Dataflow SDK docs -->
- <offlineLink>
- <url>https://cloud.google.com/dataflow/java-sdk/JavaDoc/</url>
- <location>${basedir}/../javadoc/dataflow-sdk-docs</location>
- </offlineLink>
- <!-- Other dependencies -->
- <offlineLink>
- <url>https://developers.google.com/api-client-library/java/google-api-java-client/reference/1.20.0/</url>
- <location>${basedir}/../javadoc/apiclient-docs</location>
- </offlineLink>
- <offlineLink>
- <url>http://avro.apache.org/docs/1.7.7/api/java/</url>
- <location>${basedir}/../javadoc/avro-docs</location>
- </offlineLink>
- <offlineLink>
- <url>https://developers.google.com/resources/api-libraries/documentation/bigquery/v2/java/latest/</url>
- <location>${basedir}/../javadoc/bq-docs</location>
- </offlineLink>
- <offlineLink>
- <url>https://cloud.google.com/datastore/docs/apis/javadoc/</url>
- <location>${basedir}/../javadoc/datastore-docs</location>
- </offlineLink>
- <offlineLink>
- <url>http://docs.guava-libraries.googlecode.com/git-history/release18/javadoc/</url>
- <location>${basedir}/../javadoc/guava-docs</location>
- </offlineLink>
- <offlineLink>
- <url>http://fasterxml.github.io/jackson-annotations/javadoc/2.7/</url>
- <location>${basedir}/../javadoc/jackson-annotations-docs</location>
- </offlineLink>
- <offlineLink>
- <url>http://fasterxml.github.io/jackson-databind/javadoc/2.7/</url>
- <location>${basedir}/../javadoc/jackson-databind-docs</location>
- </offlineLink>
- <offlineLink>
- <url>http://www.joda.org/joda-time/apidocs</url>
- <location>${basedir}/../javadoc/joda-docs</location>
- </offlineLink>
- <offlineLink>
- <url>https://developers.google.com/api-client-library/java/google-oauth-java-client/reference/1.20.0/</url>
- <location>${basedir}/../javadoc/oauth-docs</location>
- </offlineLink>
- </offlineLinks>
- </configuration>
- <executions>
- <execution>
- <goals>
- <goal>jar</goal>
- </goals>
- <phase>package</phase>
- </execution>
- </executions>
- </plugin>
-
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-shade-plugin</artifactId>
- <version>2.4.1</version>
- <executions>
- <execution>
- <phase>package</phase>
- <goals>
- <goal>shade</goal>
- </goals>
- <configuration>
- <finalName>${project.artifactId}-bundled-${project.version}</finalName>
- <artifactSet>
- <includes>
- <include>*:*</include>
- </includes>
- </artifactSet>
- <filters>
- <filter>
- <artifact>*:*</artifact>
- <excludes>
- <exclude>META-INF/*.SF</exclude>
- <exclude>META-INF/*.DSA</exclude>
- <exclude>META-INF/*.RSA</exclude>
- </excludes>
- </filter>
- </filters>
- </configuration>
- </execution>
- </executions>
- </plugin>
-
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-jar-plugin</artifactId>
- <executions>
- <execution>
- <id>default-jar</id>
- <goals>
- <goal>jar</goal>
- </goals>
- </execution>
- <execution>
- <id>default-test-jar</id>
- <goals>
- <goal>test-jar</goal>
- </goals>
- </execution>
- </executions>
- </plugin>
-
- <!-- Coverage analysis for unit tests. -->
- <plugin>
- <groupId>org.jacoco</groupId>
- <artifactId>jacoco-maven-plugin</artifactId>
- </plugin>
- </plugins>
- </build>
-
- <dependencies>
- <dependency>
- <groupId>org.apache.beam</groupId>
- <artifactId>java-sdk-all</artifactId>
- <version>${project.version}</version>
- </dependency>
-
- <dependency>
- <groupId>com.google.api-client</groupId>
- <artifactId>google-api-client</artifactId>
- <version>${google-clients.version}</version>
- <exclusions>
- <!-- Exclude an old version of guava that is being pulled
- in by a transitive dependency of google-api-client -->
- <exclusion>
- <groupId>com.google.guava</groupId>
- <artifactId>guava-jdk5</artifactId>
- </exclusion>
- </exclusions>
- </dependency>
-
- <dependency>
- <groupId>com.google.apis</groupId>
- <artifactId>google-api-services-dataflow</artifactId>
- <version>${dataflow.version}</version>
- <exclusions>
- <!-- Exclude an old version of guava that is being pulled
- in by a transitive dependency of google-api-client -->
- <exclusion>
- <groupId>com.google.guava</groupId>
- <artifactId>guava-jdk5</artifactId>
- </exclusion>
- </exclusions>
- </dependency>
-
- <dependency>
- <groupId>com.google.apis</groupId>
- <artifactId>google-api-services-bigquery</artifactId>
- <version>${bigquery.version}</version>
- <exclusions>
- <!-- Exclude an old version of guava that is being pulled
- in by a transitive dependency of google-api-client -->
- <exclusion>
- <groupId>com.google.guava</groupId>
- <artifactId>guava-jdk5</artifactId>
- </exclusion>
- </exclusions>
- </dependency>
-
- <dependency>
- <groupId>com.google.http-client</groupId>
- <artifactId>google-http-client</artifactId>
- <version>${google-clients.version}</version>
- <exclusions>
- <!-- Exclude an old version of guava that is being pulled
- in by a transitive dependency of google-api-client -->
- <exclusion>
- <groupId>com.google.guava</groupId>
- <artifactId>guava-jdk5</artifactId>
- </exclusion>
- </exclusions>
- </dependency>
-
- <dependency>
- <groupId>org.apache.avro</groupId>
- <artifactId>avro</artifactId>
- <version>${avro.version}</version>
- </dependency>
-
- <dependency>
- <groupId>com.google.apis</groupId>
- <artifactId>google-api-services-datastore-protobuf</artifactId>
- <version>${datastore.version}</version>
- <exclusions>
- <!-- Exclude an old version of guava that is being pulled
- in by a transitive dependency of google-api-client -->
- <exclusion>
- <groupId>com.google.guava</groupId>
- <artifactId>guava-jdk5</artifactId>
- </exclusion>
- </exclusions>
- </dependency>
-
- <dependency>
- <groupId>com.google.apis</groupId>
- <artifactId>google-api-services-pubsub</artifactId>
- <version>${pubsub.version}</version>
- <exclusions>
- <!-- Exclude an old version of guava that is being pulled
- in by a transitive dependency of google-api-client -->
- <exclusion>
- <groupId>com.google.guava</groupId>
- <artifactId>guava-jdk5</artifactId>
- </exclusion>
- </exclusions>
- </dependency>
-
- <dependency>
- <groupId>com.google.guava</groupId>
- <artifactId>guava</artifactId>
- <version>${guava.version}</version>
- </dependency>
-
- <dependency>
- <groupId>com.google.code.findbugs</groupId>
- <artifactId>jsr305</artifactId>
- <version>${jsr305.version}</version>
- </dependency>
-
- <dependency>
- <groupId>joda-time</groupId>
- <artifactId>joda-time</artifactId>
- <version>${joda.version}</version>
- </dependency>
-
- <dependency>
- <groupId>org.slf4j</groupId>
- <artifactId>slf4j-api</artifactId>
- <version>${slf4j.version}</version>
- </dependency>
-
- <dependency>
- <groupId>org.slf4j</groupId>
- <artifactId>slf4j-jdk14</artifactId>
- <version>${slf4j.version}</version>
- <scope>runtime</scope>
- </dependency>
-
- <dependency>
- <groupId>javax.servlet</groupId>
- <artifactId>javax.servlet-api</artifactId>
- <version>3.1.0</version>
- </dependency>
-
- <!-- Hamcrest and JUnit are required dependencies of DataflowAssert,
- which is used in the main code of DebuggingWordCount example. -->
-
- <dependency>
- <groupId>org.hamcrest</groupId>
- <artifactId>hamcrest-all</artifactId>
- <version>${hamcrest.version}</version>
- </dependency>
-
- <dependency>
- <groupId>junit</groupId>
- <artifactId>junit</artifactId>
- <version>${junit.version}</version>
- </dependency>
-
- <dependency>
- <groupId>org.mockito</groupId>
- <artifactId>mockito-all</artifactId>
- <version>1.10.19</version>
- <scope>test</scope>
- </dependency>
- </dependencies>
-</project>
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/src/main/java/com/google/cloud/dataflow/examples/DebuggingWordCount.java
----------------------------------------------------------------------
diff --git a/examples/src/main/java/com/google/cloud/dataflow/examples/DebuggingWordCount.java b/examples/src/main/java/com/google/cloud/dataflow/examples/DebuggingWordCount.java
deleted file mode 100644
index 8823dbc..0000000
--- a/examples/src/main/java/com/google/cloud/dataflow/examples/DebuggingWordCount.java
+++ /dev/null
@@ -1,182 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.examples;
-
-import com.google.cloud.dataflow.examples.WordCount.WordCountOptions;
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.io.TextIO;
-import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
-import com.google.cloud.dataflow.sdk.testing.DataflowAssert;
-import com.google.cloud.dataflow.sdk.transforms.Aggregator;
-import com.google.cloud.dataflow.sdk.transforms.DoFn;
-import com.google.cloud.dataflow.sdk.transforms.ParDo;
-import com.google.cloud.dataflow.sdk.transforms.Sum;
-import com.google.cloud.dataflow.sdk.values.KV;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.util.Arrays;
-import java.util.List;
-import java.util.regex.Pattern;
-
-
-/**
- * An example that verifies word counts in Shakespeare and includes Dataflow best practices.
- *
- * <p>This class, {@link DebuggingWordCount}, is the third in a series of four successively more
- * detailed 'word count' examples. You may first want to take a look at {@link MinimalWordCount}
- * and {@link WordCount}. After you've looked at this example, then see the
- * {@link WindowedWordCount} pipeline, for introduction of additional concepts.
- *
- * <p>Basic concepts, also in the MinimalWordCount and WordCount examples:
- * Reading text files; counting a PCollection; executing a Pipeline both locally
- * and using the Dataflow service; defining DoFns.
- *
- * <p>New Concepts:
- * <pre>
- * 1. Logging to Cloud Logging
- * 2. Controlling Dataflow worker log levels
- * 3. Creating a custom aggregator
- * 4. Testing your Pipeline via DataflowAssert
- * </pre>
- *
- * <p>To execute this pipeline locally, specify general pipeline configuration:
- * <pre>{@code
- * --project=YOUR_PROJECT_ID
- * }
- * </pre>
- *
- * <p>To execute this pipeline using the Dataflow service and the additional logging discussed
- * below, specify pipeline configuration:
- * <pre>{@code
- * --project=YOUR_PROJECT_ID
- * --stagingLocation=gs://YOUR_STAGING_DIRECTORY
- * --runner=BlockingDataflowPipelineRunner
- * --workerLogLevelOverrides={"com.google.cloud.dataflow.examples":"DEBUG"}
- * }
- * </pre>
- *
- * <p>Note that when you run via <code>mvn exec</code>, you may need to escape
- * the quotations as appropriate for your shell. For example, in <code>bash</code>:
- * <pre>
- * mvn compile exec:java ... \
- * -Dexec.args="... \
- * --workerLogLevelOverrides={\\\"com.google.cloud.dataflow.examples\\\":\\\"DEBUG\\\"}"
- * </pre>
- *
- * <p>Concept #2: Dataflow workers which execute user code are configured to log to Cloud
- * Logging by default at "INFO" log level and higher. One may override log levels for specific
- * logging namespaces by specifying:
- * <pre><code>
- * --workerLogLevelOverrides={"Name1":"Level1","Name2":"Level2",...}
- * </code></pre>
- * For example, by specifying:
- * <pre><code>
- * --workerLogLevelOverrides={"com.google.cloud.dataflow.examples":"DEBUG"}
- * </code></pre>
- * when executing this pipeline using the Dataflow service, Cloud Logging would contain only
- * "DEBUG" or higher level logs for the {@code com.google.cloud.dataflow.examples} package in
- * addition to the default "INFO" or higher level logs. In addition, the default Dataflow worker
- * logging configuration can be overridden by specifying
- * {@code --defaultWorkerLogLevel=<one of TRACE, DEBUG, INFO, WARN, ERROR>}. For example,
- * by specifying {@code --defaultWorkerLogLevel=DEBUG} when executing this pipeline with
- * the Dataflow service, Cloud Logging would contain all "DEBUG" or higher level logs. Note
- * that changing the default worker log level to TRACE or DEBUG will significantly increase
- * the amount of logs output.
- *
- * <p>The input file defaults to {@code gs://dataflow-samples/shakespeare/kinglear.txt} and can be
- * overridden with {@code --inputFile}.
- */
-public class DebuggingWordCount {
- /** A DoFn that filters for a specific key based upon a regular expression. */
- public static class FilterTextFn extends DoFn<KV<String, Long>, KV<String, Long>> {
- /**
- * Concept #1: The logger below uses the fully qualified class name of FilterTextFn
- * as the logger. All log statements emitted by this logger will be referenced by this name
- * and will be visible in the Cloud Logging UI. Learn more at https://cloud.google.com/logging
- * about the Cloud Logging UI.
- */
- private static final Logger LOG = LoggerFactory.getLogger(FilterTextFn.class);
-
- private final Pattern filter;
- public FilterTextFn(String pattern) {
- filter = Pattern.compile(pattern);
- }
-
- /**
- * Concept #3: A custom aggregator can track values in your pipeline as it runs. Those
- * values will be displayed in the Dataflow Monitoring UI when this pipeline is run using the
- * Dataflow service. These aggregators below track the number of matched and unmatched words.
- * Learn more at https://cloud.google.com/dataflow/pipelines/dataflow-monitoring-intf about
- * the Dataflow Monitoring UI.
- */
- private final Aggregator<Long, Long> matchedWords =
- createAggregator("matchedWords", new Sum.SumLongFn());
- private final Aggregator<Long, Long> unmatchedWords =
- createAggregator("umatchedWords", new Sum.SumLongFn());
-
- @Override
- public void processElement(ProcessContext c) {
- if (filter.matcher(c.element().getKey()).matches()) {
- // Log at the "DEBUG" level each element that we match. When executing this pipeline
- // using the Dataflow service, these log lines will appear in the Cloud Logging UI
- // only if the log level is set to "DEBUG" or lower.
- LOG.debug("Matched: " + c.element().getKey());
- matchedWords.addValue(1L);
- c.output(c.element());
- } else {
- // Log at the "TRACE" level each element that is not matched. Different log levels
- // can be used to control the verbosity of logging providing an effective mechanism
- // to filter less important information.
- LOG.trace("Did not match: " + c.element().getKey());
- unmatchedWords.addValue(1L);
- }
- }
- }
-
- public static void main(String[] args) {
- WordCountOptions options = PipelineOptionsFactory.fromArgs(args).withValidation()
- .as(WordCountOptions.class);
- Pipeline p = Pipeline.create(options);
-
- PCollection<KV<String, Long>> filteredWords =
- p.apply(TextIO.Read.named("ReadLines").from(options.getInputFile()))
- .apply(new WordCount.CountWords())
- .apply(ParDo.of(new FilterTextFn("Flourish|stomach")));
-
- /**
- * Concept #4: DataflowAssert is a set of convenient PTransforms in the style of
- * Hamcrest's collection matchers that can be used when writing Pipeline level tests
- * to validate the contents of PCollections. DataflowAssert is best used in unit tests
- * with small data sets but is demonstrated here as a teaching tool.
- *
- * <p>Below we verify that the set of filtered words matches our expected counts. Note
- * that DataflowAssert does not provide any output and that successful completion of the
- * Pipeline implies that the expectations were met. Learn more at
- * https://cloud.google.com/dataflow/pipelines/testing-your-pipeline on how to test
- * your Pipeline and see {@link DebuggingWordCountTest} for an example unit test.
- */
- List<KV<String, Long>> expectedResults = Arrays.asList(
- KV.of("Flourish", 3L),
- KV.of("stomach", 1L));
- DataflowAssert.that(filteredWords).containsInAnyOrder(expectedResults);
-
- p.run();
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/src/main/java/com/google/cloud/dataflow/examples/MinimalWordCount.java
----------------------------------------------------------------------
diff --git a/examples/src/main/java/com/google/cloud/dataflow/examples/MinimalWordCount.java b/examples/src/main/java/com/google/cloud/dataflow/examples/MinimalWordCount.java
deleted file mode 100644
index 4ed0520..0000000
--- a/examples/src/main/java/com/google/cloud/dataflow/examples/MinimalWordCount.java
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.examples;
-
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.io.TextIO;
-import com.google.cloud.dataflow.sdk.options.DataflowPipelineOptions;
-import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
-import com.google.cloud.dataflow.sdk.runners.BlockingDataflowPipelineRunner;
-import com.google.cloud.dataflow.sdk.transforms.Count;
-import com.google.cloud.dataflow.sdk.transforms.DoFn;
-import com.google.cloud.dataflow.sdk.transforms.MapElements;
-import com.google.cloud.dataflow.sdk.transforms.ParDo;
-import com.google.cloud.dataflow.sdk.transforms.SimpleFunction;
-import com.google.cloud.dataflow.sdk.values.KV;
-
-
-/**
- * An example that counts words in Shakespeare.
- *
- * <p>This class, {@link MinimalWordCount}, is the first in a series of four successively more
- * detailed 'word count' examples. Here, for simplicity, we don't show any error-checking or
- * argument processing, and focus on construction of the pipeline, which chains together the
- * application of core transforms.
- *
- * <p>Next, see the {@link WordCount} pipeline, then the {@link DebuggingWordCount}, and finally
- * the {@link WindowedWordCount} pipeline, for more detailed examples that introduce additional
- * concepts.
- *
- * <p>Concepts:
- * <pre>
- * 1. Reading data from text files
- * 2. Specifying 'inline' transforms
- * 3. Counting a PCollection
- * 4. Writing data to Cloud Storage as text files
- * </pre>
- *
- * <p>To execute this pipeline, first edit the code to set your project ID, the staging
- * location, and the output location. The specified GCS bucket(s) must already exist.
- *
- * <p>Then, run the pipeline as described in the README. It will be deployed and run using the
- * Dataflow service. No args are required to run the pipeline. You can see the results in your
- * output bucket in the GCS browser.
- */
-public class MinimalWordCount {
-
- public static void main(String[] args) {
- // Create a DataflowPipelineOptions object. This object lets us set various execution
- // options for our pipeline, such as the associated Cloud Platform project and the location
- // in Google Cloud Storage to stage files.
- DataflowPipelineOptions options = PipelineOptionsFactory.create()
- .as(DataflowPipelineOptions.class);
- options.setRunner(BlockingDataflowPipelineRunner.class);
- // CHANGE 1/3: Your project ID is required in order to run your pipeline on the Google Cloud.
- options.setProject("SET_YOUR_PROJECT_ID_HERE");
- // CHANGE 2/3: Your Google Cloud Storage path is required for staging local files.
- options.setStagingLocation("gs://SET_YOUR_BUCKET_NAME_HERE/AND_STAGING_DIRECTORY");
-
- // Create the Pipeline object with the options we defined above.
- Pipeline p = Pipeline.create(options);
-
- // Apply the pipeline's transforms.
-
- // Concept #1: Apply a root transform to the pipeline; in this case, TextIO.Read to read a set
- // of input text files. TextIO.Read returns a PCollection where each element is one line from
- // the input text (a set of Shakespeare's texts).
- p.apply(TextIO.Read.from("gs://dataflow-samples/shakespeare/*"))
- // Concept #2: Apply a ParDo transform to our PCollection of text lines. This ParDo invokes a
- // DoFn (defined in-line) on each element that tokenizes the text line into individual words.
- // The ParDo returns a PCollection<String>, where each element is an individual word in
- // Shakespeare's collected texts.
- .apply(ParDo.named("ExtractWords").of(new DoFn<String, String>() {
- @Override
- public void processElement(ProcessContext c) {
- for (String word : c.element().split("[^a-zA-Z']+")) {
- if (!word.isEmpty()) {
- c.output(word);
- }
- }
- }
- }))
- // Concept #3: Apply the Count transform to our PCollection of individual words. The Count
- // transform returns a new PCollection of key/value pairs, where each key represents a unique
- // word in the text. The associated value is the occurrence count for that word.
- .apply(Count.<String>perElement())
- // Apply a MapElements transform that formats our PCollection of word counts into a printable
- // string, suitable for writing to an output file.
- .apply("FormatResults", MapElements.via(new SimpleFunction<KV<String, Long>, String>() {
- @Override
- public String apply(KV<String, Long> input) {
- return input.getKey() + ": " + input.getValue();
- }
- }))
- // Concept #4: Apply a write transform, TextIO.Write, at the end of the pipeline.
- // TextIO.Write writes the contents of a PCollection (in this case, our PCollection of
- // formatted strings) to a series of text files in Google Cloud Storage.
- // CHANGE 3/3: The Google Cloud Storage path is required for outputting the results to.
- .apply(TextIO.Write.to("gs://YOUR_OUTPUT_BUCKET/AND_OUTPUT_PREFIX"));
-
- // Run the pipeline.
- p.run();
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/src/main/java/com/google/cloud/dataflow/examples/WindowedWordCount.java
----------------------------------------------------------------------
diff --git a/examples/src/main/java/com/google/cloud/dataflow/examples/WindowedWordCount.java b/examples/src/main/java/com/google/cloud/dataflow/examples/WindowedWordCount.java
deleted file mode 100644
index 2adac55..0000000
--- a/examples/src/main/java/com/google/cloud/dataflow/examples/WindowedWordCount.java
+++ /dev/null
@@ -1,269 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.examples;
-
-import com.google.api.services.bigquery.model.TableFieldSchema;
-import com.google.api.services.bigquery.model.TableReference;
-import com.google.api.services.bigquery.model.TableRow;
-import com.google.api.services.bigquery.model.TableSchema;
-import com.google.cloud.dataflow.examples.common.DataflowExampleOptions;
-import com.google.cloud.dataflow.examples.common.DataflowExampleUtils;
-import com.google.cloud.dataflow.examples.common.ExampleBigQueryTableOptions;
-import com.google.cloud.dataflow.examples.common.ExamplePubsubTopicOptions;
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.PipelineResult;
-import com.google.cloud.dataflow.sdk.io.BigQueryIO;
-import com.google.cloud.dataflow.sdk.io.PubsubIO;
-import com.google.cloud.dataflow.sdk.io.TextIO;
-import com.google.cloud.dataflow.sdk.options.Default;
-import com.google.cloud.dataflow.sdk.options.Description;
-import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
-import com.google.cloud.dataflow.sdk.transforms.DoFn;
-import com.google.cloud.dataflow.sdk.transforms.ParDo;
-import com.google.cloud.dataflow.sdk.transforms.windowing.FixedWindows;
-import com.google.cloud.dataflow.sdk.transforms.windowing.Window;
-import com.google.cloud.dataflow.sdk.values.KV;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-
-import org.joda.time.Duration;
-import org.joda.time.Instant;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
-
-
-/**
- * An example that counts words in text, and can run over either unbounded or bounded input
- * collections.
- *
- * <p>This class, {@link WindowedWordCount}, is the last in a series of four successively more
- * detailed 'word count' examples. First take a look at {@link MinimalWordCount},
- * {@link WordCount}, and {@link DebuggingWordCount}.
- *
- * <p>Basic concepts, also in the MinimalWordCount, WordCount, and DebuggingWordCount examples:
- * Reading text files; counting a PCollection; writing to GCS; executing a Pipeline both locally
- * and using the Dataflow service; defining DoFns; creating a custom aggregator;
- * user-defined PTransforms; defining PipelineOptions.
- *
- * <p>New Concepts:
- * <pre>
- * 1. Unbounded and bounded pipeline input modes
- * 2. Adding timestamps to data
- * 3. PubSub topics as sources
- * 4. Windowing
- * 5. Re-using PTransforms over windowed PCollections
- * 6. Writing to BigQuery
- * </pre>
- *
- * <p>To execute this pipeline locally, specify general pipeline configuration:
- * <pre>{@code
- * --project=YOUR_PROJECT_ID
- * }
- * </pre>
- *
- * <p>To execute this pipeline using the Dataflow service, specify pipeline configuration:
- * <pre>{@code
- * --project=YOUR_PROJECT_ID
- * --stagingLocation=gs://YOUR_STAGING_DIRECTORY
- * --runner=BlockingDataflowPipelineRunner
- * }
- * </pre>
- *
- * <p>Optionally specify the input file path via:
- * {@code --inputFile=gs://INPUT_PATH},
- * which defaults to {@code gs://dataflow-samples/shakespeare/kinglear.txt}.
- *
- * <p>Specify an output BigQuery dataset and optionally, a table for the output. If you don't
- * specify the table, one will be created for you using the job name. If you don't specify the
- * dataset, a dataset called {@code dataflow-examples} must already exist in your project.
- * {@code --bigQueryDataset=YOUR-DATASET --bigQueryTable=YOUR-NEW-TABLE-NAME}.
- *
- * <p>Decide whether you want your pipeline to run with 'bounded' (such as files in GCS) or
- * 'unbounded' input (such as a PubSub topic). To run with unbounded input, set
- * {@code --unbounded=true}. Then, optionally specify the Google Cloud PubSub topic to read from
- * via {@code --pubsubTopic=projects/PROJECT_ID/topics/YOUR_TOPIC_NAME}. If the topic does not
- * exist, the pipeline will create one for you. It will delete this topic when it terminates.
- * The pipeline will automatically launch an auxiliary batch pipeline to populate the given PubSub
- * topic with the contents of the {@code --inputFile}, in order to make the example easy to run.
- * If you want to use an independently-populated PubSub topic, indicate this by setting
- * {@code --inputFile=""}. In that case, the auxiliary pipeline will not be started.
- *
- * <p>By default, the pipeline will do fixed windowing, on 1-minute windows. You can
- * change this interval by setting the {@code --windowSize} parameter, e.g. {@code --windowSize=10}
- * for 10-minute windows.
- */
-public class WindowedWordCount {
- private static final Logger LOG = LoggerFactory.getLogger(WindowedWordCount.class);
- static final int WINDOW_SIZE = 1; // Default window duration in minutes
-
- /**
- * Concept #2: A DoFn that sets the data element timestamp. This is a silly method, just for
- * this example, for the bounded data case.
- *
- * <p>Imagine that many ghosts of Shakespeare are all typing madly at the same time to recreate
- * his masterworks. Each line of the corpus will get a random associated timestamp somewhere in a
- * 2-hour period.
- */
- static class AddTimestampFn extends DoFn<String, String> {
- private static final long RAND_RANGE = 7200000; // 2 hours in ms
-
- @Override
- public void processElement(ProcessContext c) {
- // Generate a timestamp that falls somewhere in the past two hours.
- long randomTimestamp = System.currentTimeMillis()
- - (int) (Math.random() * RAND_RANGE);
- /**
- * Concept #2: Set the data element with that timestamp.
- */
- c.outputWithTimestamp(c.element(), new Instant(randomTimestamp));
- }
- }
-
- /** A DoFn that converts a Word and Count into a BigQuery table row. */
- static class FormatAsTableRowFn extends DoFn<KV<String, Long>, TableRow> {
- @Override
- public void processElement(ProcessContext c) {
- TableRow row = new TableRow()
- .set("word", c.element().getKey())
- .set("count", c.element().getValue())
- // include a field for the window timestamp
- .set("window_timestamp", c.timestamp().toString());
- c.output(row);
- }
- }
-
- /**
- * Helper method that defines the BigQuery schema used for the output.
- */
- private static TableSchema getSchema() {
- List<TableFieldSchema> fields = new ArrayList<>();
- fields.add(new TableFieldSchema().setName("word").setType("STRING"));
- fields.add(new TableFieldSchema().setName("count").setType("INTEGER"));
- fields.add(new TableFieldSchema().setName("window_timestamp").setType("TIMESTAMP"));
- TableSchema schema = new TableSchema().setFields(fields);
- return schema;
- }
-
- /**
- * Concept #6: We'll stream the results to a BigQuery table. The BigQuery output source is one
- * that supports both bounded and unbounded data. This is a helper method that creates a
- * TableReference from input options, to tell the pipeline where to write its BigQuery results.
- */
- private static TableReference getTableReference(Options options) {
- TableReference tableRef = new TableReference();
- tableRef.setProjectId(options.getProject());
- tableRef.setDatasetId(options.getBigQueryDataset());
- tableRef.setTableId(options.getBigQueryTable());
- return tableRef;
- }
-
- /**
- * Options supported by {@link WindowedWordCount}.
- *
- * <p>Inherits standard example configuration options, which allow specification of the BigQuery
- * table and the PubSub topic, as well as the {@link WordCount.WordCountOptions} support for
- * specification of the input file.
- */
- public static interface Options extends WordCount.WordCountOptions,
- DataflowExampleOptions, ExamplePubsubTopicOptions, ExampleBigQueryTableOptions {
- @Description("Fixed window duration, in minutes")
- @Default.Integer(WINDOW_SIZE)
- Integer getWindowSize();
- void setWindowSize(Integer value);
-
- @Description("Whether to run the pipeline with unbounded input")
- boolean isUnbounded();
- void setUnbounded(boolean value);
- }
-
- public static void main(String[] args) throws IOException {
- Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
- options.setBigQuerySchema(getSchema());
- // DataflowExampleUtils creates the necessary input sources to simplify execution of this
- // Pipeline.
- DataflowExampleUtils exampleDataflowUtils = new DataflowExampleUtils(options,
- options.isUnbounded());
-
- Pipeline pipeline = Pipeline.create(options);
-
- /**
- * Concept #1: the Dataflow SDK lets us run the same pipeline with either a bounded or
- * unbounded input source.
- */
- PCollection<String> input;
- if (options.isUnbounded()) {
- LOG.info("Reading from PubSub.");
- /**
- * Concept #3: Read from the PubSub topic. A topic will be created if it wasn't
- * specified as an argument. The data elements' timestamps will come from the pubsub
- * injection.
- */
- input = pipeline
- .apply(PubsubIO.Read.topic(options.getPubsubTopic()));
- } else {
- /** Else, this is a bounded pipeline. Read from the GCS file. */
- input = pipeline
- .apply(TextIO.Read.from(options.getInputFile()))
- // Concept #2: Add an element timestamp, using an artificial time just to show windowing.
- // See AddTimestampFn for more detail on this.
- .apply(ParDo.of(new AddTimestampFn()));
- }
-
- /**
- * Concept #4: Window into fixed windows. The fixed window size for this example defaults to 1
- * minute (you can change this with a command-line option). See the documentation for more
- * information on how fixed windows work, and for information on the other types of windowing
- * available (e.g., sliding windows).
- */
- PCollection<String> windowedWords = input
- .apply(Window.<String>into(
- FixedWindows.of(Duration.standardMinutes(options.getWindowSize()))));
-
- /**
- * Concept #5: Re-use our existing CountWords transform that does not have knowledge of
- * windows over a PCollection containing windowed values.
- */
- PCollection<KV<String, Long>> wordCounts = windowedWords.apply(new WordCount.CountWords());
-
- /**
- * Concept #6: Format the results for a BigQuery table, then write to BigQuery.
- * The BigQuery output source supports both bounded and unbounded data.
- */
- wordCounts.apply(ParDo.of(new FormatAsTableRowFn()))
- .apply(BigQueryIO.Write
- .to(getTableReference(options))
- .withSchema(getSchema())
- .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED)
- .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_APPEND));
-
- PipelineResult result = pipeline.run();
-
- /**
- * To mock unbounded input from PubSub, we'll now start an auxiliary 'injector' pipeline that
- * runs for a limited time, and publishes to the input PubSub topic.
- *
- * With an unbounded input source, you will need to explicitly shut down this pipeline when you
- * are done with it, so that you do not continue to be charged for the instances. You can do
- * this via a ctrl-C from the command line, or from the developer's console UI for Dataflow
- * pipelines. The PubSub topic will also be deleted at this time.
- */
- exampleDataflowUtils.mockUnboundedSource(options.getInputFile(), result);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/src/main/java/com/google/cloud/dataflow/examples/WordCount.java
----------------------------------------------------------------------
diff --git a/examples/src/main/java/com/google/cloud/dataflow/examples/WordCount.java b/examples/src/main/java/com/google/cloud/dataflow/examples/WordCount.java
deleted file mode 100644
index 1086106..0000000
--- a/examples/src/main/java/com/google/cloud/dataflow/examples/WordCount.java
+++ /dev/null
@@ -1,206 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.examples;
-
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.io.TextIO;
-import com.google.cloud.dataflow.sdk.options.DataflowPipelineOptions;
-import com.google.cloud.dataflow.sdk.options.Default;
-import com.google.cloud.dataflow.sdk.options.DefaultValueFactory;
-import com.google.cloud.dataflow.sdk.options.Description;
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
-import com.google.cloud.dataflow.sdk.transforms.Aggregator;
-import com.google.cloud.dataflow.sdk.transforms.Count;
-import com.google.cloud.dataflow.sdk.transforms.DoFn;
-import com.google.cloud.dataflow.sdk.transforms.MapElements;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-import com.google.cloud.dataflow.sdk.transforms.ParDo;
-import com.google.cloud.dataflow.sdk.transforms.SimpleFunction;
-import com.google.cloud.dataflow.sdk.transforms.Sum;
-import com.google.cloud.dataflow.sdk.util.gcsfs.GcsPath;
-import com.google.cloud.dataflow.sdk.values.KV;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-
-
-/**
- * An example that counts words in Shakespeare and includes Dataflow best practices.
- *
- * <p>This class, {@link WordCount}, is the second in a series of four successively more detailed
- * 'word count' examples. You may first want to take a look at {@link MinimalWordCount}.
- * After you've looked at this example, then see the {@link DebuggingWordCount}
- * pipeline, for introduction of additional concepts.
- *
- * <p>For a detailed walkthrough of this example, see
- * <a href="https://cloud.google.com/dataflow/java-sdk/wordcount-example">
- * https://cloud.google.com/dataflow/java-sdk/wordcount-example
- * </a>
- *
- * <p>Basic concepts, also in the MinimalWordCount example:
- * Reading text files; counting a PCollection; writing to GCS.
- *
- * <p>New Concepts:
- * <pre>
- * 1. Executing a Pipeline both locally and using the Dataflow service
- * 2. Using ParDo with static DoFns defined out-of-line
- * 3. Building a composite transform
- * 4. Defining your own pipeline options
- * </pre>
- *
- * <p>Concept #1: you can execute this pipeline either locally or using the Dataflow service.
- * These are now command-line options and not hard-coded as they were in the MinimalWordCount
- * example.
- * To execute this pipeline locally, specify general pipeline configuration:
- * <pre>{@code
- * --project=YOUR_PROJECT_ID
- * }
- * </pre>
- * and a local output file or output prefix on GCS:
- * <pre>{@code
- * --output=[YOUR_LOCAL_FILE | gs://YOUR_OUTPUT_PREFIX]
- * }</pre>
- *
- * <p>To execute this pipeline using the Dataflow service, specify pipeline configuration:
- * <pre>{@code
- * --project=YOUR_PROJECT_ID
- * --stagingLocation=gs://YOUR_STAGING_DIRECTORY
- * --runner=BlockingDataflowPipelineRunner
- * }
- * </pre>
- * and an output prefix on GCS:
- * <pre>{@code
- * --output=gs://YOUR_OUTPUT_PREFIX
- * }</pre>
- *
- * <p>The input file defaults to {@code gs://dataflow-samples/shakespeare/kinglear.txt} and can be
- * overridden with {@code --inputFile}.
- */
-public class WordCount {
-
- /**
- * Concept #2: You can make your pipeline code less verbose by defining your DoFns statically out-
- * of-line. This DoFn tokenizes lines of text into individual words; we pass it to a ParDo in the
- * pipeline.
- */
- static class ExtractWordsFn extends DoFn<String, String> {
- private final Aggregator<Long, Long> emptyLines =
- createAggregator("emptyLines", new Sum.SumLongFn());
-
- @Override
- public void processElement(ProcessContext c) {
- if (c.element().trim().isEmpty()) {
- emptyLines.addValue(1L);
- }
-
- // Split the line into words.
- String[] words = c.element().split("[^a-zA-Z']+");
-
- // Output each word encountered into the output PCollection.
- for (String word : words) {
- if (!word.isEmpty()) {
- c.output(word);
- }
- }
- }
- }
-
- /** A SimpleFunction that converts a Word and Count into a printable string. */
- public static class FormatAsTextFn extends SimpleFunction<KV<String, Long>, String> {
- @Override
- public String apply(KV<String, Long> input) {
- return input.getKey() + ": " + input.getValue();
- }
- }
-
- /**
- * A PTransform that converts a PCollection containing lines of text into a PCollection of
- * formatted word counts.
- *
- * <p>Concept #3: This is a custom composite transform that bundles two transforms (ParDo and
- * Count) as a reusable PTransform subclass. Using composite transforms allows for easy reuse,
- * modular testing, and an improved monitoring experience.
- */
- public static class CountWords extends PTransform<PCollection<String>,
- PCollection<KV<String, Long>>> {
- @Override
- public PCollection<KV<String, Long>> apply(PCollection<String> lines) {
-
- // Convert lines of text into individual words.
- PCollection<String> words = lines.apply(
- ParDo.of(new ExtractWordsFn()));
-
- // Count the number of times each word occurs.
- PCollection<KV<String, Long>> wordCounts =
- words.apply(Count.<String>perElement());
-
- return wordCounts;
- }
- }
-
- /**
- * Options supported by {@link WordCount}.
- *
- * <p>Concept #4: Defining your own configuration options. Here, you can add your own arguments
- * to be processed by the command-line parser, and specify default values for them. You can then
- * access the options values in your pipeline code.
- *
- * <p>Inherits standard configuration options.
- */
- public static interface WordCountOptions extends PipelineOptions {
- @Description("Path of the file to read from")
- @Default.String("gs://dataflow-samples/shakespeare/kinglear.txt")
- String getInputFile();
- void setInputFile(String value);
-
- @Description("Path of the file to write to")
- @Default.InstanceFactory(OutputFactory.class)
- String getOutput();
- void setOutput(String value);
-
- /**
- * Returns "gs://${YOUR_STAGING_DIRECTORY}/counts.txt" as the default destination.
- */
- public static class OutputFactory implements DefaultValueFactory<String> {
- @Override
- public String create(PipelineOptions options) {
- DataflowPipelineOptions dataflowOptions = options.as(DataflowPipelineOptions.class);
- if (dataflowOptions.getStagingLocation() != null) {
- return GcsPath.fromUri(dataflowOptions.getStagingLocation())
- .resolve("counts.txt").toString();
- } else {
- throw new IllegalArgumentException("Must specify --output or --stagingLocation");
- }
- }
- }
-
- }
-
- public static void main(String[] args) {
- WordCountOptions options = PipelineOptionsFactory.fromArgs(args).withValidation()
- .as(WordCountOptions.class);
- Pipeline p = Pipeline.create(options);
-
- // Concepts #2 and #3: Our pipeline applies the composite CountWords transform, and passes the
- // static FormatAsTextFn() to the ParDo transform.
- p.apply(TextIO.Read.named("ReadLines").from(options.getInputFile()))
- .apply(new CountWords())
- .apply(MapElements.via(new FormatAsTextFn()))
- .apply(TextIO.Write.named("WriteCounts").to(options.getOutput()));
-
- p.run();
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/src/main/java/com/google/cloud/dataflow/examples/common/DataflowExampleOptions.java
----------------------------------------------------------------------
diff --git a/examples/src/main/java/com/google/cloud/dataflow/examples/common/DataflowExampleOptions.java b/examples/src/main/java/com/google/cloud/dataflow/examples/common/DataflowExampleOptions.java
deleted file mode 100644
index 606bfb4..0000000
--- a/examples/src/main/java/com/google/cloud/dataflow/examples/common/DataflowExampleOptions.java
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
- * in compliance with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software distributed under the License
- * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
- * or implied. See the License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.examples.common;
-
-import com.google.cloud.dataflow.sdk.options.DataflowPipelineOptions;
-import com.google.cloud.dataflow.sdk.options.Default;
-import com.google.cloud.dataflow.sdk.options.Description;
-
-/**
- * Options that can be used to configure the Dataflow examples.
- */
-public interface DataflowExampleOptions extends DataflowPipelineOptions {
- @Description("Whether to keep jobs running on the Dataflow service after local process exit")
- @Default.Boolean(false)
- boolean getKeepJobsRunning();
- void setKeepJobsRunning(boolean keepJobsRunning);
-
- @Description("Number of workers to use when executing the injector pipeline")
- @Default.Integer(1)
- int getInjectorNumWorkers();
- void setInjectorNumWorkers(int numWorkers);
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/src/main/java/com/google/cloud/dataflow/examples/common/DataflowExampleUtils.java
----------------------------------------------------------------------
diff --git a/examples/src/main/java/com/google/cloud/dataflow/examples/common/DataflowExampleUtils.java b/examples/src/main/java/com/google/cloud/dataflow/examples/common/DataflowExampleUtils.java
deleted file mode 100644
index 4dfdd85..0000000
--- a/examples/src/main/java/com/google/cloud/dataflow/examples/common/DataflowExampleUtils.java
+++ /dev/null
@@ -1,485 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
- * in compliance with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software distributed under the License
- * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
- * or implied. See the License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.examples.common;
-
-import com.google.api.client.googleapis.json.GoogleJsonResponseException;
-import com.google.api.client.googleapis.services.AbstractGoogleClientRequest;
-import com.google.api.client.util.BackOff;
-import com.google.api.client.util.BackOffUtils;
-import com.google.api.client.util.Sleeper;
-import com.google.api.services.bigquery.Bigquery;
-import com.google.api.services.bigquery.Bigquery.Datasets;
-import com.google.api.services.bigquery.Bigquery.Tables;
-import com.google.api.services.bigquery.model.Dataset;
-import com.google.api.services.bigquery.model.DatasetReference;
-import com.google.api.services.bigquery.model.Table;
-import com.google.api.services.bigquery.model.TableReference;
-import com.google.api.services.bigquery.model.TableSchema;
-import com.google.api.services.dataflow.Dataflow;
-import com.google.api.services.pubsub.Pubsub;
-import com.google.api.services.pubsub.model.Subscription;
-import com.google.api.services.pubsub.model.Topic;
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.PipelineResult;
-import com.google.cloud.dataflow.sdk.io.TextIO;
-import com.google.cloud.dataflow.sdk.options.BigQueryOptions;
-import com.google.cloud.dataflow.sdk.options.DataflowPipelineOptions;
-import com.google.cloud.dataflow.sdk.runners.DataflowPipelineJob;
-import com.google.cloud.dataflow.sdk.runners.DataflowPipelineRunner;
-import com.google.cloud.dataflow.sdk.runners.DirectPipelineRunner;
-import com.google.cloud.dataflow.sdk.transforms.IntraBundleParallelization;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-import com.google.cloud.dataflow.sdk.util.AttemptBoundedExponentialBackOff;
-import com.google.cloud.dataflow.sdk.util.MonitoringUtil;
-import com.google.cloud.dataflow.sdk.util.Transport;
-import com.google.cloud.dataflow.sdk.values.PBegin;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.common.base.Strings;
-import com.google.common.base.Throwables;
-import com.google.common.collect.Lists;
-import com.google.common.collect.Sets;
-
-import java.io.IOException;
-import java.util.Collection;
-import java.util.List;
-import java.util.Set;
-import java.util.concurrent.TimeUnit;
-
-import javax.servlet.http.HttpServletResponse;
-
-/**
- * The utility class that sets up and tears down external resources, starts the Google Cloud Pub/Sub
- * injector, and cancels the streaming and the injector pipelines once the program terminates.
- *
- * <p>It is used to run Dataflow examples, such as TrafficMaxLaneFlow and TrafficRoutes.
- */
-public class DataflowExampleUtils {
-
- private final DataflowPipelineOptions options;
- private Bigquery bigQueryClient = null;
- private Pubsub pubsubClient = null;
- private Dataflow dataflowClient = null;
- private Set<DataflowPipelineJob> jobsToCancel = Sets.newHashSet();
- private List<String> pendingMessages = Lists.newArrayList();
-
- public DataflowExampleUtils(DataflowPipelineOptions options) {
- this.options = options;
- }
-
- /**
- * Do resources and runner options setup.
- */
- public DataflowExampleUtils(DataflowPipelineOptions options, boolean isUnbounded)
- throws IOException {
- this.options = options;
- setupResourcesAndRunner(isUnbounded);
- }
-
- /**
- * Sets up external resources that are required by the example,
- * such as Pub/Sub topics and BigQuery tables.
- *
- * @throws IOException if there is a problem setting up the resources
- */
- public void setup() throws IOException {
- Sleeper sleeper = Sleeper.DEFAULT;
- BackOff backOff = new AttemptBoundedExponentialBackOff(3, 200);
- Throwable lastException = null;
- try {
- do {
- try {
- setupPubsub();
- setupBigQueryTable();
- return;
- } catch (GoogleJsonResponseException e) {
- lastException = e;
- }
- } while (BackOffUtils.next(sleeper, backOff));
- } catch (InterruptedException e) {
- // Ignore InterruptedException
- }
- Throwables.propagate(lastException);
- }
-
- /**
- * Set up external resources, and configure the runner appropriately.
- */
- public void setupResourcesAndRunner(boolean isUnbounded) throws IOException {
- if (isUnbounded) {
- options.setStreaming(true);
- }
- setup();
- setupRunner();
- }
-
- /**
- * Sets up the Google Cloud Pub/Sub topic.
- *
- * <p>If the topic doesn't exist, a new topic with the given name will be created.
- *
- * @throws IOException if there is a problem setting up the Pub/Sub topic
- */
- public void setupPubsub() throws IOException {
- ExamplePubsubTopicAndSubscriptionOptions pubsubOptions =
- options.as(ExamplePubsubTopicAndSubscriptionOptions.class);
- if (!pubsubOptions.getPubsubTopic().isEmpty()) {
- pendingMessages.add("**********************Set Up Pubsub************************");
- setupPubsubTopic(pubsubOptions.getPubsubTopic());
- pendingMessages.add("The Pub/Sub topic has been set up for this example: "
- + pubsubOptions.getPubsubTopic());
-
- if (!pubsubOptions.getPubsubSubscription().isEmpty()) {
- setupPubsubSubscription(
- pubsubOptions.getPubsubTopic(), pubsubOptions.getPubsubSubscription());
- pendingMessages.add("The Pub/Sub subscription has been set up for this example: "
- + pubsubOptions.getPubsubSubscription());
- }
- }
- }
-
- /**
- * Sets up the BigQuery table with the given schema.
- *
- * <p>If the table already exists, the schema has to match the given one. Otherwise, the example
- * will throw a RuntimeException. If the table doesn't exist, a new table with the given schema
- * will be created.
- *
- * @throws IOException if there is a problem setting up the BigQuery table
- */
- public void setupBigQueryTable() throws IOException {
- ExampleBigQueryTableOptions bigQueryTableOptions =
- options.as(ExampleBigQueryTableOptions.class);
- if (bigQueryTableOptions.getBigQueryDataset() != null
- && bigQueryTableOptions.getBigQueryTable() != null
- && bigQueryTableOptions.getBigQuerySchema() != null) {
- pendingMessages.add("******************Set Up Big Query Table*******************");
- setupBigQueryTable(bigQueryTableOptions.getProject(),
- bigQueryTableOptions.getBigQueryDataset(),
- bigQueryTableOptions.getBigQueryTable(),
- bigQueryTableOptions.getBigQuerySchema());
- pendingMessages.add("The BigQuery table has been set up for this example: "
- + bigQueryTableOptions.getProject()
- + ":" + bigQueryTableOptions.getBigQueryDataset()
- + "." + bigQueryTableOptions.getBigQueryTable());
- }
- }
-
- /**
- * Tears down external resources that can be deleted upon the example's completion.
- */
- private void tearDown() {
- pendingMessages.add("*************************Tear Down*************************");
- ExamplePubsubTopicAndSubscriptionOptions pubsubOptions =
- options.as(ExamplePubsubTopicAndSubscriptionOptions.class);
- if (!pubsubOptions.getPubsubTopic().isEmpty()) {
- try {
- deletePubsubTopic(pubsubOptions.getPubsubTopic());
- pendingMessages.add("The Pub/Sub topic has been deleted: "
- + pubsubOptions.getPubsubTopic());
- } catch (IOException e) {
- pendingMessages.add("Failed to delete the Pub/Sub topic : "
- + pubsubOptions.getPubsubTopic());
- }
- if (!pubsubOptions.getPubsubSubscription().isEmpty()) {
- try {
- deletePubsubSubscription(pubsubOptions.getPubsubSubscription());
- pendingMessages.add("The Pub/Sub subscription has been deleted: "
- + pubsubOptions.getPubsubSubscription());
- } catch (IOException e) {
- pendingMessages.add("Failed to delete the Pub/Sub subscription : "
- + pubsubOptions.getPubsubSubscription());
- }
- }
- }
-
- ExampleBigQueryTableOptions bigQueryTableOptions =
- options.as(ExampleBigQueryTableOptions.class);
- if (bigQueryTableOptions.getBigQueryDataset() != null
- && bigQueryTableOptions.getBigQueryTable() != null
- && bigQueryTableOptions.getBigQuerySchema() != null) {
- pendingMessages.add("The BigQuery table might contain the example's output, "
- + "and it is not deleted automatically: "
- + bigQueryTableOptions.getProject()
- + ":" + bigQueryTableOptions.getBigQueryDataset()
- + "." + bigQueryTableOptions.getBigQueryTable());
- pendingMessages.add("Please go to the Developers Console to delete it manually."
- + " Otherwise, you may be charged for its usage.");
- }
- }
-
- private void setupBigQueryTable(String projectId, String datasetId, String tableId,
- TableSchema schema) throws IOException {
- if (bigQueryClient == null) {
- bigQueryClient = Transport.newBigQueryClient(options.as(BigQueryOptions.class)).build();
- }
-
- Datasets datasetService = bigQueryClient.datasets();
- if (executeNullIfNotFound(datasetService.get(projectId, datasetId)) == null) {
- Dataset newDataset = new Dataset().setDatasetReference(
- new DatasetReference().setProjectId(projectId).setDatasetId(datasetId));
- datasetService.insert(projectId, newDataset).execute();
- }
-
- Tables tableService = bigQueryClient.tables();
- Table table = executeNullIfNotFound(tableService.get(projectId, datasetId, tableId));
- if (table == null) {
- Table newTable = new Table().setSchema(schema).setTableReference(
- new TableReference().setProjectId(projectId).setDatasetId(datasetId).setTableId(tableId));
- tableService.insert(projectId, datasetId, newTable).execute();
- } else if (!table.getSchema().equals(schema)) {
- throw new RuntimeException(
- "Table exists and schemas do not match, expecting: " + schema.toPrettyString()
- + ", actual: " + table.getSchema().toPrettyString());
- }
- }
-
- private void setupPubsubTopic(String topic) throws IOException {
- if (pubsubClient == null) {
- pubsubClient = Transport.newPubsubClient(options).build();
- }
- if (executeNullIfNotFound(pubsubClient.projects().topics().get(topic)) == null) {
- pubsubClient.projects().topics().create(topic, new Topic().setName(topic)).execute();
- }
- }
-
- private void setupPubsubSubscription(String topic, String subscription) throws IOException {
- if (pubsubClient == null) {
- pubsubClient = Transport.newPubsubClient(options).build();
- }
- if (executeNullIfNotFound(pubsubClient.projects().subscriptions().get(subscription)) == null) {
- Subscription subInfo = new Subscription()
- .setAckDeadlineSeconds(60)
- .setTopic(topic);
- pubsubClient.projects().subscriptions().create(subscription, subInfo).execute();
- }
- }
-
- /**
- * Deletes the Google Cloud Pub/Sub topic.
- *
- * @throws IOException if there is a problem deleting the Pub/Sub topic
- */
- private void deletePubsubTopic(String topic) throws IOException {
- if (pubsubClient == null) {
- pubsubClient = Transport.newPubsubClient(options).build();
- }
- if (executeNullIfNotFound(pubsubClient.projects().topics().get(topic)) != null) {
- pubsubClient.projects().topics().delete(topic).execute();
- }
- }
-
- /**
- * Deletes the Google Cloud Pub/Sub subscription.
- *
- * @throws IOException if there is a problem deleting the Pub/Sub subscription
- */
- private void deletePubsubSubscription(String subscription) throws IOException {
- if (pubsubClient == null) {
- pubsubClient = Transport.newPubsubClient(options).build();
- }
- if (executeNullIfNotFound(pubsubClient.projects().subscriptions().get(subscription)) != null) {
- pubsubClient.projects().subscriptions().delete(subscription).execute();
- }
- }
-
- /**
- * If this is an unbounded (streaming) pipeline, and both inputFile and pubsub topic are defined,
- * start an 'injector' pipeline that publishes the contents of the file to the given topic, first
- * creating the topic if necessary.
- */
- public void startInjectorIfNeeded(String inputFile) {
- ExamplePubsubTopicOptions pubsubTopicOptions = options.as(ExamplePubsubTopicOptions.class);
- if (pubsubTopicOptions.isStreaming()
- && !Strings.isNullOrEmpty(inputFile)
- && !Strings.isNullOrEmpty(pubsubTopicOptions.getPubsubTopic())) {
- runInjectorPipeline(inputFile, pubsubTopicOptions.getPubsubTopic());
- }
- }
-
- /**
- * Do some runner setup: check that the DirectPipelineRunner is not used in conjunction with
- * streaming, and if streaming is specified, use the DataflowPipelineRunner. Return the streaming
- * flag value.
- */
- public void setupRunner() {
- if (options.isStreaming() && options.getRunner() != DirectPipelineRunner.class) {
- // In order to cancel the pipelines automatically,
- // {@literal DataflowPipelineRunner} is forced to be used.
- options.setRunner(DataflowPipelineRunner.class);
- }
- }
-
- /**
- * Runs a batch pipeline to inject data into the PubSubIO input topic.
- *
- * <p>The injector pipeline will read from the given text file, and inject data
- * into the Google Cloud Pub/Sub topic.
- */
- public void runInjectorPipeline(String inputFile, String topic) {
- runInjectorPipeline(TextIO.Read.from(inputFile), topic, null);
- }
-
- /**
- * Runs a batch pipeline to inject data into the PubSubIO input topic.
- *
- * <p>The injector pipeline will read from the given source, and inject data
- * into the Google Cloud Pub/Sub topic.
- */
- public void runInjectorPipeline(PTransform<? super PBegin, PCollection<String>> readSource,
- String topic,
- String pubsubTimestampTabelKey) {
- PubsubFileInjector.Bound injector;
- if (Strings.isNullOrEmpty(pubsubTimestampTabelKey)) {
- injector = PubsubFileInjector.publish(topic);
- } else {
- injector = PubsubFileInjector.withTimestampLabelKey(pubsubTimestampTabelKey).publish(topic);
- }
- DataflowPipelineOptions copiedOptions = options.cloneAs(DataflowPipelineOptions.class);
- if (options.getServiceAccountName() != null) {
- copiedOptions.setServiceAccountName(options.getServiceAccountName());
- }
- if (options.getServiceAccountKeyfile() != null) {
- copiedOptions.setServiceAccountKeyfile(options.getServiceAccountKeyfile());
- }
- copiedOptions.setStreaming(false);
- copiedOptions.setNumWorkers(options.as(DataflowExampleOptions.class).getInjectorNumWorkers());
- copiedOptions.setJobName(options.getJobName() + "-injector");
- Pipeline injectorPipeline = Pipeline.create(copiedOptions);
- injectorPipeline.apply(readSource)
- .apply(IntraBundleParallelization
- .of(injector)
- .withMaxParallelism(20));
- PipelineResult result = injectorPipeline.run();
- if (result instanceof DataflowPipelineJob) {
- jobsToCancel.add(((DataflowPipelineJob) result));
- }
- }
-
- /**
- * Runs the provided pipeline to inject data into the PubSubIO input topic.
- */
- public void runInjectorPipeline(Pipeline injectorPipeline) {
- PipelineResult result = injectorPipeline.run();
- if (result instanceof DataflowPipelineJob) {
- jobsToCancel.add(((DataflowPipelineJob) result));
- }
- }
-
- /**
- * Start the auxiliary injector pipeline, then wait for this pipeline to finish.
- */
- public void mockUnboundedSource(String inputFile, PipelineResult result) {
- startInjectorIfNeeded(inputFile);
- waitToFinish(result);
- }
-
- /**
- * If {@literal DataflowPipelineRunner} or {@literal BlockingDataflowPipelineRunner} is used,
- * waits for the pipeline to finish and cancels it (and the injector) before the program exists.
- */
- public void waitToFinish(PipelineResult result) {
- if (result instanceof DataflowPipelineJob) {
- final DataflowPipelineJob job = (DataflowPipelineJob) result;
- jobsToCancel.add(job);
- if (!options.as(DataflowExampleOptions.class).getKeepJobsRunning()) {
- addShutdownHook(jobsToCancel);
- }
- try {
- job.waitToFinish(-1, TimeUnit.SECONDS, new MonitoringUtil.PrintHandler(System.out));
- } catch (Exception e) {
- throw new RuntimeException("Failed to wait for job to finish: " + job.getJobId());
- }
- } else {
- // Do nothing if the given PipelineResult doesn't support waitToFinish(),
- // such as EvaluationResults returned by DirectPipelineRunner.
- tearDown();
- printPendingMessages();
- }
- }
-
- private void addShutdownHook(final Collection<DataflowPipelineJob> jobs) {
- if (dataflowClient == null) {
- dataflowClient = options.getDataflowClient();
- }
-
- Runtime.getRuntime().addShutdownHook(new Thread() {
- @Override
- public void run() {
- tearDown();
- printPendingMessages();
- for (DataflowPipelineJob job : jobs) {
- System.out.println("Canceling example pipeline: " + job.getJobId());
- try {
- job.cancel();
- } catch (IOException e) {
- System.out.println("Failed to cancel the job,"
- + " please go to the Developers Console to cancel it manually");
- System.out.println(
- MonitoringUtil.getJobMonitoringPageURL(job.getProjectId(), job.getJobId()));
- }
- }
-
- for (DataflowPipelineJob job : jobs) {
- boolean cancellationVerified = false;
- for (int retryAttempts = 6; retryAttempts > 0; retryAttempts--) {
- if (job.getState().isTerminal()) {
- cancellationVerified = true;
- System.out.println("Canceled example pipeline: " + job.getJobId());
- break;
- } else {
- System.out.println(
- "The example pipeline is still running. Verifying the cancellation.");
- }
- try {
- Thread.sleep(10000);
- } catch (InterruptedException e) {
- // Ignore
- }
- }
- if (!cancellationVerified) {
- System.out.println("Failed to verify the cancellation for job: " + job.getJobId());
- System.out.println("Please go to the Developers Console to verify manually:");
- System.out.println(
- MonitoringUtil.getJobMonitoringPageURL(job.getProjectId(), job.getJobId()));
- }
- }
- }
- });
- }
-
- private void printPendingMessages() {
- System.out.println();
- System.out.println("***********************************************************");
- System.out.println("***********************************************************");
- for (String message : pendingMessages) {
- System.out.println(message);
- }
- System.out.println("***********************************************************");
- System.out.println("***********************************************************");
- }
-
- private static <T> T executeNullIfNotFound(
- AbstractGoogleClientRequest<T> request) throws IOException {
- try {
- return request.execute();
- } catch (GoogleJsonResponseException e) {
- if (e.getStatusCode() == HttpServletResponse.SC_NOT_FOUND) {
- return null;
- } else {
- throw e;
- }
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/src/main/java/com/google/cloud/dataflow/examples/common/ExampleBigQueryTableOptions.java
----------------------------------------------------------------------
diff --git a/examples/src/main/java/com/google/cloud/dataflow/examples/common/ExampleBigQueryTableOptions.java b/examples/src/main/java/com/google/cloud/dataflow/examples/common/ExampleBigQueryTableOptions.java
deleted file mode 100644
index 7c213b5..0000000
--- a/examples/src/main/java/com/google/cloud/dataflow/examples/common/ExampleBigQueryTableOptions.java
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
- * in compliance with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software distributed under the License
- * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
- * or implied. See the License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.examples.common;
-
-import com.google.api.services.bigquery.model.TableSchema;
-import com.google.cloud.dataflow.sdk.options.DataflowPipelineOptions;
-import com.google.cloud.dataflow.sdk.options.Default;
-import com.google.cloud.dataflow.sdk.options.DefaultValueFactory;
-import com.google.cloud.dataflow.sdk.options.Description;
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-
-/**
- * Options that can be used to configure BigQuery tables in Dataflow examples.
- * The project defaults to the project being used to run the example.
- */
-public interface ExampleBigQueryTableOptions extends DataflowPipelineOptions {
- @Description("BigQuery dataset name")
- @Default.String("dataflow_examples")
- String getBigQueryDataset();
- void setBigQueryDataset(String dataset);
-
- @Description("BigQuery table name")
- @Default.InstanceFactory(BigQueryTableFactory.class)
- String getBigQueryTable();
- void setBigQueryTable(String table);
-
- @Description("BigQuery table schema")
- TableSchema getBigQuerySchema();
- void setBigQuerySchema(TableSchema schema);
-
- /**
- * Returns the job name as the default BigQuery table name.
- */
- static class BigQueryTableFactory implements DefaultValueFactory<String> {
- @Override
- public String create(PipelineOptions options) {
- return options.as(DataflowPipelineOptions.class).getJobName()
- .replace('-', '_');
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/src/main/java/com/google/cloud/dataflow/examples/common/ExamplePubsubTopicAndSubscriptionOptions.java
----------------------------------------------------------------------
diff --git a/examples/src/main/java/com/google/cloud/dataflow/examples/common/ExamplePubsubTopicAndSubscriptionOptions.java b/examples/src/main/java/com/google/cloud/dataflow/examples/common/ExamplePubsubTopicAndSubscriptionOptions.java
deleted file mode 100644
index d7bd4b8..0000000
--- a/examples/src/main/java/com/google/cloud/dataflow/examples/common/ExamplePubsubTopicAndSubscriptionOptions.java
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
- * in compliance with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software distributed under the License
- * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
- * or implied. See the License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.examples.common;
-
-import com.google.cloud.dataflow.sdk.options.DataflowPipelineOptions;
-import com.google.cloud.dataflow.sdk.options.Default;
-import com.google.cloud.dataflow.sdk.options.DefaultValueFactory;
-import com.google.cloud.dataflow.sdk.options.Description;
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-
-/**
- * Options that can be used to configure Pub/Sub topic/subscription in Dataflow examples.
- */
-public interface ExamplePubsubTopicAndSubscriptionOptions extends ExamplePubsubTopicOptions {
- @Description("Pub/Sub subscription")
- @Default.InstanceFactory(PubsubSubscriptionFactory.class)
- String getPubsubSubscription();
- void setPubsubSubscription(String subscription);
-
- /**
- * Returns a default Pub/Sub subscription based on the project and the job names.
- */
- static class PubsubSubscriptionFactory implements DefaultValueFactory<String> {
- @Override
- public String create(PipelineOptions options) {
- DataflowPipelineOptions dataflowPipelineOptions =
- options.as(DataflowPipelineOptions.class);
- return "projects/" + dataflowPipelineOptions.getProject()
- + "/subscriptions/" + dataflowPipelineOptions.getJobName();
- }
- }
-}
[55/67] incubator-beam git commit: Directory reorganization
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/src/main/java/com/google/cloud/dataflow/examples/common/ExamplePubsubTopicOptions.java
----------------------------------------------------------------------
diff --git a/examples/src/main/java/com/google/cloud/dataflow/examples/common/ExamplePubsubTopicOptions.java b/examples/src/main/java/com/google/cloud/dataflow/examples/common/ExamplePubsubTopicOptions.java
deleted file mode 100644
index 4bedf31..0000000
--- a/examples/src/main/java/com/google/cloud/dataflow/examples/common/ExamplePubsubTopicOptions.java
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
- * in compliance with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software distributed under the License
- * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
- * or implied. See the License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.examples.common;
-
-import com.google.cloud.dataflow.sdk.options.DataflowPipelineOptions;
-import com.google.cloud.dataflow.sdk.options.Default;
-import com.google.cloud.dataflow.sdk.options.DefaultValueFactory;
-import com.google.cloud.dataflow.sdk.options.Description;
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-
-/**
- * Options that can be used to configure Pub/Sub topic in Dataflow examples.
- */
-public interface ExamplePubsubTopicOptions extends DataflowPipelineOptions {
- @Description("Pub/Sub topic")
- @Default.InstanceFactory(PubsubTopicFactory.class)
- String getPubsubTopic();
- void setPubsubTopic(String topic);
-
- /**
- * Returns a default Pub/Sub topic based on the project and the job names.
- */
- static class PubsubTopicFactory implements DefaultValueFactory<String> {
- @Override
- public String create(PipelineOptions options) {
- DataflowPipelineOptions dataflowPipelineOptions =
- options.as(DataflowPipelineOptions.class);
- return "projects/" + dataflowPipelineOptions.getProject()
- + "/topics/" + dataflowPipelineOptions.getJobName();
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/src/main/java/com/google/cloud/dataflow/examples/common/PubsubFileInjector.java
----------------------------------------------------------------------
diff --git a/examples/src/main/java/com/google/cloud/dataflow/examples/common/PubsubFileInjector.java b/examples/src/main/java/com/google/cloud/dataflow/examples/common/PubsubFileInjector.java
deleted file mode 100644
index 4a82ae6..0000000
--- a/examples/src/main/java/com/google/cloud/dataflow/examples/common/PubsubFileInjector.java
+++ /dev/null
@@ -1,153 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.examples.common;
-
-import com.google.api.services.pubsub.Pubsub;
-import com.google.api.services.pubsub.model.PublishRequest;
-import com.google.api.services.pubsub.model.PubsubMessage;
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.io.TextIO;
-import com.google.cloud.dataflow.sdk.options.DataflowPipelineOptions;
-import com.google.cloud.dataflow.sdk.options.Description;
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
-import com.google.cloud.dataflow.sdk.options.Validation;
-import com.google.cloud.dataflow.sdk.transforms.DoFn;
-import com.google.cloud.dataflow.sdk.transforms.IntraBundleParallelization;
-import com.google.cloud.dataflow.sdk.util.Transport;
-import com.google.common.collect.ImmutableMap;
-
-import java.io.IOException;
-import java.util.Arrays;
-
-/**
- * A batch Dataflow pipeline for injecting a set of GCS files into
- * a PubSub topic line by line. Empty lines are skipped.
- *
- * <p>This is useful for testing streaming
- * pipelines. Note that since batch pipelines might retry chunks, this
- * does _not_ guarantee exactly-once injection of file data. Some lines may
- * be published multiple times.
- * </p>
- */
-public class PubsubFileInjector {
-
- /**
- * An incomplete {@code PubsubFileInjector} transform with unbound output topic.
- */
- public static class Unbound {
- private final String timestampLabelKey;
-
- Unbound() {
- this.timestampLabelKey = null;
- }
-
- Unbound(String timestampLabelKey) {
- this.timestampLabelKey = timestampLabelKey;
- }
-
- Unbound withTimestampLabelKey(String timestampLabelKey) {
- return new Unbound(timestampLabelKey);
- }
-
- public Bound publish(String outputTopic) {
- return new Bound(outputTopic, timestampLabelKey);
- }
- }
-
- /** A DoFn that publishes non-empty lines to Google Cloud PubSub. */
- public static class Bound extends DoFn<String, Void> {
- private final String outputTopic;
- private final String timestampLabelKey;
- public transient Pubsub pubsub;
-
- public Bound(String outputTopic, String timestampLabelKey) {
- this.outputTopic = outputTopic;
- this.timestampLabelKey = timestampLabelKey;
- }
-
- @Override
- public void startBundle(Context context) {
- this.pubsub =
- Transport.newPubsubClient(context.getPipelineOptions().as(DataflowPipelineOptions.class))
- .build();
- }
-
- @Override
- public void processElement(ProcessContext c) throws IOException {
- if (c.element().isEmpty()) {
- return;
- }
- PubsubMessage pubsubMessage = new PubsubMessage();
- pubsubMessage.encodeData(c.element().getBytes());
- if (timestampLabelKey != null) {
- pubsubMessage.setAttributes(
- ImmutableMap.of(timestampLabelKey, Long.toString(c.timestamp().getMillis())));
- }
- PublishRequest publishRequest = new PublishRequest();
- publishRequest.setMessages(Arrays.asList(pubsubMessage));
- this.pubsub.projects().topics().publish(outputTopic, publishRequest).execute();
- }
- }
-
- /**
- * Creates a {@code PubsubFileInjector} transform with the given timestamp label key.
- */
- public static Unbound withTimestampLabelKey(String timestampLabelKey) {
- return new Unbound(timestampLabelKey);
- }
-
- /**
- * Creates a {@code PubsubFileInjector} transform that publishes to the given output topic.
- */
- public static Bound publish(String outputTopic) {
- return new Unbound().publish(outputTopic);
- }
-
- /**
- * Command line parameter options.
- */
- private interface PubsubFileInjectorOptions extends PipelineOptions {
- @Description("GCS location of files.")
- @Validation.Required
- String getInput();
- void setInput(String value);
-
- @Description("Topic to publish on.")
- @Validation.Required
- String getOutputTopic();
- void setOutputTopic(String value);
- }
-
- /**
- * Sets up and starts streaming pipeline.
- */
- public static void main(String[] args) {
- PubsubFileInjectorOptions options = PipelineOptionsFactory.fromArgs(args)
- .withValidation()
- .as(PubsubFileInjectorOptions.class);
-
- Pipeline pipeline = Pipeline.create(options);
-
- pipeline
- .apply(TextIO.Read.from(options.getInput()))
- .apply(IntraBundleParallelization.of(PubsubFileInjector.publish(options.getOutputTopic()))
- .withMaxParallelism(20));
-
- pipeline.run();
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/src/main/java/com/google/cloud/dataflow/examples/complete/AutoComplete.java
----------------------------------------------------------------------
diff --git a/examples/src/main/java/com/google/cloud/dataflow/examples/complete/AutoComplete.java b/examples/src/main/java/com/google/cloud/dataflow/examples/complete/AutoComplete.java
deleted file mode 100644
index f897338..0000000
--- a/examples/src/main/java/com/google/cloud/dataflow/examples/complete/AutoComplete.java
+++ /dev/null
@@ -1,516 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.examples.complete;
-
-import com.google.api.services.bigquery.model.TableFieldSchema;
-import com.google.api.services.bigquery.model.TableReference;
-import com.google.api.services.bigquery.model.TableRow;
-import com.google.api.services.bigquery.model.TableSchema;
-import com.google.api.services.datastore.DatastoreV1.Entity;
-import com.google.api.services.datastore.DatastoreV1.Key;
-import com.google.api.services.datastore.DatastoreV1.Value;
-import com.google.api.services.datastore.client.DatastoreHelper;
-import com.google.cloud.dataflow.examples.common.DataflowExampleUtils;
-import com.google.cloud.dataflow.examples.common.ExampleBigQueryTableOptions;
-import com.google.cloud.dataflow.examples.common.ExamplePubsubTopicOptions;
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.PipelineResult;
-import com.google.cloud.dataflow.sdk.coders.AvroCoder;
-import com.google.cloud.dataflow.sdk.coders.DefaultCoder;
-import com.google.cloud.dataflow.sdk.io.BigQueryIO;
-import com.google.cloud.dataflow.sdk.io.DatastoreIO;
-import com.google.cloud.dataflow.sdk.io.PubsubIO;
-import com.google.cloud.dataflow.sdk.io.TextIO;
-import com.google.cloud.dataflow.sdk.options.Default;
-import com.google.cloud.dataflow.sdk.options.Description;
-import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
-import com.google.cloud.dataflow.sdk.runners.DataflowPipelineRunner;
-import com.google.cloud.dataflow.sdk.transforms.Count;
-import com.google.cloud.dataflow.sdk.transforms.DoFn;
-import com.google.cloud.dataflow.sdk.transforms.Filter;
-import com.google.cloud.dataflow.sdk.transforms.Flatten;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-import com.google.cloud.dataflow.sdk.transforms.ParDo;
-import com.google.cloud.dataflow.sdk.transforms.Partition;
-import com.google.cloud.dataflow.sdk.transforms.Partition.PartitionFn;
-import com.google.cloud.dataflow.sdk.transforms.SerializableFunction;
-import com.google.cloud.dataflow.sdk.transforms.Top;
-import com.google.cloud.dataflow.sdk.transforms.windowing.GlobalWindows;
-import com.google.cloud.dataflow.sdk.transforms.windowing.SlidingWindows;
-import com.google.cloud.dataflow.sdk.transforms.windowing.Window;
-import com.google.cloud.dataflow.sdk.transforms.windowing.WindowFn;
-import com.google.cloud.dataflow.sdk.values.KV;
-import com.google.cloud.dataflow.sdk.values.PBegin;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.cloud.dataflow.sdk.values.PCollectionList;
-import com.google.common.base.MoreObjects;
-import com.google.common.base.Preconditions;
-
-import org.joda.time.Duration;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-/**
- * An example that computes the most popular hash tags
- * for every prefix, which can be used for auto-completion.
- *
- * <p>Concepts: Using the same pipeline in both streaming and batch, combiners,
- * composite transforms.
- *
- * <p>To execute this pipeline using the Dataflow service in batch mode,
- * specify pipeline configuration:
- * <pre>{@code
- * --project=YOUR_PROJECT_ID
- * --stagingLocation=gs://YOUR_STAGING_DIRECTORY
- * --runner=DataflowPipelineRunner
- * --inputFile=gs://path/to/input*.txt
- * }</pre>
- *
- * <p>To execute this pipeline using the Dataflow service in streaming mode,
- * specify pipeline configuration:
- * <pre>{@code
- * --project=YOUR_PROJECT_ID
- * --stagingLocation=gs://YOUR_STAGING_DIRECTORY
- * --runner=DataflowPipelineRunner
- * --inputFile=gs://YOUR_INPUT_DIRECTORY/*.txt
- * --streaming
- * }</pre>
- *
- * <p>This will update the datastore every 10 seconds based on the last
- * 30 minutes of data received.
- */
-public class AutoComplete {
-
- /**
- * A PTransform that takes as input a list of tokens and returns
- * the most common tokens per prefix.
- */
- public static class ComputeTopCompletions
- extends PTransform<PCollection<String>, PCollection<KV<String, List<CompletionCandidate>>>> {
- private final int candidatesPerPrefix;
- private final boolean recursive;
-
- protected ComputeTopCompletions(int candidatesPerPrefix, boolean recursive) {
- this.candidatesPerPrefix = candidatesPerPrefix;
- this.recursive = recursive;
- }
-
- public static ComputeTopCompletions top(int candidatesPerPrefix, boolean recursive) {
- return new ComputeTopCompletions(candidatesPerPrefix, recursive);
- }
-
- @Override
- public PCollection<KV<String, List<CompletionCandidate>>> apply(PCollection<String> input) {
- PCollection<CompletionCandidate> candidates = input
- // First count how often each token appears.
- .apply(new Count.PerElement<String>())
-
- // Map the KV outputs of Count into our own CompletionCandiate class.
- .apply(ParDo.named("CreateCompletionCandidates").of(
- new DoFn<KV<String, Long>, CompletionCandidate>() {
- @Override
- public void processElement(ProcessContext c) {
- c.output(new CompletionCandidate(c.element().getKey(), c.element().getValue()));
- }
- }));
-
- // Compute the top via either a flat or recursive algorithm.
- if (recursive) {
- return candidates
- .apply(new ComputeTopRecursive(candidatesPerPrefix, 1))
- .apply(Flatten.<KV<String, List<CompletionCandidate>>>pCollections());
- } else {
- return candidates
- .apply(new ComputeTopFlat(candidatesPerPrefix, 1));
- }
- }
- }
-
- /**
- * Lower latency, but more expensive.
- */
- private static class ComputeTopFlat
- extends PTransform<PCollection<CompletionCandidate>,
- PCollection<KV<String, List<CompletionCandidate>>>> {
- private final int candidatesPerPrefix;
- private final int minPrefix;
-
- public ComputeTopFlat(int candidatesPerPrefix, int minPrefix) {
- this.candidatesPerPrefix = candidatesPerPrefix;
- this.minPrefix = minPrefix;
- }
-
- @Override
- public PCollection<KV<String, List<CompletionCandidate>>> apply(
- PCollection<CompletionCandidate> input) {
- return input
- // For each completion candidate, map it to all prefixes.
- .apply(ParDo.of(new AllPrefixes(minPrefix)))
-
- // Find and return the top candiates for each prefix.
- .apply(Top.<String, CompletionCandidate>largestPerKey(candidatesPerPrefix)
- .withHotKeyFanout(new HotKeyFanout()));
- }
-
- private static class HotKeyFanout implements SerializableFunction<String, Integer> {
- @Override
- public Integer apply(String input) {
- return (int) Math.pow(4, 5 - input.length());
- }
- }
- }
-
- /**
- * Cheaper but higher latency.
- *
- * <p>Returns two PCollections, the first is top prefixes of size greater
- * than minPrefix, and the second is top prefixes of size exactly
- * minPrefix.
- */
- private static class ComputeTopRecursive
- extends PTransform<PCollection<CompletionCandidate>,
- PCollectionList<KV<String, List<CompletionCandidate>>>> {
- private final int candidatesPerPrefix;
- private final int minPrefix;
-
- public ComputeTopRecursive(int candidatesPerPrefix, int minPrefix) {
- this.candidatesPerPrefix = candidatesPerPrefix;
- this.minPrefix = minPrefix;
- }
-
- private class KeySizePartitionFn implements PartitionFn<KV<String, List<CompletionCandidate>>> {
- @Override
- public int partitionFor(KV<String, List<CompletionCandidate>> elem, int numPartitions) {
- return elem.getKey().length() > minPrefix ? 0 : 1;
- }
- }
-
- private static class FlattenTops
- extends DoFn<KV<String, List<CompletionCandidate>>, CompletionCandidate> {
- @Override
- public void processElement(ProcessContext c) {
- for (CompletionCandidate cc : c.element().getValue()) {
- c.output(cc);
- }
- }
- }
-
- @Override
- public PCollectionList<KV<String, List<CompletionCandidate>>> apply(
- PCollection<CompletionCandidate> input) {
- if (minPrefix > 10) {
- // Base case, partitioning to return the output in the expected format.
- return input
- .apply(new ComputeTopFlat(candidatesPerPrefix, minPrefix))
- .apply(Partition.of(2, new KeySizePartitionFn()));
- } else {
- // If a candidate is in the top N for prefix a...b, it must also be in the top
- // N for a...bX for every X, which is typlically a much smaller set to consider.
- // First, compute the top candidate for prefixes of size at least minPrefix + 1.
- PCollectionList<KV<String, List<CompletionCandidate>>> larger = input
- .apply(new ComputeTopRecursive(candidatesPerPrefix, minPrefix + 1));
- // Consider the top candidates for each prefix of length minPrefix + 1...
- PCollection<KV<String, List<CompletionCandidate>>> small =
- PCollectionList
- .of(larger.get(1).apply(ParDo.of(new FlattenTops())))
- // ...together with those (previously excluded) candidates of length
- // exactly minPrefix...
- .and(input.apply(Filter.byPredicate(
- new SerializableFunction<CompletionCandidate, Boolean>() {
- @Override
- public Boolean apply(CompletionCandidate c) {
- return c.getValue().length() == minPrefix;
- }
- })))
- .apply("FlattenSmall", Flatten.<CompletionCandidate>pCollections())
- // ...set the key to be the minPrefix-length prefix...
- .apply(ParDo.of(new AllPrefixes(minPrefix, minPrefix)))
- // ...and (re)apply the Top operator to all of them together.
- .apply(Top.<String, CompletionCandidate>largestPerKey(candidatesPerPrefix));
-
- PCollection<KV<String, List<CompletionCandidate>>> flattenLarger = larger
- .apply("FlattenLarge", Flatten.<KV<String, List<CompletionCandidate>>>pCollections());
-
- return PCollectionList.of(flattenLarger).and(small);
- }
- }
- }
-
- /**
- * A DoFn that keys each candidate by all its prefixes.
- */
- private static class AllPrefixes
- extends DoFn<CompletionCandidate, KV<String, CompletionCandidate>> {
- private final int minPrefix;
- private final int maxPrefix;
- public AllPrefixes(int minPrefix) {
- this(minPrefix, Integer.MAX_VALUE);
- }
- public AllPrefixes(int minPrefix, int maxPrefix) {
- this.minPrefix = minPrefix;
- this.maxPrefix = maxPrefix;
- }
- @Override
- public void processElement(ProcessContext c) {
- String word = c.element().value;
- for (int i = minPrefix; i <= Math.min(word.length(), maxPrefix); i++) {
- c.output(KV.of(word.substring(0, i), c.element()));
- }
- }
- }
-
- /**
- * Class used to store tag-count pairs.
- */
- @DefaultCoder(AvroCoder.class)
- static class CompletionCandidate implements Comparable<CompletionCandidate> {
- private long count;
- private String value;
-
- public CompletionCandidate(String value, long count) {
- this.value = value;
- this.count = count;
- }
-
- public long getCount() {
- return count;
- }
-
- public String getValue() {
- return value;
- }
-
- // Empty constructor required for Avro decoding.
- public CompletionCandidate() {}
-
- @Override
- public int compareTo(CompletionCandidate o) {
- if (this.count < o.count) {
- return -1;
- } else if (this.count == o.count) {
- return this.value.compareTo(o.value);
- } else {
- return 1;
- }
- }
-
- @Override
- public boolean equals(Object other) {
- if (other instanceof CompletionCandidate) {
- CompletionCandidate that = (CompletionCandidate) other;
- return this.count == that.count && this.value.equals(that.value);
- } else {
- return false;
- }
- }
-
- @Override
- public int hashCode() {
- return Long.valueOf(count).hashCode() ^ value.hashCode();
- }
-
- @Override
- public String toString() {
- return "CompletionCandidate[" + value + ", " + count + "]";
- }
- }
-
- /**
- * Takes as input a set of strings, and emits each #hashtag found therein.
- */
- static class ExtractHashtags extends DoFn<String, String> {
- @Override
- public void processElement(ProcessContext c) {
- Matcher m = Pattern.compile("#\\S+").matcher(c.element());
- while (m.find()) {
- c.output(m.group().substring(1));
- }
- }
- }
-
- static class FormatForBigquery extends DoFn<KV<String, List<CompletionCandidate>>, TableRow> {
- @Override
- public void processElement(ProcessContext c) {
- List<TableRow> completions = new ArrayList<>();
- for (CompletionCandidate cc : c.element().getValue()) {
- completions.add(new TableRow()
- .set("count", cc.getCount())
- .set("tag", cc.getValue()));
- }
- TableRow row = new TableRow()
- .set("prefix", c.element().getKey())
- .set("tags", completions);
- c.output(row);
- }
-
- /**
- * Defines the BigQuery schema used for the output.
- */
- static TableSchema getSchema() {
- List<TableFieldSchema> tagFields = new ArrayList<>();
- tagFields.add(new TableFieldSchema().setName("count").setType("INTEGER"));
- tagFields.add(new TableFieldSchema().setName("tag").setType("STRING"));
- List<TableFieldSchema> fields = new ArrayList<>();
- fields.add(new TableFieldSchema().setName("prefix").setType("STRING"));
- fields.add(new TableFieldSchema()
- .setName("tags").setType("RECORD").setMode("REPEATED").setFields(tagFields));
- return new TableSchema().setFields(fields);
- }
- }
-
- /**
- * Takes as input a the top candidates per prefix, and emits an entity
- * suitable for writing to Datastore.
- */
- static class FormatForDatastore extends DoFn<KV<String, List<CompletionCandidate>>, Entity> {
- private String kind;
-
- public FormatForDatastore(String kind) {
- this.kind = kind;
- }
-
- @Override
- public void processElement(ProcessContext c) {
- Entity.Builder entityBuilder = Entity.newBuilder();
- Key key = DatastoreHelper.makeKey(kind, c.element().getKey()).build();
-
- entityBuilder.setKey(key);
- List<Value> candidates = new ArrayList<>();
- for (CompletionCandidate tag : c.element().getValue()) {
- Entity.Builder tagEntity = Entity.newBuilder();
- tagEntity.addProperty(
- DatastoreHelper.makeProperty("tag", DatastoreHelper.makeValue(tag.value)));
- tagEntity.addProperty(
- DatastoreHelper.makeProperty("count", DatastoreHelper.makeValue(tag.count)));
- candidates.add(DatastoreHelper.makeValue(tagEntity).setIndexed(false).build());
- }
- entityBuilder.addProperty(
- DatastoreHelper.makeProperty("candidates", DatastoreHelper.makeValue(candidates)));
- c.output(entityBuilder.build());
- }
- }
-
- /**
- * Options supported by this class.
- *
- * <p>Inherits standard Dataflow configuration options.
- */
- private static interface Options extends ExamplePubsubTopicOptions, ExampleBigQueryTableOptions {
- @Description("Input text file")
- String getInputFile();
- void setInputFile(String value);
-
- @Description("Whether to use the recursive algorithm")
- @Default.Boolean(true)
- Boolean getRecursive();
- void setRecursive(Boolean value);
-
- @Description("Dataset entity kind")
- @Default.String("autocomplete-demo")
- String getKind();
- void setKind(String value);
-
- @Description("Whether output to BigQuery")
- @Default.Boolean(true)
- Boolean getOutputToBigQuery();
- void setOutputToBigQuery(Boolean value);
-
- @Description("Whether output to Datastore")
- @Default.Boolean(false)
- Boolean getOutputToDatastore();
- void setOutputToDatastore(Boolean value);
-
- @Description("Datastore output dataset ID, defaults to project ID")
- String getOutputDataset();
- void setOutputDataset(String value);
- }
-
- public static void main(String[] args) throws IOException {
- Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
-
- if (options.isStreaming()) {
- // In order to cancel the pipelines automatically,
- // {@literal DataflowPipelineRunner} is forced to be used.
- options.setRunner(DataflowPipelineRunner.class);
- }
-
- options.setBigQuerySchema(FormatForBigquery.getSchema());
- DataflowExampleUtils dataflowUtils = new DataflowExampleUtils(options);
-
- // We support running the same pipeline in either
- // batch or windowed streaming mode.
- PTransform<? super PBegin, PCollection<String>> readSource;
- WindowFn<Object, ?> windowFn;
- if (options.isStreaming()) {
- Preconditions.checkArgument(
- !options.getOutputToDatastore(), "DatastoreIO is not supported in streaming.");
- dataflowUtils.setupPubsub();
-
- readSource = PubsubIO.Read.topic(options.getPubsubTopic());
- windowFn = SlidingWindows.of(Duration.standardMinutes(30)).every(Duration.standardSeconds(5));
- } else {
- readSource = TextIO.Read.from(options.getInputFile());
- windowFn = new GlobalWindows();
- }
-
- // Create the pipeline.
- Pipeline p = Pipeline.create(options);
- PCollection<KV<String, List<CompletionCandidate>>> toWrite = p
- .apply(readSource)
- .apply(ParDo.of(new ExtractHashtags()))
- .apply(Window.<String>into(windowFn))
- .apply(ComputeTopCompletions.top(10, options.getRecursive()));
-
- if (options.getOutputToDatastore()) {
- toWrite
- .apply(ParDo.named("FormatForDatastore").of(new FormatForDatastore(options.getKind())))
- .apply(DatastoreIO.writeTo(MoreObjects.firstNonNull(
- options.getOutputDataset(), options.getProject())));
- }
- if (options.getOutputToBigQuery()) {
- dataflowUtils.setupBigQueryTable();
-
- TableReference tableRef = new TableReference();
- tableRef.setProjectId(options.getProject());
- tableRef.setDatasetId(options.getBigQueryDataset());
- tableRef.setTableId(options.getBigQueryTable());
-
- toWrite
- .apply(ParDo.of(new FormatForBigquery()))
- .apply(BigQueryIO.Write
- .to(tableRef)
- .withSchema(FormatForBigquery.getSchema())
- .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED)
- .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_TRUNCATE));
- }
-
- // Run the pipeline.
- PipelineResult result = p.run();
-
- if (options.isStreaming() && !options.getInputFile().isEmpty()) {
- // Inject the data into the Pub/Sub topic with a Dataflow batch pipeline.
- dataflowUtils.runInjectorPipeline(options.getInputFile(), options.getPubsubTopic());
- }
-
- // dataflowUtils will try to cancel the pipeline and the injector before the program exists.
- dataflowUtils.waitToFinish(result);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/src/main/java/com/google/cloud/dataflow/examples/complete/README.md
----------------------------------------------------------------------
diff --git a/examples/src/main/java/com/google/cloud/dataflow/examples/complete/README.md b/examples/src/main/java/com/google/cloud/dataflow/examples/complete/README.md
deleted file mode 100644
index 5fba154..0000000
--- a/examples/src/main/java/com/google/cloud/dataflow/examples/complete/README.md
+++ /dev/null
@@ -1,44 +0,0 @@
-
-# "Complete" Examples
-
-This directory contains end-to-end example pipelines that perform complex data processing tasks. They include:
-
-<ul>
- <li><a href="https://github.com/GoogleCloudPlatform/DataflowJavaSDK/blob/master/examples/src/main/java/com/google/cloud/dataflow/examples/complete/AutoComplete.java">AutoComplete</a>
- — An example that computes the most popular hash tags for every
- prefix, which can be used for auto-completion. Demonstrates how to use the
- same pipeline in both streaming and batch, combiners, and composite
- transforms.</li>
- <li><a href="https://github.com/GoogleCloudPlatform/DataflowJavaSDK/blob/master/examples/src/main/java/com/google/cloud/dataflow/examples/complete/StreamingWordExtract.java">StreamingWordExtract</a>
- — A streaming pipeline example that inputs lines of text from a Cloud
- Pub/Sub topic, splits each line into individual words, capitalizes those
- words, and writes the output to a BigQuery table.
- </li>
- <li><a href="https://github.com/GoogleCloudPlatform/DataflowJavaSDK/blob/master/examples/src/main/java/com/google/cloud/dataflow/examples/complete/TfIdf.java">TfIdf</a>
- — An example that computes a basic TF-IDF search table for a directory or
- Cloud Storage prefix. Demonstrates joining data, side inputs, and logging.
- </li>
- <li><a href="https://github.com/GoogleCloudPlatform/DataflowJavaSDK/blob/master/examples/src/main/java/com/google/cloud/dataflow/examples/complete/TopWikipediaSessions.java">TopWikipediaSessions</a>
- — An example that reads Wikipedia edit data from Cloud Storage and
- computes the user with the longest string of edits separated by no more than
- an hour within each month. Demonstrates using Cloud Dataflow
- <code>Windowing</code> to perform time-based aggregations of data.
- </li>
- <li><a href="https://github.com/GoogleCloudPlatform/DataflowJavaSDK/blob/master/examples/src/main/java/com/google/cloud/dataflow/examples/complete/TrafficMaxLaneFlow.java">TrafficMaxLaneFlow</a>
- — A streaming Cloud Dataflow example using BigQuery output in the
- <code>traffic sensor</code> domain. Demonstrates the Cloud Dataflow streaming
- runner, sliding windows, Cloud Pub/Sub topic ingestion, the use of the
- <code>AvroCoder</code> to encode a custom class, and custom
- <code>Combine</code> transforms.
- </li>
- <li><a href="https://github.com/GoogleCloudPlatform/DataflowJavaSDK/blob/master/examples/src/main/java/com/google/cloud/dataflow/examples/complete/TrafficRoutes.java">TrafficRoutes</a>
- — A streaming Cloud Dataflow example using BigQuery output in the
- <code>traffic sensor</code> domain. Demonstrates the Cloud Dataflow streaming
- runner, <code>GroupByKey</code>, keyed state, sliding windows, and Cloud
- Pub/Sub topic ingestion.
- </li>
- </ul>
-
-See the [documentation](https://cloud.google.com/dataflow/getting-started) and the [Examples
-README](../../../../../../../../../README.md) for
-information about how to run these examples.
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/src/main/java/com/google/cloud/dataflow/examples/complete/StreamingWordExtract.java
----------------------------------------------------------------------
diff --git a/examples/src/main/java/com/google/cloud/dataflow/examples/complete/StreamingWordExtract.java b/examples/src/main/java/com/google/cloud/dataflow/examples/complete/StreamingWordExtract.java
deleted file mode 100644
index 99c5249..0000000
--- a/examples/src/main/java/com/google/cloud/dataflow/examples/complete/StreamingWordExtract.java
+++ /dev/null
@@ -1,163 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.examples.complete;
-
-import com.google.api.services.bigquery.model.TableFieldSchema;
-import com.google.api.services.bigquery.model.TableRow;
-import com.google.api.services.bigquery.model.TableSchema;
-import com.google.cloud.dataflow.examples.common.DataflowExampleUtils;
-import com.google.cloud.dataflow.examples.common.ExampleBigQueryTableOptions;
-import com.google.cloud.dataflow.examples.common.ExamplePubsubTopicOptions;
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.PipelineResult;
-import com.google.cloud.dataflow.sdk.io.BigQueryIO;
-import com.google.cloud.dataflow.sdk.io.PubsubIO;
-import com.google.cloud.dataflow.sdk.options.Default;
-import com.google.cloud.dataflow.sdk.options.Description;
-import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
-import com.google.cloud.dataflow.sdk.runners.DataflowPipelineRunner;
-import com.google.cloud.dataflow.sdk.transforms.DoFn;
-import com.google.cloud.dataflow.sdk.transforms.ParDo;
-
-import java.io.IOException;
-import java.util.ArrayList;
-
-/**
- * A streaming Dataflow Example using BigQuery output.
- *
- * <p>This pipeline example reads lines of text from a PubSub topic, splits each line
- * into individual words, capitalizes those words, and writes the output to
- * a BigQuery table.
- *
- * <p>By default, the example will run a separate pipeline to inject the data from the default
- * {@literal --inputFile} to the Pub/Sub {@literal --pubsubTopic}. It will make it available for
- * the streaming pipeline to process. You may override the default {@literal --inputFile} with the
- * file of your choosing. You may also set {@literal --inputFile} to an empty string, which will
- * disable the automatic Pub/Sub injection, and allow you to use separate tool to control the input
- * to this example.
- *
- * <p>The example is configured to use the default Pub/Sub topic and the default BigQuery table
- * from the example common package (there are no defaults for a general Dataflow pipeline).
- * You can override them by using the {@literal --pubsubTopic}, {@literal --bigQueryDataset}, and
- * {@literal --bigQueryTable} options. If the Pub/Sub topic or the BigQuery table do not exist,
- * the example will try to create them.
- *
- * <p>The example will try to cancel the pipelines on the signal to terminate the process (CTRL-C)
- * and then exits.
- */
-public class StreamingWordExtract {
-
- /** A DoFn that tokenizes lines of text into individual words. */
- static class ExtractWords extends DoFn<String, String> {
- @Override
- public void processElement(ProcessContext c) {
- String[] words = c.element().split("[^a-zA-Z']+");
- for (String word : words) {
- if (!word.isEmpty()) {
- c.output(word);
- }
- }
- }
- }
-
- /** A DoFn that uppercases a word. */
- static class Uppercase extends DoFn<String, String> {
- @Override
- public void processElement(ProcessContext c) {
- c.output(c.element().toUpperCase());
- }
- }
-
- /**
- * Converts strings into BigQuery rows.
- */
- static class StringToRowConverter extends DoFn<String, TableRow> {
- /**
- * In this example, put the whole string into single BigQuery field.
- */
- @Override
- public void processElement(ProcessContext c) {
- c.output(new TableRow().set("string_field", c.element()));
- }
-
- static TableSchema getSchema() {
- return new TableSchema().setFields(new ArrayList<TableFieldSchema>() {
- // Compose the list of TableFieldSchema from tableSchema.
- {
- add(new TableFieldSchema().setName("string_field").setType("STRING"));
- }
- });
- }
- }
-
- /**
- * Options supported by {@link StreamingWordExtract}.
- *
- * <p>Inherits standard configuration options.
- */
- private interface StreamingWordExtractOptions
- extends ExamplePubsubTopicOptions, ExampleBigQueryTableOptions {
- @Description("Input file to inject to Pub/Sub topic")
- @Default.String("gs://dataflow-samples/shakespeare/kinglear.txt")
- String getInputFile();
- void setInputFile(String value);
- }
-
- /**
- * Sets up and starts streaming pipeline.
- *
- * @throws IOException if there is a problem setting up resources
- */
- public static void main(String[] args) throws IOException {
- StreamingWordExtractOptions options = PipelineOptionsFactory.fromArgs(args)
- .withValidation()
- .as(StreamingWordExtractOptions.class);
- options.setStreaming(true);
- // In order to cancel the pipelines automatically,
- // {@literal DataflowPipelineRunner} is forced to be used.
- options.setRunner(DataflowPipelineRunner.class);
-
- options.setBigQuerySchema(StringToRowConverter.getSchema());
- DataflowExampleUtils dataflowUtils = new DataflowExampleUtils(options);
- dataflowUtils.setup();
-
- Pipeline pipeline = Pipeline.create(options);
-
- String tableSpec = new StringBuilder()
- .append(options.getProject()).append(":")
- .append(options.getBigQueryDataset()).append(".")
- .append(options.getBigQueryTable())
- .toString();
- pipeline
- .apply(PubsubIO.Read.topic(options.getPubsubTopic()))
- .apply(ParDo.of(new ExtractWords()))
- .apply(ParDo.of(new Uppercase()))
- .apply(ParDo.of(new StringToRowConverter()))
- .apply(BigQueryIO.Write.to(tableSpec)
- .withSchema(StringToRowConverter.getSchema()));
-
- PipelineResult result = pipeline.run();
-
- if (!options.getInputFile().isEmpty()) {
- // Inject the data into the Pub/Sub topic with a Dataflow batch pipeline.
- dataflowUtils.runInjectorPipeline(options.getInputFile(), options.getPubsubTopic());
- }
-
- // dataflowUtils will try to cancel the pipeline and the injector before the program exists.
- dataflowUtils.waitToFinish(result);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/src/main/java/com/google/cloud/dataflow/examples/complete/TfIdf.java
----------------------------------------------------------------------
diff --git a/examples/src/main/java/com/google/cloud/dataflow/examples/complete/TfIdf.java b/examples/src/main/java/com/google/cloud/dataflow/examples/complete/TfIdf.java
deleted file mode 100644
index 65ac753..0000000
--- a/examples/src/main/java/com/google/cloud/dataflow/examples/complete/TfIdf.java
+++ /dev/null
@@ -1,431 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.examples.complete;
-
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.coders.KvCoder;
-import com.google.cloud.dataflow.sdk.coders.StringDelegateCoder;
-import com.google.cloud.dataflow.sdk.coders.StringUtf8Coder;
-import com.google.cloud.dataflow.sdk.io.TextIO;
-import com.google.cloud.dataflow.sdk.options.Default;
-import com.google.cloud.dataflow.sdk.options.Description;
-import com.google.cloud.dataflow.sdk.options.GcsOptions;
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
-import com.google.cloud.dataflow.sdk.options.Validation;
-import com.google.cloud.dataflow.sdk.transforms.Count;
-import com.google.cloud.dataflow.sdk.transforms.DoFn;
-import com.google.cloud.dataflow.sdk.transforms.Flatten;
-import com.google.cloud.dataflow.sdk.transforms.Keys;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-import com.google.cloud.dataflow.sdk.transforms.ParDo;
-import com.google.cloud.dataflow.sdk.transforms.RemoveDuplicates;
-import com.google.cloud.dataflow.sdk.transforms.Values;
-import com.google.cloud.dataflow.sdk.transforms.View;
-import com.google.cloud.dataflow.sdk.transforms.WithKeys;
-import com.google.cloud.dataflow.sdk.transforms.join.CoGbkResult;
-import com.google.cloud.dataflow.sdk.transforms.join.CoGroupByKey;
-import com.google.cloud.dataflow.sdk.transforms.join.KeyedPCollectionTuple;
-import com.google.cloud.dataflow.sdk.util.GcsUtil;
-import com.google.cloud.dataflow.sdk.util.gcsfs.GcsPath;
-import com.google.cloud.dataflow.sdk.values.KV;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.cloud.dataflow.sdk.values.PCollectionList;
-import com.google.cloud.dataflow.sdk.values.PCollectionView;
-import com.google.cloud.dataflow.sdk.values.PDone;
-import com.google.cloud.dataflow.sdk.values.PInput;
-import com.google.cloud.dataflow.sdk.values.TupleTag;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.File;
-import java.io.IOException;
-import java.net.URI;
-import java.net.URISyntaxException;
-import java.util.HashSet;
-import java.util.Set;
-
-/**
- * An example that computes a basic TF-IDF search table for a directory or GCS prefix.
- *
- * <p>Concepts: joining data; side inputs; logging
- *
- * <p>To execute this pipeline locally, specify general pipeline configuration:
- * <pre>{@code
- * --project=YOUR_PROJECT_ID
- * }</pre>
- * and a local output file or output prefix on GCS:
- * <pre>{@code
- * --output=[YOUR_LOCAL_FILE | gs://YOUR_OUTPUT_PREFIX]
- * }</pre>
- *
- * <p>To execute this pipeline using the Dataflow service, specify pipeline configuration:
- * <pre>{@code
- * --project=YOUR_PROJECT_ID
- * --stagingLocation=gs://YOUR_STAGING_DIRECTORY
- * --runner=BlockingDataflowPipelineRunner
- * and an output prefix on GCS:
- * --output=gs://YOUR_OUTPUT_PREFIX
- * }</pre>
- *
- * <p>The default input is {@code gs://dataflow-samples/shakespeare/} and can be overridden with
- * {@code --input}.
- */
-public class TfIdf {
- /**
- * Options supported by {@link TfIdf}.
- *
- * <p>Inherits standard configuration options.
- */
- private static interface Options extends PipelineOptions {
- @Description("Path to the directory or GCS prefix containing files to read from")
- @Default.String("gs://dataflow-samples/shakespeare/")
- String getInput();
- void setInput(String value);
-
- @Description("Prefix of output URI to write to")
- @Validation.Required
- String getOutput();
- void setOutput(String value);
- }
-
- /**
- * Lists documents contained beneath the {@code options.input} prefix/directory.
- */
- public static Set<URI> listInputDocuments(Options options)
- throws URISyntaxException, IOException {
- URI baseUri = new URI(options.getInput());
-
- // List all documents in the directory or GCS prefix.
- URI absoluteUri;
- if (baseUri.getScheme() != null) {
- absoluteUri = baseUri;
- } else {
- absoluteUri = new URI(
- "file",
- baseUri.getAuthority(),
- baseUri.getPath(),
- baseUri.getQuery(),
- baseUri.getFragment());
- }
-
- Set<URI> uris = new HashSet<>();
- if (absoluteUri.getScheme().equals("file")) {
- File directory = new File(absoluteUri);
- for (String entry : directory.list()) {
- File path = new File(directory, entry);
- uris.add(path.toURI());
- }
- } else if (absoluteUri.getScheme().equals("gs")) {
- GcsUtil gcsUtil = options.as(GcsOptions.class).getGcsUtil();
- URI gcsUriGlob = new URI(
- absoluteUri.getScheme(),
- absoluteUri.getAuthority(),
- absoluteUri.getPath() + "*",
- absoluteUri.getQuery(),
- absoluteUri.getFragment());
- for (GcsPath entry : gcsUtil.expand(GcsPath.fromUri(gcsUriGlob))) {
- uris.add(entry.toUri());
- }
- }
-
- return uris;
- }
-
- /**
- * Reads the documents at the provided uris and returns all lines
- * from the documents tagged with which document they are from.
- */
- public static class ReadDocuments
- extends PTransform<PInput, PCollection<KV<URI, String>>> {
- private Iterable<URI> uris;
-
- public ReadDocuments(Iterable<URI> uris) {
- this.uris = uris;
- }
-
- @Override
- public Coder<?> getDefaultOutputCoder() {
- return KvCoder.of(StringDelegateCoder.of(URI.class), StringUtf8Coder.of());
- }
-
- @Override
- public PCollection<KV<URI, String>> apply(PInput input) {
- Pipeline pipeline = input.getPipeline();
-
- // Create one TextIO.Read transform for each document
- // and add its output to a PCollectionList
- PCollectionList<KV<URI, String>> urisToLines =
- PCollectionList.empty(pipeline);
-
- // TextIO.Read supports:
- // - file: URIs and paths locally
- // - gs: URIs on the service
- for (final URI uri : uris) {
- String uriString;
- if (uri.getScheme().equals("file")) {
- uriString = new File(uri).getPath();
- } else {
- uriString = uri.toString();
- }
-
- PCollection<KV<URI, String>> oneUriToLines = pipeline
- .apply(TextIO.Read.from(uriString)
- .named("TextIO.Read(" + uriString + ")"))
- .apply("WithKeys(" + uriString + ")", WithKeys.<URI, String>of(uri));
-
- urisToLines = urisToLines.and(oneUriToLines);
- }
-
- return urisToLines.apply(Flatten.<KV<URI, String>>pCollections());
- }
- }
-
- /**
- * A transform containing a basic TF-IDF pipeline. The input consists of KV objects
- * where the key is the document's URI and the value is a piece
- * of the document's content. The output is mapping from terms to
- * scores for each document URI.
- */
- public static class ComputeTfIdf
- extends PTransform<PCollection<KV<URI, String>>, PCollection<KV<String, KV<URI, Double>>>> {
- public ComputeTfIdf() { }
-
- @Override
- public PCollection<KV<String, KV<URI, Double>>> apply(
- PCollection<KV<URI, String>> uriToContent) {
-
- // Compute the total number of documents, and
- // prepare this singleton PCollectionView for
- // use as a side input.
- final PCollectionView<Long> totalDocuments =
- uriToContent
- .apply("GetURIs", Keys.<URI>create())
- .apply("RemoveDuplicateDocs", RemoveDuplicates.<URI>create())
- .apply(Count.<URI>globally())
- .apply(View.<Long>asSingleton());
-
- // Create a collection of pairs mapping a URI to each
- // of the words in the document associated with that that URI.
- PCollection<KV<URI, String>> uriToWords = uriToContent
- .apply(ParDo.named("SplitWords").of(
- new DoFn<KV<URI, String>, KV<URI, String>>() {
- @Override
- public void processElement(ProcessContext c) {
- URI uri = c.element().getKey();
- String line = c.element().getValue();
- for (String word : line.split("\\W+")) {
- // Log INFO messages when the word “love” is found.
- if (word.toLowerCase().equals("love")) {
- LOG.info("Found {}", word.toLowerCase());
- }
-
- if (!word.isEmpty()) {
- c.output(KV.of(uri, word.toLowerCase()));
- }
- }
- }
- }));
-
- // Compute a mapping from each word to the total
- // number of documents in which it appears.
- PCollection<KV<String, Long>> wordToDocCount = uriToWords
- .apply("RemoveDuplicateWords", RemoveDuplicates.<KV<URI, String>>create())
- .apply(Values.<String>create())
- .apply("CountDocs", Count.<String>perElement());
-
- // Compute a mapping from each URI to the total
- // number of words in the document associated with that URI.
- PCollection<KV<URI, Long>> uriToWordTotal = uriToWords
- .apply("GetURIs2", Keys.<URI>create())
- .apply("CountWords", Count.<URI>perElement());
-
- // Count, for each (URI, word) pair, the number of
- // occurrences of that word in the document associated
- // with the URI.
- PCollection<KV<KV<URI, String>, Long>> uriAndWordToCount = uriToWords
- .apply("CountWordDocPairs", Count.<KV<URI, String>>perElement());
-
- // Adjust the above collection to a mapping from
- // (URI, word) pairs to counts into an isomorphic mapping
- // from URI to (word, count) pairs, to prepare for a join
- // by the URI key.
- PCollection<KV<URI, KV<String, Long>>> uriToWordAndCount = uriAndWordToCount
- .apply(ParDo.named("ShiftKeys").of(
- new DoFn<KV<KV<URI, String>, Long>, KV<URI, KV<String, Long>>>() {
- @Override
- public void processElement(ProcessContext c) {
- URI uri = c.element().getKey().getKey();
- String word = c.element().getKey().getValue();
- Long occurrences = c.element().getValue();
- c.output(KV.of(uri, KV.of(word, occurrences)));
- }
- }));
-
- // Prepare to join the mapping of URI to (word, count) pairs with
- // the mapping of URI to total word counts, by associating
- // each of the input PCollection<KV<URI, ...>> with
- // a tuple tag. Each input must have the same key type, URI
- // in this case. The type parameter of the tuple tag matches
- // the types of the values for each collection.
- final TupleTag<Long> wordTotalsTag = new TupleTag<Long>();
- final TupleTag<KV<String, Long>> wordCountsTag = new TupleTag<KV<String, Long>>();
- KeyedPCollectionTuple<URI> coGbkInput = KeyedPCollectionTuple
- .of(wordTotalsTag, uriToWordTotal)
- .and(wordCountsTag, uriToWordAndCount);
-
- // Perform a CoGroupByKey (a sort of pre-join) on the prepared
- // inputs. This yields a mapping from URI to a CoGbkResult
- // (CoGroupByKey Result). The CoGbkResult is a mapping
- // from the above tuple tags to the values in each input
- // associated with a particular URI. In this case, each
- // KV<URI, CoGbkResult> group a URI with the total number of
- // words in that document as well as all the (word, count)
- // pairs for particular words.
- PCollection<KV<URI, CoGbkResult>> uriToWordAndCountAndTotal = coGbkInput
- .apply("CoGroupByUri", CoGroupByKey.<URI>create());
-
- // Compute a mapping from each word to a (URI, term frequency)
- // pair for each URI. A word's term frequency for a document
- // is simply the number of times that word occurs in the document
- // divided by the total number of words in the document.
- PCollection<KV<String, KV<URI, Double>>> wordToUriAndTf = uriToWordAndCountAndTotal
- .apply(ParDo.named("ComputeTermFrequencies").of(
- new DoFn<KV<URI, CoGbkResult>, KV<String, KV<URI, Double>>>() {
- @Override
- public void processElement(ProcessContext c) {
- URI uri = c.element().getKey();
- Long wordTotal = c.element().getValue().getOnly(wordTotalsTag);
-
- for (KV<String, Long> wordAndCount
- : c.element().getValue().getAll(wordCountsTag)) {
- String word = wordAndCount.getKey();
- Long wordCount = wordAndCount.getValue();
- Double termFrequency = wordCount.doubleValue() / wordTotal.doubleValue();
- c.output(KV.of(word, KV.of(uri, termFrequency)));
- }
- }
- }));
-
- // Compute a mapping from each word to its document frequency.
- // A word's document frequency in a corpus is the number of
- // documents in which the word appears divided by the total
- // number of documents in the corpus. Note how the total number of
- // documents is passed as a side input; the same value is
- // presented to each invocation of the DoFn.
- PCollection<KV<String, Double>> wordToDf = wordToDocCount
- .apply(ParDo
- .named("ComputeDocFrequencies")
- .withSideInputs(totalDocuments)
- .of(new DoFn<KV<String, Long>, KV<String, Double>>() {
- @Override
- public void processElement(ProcessContext c) {
- String word = c.element().getKey();
- Long documentCount = c.element().getValue();
- Long documentTotal = c.sideInput(totalDocuments);
- Double documentFrequency = documentCount.doubleValue()
- / documentTotal.doubleValue();
-
- c.output(KV.of(word, documentFrequency));
- }
- }));
-
- // Join the term frequency and document frequency
- // collections, each keyed on the word.
- final TupleTag<KV<URI, Double>> tfTag = new TupleTag<KV<URI, Double>>();
- final TupleTag<Double> dfTag = new TupleTag<Double>();
- PCollection<KV<String, CoGbkResult>> wordToUriAndTfAndDf = KeyedPCollectionTuple
- .of(tfTag, wordToUriAndTf)
- .and(dfTag, wordToDf)
- .apply(CoGroupByKey.<String>create());
-
- // Compute a mapping from each word to a (URI, TF-IDF) score
- // for each URI. There are a variety of definitions of TF-IDF
- // ("term frequency - inverse document frequency") score;
- // here we use a basic version that is the term frequency
- // divided by the log of the document frequency.
- PCollection<KV<String, KV<URI, Double>>> wordToUriAndTfIdf = wordToUriAndTfAndDf
- .apply(ParDo.named("ComputeTfIdf").of(
- new DoFn<KV<String, CoGbkResult>, KV<String, KV<URI, Double>>>() {
- @Override
- public void processElement(ProcessContext c) {
- String word = c.element().getKey();
- Double df = c.element().getValue().getOnly(dfTag);
-
- for (KV<URI, Double> uriAndTf : c.element().getValue().getAll(tfTag)) {
- URI uri = uriAndTf.getKey();
- Double tf = uriAndTf.getValue();
- Double tfIdf = tf * Math.log(1 / df);
- c.output(KV.of(word, KV.of(uri, tfIdf)));
- }
- }
- }));
-
- return wordToUriAndTfIdf;
- }
-
- // Instantiate Logger.
- // It is suggested that the user specify the class name of the containing class
- // (in this case ComputeTfIdf).
- private static final Logger LOG = LoggerFactory.getLogger(ComputeTfIdf.class);
- }
-
- /**
- * A {@link PTransform} to write, in CSV format, a mapping from term and URI
- * to score.
- */
- public static class WriteTfIdf
- extends PTransform<PCollection<KV<String, KV<URI, Double>>>, PDone> {
- private String output;
-
- public WriteTfIdf(String output) {
- this.output = output;
- }
-
- @Override
- public PDone apply(PCollection<KV<String, KV<URI, Double>>> wordToUriAndTfIdf) {
- return wordToUriAndTfIdf
- .apply(ParDo.named("Format").of(new DoFn<KV<String, KV<URI, Double>>, String>() {
- @Override
- public void processElement(ProcessContext c) {
- c.output(String.format("%s,\t%s,\t%f",
- c.element().getKey(),
- c.element().getValue().getKey(),
- c.element().getValue().getValue()));
- }
- }))
- .apply(TextIO.Write
- .to(output)
- .withSuffix(".csv"));
- }
- }
-
- public static void main(String[] args) throws Exception {
- Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
- Pipeline pipeline = Pipeline.create(options);
- pipeline.getCoderRegistry().registerCoder(URI.class, StringDelegateCoder.of(URI.class));
-
- pipeline
- .apply(new ReadDocuments(listInputDocuments(options)))
- .apply(new ComputeTfIdf())
- .apply(new WriteTfIdf(options.getOutput()));
-
- pipeline.run();
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/src/main/java/com/google/cloud/dataflow/examples/complete/TopWikipediaSessions.java
----------------------------------------------------------------------
diff --git a/examples/src/main/java/com/google/cloud/dataflow/examples/complete/TopWikipediaSessions.java b/examples/src/main/java/com/google/cloud/dataflow/examples/complete/TopWikipediaSessions.java
deleted file mode 100644
index c57a5f2..0000000
--- a/examples/src/main/java/com/google/cloud/dataflow/examples/complete/TopWikipediaSessions.java
+++ /dev/null
@@ -1,223 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.examples.complete;
-
-import com.google.api.services.bigquery.model.TableRow;
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.coders.TableRowJsonCoder;
-import com.google.cloud.dataflow.sdk.io.TextIO;
-import com.google.cloud.dataflow.sdk.options.DataflowPipelineOptions;
-import com.google.cloud.dataflow.sdk.options.Default;
-import com.google.cloud.dataflow.sdk.options.Description;
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
-import com.google.cloud.dataflow.sdk.options.Validation;
-import com.google.cloud.dataflow.sdk.transforms.Count;
-import com.google.cloud.dataflow.sdk.transforms.DoFn;
-import com.google.cloud.dataflow.sdk.transforms.DoFn.RequiresWindowAccess;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-import com.google.cloud.dataflow.sdk.transforms.ParDo;
-import com.google.cloud.dataflow.sdk.transforms.SerializableComparator;
-import com.google.cloud.dataflow.sdk.transforms.Top;
-import com.google.cloud.dataflow.sdk.transforms.windowing.CalendarWindows;
-import com.google.cloud.dataflow.sdk.transforms.windowing.IntervalWindow;
-import com.google.cloud.dataflow.sdk.transforms.windowing.Sessions;
-import com.google.cloud.dataflow.sdk.transforms.windowing.Window;
-import com.google.cloud.dataflow.sdk.values.KV;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-
-import org.joda.time.Duration;
-import org.joda.time.Instant;
-
-import java.util.List;
-
-/**
- * An example that reads Wikipedia edit data from Cloud Storage and computes the user with
- * the longest string of edits separated by no more than an hour within each month.
- *
- * <p>Concepts: Using Windowing to perform time-based aggregations of data.
- *
- * <p>It is not recommended to execute this pipeline locally, given the size of the default input
- * data.
- *
- * <p>To execute this pipeline using the Dataflow service, specify pipeline configuration:
- * <pre>{@code
- * --project=YOUR_PROJECT_ID
- * --stagingLocation=gs://YOUR_STAGING_DIRECTORY
- * --runner=BlockingDataflowPipelineRunner
- * }
- * </pre>
- * and an output prefix on GCS:
- * <pre>{@code
- * --output=gs://YOUR_OUTPUT_PREFIX
- * }</pre>
- *
- * <p>The default input is {@code gs://dataflow-samples/wikipedia_edits/*.json} and can be
- * overridden with {@code --input}.
- *
- * <p>The input for this example is large enough that it's a good place to enable (experimental)
- * autoscaling:
- * <pre>{@code
- * --autoscalingAlgorithm=BASIC
- * --maxNumWorkers=20
- * }
- * </pre>
- * This will automatically scale the number of workers up over time until the job completes.
- */
-public class TopWikipediaSessions {
- private static final String EXPORTED_WIKI_TABLE = "gs://dataflow-samples/wikipedia_edits/*.json";
-
- /**
- * Extracts user and timestamp from a TableRow representing a Wikipedia edit.
- */
- static class ExtractUserAndTimestamp extends DoFn<TableRow, String> {
- @Override
- public void processElement(ProcessContext c) {
- TableRow row = c.element();
- int timestamp = (Integer) row.get("timestamp");
- String userName = (String) row.get("contributor_username");
- if (userName != null) {
- // Sets the implicit timestamp field to be used in windowing.
- c.outputWithTimestamp(userName, new Instant(timestamp * 1000L));
- }
- }
- }
-
- /**
- * Computes the number of edits in each user session. A session is defined as
- * a string of edits where each is separated from the next by less than an hour.
- */
- static class ComputeSessions
- extends PTransform<PCollection<String>, PCollection<KV<String, Long>>> {
- @Override
- public PCollection<KV<String, Long>> apply(PCollection<String> actions) {
- return actions
- .apply(Window.<String>into(Sessions.withGapDuration(Duration.standardHours(1))))
-
- .apply(Count.<String>perElement());
- }
- }
-
- /**
- * Computes the longest session ending in each month.
- */
- private static class TopPerMonth
- extends PTransform<PCollection<KV<String, Long>>, PCollection<List<KV<String, Long>>>> {
- @Override
- public PCollection<List<KV<String, Long>>> apply(PCollection<KV<String, Long>> sessions) {
- return sessions
- .apply(Window.<KV<String, Long>>into(CalendarWindows.months(1)))
-
- .apply(Top.of(1, new SerializableComparator<KV<String, Long>>() {
- @Override
- public int compare(KV<String, Long> o1, KV<String, Long> o2) {
- return Long.compare(o1.getValue(), o2.getValue());
- }
- }).withoutDefaults());
- }
- }
-
- static class SessionsToStringsDoFn extends DoFn<KV<String, Long>, KV<String, Long>>
- implements RequiresWindowAccess {
-
- @Override
- public void processElement(ProcessContext c) {
- c.output(KV.of(
- c.element().getKey() + " : " + c.window(), c.element().getValue()));
- }
- }
-
- static class FormatOutputDoFn extends DoFn<List<KV<String, Long>>, String>
- implements RequiresWindowAccess {
- @Override
- public void processElement(ProcessContext c) {
- for (KV<String, Long> item : c.element()) {
- String session = item.getKey();
- long count = item.getValue();
- c.output(session + " : " + count + " : " + ((IntervalWindow) c.window()).start());
- }
- }
- }
-
- static class ComputeTopSessions extends PTransform<PCollection<TableRow>, PCollection<String>> {
-
- private final double samplingThreshold;
-
- public ComputeTopSessions(double samplingThreshold) {
- this.samplingThreshold = samplingThreshold;
- }
-
- @Override
- public PCollection<String> apply(PCollection<TableRow> input) {
- return input
- .apply(ParDo.of(new ExtractUserAndTimestamp()))
-
- .apply(ParDo.named("SampleUsers").of(
- new DoFn<String, String>() {
- @Override
- public void processElement(ProcessContext c) {
- if (Math.abs(c.element().hashCode()) <= Integer.MAX_VALUE * samplingThreshold) {
- c.output(c.element());
- }
- }
- }))
-
- .apply(new ComputeSessions())
-
- .apply(ParDo.named("SessionsToStrings").of(new SessionsToStringsDoFn()))
- .apply(new TopPerMonth())
- .apply(ParDo.named("FormatOutput").of(new FormatOutputDoFn()));
- }
- }
-
- /**
- * Options supported by this class.
- *
- * <p>Inherits standard Dataflow configuration options.
- */
- private static interface Options extends PipelineOptions {
- @Description(
- "Input specified as a GCS path containing a BigQuery table exported as json")
- @Default.String(EXPORTED_WIKI_TABLE)
- String getInput();
- void setInput(String value);
-
- @Description("File to output results to")
- @Validation.Required
- String getOutput();
- void setOutput(String value);
- }
-
- public static void main(String[] args) {
- Options options = PipelineOptionsFactory.fromArgs(args)
- .withValidation()
- .as(Options.class);
- DataflowPipelineOptions dataflowOptions = options.as(DataflowPipelineOptions.class);
-
- Pipeline p = Pipeline.create(dataflowOptions);
-
- double samplingThreshold = 0.1;
-
- p.apply(TextIO.Read
- .from(options.getInput())
- .withCoder(TableRowJsonCoder.of()))
- .apply(new ComputeTopSessions(samplingThreshold))
- .apply(TextIO.Write.named("Write").withoutSharding().to(options.getOutput()));
-
- p.run();
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/src/main/java/com/google/cloud/dataflow/examples/complete/TrafficMaxLaneFlow.java
----------------------------------------------------------------------
diff --git a/examples/src/main/java/com/google/cloud/dataflow/examples/complete/TrafficMaxLaneFlow.java b/examples/src/main/java/com/google/cloud/dataflow/examples/complete/TrafficMaxLaneFlow.java
deleted file mode 100644
index 2d54252..0000000
--- a/examples/src/main/java/com/google/cloud/dataflow/examples/complete/TrafficMaxLaneFlow.java
+++ /dev/null
@@ -1,425 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.examples.complete;
-
-import com.google.api.services.bigquery.model.TableFieldSchema;
-import com.google.api.services.bigquery.model.TableReference;
-import com.google.api.services.bigquery.model.TableRow;
-import com.google.api.services.bigquery.model.TableSchema;
-import com.google.cloud.dataflow.examples.common.DataflowExampleOptions;
-import com.google.cloud.dataflow.examples.common.DataflowExampleUtils;
-import com.google.cloud.dataflow.examples.common.ExampleBigQueryTableOptions;
-import com.google.cloud.dataflow.examples.common.ExamplePubsubTopicAndSubscriptionOptions;
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.PipelineResult;
-import com.google.cloud.dataflow.sdk.coders.AvroCoder;
-import com.google.cloud.dataflow.sdk.coders.DefaultCoder;
-import com.google.cloud.dataflow.sdk.io.BigQueryIO;
-import com.google.cloud.dataflow.sdk.io.PubsubIO;
-import com.google.cloud.dataflow.sdk.io.TextIO;
-import com.google.cloud.dataflow.sdk.options.Default;
-import com.google.cloud.dataflow.sdk.options.Description;
-import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
-import com.google.cloud.dataflow.sdk.transforms.Combine;
-import com.google.cloud.dataflow.sdk.transforms.DoFn;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-import com.google.cloud.dataflow.sdk.transforms.ParDo;
-import com.google.cloud.dataflow.sdk.transforms.SerializableFunction;
-import com.google.cloud.dataflow.sdk.transforms.windowing.SlidingWindows;
-import com.google.cloud.dataflow.sdk.transforms.windowing.Window;
-import com.google.cloud.dataflow.sdk.values.KV;
-import com.google.cloud.dataflow.sdk.values.PBegin;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.common.base.Strings;
-
-import org.apache.avro.reflect.Nullable;
-import org.joda.time.Duration;
-import org.joda.time.Instant;
-import org.joda.time.format.DateTimeFormat;
-import org.joda.time.format.DateTimeFormatter;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
-
-/**
- * A Dataflow Example that runs in both batch and streaming modes with traffic sensor data.
- * You can configure the running mode by setting {@literal --streaming} to true or false.
- *
- * <p>Concepts: The batch and streaming runners, sliding windows, Google Cloud Pub/Sub
- * topic injection, use of the AvroCoder to encode a custom class, and custom Combine transforms.
- *
- * <p>This example analyzes traffic sensor data using SlidingWindows. For each window,
- * it finds the lane that had the highest flow recorded, for each sensor station. It writes
- * those max values along with auxiliary info to a BigQuery table.
- *
- * <p>In batch mode, the pipeline reads traffic sensor data from {@literal --inputFile}.
- *
- * <p>In streaming mode, the pipeline reads the data from a Pub/Sub topic.
- * By default, the example will run a separate pipeline to inject the data from the default
- * {@literal --inputFile} to the Pub/Sub {@literal --pubsubTopic}. It will make it available for
- * the streaming pipeline to process. You may override the default {@literal --inputFile} with the
- * file of your choosing. You may also set {@literal --inputFile} to an empty string, which will
- * disable the automatic Pub/Sub injection, and allow you to use separate tool to control the input
- * to this example. An example code, which publishes traffic sensor data to a Pub/Sub topic,
- * is provided in
- * <a href="https://github.com/GoogleCloudPlatform/cloud-pubsub-samples-python/tree/master/gce-cmdline-publisher"></a>.
- *
- * <p>The example is configured to use the default Pub/Sub topic and the default BigQuery table
- * from the example common package (there are no defaults for a general Dataflow pipeline).
- * You can override them by using the {@literal --pubsubTopic}, {@literal --bigQueryDataset}, and
- * {@literal --bigQueryTable} options. If the Pub/Sub topic or the BigQuery table do not exist,
- * the example will try to create them.
- *
- * <p>The example will try to cancel the pipelines on the signal to terminate the process (CTRL-C)
- * and then exits.
- */
-public class TrafficMaxLaneFlow {
-
- private static final String PUBSUB_TIMESTAMP_LABEL_KEY = "timestamp_ms";
- private static final Integer VALID_INPUTS = 4999;
-
- static final int WINDOW_DURATION = 60; // Default sliding window duration in minutes
- static final int WINDOW_SLIDE_EVERY = 5; // Default window 'slide every' setting in minutes
-
- /**
- * This class holds information about each lane in a station reading, along with some general
- * information from the reading.
- */
- @DefaultCoder(AvroCoder.class)
- static class LaneInfo {
- @Nullable String stationId;
- @Nullable String lane;
- @Nullable String direction;
- @Nullable String freeway;
- @Nullable String recordedTimestamp;
- @Nullable Integer laneFlow;
- @Nullable Integer totalFlow;
- @Nullable Double laneAO;
- @Nullable Double laneAS;
-
- public LaneInfo() {}
-
- public LaneInfo(String stationId, String lane, String direction, String freeway,
- String timestamp, Integer laneFlow, Double laneAO,
- Double laneAS, Integer totalFlow) {
- this.stationId = stationId;
- this.lane = lane;
- this.direction = direction;
- this.freeway = freeway;
- this.recordedTimestamp = timestamp;
- this.laneFlow = laneFlow;
- this.laneAO = laneAO;
- this.laneAS = laneAS;
- this.totalFlow = totalFlow;
- }
-
- public String getStationId() {
- return this.stationId;
- }
- public String getLane() {
- return this.lane;
- }
- public String getDirection() {
- return this.direction;
- }
- public String getFreeway() {
- return this.freeway;
- }
- public String getRecordedTimestamp() {
- return this.recordedTimestamp;
- }
- public Integer getLaneFlow() {
- return this.laneFlow;
- }
- public Double getLaneAO() {
- return this.laneAO;
- }
- public Double getLaneAS() {
- return this.laneAS;
- }
- public Integer getTotalFlow() {
- return this.totalFlow;
- }
- }
-
- /**
- * Extract the timestamp field from the input string, and use it as the element timestamp.
- */
- static class ExtractTimestamps extends DoFn<String, String> {
- private static final DateTimeFormatter dateTimeFormat =
- DateTimeFormat.forPattern("MM/dd/yyyy HH:mm:ss");
-
- @Override
- public void processElement(DoFn<String, String>.ProcessContext c) throws Exception {
- String[] items = c.element().split(",");
- if (items.length > 0) {
- try {
- String timestamp = items[0];
- c.outputWithTimestamp(c.element(), new Instant(dateTimeFormat.parseMillis(timestamp)));
- } catch (IllegalArgumentException e) {
- // Skip the invalid input.
- }
- }
- }
- }
-
- /**
- * Extract flow information for each of the 8 lanes in a reading, and output as separate tuples.
- * This will let us determine which lane has the max flow for that station over the span of the
- * window, and output not only the max flow from that calculation, but other associated
- * information. The number of lanes for which data is present depends upon which freeway the data
- * point comes from.
- */
- static class ExtractFlowInfoFn extends DoFn<String, KV<String, LaneInfo>> {
-
- @Override
- public void processElement(ProcessContext c) {
- String[] items = c.element().split(",");
- if (items.length < 48) {
- // Skip the invalid input.
- return;
- }
- // extract the sensor information for the lanes from the input string fields.
- String timestamp = items[0];
- String stationId = items[1];
- String freeway = items[2];
- String direction = items[3];
- Integer totalFlow = tryIntParse(items[7]);
- for (int i = 1; i <= 8; ++i) {
- Integer laneFlow = tryIntParse(items[6 + 5 * i]);
- Double laneAvgOccupancy = tryDoubleParse(items[7 + 5 * i]);
- Double laneAvgSpeed = tryDoubleParse(items[8 + 5 * i]);
- if (laneFlow == null || laneAvgOccupancy == null || laneAvgSpeed == null) {
- return;
- }
- LaneInfo laneInfo = new LaneInfo(stationId, "lane" + i, direction, freeway, timestamp,
- laneFlow, laneAvgOccupancy, laneAvgSpeed, totalFlow);
- c.output(KV.of(stationId, laneInfo));
- }
- }
- }
-
- /**
- * A custom 'combine function' used with the Combine.perKey transform. Used to find the max lane
- * flow over all the data points in the Window. Extracts the lane flow from the input string and
- * determines whether it's the max seen so far. We're using a custom combiner instead of the Max
- * transform because we want to retain the additional information we've associated with the flow
- * value.
- */
- public static class MaxFlow implements SerializableFunction<Iterable<LaneInfo>, LaneInfo> {
- @Override
- public LaneInfo apply(Iterable<LaneInfo> input) {
- Integer max = 0;
- LaneInfo maxInfo = new LaneInfo();
- for (LaneInfo item : input) {
- Integer flow = item.getLaneFlow();
- if (flow != null && (flow >= max)) {
- max = flow;
- maxInfo = item;
- }
- }
- return maxInfo;
- }
- }
-
- /**
- * Format the results of the Max Lane flow calculation to a TableRow, to save to BigQuery.
- * Add the timestamp from the window context.
- */
- static class FormatMaxesFn extends DoFn<KV<String, LaneInfo>, TableRow> {
- @Override
- public void processElement(ProcessContext c) {
-
- LaneInfo laneInfo = c.element().getValue();
- TableRow row = new TableRow()
- .set("station_id", c.element().getKey())
- .set("direction", laneInfo.getDirection())
- .set("freeway", laneInfo.getFreeway())
- .set("lane_max_flow", laneInfo.getLaneFlow())
- .set("lane", laneInfo.getLane())
- .set("avg_occ", laneInfo.getLaneAO())
- .set("avg_speed", laneInfo.getLaneAS())
- .set("total_flow", laneInfo.getTotalFlow())
- .set("recorded_timestamp", laneInfo.getRecordedTimestamp())
- .set("window_timestamp", c.timestamp().toString());
- c.output(row);
- }
-
- /** Defines the BigQuery schema used for the output. */
- static TableSchema getSchema() {
- List<TableFieldSchema> fields = new ArrayList<>();
- fields.add(new TableFieldSchema().setName("station_id").setType("STRING"));
- fields.add(new TableFieldSchema().setName("direction").setType("STRING"));
- fields.add(new TableFieldSchema().setName("freeway").setType("STRING"));
- fields.add(new TableFieldSchema().setName("lane_max_flow").setType("INTEGER"));
- fields.add(new TableFieldSchema().setName("lane").setType("STRING"));
- fields.add(new TableFieldSchema().setName("avg_occ").setType("FLOAT"));
- fields.add(new TableFieldSchema().setName("avg_speed").setType("FLOAT"));
- fields.add(new TableFieldSchema().setName("total_flow").setType("INTEGER"));
- fields.add(new TableFieldSchema().setName("window_timestamp").setType("TIMESTAMP"));
- fields.add(new TableFieldSchema().setName("recorded_timestamp").setType("STRING"));
- TableSchema schema = new TableSchema().setFields(fields);
- return schema;
- }
- }
-
- /**
- * This PTransform extracts lane info, calculates the max lane flow found for a given station (for
- * the current Window) using a custom 'combiner', and formats the results for BigQuery.
- */
- static class MaxLaneFlow
- extends PTransform<PCollection<KV<String, LaneInfo>>, PCollection<TableRow>> {
- @Override
- public PCollection<TableRow> apply(PCollection<KV<String, LaneInfo>> flowInfo) {
- // stationId, LaneInfo => stationId + max lane flow info
- PCollection<KV<String, LaneInfo>> flowMaxes =
- flowInfo.apply(Combine.<String, LaneInfo>perKey(
- new MaxFlow()));
-
- // <stationId, max lane flow info>... => row...
- PCollection<TableRow> results = flowMaxes.apply(
- ParDo.of(new FormatMaxesFn()));
-
- return results;
- }
- }
-
- static class ReadFileAndExtractTimestamps extends PTransform<PBegin, PCollection<String>> {
- private final String inputFile;
-
- public ReadFileAndExtractTimestamps(String inputFile) {
- this.inputFile = inputFile;
- }
-
- @Override
- public PCollection<String> apply(PBegin begin) {
- return begin
- .apply(TextIO.Read.from(inputFile))
- .apply(ParDo.of(new ExtractTimestamps()));
- }
- }
-
- /**
- * Options supported by {@link TrafficMaxLaneFlow}.
- *
- * <p>Inherits standard configuration options.
- */
- private interface TrafficMaxLaneFlowOptions extends DataflowExampleOptions,
- ExamplePubsubTopicAndSubscriptionOptions, ExampleBigQueryTableOptions {
- @Description("Input file to inject to Pub/Sub topic")
- @Default.String("gs://dataflow-samples/traffic_sensor/"
- + "Freeways-5Minaa2010-01-01_to_2010-02-15_test2.csv")
- String getInputFile();
- void setInputFile(String value);
-
- @Description("Numeric value of sliding window duration, in minutes")
- @Default.Integer(WINDOW_DURATION)
- Integer getWindowDuration();
- void setWindowDuration(Integer value);
-
- @Description("Numeric value of window 'slide every' setting, in minutes")
- @Default.Integer(WINDOW_SLIDE_EVERY)
- Integer getWindowSlideEvery();
- void setWindowSlideEvery(Integer value);
-
- @Description("Whether to run the pipeline with unbounded input")
- @Default.Boolean(false)
- boolean isUnbounded();
- void setUnbounded(boolean value);
- }
-
- /**
- * Sets up and starts streaming pipeline.
- *
- * @throws IOException if there is a problem setting up resources
- */
- public static void main(String[] args) throws IOException {
- TrafficMaxLaneFlowOptions options = PipelineOptionsFactory.fromArgs(args)
- .withValidation()
- .as(TrafficMaxLaneFlowOptions.class);
- options.setBigQuerySchema(FormatMaxesFn.getSchema());
- // Using DataflowExampleUtils to set up required resources.
- DataflowExampleUtils dataflowUtils = new DataflowExampleUtils(options, options.isUnbounded());
-
- Pipeline pipeline = Pipeline.create(options);
- TableReference tableRef = new TableReference();
- tableRef.setProjectId(options.getProject());
- tableRef.setDatasetId(options.getBigQueryDataset());
- tableRef.setTableId(options.getBigQueryTable());
-
- PCollection<String> input;
- if (options.isUnbounded()) {
- // Read unbounded PubSubIO.
- input = pipeline.apply(PubsubIO.Read
- .timestampLabel(PUBSUB_TIMESTAMP_LABEL_KEY)
- .subscription(options.getPubsubSubscription()));
- } else {
- // Read bounded PubSubIO.
- input = pipeline.apply(PubsubIO.Read
- .timestampLabel(PUBSUB_TIMESTAMP_LABEL_KEY)
- .subscription(options.getPubsubSubscription()).maxNumRecords(VALID_INPUTS));
-
- // To read bounded TextIO files, use:
- // input = pipeline.apply(new ReadFileAndExtractTimestamps(options.getInputFile()));
- }
- input
- // row... => <station route, station speed> ...
- .apply(ParDo.of(new ExtractFlowInfoFn()))
- // map the incoming data stream into sliding windows. The default window duration values
- // work well if you're running the accompanying Pub/Sub generator script with the
- // --replay flag, which simulates pauses in the sensor data publication. You may want to
- // adjust them otherwise.
- .apply(Window.<KV<String, LaneInfo>>into(SlidingWindows.of(
- Duration.standardMinutes(options.getWindowDuration())).
- every(Duration.standardMinutes(options.getWindowSlideEvery()))))
- .apply(new MaxLaneFlow())
- .apply(BigQueryIO.Write.to(tableRef)
- .withSchema(FormatMaxesFn.getSchema()));
-
- // Inject the data into the Pub/Sub topic with a Dataflow batch pipeline.
- if (!Strings.isNullOrEmpty(options.getInputFile())
- && !Strings.isNullOrEmpty(options.getPubsubTopic())) {
- dataflowUtils.runInjectorPipeline(
- new ReadFileAndExtractTimestamps(options.getInputFile()),
- options.getPubsubTopic(),
- PUBSUB_TIMESTAMP_LABEL_KEY);
- }
-
- // Run the pipeline.
- PipelineResult result = pipeline.run();
-
- // dataflowUtils will try to cancel the pipeline and the injector before the program exists.
- dataflowUtils.waitToFinish(result);
- }
-
- private static Integer tryIntParse(String number) {
- try {
- return Integer.parseInt(number);
- } catch (NumberFormatException e) {
- return null;
- }
- }
-
- private static Double tryDoubleParse(String number) {
- try {
- return Double.parseDouble(number);
- } catch (NumberFormatException e) {
- return null;
- }
- }
-}
[22/67] [partial] incubator-beam git commit: Directory reorganization
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/DoFnWithContext.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/DoFnWithContext.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/DoFnWithContext.java
deleted file mode 100644
index 4f131ad..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/DoFnWithContext.java
+++ /dev/null
@@ -1,416 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms;
-
-import static com.google.common.base.Preconditions.checkArgument;
-import static com.google.common.base.Preconditions.checkNotNull;
-import static com.google.common.base.Preconditions.checkState;
-
-import com.google.cloud.dataflow.sdk.annotations.Experimental;
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-import com.google.cloud.dataflow.sdk.transforms.Combine.CombineFn;
-import com.google.cloud.dataflow.sdk.transforms.DoFn.DelegatingAggregator;
-import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
-import com.google.cloud.dataflow.sdk.transforms.windowing.PaneInfo;
-import com.google.cloud.dataflow.sdk.util.WindowingInternals;
-import com.google.cloud.dataflow.sdk.values.PCollectionView;
-import com.google.cloud.dataflow.sdk.values.TupleTag;
-import com.google.cloud.dataflow.sdk.values.TypeDescriptor;
-
-import org.joda.time.Duration;
-import org.joda.time.Instant;
-
-import java.io.Serializable;
-import java.lang.annotation.Documented;
-import java.lang.annotation.ElementType;
-import java.lang.annotation.Retention;
-import java.lang.annotation.RetentionPolicy;
-import java.lang.annotation.Target;
-import java.util.HashMap;
-import java.util.Map;
-
-/**
- * The argument to {@link ParDo} providing the code to use to process
- * elements of the input
- * {@link com.google.cloud.dataflow.sdk.values.PCollection}.
- *
- * <p>See {@link ParDo} for more explanation, examples of use, and
- * discussion of constraints on {@code DoFnWithContext}s, including their
- * serializability, lack of access to global shared mutable state,
- * requirements for failure tolerance, and benefits of optimization.
- *
- * <p>{@code DoFnWithContext}s can be tested in a particular
- * {@code Pipeline} by running that {@code Pipeline} on sample input
- * and then checking its output. Unit testing of a {@code DoFnWithContext},
- * separately from any {@code ParDo} transform or {@code Pipeline},
- * can be done via the {@link DoFnTester} harness.
- *
- * <p>Implementations must define a method annotated with {@link ProcessElement}
- * that satisfies the requirements described there. See the {@link ProcessElement}
- * for details.
- *
- * <p>This functionality is experimental and likely to change.
- *
- * <p>Example usage:
- *
- * <pre> {@code
- * PCollection<String> lines = ... ;
- * PCollection<String> words =
- * lines.apply(ParDo.of(new DoFnWithContext<String, String>() {
- * @ProcessElement
- * public void processElement(ProcessContext c, BoundedWindow window) {
- *
- * }}));
- * } </pre>
- *
- * @param <InputT> the type of the (main) input elements
- * @param <OutputT> the type of the (main) output elements
- */
-@Experimental
-public abstract class DoFnWithContext<InputT, OutputT> implements Serializable {
-
- /** Information accessible to all methods in this {@code DoFnWithContext}. */
- public abstract class Context {
-
- /**
- * Returns the {@code PipelineOptions} specified with the
- * {@link com.google.cloud.dataflow.sdk.runners.PipelineRunner}
- * invoking this {@code DoFnWithContext}. The {@code PipelineOptions} will
- * be the default running via {@link DoFnTester}.
- */
- public abstract PipelineOptions getPipelineOptions();
-
- /**
- * Adds the given element to the main output {@code PCollection}.
- *
- * <p>Once passed to {@code output} the element should not be modified in
- * any way.
- *
- * <p>If invoked from {@link ProcessElement}, the output
- * element will have the same timestamp and be in the same windows
- * as the input element passed to the method annotated with
- * {@code @ProcessElement}.
- *
- * <p>If invoked from {@link StartBundle} or {@link FinishBundle},
- * this will attempt to use the
- * {@link com.google.cloud.dataflow.sdk.transforms.windowing.WindowFn}
- * of the input {@code PCollection} to determine what windows the element
- * should be in, throwing an exception if the {@code WindowFn} attempts
- * to access any information about the input element. The output element
- * will have a timestamp of negative infinity.
- */
- public abstract void output(OutputT output);
-
- /**
- * Adds the given element to the main output {@code PCollection},
- * with the given timestamp.
- *
- * <p>Once passed to {@code outputWithTimestamp} the element should not be
- * modified in any way.
- *
- * <p>If invoked from {@link ProcessElement}), the timestamp
- * must not be older than the input element's timestamp minus
- * {@link DoFn#getAllowedTimestampSkew}. The output element will
- * be in the same windows as the input element.
- *
- * <p>If invoked from {@link StartBundle} or {@link FinishBundle},
- * this will attempt to use the
- * {@link com.google.cloud.dataflow.sdk.transforms.windowing.WindowFn}
- * of the input {@code PCollection} to determine what windows the element
- * should be in, throwing an exception if the {@code WindowFn} attempts
- * to access any information about the input element except for the
- * timestamp.
- */
- public abstract void outputWithTimestamp(OutputT output, Instant timestamp);
-
- /**
- * Adds the given element to the side output {@code PCollection} with the
- * given tag.
- *
- * <p>Once passed to {@code sideOutput} the element should not be modified
- * in any way.
- *
- * <p>The caller of {@code ParDo} uses {@link ParDo#withOutputTags} to
- * specify the tags of side outputs that it consumes. Non-consumed side
- * outputs, e.g., outputs for monitoring purposes only, don't necessarily
- * need to be specified.
- *
- * <p>The output element will have the same timestamp and be in the same
- * windows as the input element passed to {@link ProcessElement}).
- *
- * <p>If invoked from {@link StartBundle} or {@link FinishBundle},
- * this will attempt to use the
- * {@link com.google.cloud.dataflow.sdk.transforms.windowing.WindowFn}
- * of the input {@code PCollection} to determine what windows the element
- * should be in, throwing an exception if the {@code WindowFn} attempts
- * to access any information about the input element. The output element
- * will have a timestamp of negative infinity.
- *
- * @see ParDo#withOutputTags
- */
- public abstract <T> void sideOutput(TupleTag<T> tag, T output);
-
- /**
- * Adds the given element to the specified side output {@code PCollection},
- * with the given timestamp.
- *
- * <p>Once passed to {@code sideOutputWithTimestamp} the element should not be
- * modified in any way.
- *
- * <p>If invoked from {@link ProcessElement}), the timestamp
- * must not be older than the input element's timestamp minus
- * {@link DoFn#getAllowedTimestampSkew}. The output element will
- * be in the same windows as the input element.
- *
- * <p>If invoked from {@link StartBundle} or {@link FinishBundle},
- * this will attempt to use the
- * {@link com.google.cloud.dataflow.sdk.transforms.windowing.WindowFn}
- * of the input {@code PCollection} to determine what windows the element
- * should be in, throwing an exception if the {@code WindowFn} attempts
- * to access any information about the input element except for the
- * timestamp.
- *
- * @see ParDo#withOutputTags
- */
- public abstract <T> void sideOutputWithTimestamp(
- TupleTag<T> tag, T output, Instant timestamp);
- }
-
- /**
- * Information accessible when running {@link DoFn#processElement}.
- */
- public abstract class ProcessContext extends Context {
-
- /**
- * Returns the input element to be processed.
- *
- * <p>The element will not be changed -- it is safe to cache, etc.
- * without copying.
- */
- public abstract InputT element();
-
-
- /**
- * Returns the value of the side input.
- *
- * @throws IllegalArgumentException if this is not a side input
- * @see ParDo#withSideInputs
- */
- public abstract <T> T sideInput(PCollectionView<T> view);
-
- /**
- * Returns the timestamp of the input element.
- *
- * <p>See {@link com.google.cloud.dataflow.sdk.transforms.windowing.Window}
- * for more information.
- */
- public abstract Instant timestamp();
-
- /**
- * Returns information about the pane within this window into which the
- * input element has been assigned.
- *
- * <p>Generally all data is in a single, uninteresting pane unless custom
- * triggering and/or late data has been explicitly requested.
- * See {@link com.google.cloud.dataflow.sdk.transforms.windowing.Window}
- * for more information.
- */
- public abstract PaneInfo pane();
- }
-
- /**
- * Returns the allowed timestamp skew duration, which is the maximum
- * duration that timestamps can be shifted backward in
- * {@link DoFnWithContext.Context#outputWithTimestamp}.
- *
- * <p>The default value is {@code Duration.ZERO}, in which case
- * timestamps can only be shifted forward to future. For infinite
- * skew, return {@code Duration.millis(Long.MAX_VALUE)}.
- */
- public Duration getAllowedTimestampSkew() {
- return Duration.ZERO;
- }
-
- /////////////////////////////////////////////////////////////////////////////
-
- Map<String, DelegatingAggregator<?, ?>> aggregators = new HashMap<>();
-
- /**
- * Protects aggregators from being created after initialization.
- */
- private boolean aggregatorsAreFinal;
-
- /**
- * Returns a {@link TypeDescriptor} capturing what is known statically
- * about the input type of this {@code DoFnWithContext} instance's most-derived
- * class.
- *
- * <p>See {@link #getOutputTypeDescriptor} for more discussion.
- */
- protected TypeDescriptor<InputT> getInputTypeDescriptor() {
- return new TypeDescriptor<InputT>(getClass()) {};
- }
-
- /**
- * Returns a {@link TypeDescriptor} capturing what is known statically
- * about the output type of this {@code DoFnWithContext} instance's
- * most-derived class.
- *
- * <p>In the normal case of a concrete {@code DoFnWithContext} subclass with
- * no generic type parameters of its own (including anonymous inner
- * classes), this will be a complete non-generic type, which is good
- * for choosing a default output {@code Coder<O>} for the output
- * {@code PCollection<O>}.
- */
- protected TypeDescriptor<OutputT> getOutputTypeDescriptor() {
- return new TypeDescriptor<OutputT>(getClass()) {};
- }
-
- /**
- * Interface for runner implementors to provide implementations of extra context information.
- *
- * <p>The methods on this interface are called by {@link DoFnReflector} before invoking an
- * annotated {@link StartBundle}, {@link ProcessElement} or {@link FinishBundle} method that
- * has indicated it needs the given extra context.
- *
- * <p>In the case of {@link ProcessElement} it is called once per invocation of
- * {@link ProcessElement}.
- */
- public interface ExtraContextFactory<InputT, OutputT> {
- /**
- * Construct the {@link BoundedWindow} to use within a {@link DoFnWithContext} that
- * needs it. This is called if the {@link ProcessElement} method has a parameter of type
- * {@link BoundedWindow}.
- *
- * @return {@link BoundedWindow} of the element currently being processed.
- */
- BoundedWindow window();
-
- /**
- * Construct the {@link WindowingInternals} to use within a {@link DoFnWithContext} that
- * needs it. This is called if the {@link ProcessElement} method has a parameter of type
- * {@link WindowingInternals}.
- */
- WindowingInternals<InputT, OutputT> windowingInternals();
- }
-
- /////////////////////////////////////////////////////////////////////////////
-
- /**
- * Annotation for the method to use to prepare an instance for processing a batch of elements.
- * The method annotated with this must satisfy the following constraints:
- * <ul>
- * <li>It must have at least one argument.
- * <li>Its first (and only) argument must be a {@link DoFnWithContext.Context}.
- * </ul>
- */
- @Documented
- @Retention(RetentionPolicy.RUNTIME)
- @Target(ElementType.METHOD)
- public @interface StartBundle {}
-
- /**
- * Annotation for the method to use for processing elements. A subclass of
- * {@link DoFnWithContext} must have a method with this annotation satisfying
- * the following constraints in order for it to be executable:
- * <ul>
- * <li>It must have at least one argument.
- * <li>Its first argument must be a {@link DoFnWithContext.ProcessContext}.
- * <li>Its remaining arguments must be {@link BoundedWindow}, or
- * {@link WindowingInternals WindowingInternals<InputT, OutputT>}.
- * </ul>
- */
- @Documented
- @Retention(RetentionPolicy.RUNTIME)
- @Target(ElementType.METHOD)
- public @interface ProcessElement {}
-
- /**
- * Annotation for the method to use to prepare an instance for processing a batch of elements.
- * The method annotated with this must satisfy the following constraints:
- * <ul>
- * <li>It must have at least one argument.
- * <li>Its first (and only) argument must be a {@link DoFnWithContext.Context}.
- * </ul>
- */
- @Documented
- @Retention(RetentionPolicy.RUNTIME)
- @Target(ElementType.METHOD)
- public @interface FinishBundle {}
-
- /**
- * Returns an {@link Aggregator} with aggregation logic specified by the
- * {@link CombineFn} argument. The name provided must be unique across
- * {@link Aggregator}s created within the DoFn. Aggregators can only be created
- * during pipeline construction.
- *
- * @param name the name of the aggregator
- * @param combiner the {@link CombineFn} to use in the aggregator
- * @return an aggregator for the provided name and combiner in the scope of
- * this DoFn
- * @throws NullPointerException if the name or combiner is null
- * @throws IllegalArgumentException if the given name collides with another
- * aggregator in this scope
- * @throws IllegalStateException if called during pipeline execution.
- */
- public final <AggInputT, AggOutputT> Aggregator<AggInputT, AggOutputT>
- createAggregator(String name, Combine.CombineFn<? super AggInputT, ?, AggOutputT> combiner) {
- checkNotNull(name, "name cannot be null");
- checkNotNull(combiner, "combiner cannot be null");
- checkArgument(!aggregators.containsKey(name),
- "Cannot create aggregator with name %s."
- + " An Aggregator with that name already exists within this scope.",
- name);
- checkState(!aggregatorsAreFinal,
- "Cannot create an aggregator during pipeline execution."
- + " Aggregators should be registered during pipeline construction.");
-
- DelegatingAggregator<AggInputT, AggOutputT> aggregator =
- new DelegatingAggregator<>(name, combiner);
- aggregators.put(name, aggregator);
- return aggregator;
- }
-
- /**
- * Returns an {@link Aggregator} with the aggregation logic specified by the
- * {@link SerializableFunction} argument. The name provided must be unique
- * across {@link Aggregator}s created within the DoFn. Aggregators can only be
- * created during pipeline construction.
- *
- * @param name the name of the aggregator
- * @param combiner the {@link SerializableFunction} to use in the aggregator
- * @return an aggregator for the provided name and combiner in the scope of
- * this DoFn
- * @throws NullPointerException if the name or combiner is null
- * @throws IllegalArgumentException if the given name collides with another
- * aggregator in this scope
- * @throws IllegalStateException if called during pipeline execution.
- */
- public final <AggInputT> Aggregator<AggInputT, AggInputT> createAggregator(
- String name, SerializableFunction<Iterable<AggInputT>, AggInputT> combiner) {
- checkNotNull(combiner, "combiner cannot be null.");
- return createAggregator(name, Combine.IterableCombineFn.of(combiner));
- }
-
- /**
- * Finalize the {@link DoFnWithContext} construction to prepare for processing.
- * This method should be called by runners before any processing methods.
- */
- void prepareForProcessing() {
- aggregatorsAreFinal = true;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/Filter.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/Filter.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/Filter.java
deleted file mode 100644
index 9e123a1..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/Filter.java
+++ /dev/null
@@ -1,234 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms;
-
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-
-/**
- * {@code PTransform}s for filtering from a {@code PCollection} the
- * elements satisfying a predicate, or satisfying an inequality with
- * a given value based on the elements' natural ordering.
- *
- * @param <T> the type of the values in the input {@code PCollection},
- * and the type of the elements in the output {@code PCollection}
- */
-public class Filter<T> extends PTransform<PCollection<T>, PCollection<T>> {
-
- /**
- * Returns a {@code PTransform} that takes an input
- * {@code PCollection<T>} and returns a {@code PCollection<T>} with
- * elements that satisfy the given predicate. The predicate must be
- * a {@code SerializableFunction<T, Boolean>}.
- *
- * <p>Example of use:
- * <pre> {@code
- * PCollection<String> wordList = ...;
- * PCollection<String> longWords =
- * wordList.apply(Filter.byPredicate(new MatchIfWordLengthGT(6)));
- * } </pre>
- *
- * <p>See also {@link #lessThan}, {@link #lessThanEq},
- * {@link #greaterThan}, {@link #greaterThanEq}, which return elements
- * satisfying various inequalities with the specified value based on
- * the elements' natural ordering.
- */
- public static <T, PredicateT extends SerializableFunction<T, Boolean>> Filter<T>
- byPredicate(PredicateT predicate) {
- return new Filter<T>("Filter", predicate);
- }
-
- /**
- * @deprecated use {@link #byPredicate}, which returns a {@link Filter} transform instead of
- * a {@link ParDo.Bound}.
- */
- @Deprecated
- public static <T, PredicateT extends SerializableFunction<T, Boolean>> ParDo.Bound<T, T>
- by(final PredicateT filterPred) {
- return ParDo.named("Filter").of(new DoFn<T, T>() {
- @Override
- public void processElement(ProcessContext c) {
- if (filterPred.apply(c.element()) == true) {
- c.output(c.element());
- }
- }
- });
- }
-
- /**
- * Returns a {@code PTransform} that takes an input
- * {@link PCollection} and returns a {@link PCollection} with
- * elements that are less than a given value, based on the
- * elements' natural ordering. Elements must be {@code Comparable}.
- *
- * <p>Example of use:
- * <pre> {@code
- * PCollection<Integer> listOfNumbers = ...;
- * PCollection<Integer> smallNumbers =
- * listOfNumbers.apply(Filter.lessThan(10));
- * } </pre>
- *
- * <p>See also {@link #lessThanEq}, {@link #greaterThanEq},
- * and {@link #greaterThan}, which return elements satisfying various
- * inequalities with the specified value based on the elements'
- * natural ordering.
- *
- * <p>See also {@link #byPredicate}, which returns elements
- * that satisfy the given predicate.
- */
- public static <T extends Comparable<T>> ParDo.Bound<T, T> lessThan(final T value) {
- return ParDo.named("Filter.lessThan").of(new DoFn<T, T>() {
- @Override
- public void processElement(ProcessContext c) {
- if (c.element().compareTo(value) < 0) {
- c.output(c.element());
- }
- }
- });
- }
-
- /**
- * Returns a {@code PTransform} that takes an input
- * {@code PCollection<T>} and returns a {@code PCollection<T>} with
- * elements that are greater than a given value, based on the
- * elements' natural ordering. Elements must be {@code Comparable}.
- *
- * <p>Example of use:
- * <pre> {@code
- * PCollection<Integer> listOfNumbers = ...;
- * PCollection<Integer> largeNumbers =
- * listOfNumbers.apply(Filter.greaterThan(1000));
- * } </pre>
- *
- * <p>See also {@link #greaterThanEq}, {@link #lessThan},
- * and {@link #lessThanEq}, which return elements satisfying various
- * inequalities with the specified value based on the elements'
- * natural ordering.
- *
- * <p>See also {@link #byPredicate}, which returns elements
- * that satisfy the given predicate.
- */
- public static <T extends Comparable<T>> ParDo.Bound<T, T> greaterThan(final T value) {
- return ParDo.named("Filter.greaterThan").of(new DoFn<T, T>() {
- @Override
- public void processElement(ProcessContext c) {
- if (c.element().compareTo(value) > 0) {
- c.output(c.element());
- }
- }
- });
- }
-
- /**
- * Returns a {@code PTransform} that takes an input
- * {@code PCollection<T>} and returns a {@code PCollection<T>} with
- * elements that are less than or equal to a given value, based on the
- * elements' natural ordering. Elements must be {@code Comparable}.
- *
- * <p>Example of use:
- * <pre> {@code
- * PCollection<Integer> listOfNumbers = ...;
- * PCollection<Integer> smallOrEqualNumbers =
- * listOfNumbers.apply(Filter.lessThanEq(10));
- * } </pre>
- *
- * <p>See also {@link #lessThan}, {@link #greaterThanEq},
- * and {@link #greaterThan}, which return elements satisfying various
- * inequalities with the specified value based on the elements'
- * natural ordering.
- *
- * <p>See also {@link #byPredicate}, which returns elements
- * that satisfy the given predicate.
- */
- public static <T extends Comparable<T>> ParDo.Bound<T, T> lessThanEq(final T value) {
- return ParDo.named("Filter.lessThanEq").of(new DoFn<T, T>() {
- @Override
- public void processElement(ProcessContext c) {
- if (c.element().compareTo(value) <= 0) {
- c.output(c.element());
- }
- }
- });
- }
-
- /**
- * Returns a {@code PTransform} that takes an input
- * {@code PCollection<T>} and returns a {@code PCollection<T>} with
- * elements that are greater than or equal to a given value, based on
- * the elements' natural ordering. Elements must be {@code Comparable}.
- *
- * <p>Example of use:
- * <pre> {@code
- * PCollection<Integer> listOfNumbers = ...;
- * PCollection<Integer> largeOrEqualNumbers =
- * listOfNumbers.apply(Filter.greaterThanEq(1000));
- * } </pre>
- *
- * <p>See also {@link #greaterThan}, {@link #lessThan},
- * and {@link #lessThanEq}, which return elements satisfying various
- * inequalities with the specified value based on the elements'
- * natural ordering.
- *
- * <p>See also {@link #byPredicate}, which returns elements
- * that satisfy the given predicate.
- */
- public static <T extends Comparable<T>> ParDo.Bound<T, T> greaterThanEq(final T value) {
- return ParDo.named("Filter.greaterThanEq").of(new DoFn<T, T>() {
- @Override
- public void processElement(ProcessContext c) {
- if (c.element().compareTo(value) >= 0) {
- c.output(c.element());
- }
- }
- });
- }
-
- ///////////////////////////////////////////////////////////////////////////////
-
- private SerializableFunction<T, Boolean> predicate;
-
- private Filter(SerializableFunction<T, Boolean> predicate) {
- this.predicate = predicate;
- }
-
- private Filter(String name, SerializableFunction<T, Boolean> predicate) {
- super(name);
- this.predicate = predicate;
- }
-
- public Filter<T> named(String name) {
- return new Filter<>(name, predicate);
- }
-
- @Override
- public PCollection<T> apply(PCollection<T> input) {
- PCollection<T> output = input.apply(ParDo.named("Filter").of(new DoFn<T, T>() {
- @Override
- public void processElement(ProcessContext c) {
- if (predicate.apply(c.element()) == true) {
- c.output(c.element());
- }
- }
- }));
- return output;
- }
-
- @Override
- protected Coder<T> getDefaultOutputCoder(PCollection<T> input) {
- return input.getCoder();
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/FlatMapElements.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/FlatMapElements.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/FlatMapElements.java
deleted file mode 100644
index fbaad5b..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/FlatMapElements.java
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms;
-
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.cloud.dataflow.sdk.values.TypeDescriptor;
-
-import java.lang.reflect.ParameterizedType;
-
-/**
- * {@code PTransform}s for mapping a simple function that returns iterables over the elements of a
- * {@link PCollection} and merging the results.
- */
-public class FlatMapElements<InputT, OutputT>
-extends PTransform<PCollection<InputT>, PCollection<OutputT>> {
- /**
- * For a {@code SerializableFunction<InputT, ? extends Iterable<OutputT>>} {@code fn},
- * returns a {@link PTransform} that applies {@code fn} to every element of the input
- * {@code PCollection<InputT>} and outputs all of the elements to the output
- * {@code PCollection<OutputT>}.
- *
- * <p>Example of use in Java 8:
- * <pre>{@code
- * PCollection<String> words = lines.apply(
- * FlatMapElements.via((String line) -> Arrays.asList(line.split(" ")))
- * .withOutputType(new TypeDescriptor<String>(){});
- * }</pre>
- *
- * <p>In Java 7, the overload {@link #via(SimpleFunction)} is more concise as the output type
- * descriptor need not be provided.
- */
- public static <InputT, OutputT> MissingOutputTypeDescriptor<InputT, OutputT>
- via(SerializableFunction<InputT, ? extends Iterable<OutputT>> fn) {
- return new MissingOutputTypeDescriptor<>(fn);
- }
-
- /**
- * For a {@code SimpleFunction<InputT, ? extends Iterable<OutputT>>} {@code fn},
- * return a {@link PTransform} that applies {@code fn} to every element of the input
- * {@code PCollection<InputT>} and outputs all of the elements to the output
- * {@code PCollection<OutputT>}.
- *
- * <p>This overload is intended primarily for use in Java 7. In Java 8, the overload
- * {@link #via(SerializableFunction)} supports use of lambda for greater concision.
- *
- * <p>Example of use in Java 7:
- * <pre>{@code
- * PCollection<String> lines = ...;
- * PCollection<String> words = lines.apply(FlatMapElements.via(
- * new SimpleFunction<String, List<String>>() {
- * public Integer apply(String line) {
- * return Arrays.asList(line.split(" "));
- * }
- * });
- * }</pre>
- *
- * <p>To use a Java 8 lambda, see {@link #via(SerializableFunction)}.
- */
- public static <InputT, OutputT> FlatMapElements<InputT, OutputT>
- via(SimpleFunction<InputT, ? extends Iterable<OutputT>> fn) {
-
- @SuppressWarnings({"rawtypes", "unchecked"}) // safe by static typing
- TypeDescriptor<Iterable<?>> iterableType = (TypeDescriptor) fn.getOutputTypeDescriptor();
-
- @SuppressWarnings("unchecked") // safe by correctness of getIterableElementType
- TypeDescriptor<OutputT> outputType =
- (TypeDescriptor<OutputT>) getIterableElementType(iterableType);
-
- return new FlatMapElements<>(fn, outputType);
- }
-
- /**
- * An intermediate builder for a {@link FlatMapElements} transform. To complete the transform,
- * provide an output type descriptor to {@link MissingOutputTypeDescriptor#withOutputType}. See
- * {@link #via(SerializableFunction)} for a full example of use.
- */
- public static final class MissingOutputTypeDescriptor<InputT, OutputT> {
-
- private final SerializableFunction<InputT, ? extends Iterable<OutputT>> fn;
-
- private MissingOutputTypeDescriptor(
- SerializableFunction<InputT, ? extends Iterable<OutputT>> fn) {
- this.fn = fn;
- }
-
- public FlatMapElements<InputT, OutputT> withOutputType(TypeDescriptor<OutputT> outputType) {
- return new FlatMapElements<>(fn, outputType);
- }
- }
-
- private static TypeDescriptor<?> getIterableElementType(
- TypeDescriptor<Iterable<?>> iterableTypeDescriptor) {
-
- // If a rawtype was used, the type token may be for Object, not a subtype of Iterable.
- // In this case, we rely on static typing of the function elsewhere to ensure it is
- // at least some kind of iterable, and grossly overapproximate the element type to be Object.
- if (!iterableTypeDescriptor.isSubtypeOf(new TypeDescriptor<Iterable<?>>() {})) {
- return new TypeDescriptor<Object>() {};
- }
-
- // Otherwise we can do the proper thing and get the actual type parameter.
- ParameterizedType iterableType =
- (ParameterizedType) iterableTypeDescriptor.getSupertype(Iterable.class).getType();
- return TypeDescriptor.of(iterableType.getActualTypeArguments()[0]);
- }
-
- //////////////////////////////////////////////////////////////////////////////////////////////////
-
- private final SerializableFunction<InputT, ? extends Iterable<OutputT>> fn;
- private final transient TypeDescriptor<OutputT> outputType;
-
- private FlatMapElements(
- SerializableFunction<InputT, ? extends Iterable<OutputT>> fn,
- TypeDescriptor<OutputT> outputType) {
- this.fn = fn;
- this.outputType = outputType;
- }
-
- @Override
- public PCollection<OutputT> apply(PCollection<InputT> input) {
- return input.apply(ParDo.named("Map").of(new DoFn<InputT, OutputT>() {
- private static final long serialVersionUID = 0L;
- @Override
- public void processElement(ProcessContext c) {
- for (OutputT element : fn.apply(c.element())) {
- c.output(element);
- }
- }
- })).setTypeDescriptorInternal(outputType);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/Flatten.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/Flatten.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/Flatten.java
deleted file mode 100644
index de6add0..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/Flatten.java
+++ /dev/null
@@ -1,219 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms;
-
-import com.google.cloud.dataflow.sdk.coders.CannotProvideCoderException;
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.coders.IterableLikeCoder;
-import com.google.cloud.dataflow.sdk.runners.DirectPipelineRunner;
-import com.google.cloud.dataflow.sdk.transforms.windowing.WindowFn;
-import com.google.cloud.dataflow.sdk.util.WindowingStrategy;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.cloud.dataflow.sdk.values.PCollection.IsBounded;
-import com.google.cloud.dataflow.sdk.values.PCollectionList;
-
-import java.util.ArrayList;
-import java.util.List;
-
-/**
- * {@code Flatten<T>} takes multiple {@code PCollection<T>}s bundled
- * into a {@code PCollectionList<T>} and returns a single
- * {@code PCollection<T>} containing all the elements in all the input
- * {@code PCollection}s. The name "Flatten" suggests taking a list of
- * lists and flattening them into a single list.
- *
- * <p>Example of use:
- * <pre> {@code
- * PCollection<String> pc1 = ...;
- * PCollection<String> pc2 = ...;
- * PCollection<String> pc3 = ...;
- * PCollectionList<String> pcs = PCollectionList.of(pc1).and(pc2).and(pc3);
- * PCollection<String> merged = pcs.apply(Flatten.<String>pCollections());
- * } </pre>
- *
- * <p>By default, the {@code Coder} of the output {@code PCollection}
- * is the same as the {@code Coder} of the first {@code PCollection}
- * in the input {@code PCollectionList} (if the
- * {@code PCollectionList} is non-empty).
- *
- */
-public class Flatten {
-
- /**
- * Returns a {@link PTransform} that flattens a {@link PCollectionList}
- * into a {@link PCollection} containing all the elements of all
- * the {@link PCollection}s in its input.
- *
- * <p>All inputs must have equal {@link WindowFn}s.
- * The output elements of {@code Flatten<T>} are in the same windows and
- * have the same timestamps as their corresponding input elements. The output
- * {@code PCollection} will have the same
- * {@link WindowFn} as all of the inputs.
- *
- * @param <T> the type of the elements in the input and output
- * {@code PCollection}s.
- */
- public static <T> FlattenPCollectionList<T> pCollections() {
- return new FlattenPCollectionList<>();
- }
-
- /**
- * Returns a {@code PTransform} that takes a {@code PCollection<Iterable<T>>}
- * and returns a {@code PCollection<T>} containing all the elements from
- * all the {@code Iterable}s.
- *
- * <p>Example of use:
- * <pre> {@code
- * PCollection<Iterable<Integer>> pcOfIterables = ...;
- * PCollection<Integer> pc = pcOfIterables.apply(Flatten.<Integer>iterables());
- * } </pre>
- *
- * <p>By default, the output {@code PCollection} encodes its elements
- * using the same {@code Coder} that the input uses for
- * the elements in its {@code Iterable}.
- *
- * @param <T> the type of the elements of the input {@code Iterable} and
- * the output {@code PCollection}
- */
- public static <T> FlattenIterables<T> iterables() {
- return new FlattenIterables<>();
- }
-
- /**
- * A {@link PTransform} that flattens a {@link PCollectionList}
- * into a {@link PCollection} containing all the elements of all
- * the {@link PCollection}s in its input.
- * Implements {@link #pCollections}.
- *
- * @param <T> the type of the elements in the input and output
- * {@code PCollection}s.
- */
- public static class FlattenPCollectionList<T>
- extends PTransform<PCollectionList<T>, PCollection<T>> {
-
- private FlattenPCollectionList() { }
-
- @Override
- public PCollection<T> apply(PCollectionList<T> inputs) {
- WindowingStrategy<?, ?> windowingStrategy;
- IsBounded isBounded = IsBounded.BOUNDED;
- if (!inputs.getAll().isEmpty()) {
- windowingStrategy = inputs.get(0).getWindowingStrategy();
- for (PCollection<?> input : inputs.getAll()) {
- WindowingStrategy<?, ?> other = input.getWindowingStrategy();
- if (!windowingStrategy.getWindowFn().isCompatible(other.getWindowFn())) {
- throw new IllegalStateException(
- "Inputs to Flatten had incompatible window windowFns: "
- + windowingStrategy.getWindowFn() + ", " + other.getWindowFn());
- }
-
- if (!windowingStrategy.getTrigger().getSpec()
- .isCompatible(other.getTrigger().getSpec())) {
- throw new IllegalStateException(
- "Inputs to Flatten had incompatible triggers: "
- + windowingStrategy.getTrigger() + ", " + other.getTrigger());
- }
- isBounded = isBounded.and(input.isBounded());
- }
- } else {
- windowingStrategy = WindowingStrategy.globalDefault();
- }
-
- return PCollection.<T>createPrimitiveOutputInternal(
- inputs.getPipeline(),
- windowingStrategy,
- isBounded);
- }
-
- @Override
- protected Coder<?> getDefaultOutputCoder(PCollectionList<T> input)
- throws CannotProvideCoderException {
-
- // Take coder from first collection
- for (PCollection<T> pCollection : input.getAll()) {
- return pCollection.getCoder();
- }
-
- // No inputs
- throw new CannotProvideCoderException(
- this.getClass().getSimpleName() + " cannot provide a Coder for"
- + " empty " + PCollectionList.class.getSimpleName());
- }
- }
-
- /**
- * {@code FlattenIterables<T>} takes a {@code PCollection<Iterable<T>>} and returns a
- * {@code PCollection<T>} that contains all the elements from each iterable.
- * Implements {@link #iterables}.
- *
- * @param <T> the type of the elements of the input {@code Iterable}s and
- * the output {@code PCollection}
- */
- public static class FlattenIterables<T>
- extends PTransform<PCollection<? extends Iterable<T>>, PCollection<T>> {
-
- @Override
- public PCollection<T> apply(PCollection<? extends Iterable<T>> in) {
- Coder<? extends Iterable<T>> inCoder = in.getCoder();
- if (!(inCoder instanceof IterableLikeCoder)) {
- throw new IllegalArgumentException(
- "expecting the input Coder<Iterable> to be an IterableLikeCoder");
- }
- @SuppressWarnings("unchecked")
- Coder<T> elemCoder = ((IterableLikeCoder<T, ?>) inCoder).getElemCoder();
-
- return in.apply(ParDo.named("FlattenIterables").of(
- new DoFn<Iterable<T>, T>() {
- @Override
- public void processElement(ProcessContext c) {
- for (T i : c.element()) {
- c.output(i);
- }
- }
- }))
- .setCoder(elemCoder);
- }
- }
-
- /////////////////////////////////////////////////////////////////////////////
-
- static {
- DirectPipelineRunner.registerDefaultTransformEvaluator(
- FlattenPCollectionList.class,
- new DirectPipelineRunner.TransformEvaluator<FlattenPCollectionList>() {
- @Override
- public void evaluate(
- FlattenPCollectionList transform,
- DirectPipelineRunner.EvaluationContext context) {
- evaluateHelper(transform, context);
- }
- });
- }
-
- private static <T> void evaluateHelper(
- FlattenPCollectionList<T> transform,
- DirectPipelineRunner.EvaluationContext context) {
- List<DirectPipelineRunner.ValueWithMetadata<T>> outputElems = new ArrayList<>();
- PCollectionList<T> inputs = context.getInput(transform);
-
- for (PCollection<T> input : inputs.getAll()) {
- outputElems.addAll(context.getPCollectionValuesWithMetadata(input));
- }
-
- context.setPCollectionValuesWithMetadata(context.getOutput(transform), outputElems);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/GroupByKey.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/GroupByKey.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/GroupByKey.java
deleted file mode 100644
index 8fde3e0..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/GroupByKey.java
+++ /dev/null
@@ -1,575 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms;
-
-import static com.google.cloud.dataflow.sdk.util.CoderUtils.encodeToByteArray;
-
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.coders.Coder.NonDeterministicException;
-import com.google.cloud.dataflow.sdk.coders.CoderException;
-import com.google.cloud.dataflow.sdk.coders.IterableCoder;
-import com.google.cloud.dataflow.sdk.coders.KvCoder;
-import com.google.cloud.dataflow.sdk.runners.DirectPipelineRunner;
-import com.google.cloud.dataflow.sdk.runners.DirectPipelineRunner.ValueWithMetadata;
-import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
-import com.google.cloud.dataflow.sdk.transforms.windowing.DefaultTrigger;
-import com.google.cloud.dataflow.sdk.transforms.windowing.GlobalWindows;
-import com.google.cloud.dataflow.sdk.transforms.windowing.InvalidWindows;
-import com.google.cloud.dataflow.sdk.transforms.windowing.Window;
-import com.google.cloud.dataflow.sdk.transforms.windowing.WindowFn;
-import com.google.cloud.dataflow.sdk.util.GroupAlsoByWindowsViaOutputBufferDoFn;
-import com.google.cloud.dataflow.sdk.util.ReifyTimestampAndWindowsDoFn;
-import com.google.cloud.dataflow.sdk.util.SystemReduceFn;
-import com.google.cloud.dataflow.sdk.util.WindowedValue;
-import com.google.cloud.dataflow.sdk.util.WindowedValue.FullWindowedValueCoder;
-import com.google.cloud.dataflow.sdk.util.WindowedValue.WindowedValueCoder;
-import com.google.cloud.dataflow.sdk.util.WindowingStrategy;
-import com.google.cloud.dataflow.sdk.values.KV;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.cloud.dataflow.sdk.values.PCollection.IsBounded;
-
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-
-/**
- * {@code GroupByKey<K, V>} takes a {@code PCollection<KV<K, V>>},
- * groups the values by key and windows, and returns a
- * {@code PCollection<KV<K, Iterable<V>>>} representing a map from
- * each distinct key and window of the input {@code PCollection} to an
- * {@code Iterable} over all the values associated with that key in
- * the input per window. Absent repeatedly-firing
- * {@link Window#triggering triggering}, each key in the output
- * {@code PCollection} is unique within each window.
- *
- * <p>{@code GroupByKey} is analogous to converting a multi-map into
- * a uni-map, and related to {@code GROUP BY} in SQL. It corresponds
- * to the "shuffle" step between the Mapper and the Reducer in the
- * MapReduce framework.
- *
- * <p>Two keys of type {@code K} are compared for equality
- * <b>not</b> by regular Java {@link Object#equals}, but instead by
- * first encoding each of the keys using the {@code Coder} of the
- * keys of the input {@code PCollection}, and then comparing the
- * encoded bytes. This admits efficient parallel evaluation. Note that
- * this requires that the {@code Coder} of the keys be deterministic (see
- * {@link Coder#verifyDeterministic()}). If the key {@code Coder} is not
- * deterministic, an exception is thrown at pipeline construction time.
- *
- * <p>By default, the {@code Coder} of the keys of the output
- * {@code PCollection} is the same as that of the keys of the input,
- * and the {@code Coder} of the elements of the {@code Iterable}
- * values of the output {@code PCollection} is the same as the
- * {@code Coder} of the values of the input.
- *
- * <p>Example of use:
- * <pre> {@code
- * PCollection<KV<String, Doc>> urlDocPairs = ...;
- * PCollection<KV<String, Iterable<Doc>>> urlToDocs =
- * urlDocPairs.apply(GroupByKey.<String, Doc>create());
- * PCollection<R> results =
- * urlToDocs.apply(ParDo.of(new DoFn<KV<String, Iterable<Doc>>, R>() {
- * public void processElement(ProcessContext c) {
- * String url = c.element().getKey();
- * Iterable<Doc> docsWithThatUrl = c.element().getValue();
- * ... process all docs having that url ...
- * }}));
- * } </pre>
- *
- * <p>{@code GroupByKey} is a key primitive in data-parallel
- * processing, since it is the main way to efficiently bring
- * associated data together into one location. It is also a key
- * determiner of the performance of a data-parallel pipeline.
- *
- * <p>See {@link com.google.cloud.dataflow.sdk.transforms.join.CoGroupByKey}
- * for a way to group multiple input PCollections by a common key at once.
- *
- * <p>See {@link Combine.PerKey} for a common pattern of
- * {@code GroupByKey} followed by {@link Combine.GroupedValues}.
- *
- * <p>When grouping, windows that can be merged according to the {@link WindowFn}
- * of the input {@code PCollection} will be merged together, and a window pane
- * corresponding to the new, merged window will be created. The items in this pane
- * will be emitted when a trigger fires. By default this will be when the input
- * sources estimate there will be no more data for the window. See
- * {@link com.google.cloud.dataflow.sdk.transforms.windowing.AfterWatermark}
- * for details on the estimation.
- *
- * <p>The timestamp for each emitted pane is determined by the
- * {@link Window.Bound#withOutputTimeFn windowing operation}.
- * The output {@code PCollection} will have the same {@link WindowFn}
- * as the input.
- *
- * <p>If the input {@code PCollection} contains late data (see
- * {@link com.google.cloud.dataflow.sdk.io.PubsubIO.Read.Bound#timestampLabel}
- * for an example of how this can occur) or the
- * {@link Window#triggering requested TriggerFn} can fire before
- * the watermark, then there may be multiple elements
- * output by a {@code GroupByKey} that correspond to the same key and window.
- *
- * <p>If the {@link WindowFn} of the input requires merging, it is not
- * valid to apply another {@code GroupByKey} without first applying a new
- * {@link WindowFn} or applying {@link Window#remerge()}.
- *
- * @param <K> the type of the keys of the input and output
- * {@code PCollection}s
- * @param <V> the type of the values of the input {@code PCollection}
- * and the elements of the {@code Iterable}s in the output
- * {@code PCollection}
- */
-public class GroupByKey<K, V>
- extends PTransform<PCollection<KV<K, V>>,
- PCollection<KV<K, Iterable<V>>>> {
-
- private final boolean fewKeys;
-
- private GroupByKey(boolean fewKeys) {
- this.fewKeys = fewKeys;
- }
-
- /**
- * Returns a {@code GroupByKey<K, V>} {@code PTransform}.
- *
- * @param <K> the type of the keys of the input and output
- * {@code PCollection}s
- * @param <V> the type of the values of the input {@code PCollection}
- * and the elements of the {@code Iterable}s in the output
- * {@code PCollection}
- */
- public static <K, V> GroupByKey<K, V> create() {
- return new GroupByKey<>(false);
- }
-
- /**
- * Returns a {@code GroupByKey<K, V>} {@code PTransform}.
- *
- * @param <K> the type of the keys of the input and output
- * {@code PCollection}s
- * @param <V> the type of the values of the input {@code PCollection}
- * and the elements of the {@code Iterable}s in the output
- * {@code PCollection}
- * @param fewKeys whether it groups just few keys.
- */
- static <K, V> GroupByKey<K, V> create(boolean fewKeys) {
- return new GroupByKey<>(fewKeys);
- }
-
- /**
- * Returns whether it groups just few keys.
- */
- public boolean fewKeys() {
- return fewKeys;
- }
-
- /////////////////////////////////////////////////////////////////////////////
-
- public static void applicableTo(PCollection<?> input) {
- WindowingStrategy<?, ?> windowingStrategy = input.getWindowingStrategy();
- // Verify that the input PCollection is bounded, or that there is windowing/triggering being
- // used. Without this, the watermark (at end of global window) will never be reached.
- if (windowingStrategy.getWindowFn() instanceof GlobalWindows
- && windowingStrategy.getTrigger().getSpec() instanceof DefaultTrigger
- && input.isBounded() != IsBounded.BOUNDED) {
- throw new IllegalStateException("GroupByKey cannot be applied to non-bounded PCollection in "
- + "the GlobalWindow without a trigger. Use a Window.into or Window.triggering transform "
- + "prior to GroupByKey.");
- }
-
- // Validate the window merge function.
- if (windowingStrategy.getWindowFn() instanceof InvalidWindows) {
- String cause = ((InvalidWindows<?>) windowingStrategy.getWindowFn()).getCause();
- throw new IllegalStateException(
- "GroupByKey must have a valid Window merge function. "
- + "Invalid because: " + cause);
- }
- }
-
- @Override
- public void validate(PCollection<KV<K, V>> input) {
- applicableTo(input);
-
- // Verify that the input Coder<KV<K, V>> is a KvCoder<K, V>, and that
- // the key coder is deterministic.
- Coder<K> keyCoder = getKeyCoder(input.getCoder());
- try {
- keyCoder.verifyDeterministic();
- } catch (NonDeterministicException e) {
- throw new IllegalStateException(
- "the keyCoder of a GroupByKey must be deterministic", e);
- }
- }
-
- public WindowingStrategy<?, ?> updateWindowingStrategy(WindowingStrategy<?, ?> inputStrategy) {
- WindowFn<?, ?> inputWindowFn = inputStrategy.getWindowFn();
- if (!inputWindowFn.isNonMerging()) {
- // Prevent merging windows again, without explicit user
- // involvement, e.g., by Window.into() or Window.remerge().
- inputWindowFn = new InvalidWindows<>(
- "WindowFn has already been consumed by previous GroupByKey", inputWindowFn);
- }
-
- // We also switch to the continuation trigger associated with the current trigger.
- return inputStrategy
- .withWindowFn(inputWindowFn)
- .withTrigger(inputStrategy.getTrigger().getSpec().getContinuationTrigger());
- }
-
- @Override
- public PCollection<KV<K, Iterable<V>>> apply(PCollection<KV<K, V>> input) {
- // This operation groups by the combination of key and window,
- // merging windows as needed, using the windows assigned to the
- // key/value input elements and the window merge operation of the
- // window function associated with the input PCollection.
- WindowingStrategy<?, ?> windowingStrategy = input.getWindowingStrategy();
-
- // By default, implement GroupByKey[AndWindow] via a series of lower-level
- // operations.
- return input
- // Make each input element's timestamp and assigned windows
- // explicit, in the value part.
- .apply(new ReifyTimestampsAndWindows<K, V>())
-
- // Group by just the key.
- // Combiner lifting will not happen regardless of the disallowCombinerLifting value.
- // There will be no combiners right after the GroupByKeyOnly because of the two ParDos
- // introduced in here.
- .apply(new GroupByKeyOnly<K, WindowedValue<V>>())
-
- // Sort each key's values by timestamp. GroupAlsoByWindow requires
- // its input to be sorted by timestamp.
- .apply(new SortValuesByTimestamp<K, V>())
-
- // Group each key's values by window, merging windows as needed.
- .apply(new GroupAlsoByWindow<K, V>(windowingStrategy))
-
- // And update the windowing strategy as appropriate.
- .setWindowingStrategyInternal(updateWindowingStrategy(windowingStrategy));
- }
-
- @Override
- protected Coder<KV<K, Iterable<V>>> getDefaultOutputCoder(PCollection<KV<K, V>> input) {
- return getOutputKvCoder(input.getCoder());
- }
-
- /**
- * Returns the {@code Coder} of the input to this transform, which
- * should be a {@code KvCoder}.
- */
- @SuppressWarnings("unchecked")
- static <K, V> KvCoder<K, V> getInputKvCoder(Coder<KV<K, V>> inputCoder) {
- if (!(inputCoder instanceof KvCoder)) {
- throw new IllegalStateException(
- "GroupByKey requires its input to use KvCoder");
- }
- return (KvCoder<K, V>) inputCoder;
- }
-
- /////////////////////////////////////////////////////////////////////////////
-
- /**
- * Returns the {@code Coder} of the keys of the input to this
- * transform, which is also used as the {@code Coder} of the keys of
- * the output of this transform.
- */
- static <K, V> Coder<K> getKeyCoder(Coder<KV<K, V>> inputCoder) {
- return getInputKvCoder(inputCoder).getKeyCoder();
- }
-
- /**
- * Returns the {@code Coder} of the values of the input to this transform.
- */
- public static <K, V> Coder<V> getInputValueCoder(Coder<KV<K, V>> inputCoder) {
- return getInputKvCoder(inputCoder).getValueCoder();
- }
-
- /**
- * Returns the {@code Coder} of the {@code Iterable} values of the
- * output of this transform.
- */
- static <K, V> Coder<Iterable<V>> getOutputValueCoder(Coder<KV<K, V>> inputCoder) {
- return IterableCoder.of(getInputValueCoder(inputCoder));
- }
-
- /**
- * Returns the {@code Coder} of the output of this transform.
- */
- static <K, V> KvCoder<K, Iterable<V>> getOutputKvCoder(Coder<KV<K, V>> inputCoder) {
- return KvCoder.of(getKeyCoder(inputCoder), getOutputValueCoder(inputCoder));
- }
-
- /////////////////////////////////////////////////////////////////////////////
-
- /**
- * Helper transform that makes timestamps and window assignments
- * explicit in the value part of each key/value pair.
- */
- public static class ReifyTimestampsAndWindows<K, V>
- extends PTransform<PCollection<KV<K, V>>,
- PCollection<KV<K, WindowedValue<V>>>> {
- @Override
- public PCollection<KV<K, WindowedValue<V>>> apply(
- PCollection<KV<K, V>> input) {
- @SuppressWarnings("unchecked")
- KvCoder<K, V> inputKvCoder = (KvCoder<K, V>) input.getCoder();
- Coder<K> keyCoder = inputKvCoder.getKeyCoder();
- Coder<V> inputValueCoder = inputKvCoder.getValueCoder();
- Coder<WindowedValue<V>> outputValueCoder = FullWindowedValueCoder.of(
- inputValueCoder, input.getWindowingStrategy().getWindowFn().windowCoder());
- Coder<KV<K, WindowedValue<V>>> outputKvCoder =
- KvCoder.of(keyCoder, outputValueCoder);
- return input.apply(ParDo.of(new ReifyTimestampAndWindowsDoFn<K, V>()))
- .setCoder(outputKvCoder);
- }
- }
-
-
- /////////////////////////////////////////////////////////////////////////////
-
- /**
- * Helper transform that sorts the values associated with each key
- * by timestamp.
- */
- public static class SortValuesByTimestamp<K, V>
- extends PTransform<PCollection<KV<K, Iterable<WindowedValue<V>>>>,
- PCollection<KV<K, Iterable<WindowedValue<V>>>>> {
- @Override
- public PCollection<KV<K, Iterable<WindowedValue<V>>>> apply(
- PCollection<KV<K, Iterable<WindowedValue<V>>>> input) {
- return input.apply(ParDo.of(
- new DoFn<KV<K, Iterable<WindowedValue<V>>>,
- KV<K, Iterable<WindowedValue<V>>>>() {
- @Override
- public void processElement(ProcessContext c) {
- KV<K, Iterable<WindowedValue<V>>> kvs = c.element();
- K key = kvs.getKey();
- Iterable<WindowedValue<V>> unsortedValues = kvs.getValue();
- List<WindowedValue<V>> sortedValues = new ArrayList<>();
- for (WindowedValue<V> value : unsortedValues) {
- sortedValues.add(value);
- }
- Collections.sort(sortedValues,
- new Comparator<WindowedValue<V>>() {
- @Override
- public int compare(WindowedValue<V> e1, WindowedValue<V> e2) {
- return e1.getTimestamp().compareTo(e2.getTimestamp());
- }
- });
- c.output(KV.<K, Iterable<WindowedValue<V>>>of(key, sortedValues));
- }}))
- .setCoder(input.getCoder());
- }
- }
-
-
- /////////////////////////////////////////////////////////////////////////////
-
- /**
- * Helper transform that takes a collection of timestamp-ordered
- * values associated with each key, groups the values by window,
- * combines windows as needed, and for each window in each key,
- * outputs a collection of key/value-list pairs implicitly assigned
- * to the window and with the timestamp derived from that window.
- */
- public static class GroupAlsoByWindow<K, V>
- extends PTransform<PCollection<KV<K, Iterable<WindowedValue<V>>>>,
- PCollection<KV<K, Iterable<V>>>> {
- private final WindowingStrategy<?, ?> windowingStrategy;
-
- public GroupAlsoByWindow(WindowingStrategy<?, ?> windowingStrategy) {
- this.windowingStrategy = windowingStrategy;
- }
-
- @Override
- @SuppressWarnings("unchecked")
- public PCollection<KV<K, Iterable<V>>> apply(
- PCollection<KV<K, Iterable<WindowedValue<V>>>> input) {
- @SuppressWarnings("unchecked")
- KvCoder<K, Iterable<WindowedValue<V>>> inputKvCoder =
- (KvCoder<K, Iterable<WindowedValue<V>>>) input.getCoder();
-
- Coder<K> keyCoder = inputKvCoder.getKeyCoder();
- Coder<Iterable<WindowedValue<V>>> inputValueCoder =
- inputKvCoder.getValueCoder();
-
- IterableCoder<WindowedValue<V>> inputIterableValueCoder =
- (IterableCoder<WindowedValue<V>>) inputValueCoder;
- Coder<WindowedValue<V>> inputIterableElementCoder =
- inputIterableValueCoder.getElemCoder();
- WindowedValueCoder<V> inputIterableWindowedValueCoder =
- (WindowedValueCoder<V>) inputIterableElementCoder;
-
- Coder<V> inputIterableElementValueCoder =
- inputIterableWindowedValueCoder.getValueCoder();
- Coder<Iterable<V>> outputValueCoder =
- IterableCoder.of(inputIterableElementValueCoder);
- Coder<KV<K, Iterable<V>>> outputKvCoder = KvCoder.of(keyCoder, outputValueCoder);
-
- return input
- .apply(ParDo.of(groupAlsoByWindowsFn(windowingStrategy, inputIterableElementValueCoder)))
- .setCoder(outputKvCoder);
- }
-
- private <W extends BoundedWindow> GroupAlsoByWindowsViaOutputBufferDoFn<K, V, Iterable<V>, W>
- groupAlsoByWindowsFn(
- WindowingStrategy<?, W> strategy, Coder<V> inputIterableElementValueCoder) {
- return new GroupAlsoByWindowsViaOutputBufferDoFn<K, V, Iterable<V>, W>(
- strategy, SystemReduceFn.<K, V, W>buffering(inputIterableElementValueCoder));
- }
- }
-
-
- /////////////////////////////////////////////////////////////////////////////
-
- /**
- * Primitive helper transform that groups by key only, ignoring any
- * window assignments.
- */
- public static class GroupByKeyOnly<K, V>
- extends PTransform<PCollection<KV<K, V>>,
- PCollection<KV<K, Iterable<V>>>> {
-
- @SuppressWarnings({"rawtypes", "unchecked"})
- @Override
- public PCollection<KV<K, Iterable<V>>> apply(PCollection<KV<K, V>> input) {
- return PCollection.<KV<K, Iterable<V>>>createPrimitiveOutputInternal(
- input.getPipeline(), input.getWindowingStrategy(), input.isBounded());
- }
-
- /**
- * Returns the {@code Coder} of the input to this transform, which
- * should be a {@code KvCoder}.
- */
- @SuppressWarnings("unchecked")
- KvCoder<K, V> getInputKvCoder(Coder<KV<K, V>> inputCoder) {
- if (!(inputCoder instanceof KvCoder)) {
- throw new IllegalStateException(
- "GroupByKey requires its input to use KvCoder");
- }
- return (KvCoder<K, V>) inputCoder;
- }
-
- @Override
- protected Coder<KV<K, Iterable<V>>> getDefaultOutputCoder(PCollection<KV<K, V>> input) {
- return GroupByKey.getOutputKvCoder(input.getCoder());
- }
- }
-
-
- /////////////////////////////////////////////////////////////////////////////
-
- static {
- registerWithDirectPipelineRunner();
- }
-
- @SuppressWarnings({"rawtypes", "unchecked"})
- private static <K, V> void registerWithDirectPipelineRunner() {
- DirectPipelineRunner.registerDefaultTransformEvaluator(
- GroupByKeyOnly.class,
- new DirectPipelineRunner.TransformEvaluator<GroupByKeyOnly>() {
- @Override
- public void evaluate(
- GroupByKeyOnly transform,
- DirectPipelineRunner.EvaluationContext context) {
- evaluateHelper(transform, context);
- }
- });
- }
-
- private static <K, V> void evaluateHelper(
- GroupByKeyOnly<K, V> transform,
- DirectPipelineRunner.EvaluationContext context) {
- PCollection<KV<K, V>> input = context.getInput(transform);
-
- List<ValueWithMetadata<KV<K, V>>> inputElems =
- context.getPCollectionValuesWithMetadata(input);
-
- Coder<K> keyCoder = GroupByKey.getKeyCoder(input.getCoder());
-
- Map<GroupingKey<K>, List<V>> groupingMap = new HashMap<>();
-
- for (ValueWithMetadata<KV<K, V>> elem : inputElems) {
- K key = elem.getValue().getKey();
- V value = elem.getValue().getValue();
- byte[] encodedKey;
- try {
- encodedKey = encodeToByteArray(keyCoder, key);
- } catch (CoderException exn) {
- // TODO: Put in better element printing:
- // truncate if too long.
- throw new IllegalArgumentException(
- "unable to encode key " + key + " of input to " + transform +
- " using " + keyCoder,
- exn);
- }
- GroupingKey<K> groupingKey = new GroupingKey<>(key, encodedKey);
- List<V> values = groupingMap.get(groupingKey);
- if (values == null) {
- values = new ArrayList<V>();
- groupingMap.put(groupingKey, values);
- }
- values.add(value);
- }
-
- List<ValueWithMetadata<KV<K, Iterable<V>>>> outputElems =
- new ArrayList<>();
- for (Map.Entry<GroupingKey<K>, List<V>> entry : groupingMap.entrySet()) {
- GroupingKey<K> groupingKey = entry.getKey();
- K key = groupingKey.getKey();
- List<V> values = entry.getValue();
- values = context.randomizeIfUnordered(values, true /* inPlaceAllowed */);
- outputElems.add(ValueWithMetadata
- .of(WindowedValue.valueInEmptyWindows(KV.<K, Iterable<V>>of(key, values)))
- .withKey(key));
- }
-
- context.setPCollectionValuesWithMetadata(context.getOutput(transform),
- outputElems);
- }
-
- private static class GroupingKey<K> {
- private K key;
- private byte[] encodedKey;
-
- public GroupingKey(K key, byte[] encodedKey) {
- this.key = key;
- this.encodedKey = encodedKey;
- }
-
- public K getKey() {
- return key;
- }
-
- @Override
- public boolean equals(Object o) {
- if (o instanceof GroupingKey) {
- GroupingKey<?> that = (GroupingKey<?>) o;
- return Arrays.equals(this.encodedKey, that.encodedKey);
- } else {
- return false;
- }
- }
-
- @Override
- public int hashCode() {
- return Arrays.hashCode(encodedKey);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/IntraBundleParallelization.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/IntraBundleParallelization.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/IntraBundleParallelization.java
deleted file mode 100644
index b6497b7..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/IntraBundleParallelization.java
+++ /dev/null
@@ -1,346 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms;
-
-import com.google.cloud.dataflow.sdk.options.GcsOptions;
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-import com.google.cloud.dataflow.sdk.transforms.Combine.CombineFn;
-import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
-import com.google.cloud.dataflow.sdk.transforms.windowing.PaneInfo;
-import com.google.cloud.dataflow.sdk.util.WindowingInternals;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.cloud.dataflow.sdk.values.PCollectionView;
-import com.google.cloud.dataflow.sdk.values.TupleTag;
-import com.google.cloud.dataflow.sdk.values.TypeDescriptor;
-import com.google.common.base.Preconditions;
-import com.google.common.base.Throwables;
-
-import org.joda.time.Instant;
-
-import java.util.concurrent.ExecutorService;
-import java.util.concurrent.Semaphore;
-import java.util.concurrent.atomic.AtomicReference;
-
-/**
- * Provides multi-threading of {@link DoFn}s, using threaded execution to
- * process multiple elements concurrently within a bundle.
- *
- * <p>Note, that each Dataflow worker will already process multiple bundles
- * concurrently and usage of this class is meant only for cases where processing
- * elements from within a bundle is limited by blocking calls.
- *
- * <p>CPU intensive or IO intensive tasks are in general a poor fit for parallelization.
- * This is because a limited resource that is already maximally utilized does not
- * benefit from sub-division of work. The parallelization will increase the amount of time
- * to process each element yet the throughput for processing will remain relatively the same.
- * For example, if the local disk (an IO resource) has a maximum write rate of 10 MiB/s,
- * and processing each element requires to write 20 MiBs to disk, then processing one element
- * to disk will take 2 seconds. Yet processing 3 elements concurrently (each getting an equal
- * share of the maximum write rate) will take at least 6 seconds to complete (there is additional
- * overhead in the extra parallelization).
- *
- * <p>To parallelize a {@link DoFn} to 10 threads:
- * <pre>{@code
- * PCollection<T> data = ...;
- * data.apply(
- * IntraBundleParallelization.of(new MyDoFn())
- * .withMaxParallelism(10)));
- * }</pre>
- *
- * <p>An uncaught exception from the wrapped {@link DoFn} will result in the exception
- * being rethrown in later calls to {@link MultiThreadedIntraBundleProcessingDoFn#processElement}
- * or a call to {@link MultiThreadedIntraBundleProcessingDoFn#finishBundle}.
- */
-public class IntraBundleParallelization {
- /**
- * Creates a {@link IntraBundleParallelization} {@link PTransform} for the given
- * {@link DoFn} that processes elements using multiple threads.
- *
- * <p>Note that the specified {@code doFn} needs to be thread safe.
- */
- public static <InputT, OutputT> Bound<InputT, OutputT> of(DoFn<InputT, OutputT> doFn) {
- return new Unbound().of(doFn);
- }
-
- /**
- * Creates a {@link IntraBundleParallelization} {@link PTransform} with the specified
- * maximum concurrency level.
- */
- public static Unbound withMaxParallelism(int maxParallelism) {
- return new Unbound().withMaxParallelism(maxParallelism);
- }
-
- /**
- * An incomplete {@code IntraBundleParallelization} transform, with unbound input/output types.
- *
- * <p>Before being applied, {@link IntraBundleParallelization.Unbound#of} must be
- * invoked to specify the {@link DoFn} to invoke, which will also
- * bind the input/output types of this {@code PTransform}.
- */
- public static class Unbound {
- private final int maxParallelism;
-
- Unbound() {
- this(DEFAULT_MAX_PARALLELISM);
- }
-
- Unbound(int maxParallelism) {
- Preconditions.checkArgument(maxParallelism > 0,
- "Expected parallelism factor greater than zero, received %s.", maxParallelism);
- this.maxParallelism = maxParallelism;
- }
-
- /**
- * Returns a new {@link IntraBundleParallelization} {@link PTransform} like this one
- * with the specified maximum concurrency level.
- */
- public Unbound withMaxParallelism(int maxParallelism) {
- return new Unbound(maxParallelism);
- }
-
- /**
- * Returns a new {@link IntraBundleParallelization} {@link PTransform} like this one
- * with the specified {@link DoFn}.
- *
- * <p>Note that the specified {@code doFn} needs to be thread safe.
- */
- public <InputT, OutputT> Bound<InputT, OutputT> of(DoFn<InputT, OutputT> doFn) {
- return new Bound<>(doFn, maxParallelism);
- }
- }
-
- /**
- * A {@code PTransform} that, when applied to a {@code PCollection<InputT>},
- * invokes a user-specified {@code DoFn<InputT, OutputT>} on all its elements,
- * with all its outputs collected into an output
- * {@code PCollection<OutputT>}.
- *
- * <p>Note that the specified {@code doFn} needs to be thread safe.
- *
- * @param <InputT> the type of the (main) input {@code PCollection} elements
- * @param <OutputT> the type of the (main) output {@code PCollection} elements
- */
- public static class Bound<InputT, OutputT>
- extends PTransform<PCollection<? extends InputT>, PCollection<OutputT>> {
- private final DoFn<InputT, OutputT> doFn;
- private final int maxParallelism;
-
- Bound(DoFn<InputT, OutputT> doFn, int maxParallelism) {
- Preconditions.checkArgument(maxParallelism > 0,
- "Expected parallelism factor greater than zero, received %s.", maxParallelism);
- this.doFn = doFn;
- this.maxParallelism = maxParallelism;
- }
-
- /**
- * Returns a new {@link IntraBundleParallelization} {@link PTransform} like this one
- * with the specified maximum concurrency level.
- */
- public Bound<InputT, OutputT> withMaxParallelism(int maxParallelism) {
- return new Bound<>(doFn, maxParallelism);
- }
-
- /**
- * Returns a new {@link IntraBundleParallelization} {@link PTransform} like this one
- * with the specified {@link DoFn}.
- *
- * <p>Note that the specified {@code doFn} needs to be thread safe.
- */
- public <NewInputT, NewOutputT> Bound<NewInputT, NewOutputT>
- of(DoFn<NewInputT, NewOutputT> doFn) {
- return new Bound<>(doFn, maxParallelism);
- }
-
- @Override
- public PCollection<OutputT> apply(PCollection<? extends InputT> input) {
- return input.apply(
- ParDo.of(new MultiThreadedIntraBundleProcessingDoFn<>(doFn, maxParallelism)));
- }
- }
-
- /**
- * A multi-threaded {@code DoFn} wrapper.
- *
- * @see IntraBundleParallelization#of(DoFn)
- *
- * @param <InputT> the type of the (main) input elements
- * @param <OutputT> the type of the (main) output elements
- */
- public static class MultiThreadedIntraBundleProcessingDoFn<InputT, OutputT>
- extends DoFn<InputT, OutputT> {
-
- public MultiThreadedIntraBundleProcessingDoFn(DoFn<InputT, OutputT> doFn, int maxParallelism) {
- Preconditions.checkArgument(maxParallelism > 0,
- "Expected parallelism factor greater than zero, received %s.", maxParallelism);
- this.doFn = doFn;
- this.maxParallelism = maxParallelism;
- }
-
- @Override
- public void startBundle(Context c) throws Exception {
- doFn.startBundle(c);
-
- executor = c.getPipelineOptions().as(GcsOptions.class).getExecutorService();
- workTickets = new Semaphore(maxParallelism);
- failure = new AtomicReference<>();
- }
-
- @Override
- public void processElement(final ProcessContext c) throws Exception {
- try {
- workTickets.acquire();
- } catch (InterruptedException e) {
- throw new RuntimeException("Interrupted while scheduling work", e);
- }
-
- if (failure.get() != null) {
- throw Throwables.propagate(failure.get());
- }
-
- executor.submit(new Runnable() {
- @Override
- public void run() {
- try {
- doFn.processElement(new WrappedContext(c));
- } catch (Throwable t) {
- failure.compareAndSet(null, t);
- Throwables.propagateIfPossible(t);
- throw new AssertionError("Unexpected checked exception: " + t);
- } finally {
- workTickets.release();
- }
- }
- });
- }
-
- @Override
- public void finishBundle(Context c) throws Exception {
- // Acquire all the work tickets to guarantee that all the previous
- // processElement calls have finished.
- workTickets.acquire(maxParallelism);
- if (failure.get() != null) {
- throw Throwables.propagate(failure.get());
- }
- doFn.finishBundle(c);
- }
-
- @Override
- protected TypeDescriptor<InputT> getInputTypeDescriptor() {
- return doFn.getInputTypeDescriptor();
- }
-
- @Override
- protected TypeDescriptor<OutputT> getOutputTypeDescriptor() {
- return doFn.getOutputTypeDescriptor();
- }
-
- /////////////////////////////////////////////////////////////////////////////
-
- /**
- * Wraps a DoFn context, forcing single-thread output so that threads don't
- * propagate through to downstream functions.
- */
- private class WrappedContext extends ProcessContext {
- private final ProcessContext context;
-
- WrappedContext(ProcessContext context) {
- this.context = context;
- }
-
- @Override
- public InputT element() {
- return context.element();
- }
-
- @Override
- public PipelineOptions getPipelineOptions() {
- return context.getPipelineOptions();
- }
-
- @Override
- public <T> T sideInput(PCollectionView<T> view) {
- return context.sideInput(view);
- }
-
- @Override
- public void output(OutputT output) {
- synchronized (MultiThreadedIntraBundleProcessingDoFn.this) {
- context.output(output);
- }
- }
-
- @Override
- public void outputWithTimestamp(OutputT output, Instant timestamp) {
- synchronized (MultiThreadedIntraBundleProcessingDoFn.this) {
- context.outputWithTimestamp(output, timestamp);
- }
- }
-
- @Override
- public <T> void sideOutput(TupleTag<T> tag, T output) {
- synchronized (MultiThreadedIntraBundleProcessingDoFn.this) {
- context.sideOutput(tag, output);
- }
- }
-
- @Override
- public <T> void sideOutputWithTimestamp(TupleTag<T> tag, T output, Instant timestamp) {
- synchronized (MultiThreadedIntraBundleProcessingDoFn.this) {
- context.sideOutputWithTimestamp(tag, output, timestamp);
- }
- }
-
- @Override
- public Instant timestamp() {
- return context.timestamp();
- }
-
- @Override
- public BoundedWindow window() {
- return context.window();
- }
-
- @Override
- public PaneInfo pane() {
- return context.pane();
- }
-
- @Override
- public WindowingInternals<InputT, OutputT> windowingInternals() {
- return context.windowingInternals();
- }
-
- @Override
- protected <AggInputT, AggOutputT> Aggregator<AggInputT, AggOutputT> createAggregatorInternal(
- String name, CombineFn<AggInputT, ?, AggOutputT> combiner) {
- return context.createAggregatorInternal(name, combiner);
- }
- }
-
- private final DoFn<InputT, OutputT> doFn;
- private int maxParallelism;
-
- private transient ExecutorService executor;
- private transient Semaphore workTickets;
- private transient AtomicReference<Throwable> failure;
- }
-
- /**
- * Default maximum for number of concurrent elements to process.
- */
- private static final int DEFAULT_MAX_PARALLELISM = 16;
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/Keys.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/Keys.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/Keys.java
deleted file mode 100644
index 370d43d..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/Keys.java
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms;
-
-import com.google.cloud.dataflow.sdk.values.KV;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-
-/**
- * {@code Keys<K>} takes a {@code PCollection} of {@code KV<K, V>}s and
- * returns a {@code PCollection<K>} of the keys.
- *
- * <p>Example of use:
- * <pre> {@code
- * PCollection<KV<String, Long>> wordCounts = ...;
- * PCollection<String> words = wordCounts.apply(Keys.<String>create());
- * } </pre>
- *
- * <p>Each output element has the same timestamp and is in the same windows
- * as its corresponding input element, and the output {@code PCollection}
- * has the same
- * {@link com.google.cloud.dataflow.sdk.transforms.windowing.WindowFn}
- * associated with it as the input.
- *
- * <p>See also {@link Values}.
- *
- * @param <K> the type of the keys in the input {@code PCollection},
- * and the type of the elements in the output {@code PCollection}
- */
-public class Keys<K> extends PTransform<PCollection<? extends KV<K, ?>>,
- PCollection<K>> {
- /**
- * Returns a {@code Keys<K>} {@code PTransform}.
- *
- * @param <K> the type of the keys in the input {@code PCollection},
- * and the type of the elements in the output {@code PCollection}
- */
- public static <K> Keys<K> create() {
- return new Keys<>();
- }
-
- private Keys() { }
-
- @Override
- public PCollection<K> apply(PCollection<? extends KV<K, ?>> in) {
- return
- in.apply(ParDo.named("Keys")
- .of(new DoFn<KV<K, ?>, K>() {
- @Override
- public void processElement(ProcessContext c) {
- c.output(c.element().getKey());
- }
- }));
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/KvSwap.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/KvSwap.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/KvSwap.java
deleted file mode 100644
index 5a9cc87..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/KvSwap.java
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms;
-
-import com.google.cloud.dataflow.sdk.values.KV;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-
-/**
- * {@code KvSwap<K, V>} takes a {@code PCollection<KV<K, V>>} and
- * returns a {@code PCollection<KV<V, K>>}, where all the keys and
- * values have been swapped.
- *
- * <p>Example of use:
- * <pre> {@code
- * PCollection<String, Long> wordsToCounts = ...;
- * PCollection<Long, String> countsToWords =
- * wordToCounts.apply(KvSwap.<String, Long>create());
- * } </pre>
- *
- * <p>Each output element has the same timestamp and is in the same windows
- * as its corresponding input element, and the output {@code PCollection}
- * has the same
- * {@link com.google.cloud.dataflow.sdk.transforms.windowing.WindowFn}
- * associated with it as the input.
- *
- * @param <K> the type of the keys in the input {@code PCollection}
- * and the values in the output {@code PCollection}
- * @param <V> the type of the values in the input {@code PCollection}
- * and the keys in the output {@code PCollection}
- */
-public class KvSwap<K, V> extends PTransform<PCollection<KV<K, V>>,
- PCollection<KV<V, K>>> {
- /**
- * Returns a {@code KvSwap<K, V>} {@code PTransform}.
- *
- * @param <K> the type of the keys in the input {@code PCollection}
- * and the values in the output {@code PCollection}
- * @param <V> the type of the values in the input {@code PCollection}
- * and the keys in the output {@code PCollection}
- */
- public static <K, V> KvSwap<K, V> create() {
- return new KvSwap<>();
- }
-
- private KvSwap() { }
-
- @Override
- public PCollection<KV<V, K>> apply(PCollection<KV<K, V>> in) {
- return
- in.apply(ParDo.named("KvSwap")
- .of(new DoFn<KV<K, V>, KV<V, K>>() {
- @Override
- public void processElement(ProcessContext c) {
- KV<K, V> e = c.element();
- c.output(KV.of(e.getValue(), e.getKey()));
- }
- }));
- }
-}
[26/67] [partial] incubator-beam git commit: Directory reorganization
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/ApproximateQuantiles.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/ApproximateQuantiles.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/ApproximateQuantiles.java
deleted file mode 100644
index 57dd510..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/ApproximateQuantiles.java
+++ /dev/null
@@ -1,766 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms;
-
-import com.google.cloud.dataflow.sdk.coders.BigEndianIntegerCoder;
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.coders.CoderException;
-import com.google.cloud.dataflow.sdk.coders.CoderRegistry;
-import com.google.cloud.dataflow.sdk.coders.CustomCoder;
-import com.google.cloud.dataflow.sdk.coders.ListCoder;
-import com.google.cloud.dataflow.sdk.transforms.Combine.AccumulatingCombineFn;
-import com.google.cloud.dataflow.sdk.transforms.Combine.AccumulatingCombineFn.Accumulator;
-import com.google.cloud.dataflow.sdk.util.WeightedValue;
-import com.google.cloud.dataflow.sdk.util.common.ElementByteSizeObserver;
-import com.google.cloud.dataflow.sdk.values.KV;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.common.base.Preconditions;
-import com.google.common.collect.Iterators;
-import com.google.common.collect.Lists;
-import com.google.common.collect.UnmodifiableIterator;
-
-import java.io.DataInputStream;
-import java.io.DataOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.io.Serializable;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.Iterator;
-import java.util.List;
-import java.util.PriorityQueue;
-
-import javax.annotation.Nullable;
-
-/**
- * {@code PTransform}s for getting an idea of a {@code PCollection}'s
- * data distribution using approximate {@code N}-tiles (e.g. quartiles,
- * percentiles, etc.), either globally or per-key.
- */
-public class ApproximateQuantiles {
- private ApproximateQuantiles() {
- // do not instantiate
- }
-
- /**
- * Returns a {@code PTransform} that takes a {@code PCollection<T>}
- * and returns a {@code PCollection<List<T>>} whose single value is a
- * {@code List} of the approximate {@code N}-tiles of the elements
- * of the input {@code PCollection}. This gives an idea of the
- * distribution of the input elements.
- *
- * <p>The computed {@code List} is of size {@code numQuantiles},
- * and contains the input elements' minimum value,
- * {@code numQuantiles-2} intermediate values, and maximum value, in
- * sorted order, using the given {@code Comparator} to order values.
- * To compute traditional {@code N}-tiles, one should use
- * {@code ApproximateQuantiles.globally(compareFn, N+1)}.
- *
- * <p>If there are fewer input elements than {@code numQuantiles},
- * then the result {@code List} will contain all the input elements,
- * in sorted order.
- *
- * <p>The argument {@code Comparator} must be {@code Serializable}.
- *
- * <p>Example of use:
- * <pre> {@code
- * PCollection<String> pc = ...;
- * PCollection<List<String>> quantiles =
- * pc.apply(ApproximateQuantiles.globally(stringCompareFn, 11));
- * } </pre>
- *
- * @param <T> the type of the elements in the input {@code PCollection}
- * @param numQuantiles the number of elements in the resulting
- * quantile values {@code List}
- * @param compareFn the function to use to order the elements
- */
- public static <T, ComparatorT extends Comparator<T> & Serializable>
- PTransform<PCollection<T>, PCollection<List<T>>> globally(
- int numQuantiles, ComparatorT compareFn) {
- return Combine.globally(
- ApproximateQuantilesCombineFn.create(numQuantiles, compareFn));
- }
-
- /**
- * Like {@link #globally(int, Comparator)}, but sorts using the
- * elements' natural ordering.
- *
- * @param <T> the type of the elements in the input {@code PCollection}
- * @param numQuantiles the number of elements in the resulting
- * quantile values {@code List}
- */
- public static <T extends Comparable<T>>
- PTransform<PCollection<T>, PCollection<List<T>>> globally(int numQuantiles) {
- return Combine.globally(
- ApproximateQuantilesCombineFn.<T>create(numQuantiles));
- }
-
- /**
- * Returns a {@code PTransform} that takes a
- * {@code PCollection<KV<K, V>>} and returns a
- * {@code PCollection<KV<K, List<V>>>} that contains an output
- * element mapping each distinct key in the input
- * {@code PCollection} to a {@code List} of the approximate
- * {@code N}-tiles of the values associated with that key in the
- * input {@code PCollection}. This gives an idea of the
- * distribution of the input values for each key.
- *
- * <p>Each of the computed {@code List}s is of size {@code numQuantiles},
- * and contains the input values' minimum value,
- * {@code numQuantiles-2} intermediate values, and maximum value, in
- * sorted order, using the given {@code Comparator} to order values.
- * To compute traditional {@code N}-tiles, one should use
- * {@code ApproximateQuantiles.perKey(compareFn, N+1)}.
- *
- * <p>If a key has fewer than {@code numQuantiles} values
- * associated with it, then that key's output {@code List} will
- * contain all the key's input values, in sorted order.
- *
- * <p>The argument {@code Comparator} must be {@code Serializable}.
- *
- * <p>Example of use:
- * <pre> {@code
- * PCollection<KV<Integer, String>> pc = ...;
- * PCollection<KV<Integer, List<String>>> quantilesPerKey =
- * pc.apply(ApproximateQuantiles.<Integer, String>perKey(stringCompareFn, 11));
- * } </pre>
- *
- * <p>See {@link Combine.PerKey} for how this affects timestamps and windowing.
- *
- * @param <K> the type of the keys in the input and output
- * {@code PCollection}s
- * @param <V> the type of the values in the input {@code PCollection}
- * @param numQuantiles the number of elements in the resulting
- * quantile values {@code List}
- * @param compareFn the function to use to order the elements
- */
- public static <K, V, ComparatorT extends Comparator<V> & Serializable>
- PTransform<PCollection<KV<K, V>>, PCollection<KV<K, List<V>>>>
- perKey(int numQuantiles, ComparatorT compareFn) {
- return Combine.perKey(
- ApproximateQuantilesCombineFn.create(numQuantiles, compareFn)
- .<K>asKeyedFn());
- }
-
- /**
- * Like {@link #perKey(int, Comparator)}, but sorts
- * values using the their natural ordering.
- *
- * @param <K> the type of the keys in the input and output
- * {@code PCollection}s
- * @param <V> the type of the values in the input {@code PCollection}
- * @param numQuantiles the number of elements in the resulting
- * quantile values {@code List}
- */
- public static <K, V extends Comparable<V>>
- PTransform<PCollection<KV<K, V>>, PCollection<KV<K, List<V>>>>
- perKey(int numQuantiles) {
- return Combine.perKey(
- ApproximateQuantilesCombineFn.<V>create(numQuantiles)
- .<K>asKeyedFn());
- }
-
-
- /////////////////////////////////////////////////////////////////////////////
-
- /**
- * The {@code ApproximateQuantilesCombineFn} combiner gives an idea
- * of the distribution of a collection of values using approximate
- * {@code N}-tiles. The output of this combiner is a {@code List}
- * of size {@code numQuantiles}, containing the input values'
- * minimum value, {@code numQuantiles-2} intermediate values, and
- * maximum value, in sorted order, so for traditional
- * {@code N}-tiles, one should use
- * {@code ApproximateQuantilesCombineFn#create(N+1)}.
- *
- * <p>If there are fewer values to combine than
- * {@code numQuantiles}, then the result {@code List} will contain all the
- * values being combined, in sorted order.
- *
- * <p>Values are ordered using either a specified
- * {@code Comparator} or the values' natural ordering.
- *
- * <p>To evaluate the quantiles we use the "New Algorithm" described here:
- * <pre>
- * [MRL98] Manku, Rajagopalan & Lindsay, "Approximate Medians and other
- * Quantiles in One Pass and with Limited Memory", Proc. 1998 ACM
- * SIGMOD, Vol 27, No 2, p 426-435, June 1998.
- * http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.6.6513&rep=rep1&type=pdf
- * </pre>
- *
- * <p>The default error bound is {@code 1 / N}, though in practice
- * the accuracy tends to be much better. <p>See
- * {@link #create(int, Comparator, long, double)} for
- * more information about the meaning of {@code epsilon}, and
- * {@link #withEpsilon} for a convenient way to adjust it.
- *
- * @param <T> the type of the values being combined
- */
- public static class ApproximateQuantilesCombineFn
- <T, ComparatorT extends Comparator<T> & Serializable>
- extends AccumulatingCombineFn<T, QuantileState<T, ComparatorT>, List<T>> {
-
- /**
- * The cost (in time and space) to compute quantiles to a given
- * accuracy is a function of the total number of elements in the
- * data set. If an estimate is not known or specified, we use
- * this as an upper bound. If this is too low, errors may exceed
- * the requested tolerance; if too high, efficiency may be
- * non-optimal. The impact is logarithmic with respect to this
- * value, so this default should be fine for most uses.
- */
- public static final long DEFAULT_MAX_NUM_ELEMENTS = (long) 1e9;
-
- /** The comparison function to use. */
- private final ComparatorT compareFn;
-
- /**
- * Number of quantiles to produce. The size of the final output
- * list, including the minimum and maximum, is numQuantiles.
- */
- private final int numQuantiles;
-
- /** The size of the buffers, corresponding to k in the referenced paper. */
- private final int bufferSize;
-
- /** The number of buffers, corresponding to b in the referenced paper. */
- private final int numBuffers;
-
- private final long maxNumElements;
-
- private ApproximateQuantilesCombineFn(
- int numQuantiles,
- ComparatorT compareFn,
- int bufferSize,
- int numBuffers,
- long maxNumElements) {
- Preconditions.checkArgument(numQuantiles >= 2);
- Preconditions.checkArgument(bufferSize >= 2);
- Preconditions.checkArgument(numBuffers >= 2);
- this.numQuantiles = numQuantiles;
- this.compareFn = compareFn;
- this.bufferSize = bufferSize;
- this.numBuffers = numBuffers;
- this.maxNumElements = maxNumElements;
- }
-
- /**
- * Returns an approximate quantiles combiner with the given
- * {@code compareFn} and desired number of quantiles. A total of
- * {@code numQuantiles} elements will appear in the output list,
- * including the minimum and maximum.
- *
- * <p>The {@code Comparator} must be {@code Serializable}.
- *
- * <p>The default error bound is {@code 1 / numQuantiles}, which
- * holds as long as the number of elements is less than
- * {@link #DEFAULT_MAX_NUM_ELEMENTS}.
- */
- public static <T, ComparatorT extends Comparator<T> & Serializable>
- ApproximateQuantilesCombineFn<T, ComparatorT> create(
- int numQuantiles, ComparatorT compareFn) {
- return create(
- numQuantiles, compareFn, DEFAULT_MAX_NUM_ELEMENTS, 1.0 / numQuantiles);
- }
-
- /**
- * Like {@link #create(int, Comparator)}, but sorts values using their natural ordering.
- */
- public static <T extends Comparable<T>>
- ApproximateQuantilesCombineFn<T, Top.Largest<T>> create(int numQuantiles) {
- return create(numQuantiles, new Top.Largest<T>());
- }
-
- /**
- * Returns an {@code ApproximateQuantilesCombineFn} that's like
- * this one except that it uses the specified {@code epsilon}
- * value. Does not modify this combiner.
- *
- * <p>See {@link #create(int, Comparator, long,
- * double)} for more information about the meaning of
- * {@code epsilon}.
- */
- public ApproximateQuantilesCombineFn<T, ComparatorT> withEpsilon(double epsilon) {
- return create(numQuantiles, compareFn, maxNumElements, epsilon);
- }
-
- /**
- * Returns an {@code ApproximateQuantilesCombineFn} that's like
- * this one except that it uses the specified {@code maxNumElements}
- * value. Does not modify this combiner.
- *
- * <p>See {@link #create(int, Comparator, long, double)} for more
- * information about the meaning of {@code maxNumElements}.
- */
- public ApproximateQuantilesCombineFn<T, ComparatorT> withMaxInputSize(
- long maxNumElements) {
- return create(numQuantiles, compareFn, maxNumElements, maxNumElements);
- }
-
- /**
- * Creates an approximate quantiles combiner with the given
- * {@code compareFn} and desired number of quantiles. A total of
- * {@code numQuantiles} elements will appear in the output list,
- * including the minimum and maximum.
- *
- * <p>The {@code Comparator} must be {@code Serializable}.
- *
- * <p>The default error bound is {@code epsilon}, which holds as long
- * as the number of elements is less than {@code maxNumElements}.
- * Specifically, if one considers the input as a sorted list x_1, ..., x_N,
- * then the distance between the each exact quantile x_c and its
- * approximation x_c' is bounded by {@code |c - c'| < epsilon * N}.
- * Note that these errors are worst-case scenarios; in practice the accuracy
- * tends to be much better.
- */
- public static <T, ComparatorT extends Comparator<T> & Serializable>
- ApproximateQuantilesCombineFn<T, ComparatorT> create(
- int numQuantiles,
- ComparatorT compareFn,
- long maxNumElements,
- double epsilon) {
- // Compute optimal b and k.
- int b = 2;
- while ((b - 2) * (1 << (b - 2)) < epsilon * maxNumElements) {
- b++;
- }
- b--;
- int k = Math.max(2, (int) Math.ceil(maxNumElements / (1 << (b - 1))));
- return new ApproximateQuantilesCombineFn<T, ComparatorT>(
- numQuantiles, compareFn, k, b, maxNumElements);
- }
-
- @Override
- public QuantileState<T, ComparatorT> createAccumulator() {
- return QuantileState.empty(compareFn, numQuantiles, numBuffers, bufferSize);
- }
-
- @Override
- public Coder<QuantileState<T, ComparatorT>> getAccumulatorCoder(
- CoderRegistry registry, Coder<T> elementCoder) {
- return new QuantileStateCoder<>(compareFn, elementCoder);
- }
- }
-
- /**
- * Compact summarization of a collection on which quantiles can be estimated.
- */
- static class QuantileState<T, ComparatorT extends Comparator<T> & Serializable>
- implements Accumulator<T, QuantileState<T, ComparatorT>, List<T>> {
-
- private ComparatorT compareFn;
- private int numQuantiles;
- private int numBuffers;
- private int bufferSize;
-
- @Nullable
- private T min;
-
- @Nullable
- private T max;
-
- /**
- * The set of buffers, ordered by level from smallest to largest.
- */
- private PriorityQueue<QuantileBuffer<T>> buffers;
-
- /**
- * The algorithm requires that the manipulated buffers always be filled
- * to capacity to perform the collapse operation. This operation can
- * be extended to buffers of varying sizes by introducing the notion of
- * fractional weights, but it's easier to simply combine the remainders
- * from all shards into new, full buffers and then take them into account
- * when computing the final output.
- */
- private List<T> unbufferedElements = Lists.newArrayList();
-
- private QuantileState(
- ComparatorT compareFn,
- int numQuantiles,
- @Nullable T min,
- @Nullable T max,
- int numBuffers,
- int bufferSize,
- Collection<T> unbufferedElements,
- Collection<QuantileBuffer<T>> buffers) {
- this.compareFn = compareFn;
- this.numQuantiles = numQuantiles;
- this.numBuffers = numBuffers;
- this.bufferSize = bufferSize;
- this.buffers = new PriorityQueue<>(numBuffers + 1);
- this.min = min;
- this.max = max;
- this.unbufferedElements.addAll(unbufferedElements);
- this.buffers.addAll(buffers);
- }
-
- public static <T, ComparatorT extends Comparator<T> & Serializable>
- QuantileState<T, ComparatorT> empty(
- ComparatorT compareFn, int numQuantiles, int numBuffers, int bufferSize) {
- return new QuantileState<T, ComparatorT>(
- compareFn,
- numQuantiles,
- null, /* min */
- null, /* max */
- numBuffers,
- bufferSize,
- Collections.<T>emptyList(),
- Collections.<QuantileBuffer<T>>emptyList());
- }
-
- public static <T, ComparatorT extends Comparator<T> & Serializable>
- QuantileState<T, ComparatorT> singleton(
- ComparatorT compareFn, int numQuantiles, T elem, int numBuffers, int bufferSize) {
- return new QuantileState<T, ComparatorT>(
- compareFn,
- numQuantiles,
- elem, /* min */
- elem, /* max */
- numBuffers,
- bufferSize,
- Collections.singletonList(elem),
- Collections.<QuantileBuffer<T>>emptyList());
- }
-
- /**
- * Add a new element to the collection being summarized by this state.
- */
- @Override
- public void addInput(T elem) {
- if (isEmpty()) {
- min = max = elem;
- } else if (compareFn.compare(elem, min) < 0) {
- min = elem;
- } else if (compareFn.compare(elem, max) > 0) {
- max = elem;
- }
- addUnbuffered(elem);
- }
-
- /**
- * Add a new buffer to the unbuffered list, creating a new buffer and
- * collapsing if needed.
- */
- private void addUnbuffered(T elem) {
- unbufferedElements.add(elem);
- if (unbufferedElements.size() == bufferSize) {
- Collections.sort(unbufferedElements, compareFn);
- buffers.add(new QuantileBuffer<T>(unbufferedElements));
- unbufferedElements = Lists.newArrayListWithCapacity(bufferSize);
- collapseIfNeeded();
- }
- }
-
- /**
- * Updates this as if adding all elements seen by other.
- *
- * <p>Note that this ignores the {@code Comparator} of the other {@link QuantileState}. In
- * practice, they should generally be equal, but this method tolerates a mismatch.
- */
- @Override
- public void mergeAccumulator(QuantileState<T, ComparatorT> other) {
- if (other.isEmpty()) {
- return;
- }
- if (min == null || compareFn.compare(other.min, min) < 0) {
- min = other.min;
- }
- if (max == null || compareFn.compare(other.max, max) > 0) {
- max = other.max;
- }
- for (T elem : other.unbufferedElements) {
- addUnbuffered(elem);
- }
- buffers.addAll(other.buffers);
- collapseIfNeeded();
- }
-
- public boolean isEmpty() {
- return unbufferedElements.size() == 0 && buffers.size() == 0;
- }
-
- private void collapseIfNeeded() {
- while (buffers.size() > numBuffers) {
- List<QuantileBuffer<T>> toCollapse = Lists.newArrayList();
- toCollapse.add(buffers.poll());
- toCollapse.add(buffers.poll());
- int minLevel = toCollapse.get(1).level;
- while (!buffers.isEmpty() && buffers.peek().level == minLevel) {
- toCollapse.add(buffers.poll());
- }
- buffers.add(collapse(toCollapse));
- }
- }
-
- private QuantileBuffer<T> collapse(
- Iterable<QuantileBuffer<T>> buffers) {
- int newLevel = 0;
- long newWeight = 0;
- for (QuantileBuffer<T> buffer : buffers) {
- // As presented in the paper, there should always be at least two
- // buffers of the same (minimal) level to collapse, but it is possible
- // to violate this condition when combining buffers from independently
- // computed shards. If they differ we take the max.
- newLevel = Math.max(newLevel, buffer.level + 1);
- newWeight += buffer.weight;
- }
- List<T> newElements =
- interpolate(buffers, bufferSize, newWeight, offset(newWeight));
- return new QuantileBuffer<>(newLevel, newWeight, newElements);
- }
-
- /**
- * If the weight is even, we must round up or down. Alternate between these two options to
- * avoid a bias.
- */
- private long offset(long newWeight) {
- if (newWeight % 2 == 1) {
- return (newWeight + 1) / 2;
- } else {
- offsetJitter = 2 - offsetJitter;
- return (newWeight + offsetJitter) / 2;
- }
- }
-
- /** For alternating between biasing up and down in the above even weight collapse operation. */
- private int offsetJitter = 0;
-
-
- /**
- * Emulates taking the ordered union of all elements in buffers, repeated
- * according to their weight, and picking out the (k * step + offset)-th
- * elements of this list for {@code 0 <= k < count}.
- */
- private List<T> interpolate(Iterable<QuantileBuffer<T>> buffers,
- int count, double step, double offset) {
- List<Iterator<WeightedValue<T>>> iterators = Lists.newArrayList();
- for (QuantileBuffer<T> buffer : buffers) {
- iterators.add(buffer.sizedIterator());
- }
- // Each of the buffers is already sorted by element.
- Iterator<WeightedValue<T>> sorted = Iterators.mergeSorted(
- iterators,
- new Comparator<WeightedValue<T>>() {
- @Override
- public int compare(WeightedValue<T> a, WeightedValue<T> b) {
- return compareFn.compare(a.getValue(), b.getValue());
- }
- });
-
- List<T> newElements = Lists.newArrayListWithCapacity(count);
- WeightedValue<T> weightedElement = sorted.next();
- double current = weightedElement.getWeight();
- for (int j = 0; j < count; j++) {
- double target = j * step + offset;
- while (current <= target && sorted.hasNext()) {
- weightedElement = sorted.next();
- current += weightedElement.getWeight();
- }
- newElements.add(weightedElement.getValue());
- }
- return newElements;
- }
-
- /**
- * Outputs numQuantiles elements consisting of the minimum, maximum, and
- * numQuantiles - 2 evenly spaced intermediate elements.
- *
- * <p>Returns the empty list if no elements have been added.
- */
- @Override
- public List<T> extractOutput() {
- if (isEmpty()) {
- return Lists.newArrayList();
- }
- long totalCount = unbufferedElements.size();
- for (QuantileBuffer<T> buffer : buffers) {
- totalCount += bufferSize * buffer.weight;
- }
- List<QuantileBuffer<T>> all = Lists.newArrayList(buffers);
- if (!unbufferedElements.isEmpty()) {
- Collections.sort(unbufferedElements, compareFn);
- all.add(new QuantileBuffer<>(unbufferedElements));
- }
- double step = 1.0 * totalCount / (numQuantiles - 1);
- double offset = (1.0 * totalCount - 1) / (numQuantiles - 1);
- List<T> quantiles = interpolate(all, numQuantiles - 2, step, offset);
- quantiles.add(0, min);
- quantiles.add(max);
- return quantiles;
- }
- }
-
- /**
- * A single buffer in the sense of the referenced algorithm.
- */
- private static class QuantileBuffer<T> implements Comparable<QuantileBuffer<T>> {
- private int level;
- private long weight;
- private List<T> elements;
-
- public QuantileBuffer(List<T> elements) {
- this(0, 1, elements);
- }
-
- public QuantileBuffer(int level, long weight, List<T> elements) {
- this.level = level;
- this.weight = weight;
- this.elements = elements;
- }
-
- @Override
- public int compareTo(QuantileBuffer<T> other) {
- return this.level - other.level;
- }
-
- @Override
- public String toString() {
- return "QuantileBuffer["
- + "level=" + level
- + ", weight="
- + weight + ", elements=" + elements + "]";
- }
-
- public Iterator<WeightedValue<T>> sizedIterator() {
- return new UnmodifiableIterator<WeightedValue<T>>() {
- Iterator<T> iter = elements.iterator();
- @Override
- public boolean hasNext() {
- return iter.hasNext();
- }
- @Override public WeightedValue<T> next() {
- return WeightedValue.of(iter.next(), weight);
- }
- };
- }
- }
-
- /**
- * Coder for QuantileState.
- */
- private static class QuantileStateCoder<T, ComparatorT extends Comparator<T> & Serializable>
- extends CustomCoder<QuantileState<T, ComparatorT>> {
- private final ComparatorT compareFn;
- private final Coder<T> elementCoder;
- private final Coder<List<T>> elementListCoder;
- private final Coder<Integer> intCoder = BigEndianIntegerCoder.of();
-
- public QuantileStateCoder(ComparatorT compareFn, Coder<T> elementCoder) {
- this.compareFn = compareFn;
- this.elementCoder = elementCoder;
- this.elementListCoder = ListCoder.of(elementCoder);
- }
-
- @Override
- public void encode(
- QuantileState<T, ComparatorT> state, OutputStream outStream, Coder.Context context)
- throws CoderException, IOException {
- Coder.Context nestedContext = context.nested();
- intCoder.encode(state.numQuantiles, outStream, nestedContext);
- intCoder.encode(state.bufferSize, outStream, nestedContext);
- elementCoder.encode(state.min, outStream, nestedContext);
- elementCoder.encode(state.max, outStream, nestedContext);
- elementListCoder.encode(
- state.unbufferedElements, outStream, nestedContext);
- BigEndianIntegerCoder.of().encode(
- state.buffers.size(), outStream, nestedContext);
- for (QuantileBuffer<T> buffer : state.buffers) {
- encodeBuffer(buffer, outStream, nestedContext);
- }
- }
-
- @Override
- public QuantileState<T, ComparatorT> decode(InputStream inStream, Coder.Context context)
- throws CoderException, IOException {
- Coder.Context nestedContext = context.nested();
- int numQuantiles = intCoder.decode(inStream, nestedContext);
- int bufferSize = intCoder.decode(inStream, nestedContext);
- T min = elementCoder.decode(inStream, nestedContext);
- T max = elementCoder.decode(inStream, nestedContext);
- List<T> unbufferedElements =
- elementListCoder.decode(inStream, nestedContext);
- int numBuffers =
- BigEndianIntegerCoder.of().decode(inStream, nestedContext);
- List<QuantileBuffer<T>> buffers = new ArrayList<>(numBuffers);
- for (int i = 0; i < numBuffers; i++) {
- buffers.add(decodeBuffer(inStream, nestedContext));
- }
- return new QuantileState<T, ComparatorT>(
- compareFn, numQuantiles, min, max, numBuffers, bufferSize, unbufferedElements, buffers);
- }
-
- private void encodeBuffer(
- QuantileBuffer<T> buffer, OutputStream outStream, Coder.Context context)
- throws CoderException, IOException {
- DataOutputStream outData = new DataOutputStream(outStream);
- outData.writeInt(buffer.level);
- outData.writeLong(buffer.weight);
- elementListCoder.encode(buffer.elements, outStream, context);
- }
-
- private QuantileBuffer<T> decodeBuffer(
- InputStream inStream, Coder.Context context)
- throws IOException, CoderException {
- DataInputStream inData = new DataInputStream(inStream);
- return new QuantileBuffer<>(
- inData.readInt(),
- inData.readLong(),
- elementListCoder.decode(inStream, context));
- }
-
- /**
- * Notifies ElementByteSizeObserver about the byte size of the
- * encoded value using this coder.
- */
- @Override
- public void registerByteSizeObserver(
- QuantileState<T, ComparatorT> state,
- ElementByteSizeObserver observer,
- Coder.Context context)
- throws Exception {
- Coder.Context nestedContext = context.nested();
- elementCoder.registerByteSizeObserver(
- state.min, observer, nestedContext);
- elementCoder.registerByteSizeObserver(
- state.max, observer, nestedContext);
- elementListCoder.registerByteSizeObserver(
- state.unbufferedElements, observer, nestedContext);
-
- BigEndianIntegerCoder.of().registerByteSizeObserver(
- state.buffers.size(), observer, nestedContext);
- for (QuantileBuffer<T> buffer : state.buffers) {
- observer.update(4L + 8);
-
- elementListCoder.registerByteSizeObserver(
- buffer.elements, observer, nestedContext);
- }
- }
-
- @Override
- public void verifyDeterministic() throws NonDeterministicException {
- verifyDeterministic(
- "QuantileState.ElementCoder must be deterministic",
- elementCoder);
- verifyDeterministic(
- "QuantileState.ElementListCoder must be deterministic",
- elementListCoder);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/ApproximateUnique.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/ApproximateUnique.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/ApproximateUnique.java
deleted file mode 100644
index 3c936a2..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/ApproximateUnique.java
+++ /dev/null
@@ -1,419 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms;
-
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.coders.Coder.Context;
-import com.google.cloud.dataflow.sdk.coders.CoderException;
-import com.google.cloud.dataflow.sdk.coders.CoderRegistry;
-import com.google.cloud.dataflow.sdk.coders.KvCoder;
-import com.google.cloud.dataflow.sdk.coders.SerializableCoder;
-import com.google.cloud.dataflow.sdk.transforms.Combine.CombineFn;
-import com.google.cloud.dataflow.sdk.values.KV;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.common.hash.Hashing;
-import com.google.common.hash.HashingOutputStream;
-import com.google.common.io.ByteStreams;
-
-import java.io.IOException;
-import java.io.Serializable;
-import java.util.Arrays;
-import java.util.Iterator;
-import java.util.List;
-import java.util.PriorityQueue;
-
-/**
- * {@code PTransform}s for estimating the number of distinct elements
- * in a {@code PCollection}, or the number of distinct values
- * associated with each key in a {@code PCollection} of {@code KV}s.
- */
-public class ApproximateUnique {
-
- /**
- * Returns a {@code PTransform} that takes a {@code PCollection<T>}
- * and returns a {@code PCollection<Long>} containing a single value
- * that is an estimate of the number of distinct elements in the
- * input {@code PCollection}.
- *
- * <p>The {@code sampleSize} parameter controls the estimation
- * error. The error is about {@code 2 / sqrt(sampleSize)}, so for
- * {@code ApproximateUnique.globally(10000)} the estimation error is
- * about 2%. Similarly, for {@code ApproximateUnique.of(16)} the
- * estimation error is about 50%. If there are fewer than
- * {@code sampleSize} distinct elements then the returned result
- * will be exact with extremely high probability (the chance of a
- * hash collision is about {@code sampleSize^2 / 2^65}).
- *
- * <p>This transform approximates the number of elements in a set
- * by computing the top {@code sampleSize} hash values, and using
- * that to extrapolate the size of the entire set of hash values by
- * assuming the rest of the hash values are as densely distributed
- * as the top {@code sampleSize}.
- *
- * <p>See also {@link #globally(double)}.
- *
- * <p>Example of use:
- * <pre> {@code
- * PCollection<String> pc = ...;
- * PCollection<Long> approxNumDistinct =
- * pc.apply(ApproximateUnique.<String>globally(1000));
- * } </pre>
- *
- * @param <T> the type of the elements in the input {@code PCollection}
- * @param sampleSize the number of entries in the statistical
- * sample; the higher this number, the more accurate the
- * estimate will be; should be {@code >= 16}
- * @throws IllegalArgumentException if the {@code sampleSize}
- * argument is too small
- */
- public static <T> Globally<T> globally(int sampleSize) {
- return new Globally<>(sampleSize);
- }
-
- /**
- * Like {@link #globally(int)}, but specifies the desired maximum
- * estimation error instead of the sample size.
- *
- * @param <T> the type of the elements in the input {@code PCollection}
- * @param maximumEstimationError the maximum estimation error, which
- * should be in the range {@code [0.01, 0.5]}
- * @throws IllegalArgumentException if the
- * {@code maximumEstimationError} argument is out of range
- */
- public static <T> Globally<T> globally(double maximumEstimationError) {
- return new Globally<>(maximumEstimationError);
- }
-
- /**
- * Returns a {@code PTransform} that takes a
- * {@code PCollection<KV<K, V>>} and returns a
- * {@code PCollection<KV<K, Long>>} that contains an output element
- * mapping each distinct key in the input {@code PCollection} to an
- * estimate of the number of distinct values associated with that
- * key in the input {@code PCollection}.
- *
- * <p>See {@link #globally(int)} for an explanation of the
- * {@code sampleSize} parameter. A separate sampling is computed
- * for each distinct key of the input.
- *
- * <p>See also {@link #perKey(double)}.
- *
- * <p>Example of use:
- * <pre> {@code
- * PCollection<KV<Integer, String>> pc = ...;
- * PCollection<KV<Integer, Long>> approxNumDistinctPerKey =
- * pc.apply(ApproximateUnique.<Integer, String>perKey(1000));
- * } </pre>
- *
- * @param <K> the type of the keys in the input and output
- * {@code PCollection}s
- * @param <V> the type of the values in the input {@code PCollection}
- * @param sampleSize the number of entries in the statistical
- * sample; the higher this number, the more accurate the
- * estimate will be; should be {@code >= 16}
- * @throws IllegalArgumentException if the {@code sampleSize}
- * argument is too small
- */
- public static <K, V> PerKey<K, V> perKey(int sampleSize) {
- return new PerKey<>(sampleSize);
- }
-
- /**
- * Like {@link #perKey(int)}, but specifies the desired maximum
- * estimation error instead of the sample size.
- *
- * @param <K> the type of the keys in the input and output
- * {@code PCollection}s
- * @param <V> the type of the values in the input {@code PCollection}
- * @param maximumEstimationError the maximum estimation error, which
- * should be in the range {@code [0.01, 0.5]}
- * @throws IllegalArgumentException if the
- * {@code maximumEstimationError} argument is out of range
- */
- public static <K, V> PerKey<K, V> perKey(double maximumEstimationError) {
- return new PerKey<>(maximumEstimationError);
- }
-
-
- /////////////////////////////////////////////////////////////////////////////
-
- /**
- * {@code PTransform} for estimating the number of distinct elements
- * in a {@code PCollection}.
- *
- * @param <T> the type of the elements in the input {@code PCollection}
- */
- static class Globally<T> extends PTransform<PCollection<T>, PCollection<Long>> {
-
- /**
- * The number of entries in the statistical sample; the higher this number,
- * the more accurate the estimate will be.
- */
- private final long sampleSize;
-
- /**
- * @see ApproximateUnique#globally(int)
- */
- public Globally(int sampleSize) {
- if (sampleSize < 16) {
- throw new IllegalArgumentException(
- "ApproximateUnique needs a sampleSize "
- + ">= 16 for an estimation error <= 50%. "
- + "In general, the estimation "
- + "error is about 2 / sqrt(sampleSize).");
- }
- this.sampleSize = sampleSize;
- }
-
- /**
- * @see ApproximateUnique#globally(double)
- */
- public Globally(double maximumEstimationError) {
- if (maximumEstimationError < 0.01 || maximumEstimationError > 0.5) {
- throw new IllegalArgumentException(
- "ApproximateUnique needs an "
- + "estimation error between 1% (0.01) and 50% (0.5).");
- }
- this.sampleSize = sampleSizeFromEstimationError(maximumEstimationError);
- }
-
- @Override
- public PCollection<Long> apply(PCollection<T> input) {
- Coder<T> coder = input.getCoder();
- return input.apply(
- Combine.globally(
- new ApproximateUniqueCombineFn<>(sampleSize, coder)));
- }
- }
-
- /**
- * {@code PTransform} for estimating the number of distinct values
- * associated with each key in a {@code PCollection} of {@code KV}s.
- *
- * @param <K> the type of the keys in the input and output
- * {@code PCollection}s
- * @param <V> the type of the values in the input {@code PCollection}
- */
- static class PerKey<K, V>
- extends PTransform<PCollection<KV<K, V>>, PCollection<KV<K, Long>>> {
-
- private final long sampleSize;
-
- /**
- * @see ApproximateUnique#perKey(int)
- */
- public PerKey(int sampleSize) {
- if (sampleSize < 16) {
- throw new IllegalArgumentException(
- "ApproximateUnique needs a "
- + "sampleSize >= 16 for an estimation error <= 50%. In general, "
- + "the estimation error is about 2 / sqrt(sampleSize).");
- }
- this.sampleSize = sampleSize;
- }
-
- /**
- * @see ApproximateUnique#perKey(double)
- */
- public PerKey(double estimationError) {
- if (estimationError < 0.01 || estimationError > 0.5) {
- throw new IllegalArgumentException(
- "ApproximateUnique.PerKey needs an "
- + "estimation error between 1% (0.01) and 50% (0.5).");
- }
- this.sampleSize = sampleSizeFromEstimationError(estimationError);
- }
-
- @Override
- public PCollection<KV<K, Long>> apply(PCollection<KV<K, V>> input) {
- Coder<KV<K, V>> inputCoder = input.getCoder();
- if (!(inputCoder instanceof KvCoder)) {
- throw new IllegalStateException(
- "ApproximateUnique.PerKey requires its input to use KvCoder");
- }
- @SuppressWarnings("unchecked")
- final Coder<V> coder = ((KvCoder<K, V>) inputCoder).getValueCoder();
-
- return input.apply(
- Combine.perKey(new ApproximateUniqueCombineFn<>(
- sampleSize, coder).<K>asKeyedFn()));
- }
- }
-
-
- /////////////////////////////////////////////////////////////////////////////
-
- /**
- * {@code CombineFn} that computes an estimate of the number of
- * distinct values that were combined.
- *
- * <p>Hashes input elements, computes the top {@code sampleSize}
- * hash values, and uses those to extrapolate the size of the entire
- * set of hash values by assuming the rest of the hash values are as
- * densely distributed as the top {@code sampleSize}.
- *
- * <p>Used to implement
- * {@link #globally(int) ApproximatUnique.globally(...)} and
- * {@link #perKey(int) ApproximatUnique.perKey(...)}.
- *
- * @param <T> the type of the values being combined
- */
- public static class ApproximateUniqueCombineFn<T> extends
- CombineFn<T, ApproximateUniqueCombineFn.LargestUnique, Long> {
-
- /**
- * The size of the space of hashes returned by the hash function.
- */
- static final double HASH_SPACE_SIZE =
- Long.MAX_VALUE - (double) Long.MIN_VALUE;
-
- /**
- * A heap utility class to efficiently track the largest added elements.
- */
- public static class LargestUnique implements Serializable {
- private PriorityQueue<Long> heap = new PriorityQueue<>();
- private final long sampleSize;
-
- /**
- * Creates a heap to track the largest {@code sampleSize} elements.
- *
- * @param sampleSize the size of the heap
- */
- public LargestUnique(long sampleSize) {
- this.sampleSize = sampleSize;
- }
-
- /**
- * Adds a value to the heap, returning whether the value is (large enough
- * to be) in the heap.
- */
- public boolean add(Long value) {
- if (heap.contains(value)) {
- return true;
- } else if (heap.size() < sampleSize) {
- heap.add(value);
- return true;
- } else if (value > heap.element()) {
- heap.remove();
- heap.add(value);
- return true;
- } else {
- return false;
- }
- }
-
- /**
- * Returns the values in the heap, ordered largest to smallest.
- */
- public List<Long> extractOrderedList() {
- // The only way to extract the order from the heap is element-by-element
- // from smallest to largest.
- Long[] array = new Long[heap.size()];
- for (int i = heap.size() - 1; i >= 0; i--) {
- array[i] = heap.remove();
- }
- return Arrays.asList(array);
- }
- }
-
- private final long sampleSize;
- private final Coder<T> coder;
-
- public ApproximateUniqueCombineFn(long sampleSize, Coder<T> coder) {
- this.sampleSize = sampleSize;
- this.coder = coder;
- }
-
- @Override
- public LargestUnique createAccumulator() {
- return new LargestUnique(sampleSize);
- }
-
- @Override
- public LargestUnique addInput(LargestUnique heap, T input) {
- try {
- heap.add(hash(input, coder));
- return heap;
- } catch (Throwable e) {
- throw new RuntimeException(e);
- }
- }
-
- @Override
- public LargestUnique mergeAccumulators(Iterable<LargestUnique> heaps) {
- Iterator<LargestUnique> iterator = heaps.iterator();
- LargestUnique heap = iterator.next();
- while (iterator.hasNext()) {
- List<Long> largestHashes = iterator.next().extractOrderedList();
- for (long hash : largestHashes) {
- if (!heap.add(hash)) {
- break; // The remainder of this list is all smaller.
- }
- }
- }
- return heap;
- }
-
- @Override
- public Long extractOutput(LargestUnique heap) {
- List<Long> largestHashes = heap.extractOrderedList();
- if (largestHashes.size() < sampleSize) {
- return (long) largestHashes.size();
- } else {
- long smallestSampleHash = largestHashes.get(largestHashes.size() - 1);
- double sampleSpaceSize = Long.MAX_VALUE - (double) smallestSampleHash;
- // This formula takes into account the possibility of hash collisions,
- // which become more likely than not for 2^32 distinct elements.
- // Note that log(1+x) ~ x for small x, so for sampleSize << maxHash
- // log(1 - sampleSize/sampleSpace) / log(1 - 1/sampleSpace) ~ sampleSize
- // and hence estimate ~ sampleSize * HASH_SPACE_SIZE / sampleSpace
- // as one would expect.
- double estimate = Math.log1p(-sampleSize / sampleSpaceSize)
- / Math.log1p(-1 / sampleSpaceSize)
- * HASH_SPACE_SIZE / sampleSpaceSize;
- return Math.round(estimate);
- }
- }
-
- @Override
- public Coder<LargestUnique> getAccumulatorCoder(CoderRegistry registry,
- Coder<T> inputCoder) {
- return SerializableCoder.of(LargestUnique.class);
- }
-
- /**
- * Encodes the given element using the given coder and hashes the encoding.
- */
- static <T> long hash(T element, Coder<T> coder) throws CoderException, IOException {
- try (HashingOutputStream stream =
- new HashingOutputStream(Hashing.murmur3_128(), ByteStreams.nullOutputStream())) {
- coder.encode(element, stream, Context.OUTER);
- return stream.hash().asLong();
- }
- }
- }
-
- /**
- * Computes the sampleSize based on the desired estimation error.
- *
- * @param estimationError should be bounded by [0.01, 0.5]
- * @return the sample size needed for the desired estimation error
- */
- static long sampleSizeFromEstimationError(double estimationError) {
- return Math.round(Math.ceil(4.0 / Math.pow(estimationError, 2.0)));
- }
-}
[21/67] [partial] incubator-beam git commit: Directory reorganization
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/MapElements.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/MapElements.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/MapElements.java
deleted file mode 100644
index 8997050..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/MapElements.java
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms;
-
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.cloud.dataflow.sdk.values.TypeDescriptor;
-
-/**
- * {@code PTransform}s for mapping a simple function over the elements of a {@link PCollection}.
- */
-public class MapElements<InputT, OutputT>
-extends PTransform<PCollection<InputT>, PCollection<OutputT>> {
-
- /**
- * For a {@code SerializableFunction<InputT, OutputT>} {@code fn} and output type descriptor,
- * returns a {@code PTransform} that takes an input {@code PCollection<InputT>} and returns
- * a {@code PCollection<OutputT>} containing {@code fn.apply(v)} for every element {@code v} in
- * the input.
- *
- * <p>Example of use in Java 8:
- * <pre>{@code
- * PCollection<Integer> wordLengths = words.apply(
- * MapElements.via((String word) -> word.length())
- * .withOutputType(new TypeDescriptor<Integer>() {});
- * }</pre>
- *
- * <p>In Java 7, the overload {@link #via(SimpleFunction)} is more concise as the output type
- * descriptor need not be provided.
- */
- public static <InputT, OutputT> MissingOutputTypeDescriptor<InputT, OutputT>
- via(SerializableFunction<InputT, OutputT> fn) {
- return new MissingOutputTypeDescriptor<>(fn);
- }
-
- /**
- * For a {@code SimpleFunction<InputT, OutputT>} {@code fn}, returns a {@code PTransform} that
- * takes an input {@code PCollection<InputT>} and returns a {@code PCollection<OutputT>}
- * containing {@code fn.apply(v)} for every element {@code v} in the input.
- *
- * <p>This overload is intended primarily for use in Java 7. In Java 8, the overload
- * {@link #via(SerializableFunction)} supports use of lambda for greater concision.
- *
- * <p>Example of use in Java 7:
- * <pre>{@code
- * PCollection<String> words = ...;
- * PCollection<Integer> wordsPerLine = words.apply(MapElements.via(
- * new SimpleFunction<String, Integer>() {
- * public Integer apply(String word) {
- * return word.length();
- * }
- * }));
- * }</pre>
- */
- public static <InputT, OutputT> MapElements<InputT, OutputT>
- via(final SimpleFunction<InputT, OutputT> fn) {
- return new MapElements<>(fn, fn.getOutputTypeDescriptor());
- }
-
- /**
- * An intermediate builder for a {@link MapElements} transform. To complete the transform, provide
- * an output type descriptor to {@link MissingOutputTypeDescriptor#withOutputType}. See
- * {@link #via(SerializableFunction)} for a full example of use.
- */
- public static final class MissingOutputTypeDescriptor<InputT, OutputT> {
-
- private final SerializableFunction<InputT, OutputT> fn;
-
- private MissingOutputTypeDescriptor(SerializableFunction<InputT, OutputT> fn) {
- this.fn = fn;
- }
-
- public MapElements<InputT, OutputT> withOutputType(TypeDescriptor<OutputT> outputType) {
- return new MapElements<>(fn, outputType);
- }
- }
-
- ///////////////////////////////////////////////////////////////////
-
- private final SerializableFunction<InputT, OutputT> fn;
- private final transient TypeDescriptor<OutputT> outputType;
-
- private MapElements(
- SerializableFunction<InputT, OutputT> fn,
- TypeDescriptor<OutputT> outputType) {
- this.fn = fn;
- this.outputType = outputType;
- }
-
- @Override
- public PCollection<OutputT> apply(PCollection<InputT> input) {
- return input.apply(ParDo.named("Map").of(new DoFn<InputT, OutputT>() {
- @Override
- public void processElement(ProcessContext c) {
- c.output(fn.apply(c.element()));
- }
- })).setTypeDescriptorInternal(outputType);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/Max.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/Max.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/Max.java
deleted file mode 100644
index 8678e4f..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/Max.java
+++ /dev/null
@@ -1,255 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms;
-
-import com.google.cloud.dataflow.sdk.transforms.Combine.BinaryCombineFn;
-import com.google.cloud.dataflow.sdk.util.common.Counter;
-import com.google.cloud.dataflow.sdk.util.common.Counter.AggregationKind;
-import com.google.cloud.dataflow.sdk.util.common.CounterProvider;
-
-import java.io.Serializable;
-import java.util.Comparator;
-
-/**
- * {@code PTransform}s for computing the maximum of the elements in a {@code PCollection}, or the
- * maximum of the values associated with each key in a {@code PCollection} of {@code KV}s.
- *
- * <p>Example 1: get the maximum of a {@code PCollection} of {@code Double}s.
- * <pre> {@code
- * PCollection<Double> input = ...;
- * PCollection<Double> max = input.apply(Max.doublesGlobally());
- * } </pre>
- *
- * <p>Example 2: calculate the maximum of the {@code Integer}s
- * associated with each unique key (which is of type {@code String}).
- * <pre> {@code
- * PCollection<KV<String, Integer>> input = ...;
- * PCollection<KV<String, Integer>> maxPerKey = input
- * .apply(Max.<String>integersPerKey());
- * } </pre>
- */
-public class Max {
-
- private Max() {
- // do not instantiate
- }
-
- /**
- * Returns a {@code PTransform} that takes an input {@code PCollection<Integer>} and returns a
- * {@code PCollection<Integer>} whose contents is the maximum of the input {@code PCollection}'s
- * elements, or {@code Integer.MIN_VALUE} if there are no elements.
- */
- public static Combine.Globally<Integer, Integer> integersGlobally() {
- return Combine.globally(new MaxIntegerFn()).named("Max.Globally");
- }
-
- /**
- * Returns a {@code PTransform} that takes an input {@code PCollection<KV<K, Integer>>} and
- * returns a {@code PCollection<KV<K, Integer>>} that contains an output element mapping each
- * distinct key in the input {@code PCollection} to the maximum of the values associated with that
- * key in the input {@code PCollection}.
- *
- * <p>See {@link Combine.PerKey} for how this affects timestamps and windowing.
- */
- public static <K> Combine.PerKey<K, Integer, Integer> integersPerKey() {
- return Combine.<K, Integer, Integer>perKey(new MaxIntegerFn()).named("Max.PerKey");
- }
-
- /**
- * Returns a {@code PTransform} that takes an input {@code PCollection<Long>} and returns a {@code
- * PCollection<Long>} whose contents is the maximum of the input {@code PCollection}'s elements,
- * or {@code Long.MIN_VALUE} if there are no elements.
- */
- public static Combine.Globally<Long, Long> longsGlobally() {
- return Combine.globally(new MaxLongFn()).named("Max.Globally");
- }
-
- /**
- * Returns a {@code PTransform} that takes an input {@code PCollection<KV<K, Long>>} and returns a
- * {@code PCollection<KV<K, Long>>} that contains an output element mapping each distinct key in
- * the input {@code PCollection} to the maximum of the values associated with that key in the
- * input {@code PCollection}.
- *
- * <p>See {@link Combine.PerKey} for how this affects timestamps and windowing.
- */
- public static <K> Combine.PerKey<K, Long, Long> longsPerKey() {
- return Combine.<K, Long, Long>perKey(new MaxLongFn()).named("Max.PerKey");
- }
-
- /**
- * Returns a {@code PTransform} that takes an input {@code PCollection<Double>} and returns a
- * {@code PCollection<Double>} whose contents is the maximum of the input {@code PCollection}'s
- * elements, or {@code Double.NEGATIVE_INFINITY} if there are no elements.
- */
- public static Combine.Globally<Double, Double> doublesGlobally() {
- return Combine.globally(new MaxDoubleFn()).named("Max.Globally");
- }
-
- /**
- * Returns a {@code PTransform} that takes an input {@code PCollection<KV<K, Double>>} and returns
- * a {@code PCollection<KV<K, Double>>} that contains an output element mapping each distinct key
- * in the input {@code PCollection} to the maximum of the values associated with that key in the
- * input {@code PCollection}.
- *
- * <p>See {@link Combine.PerKey} for how this affects timestamps and windowing.
- */
- public static <K> Combine.PerKey<K, Double, Double> doublesPerKey() {
- return Combine.<K, Double, Double>perKey(new MaxDoubleFn()).named("Max.PerKey");
- }
-
- /**
- * Returns a {@code PTransform} that takes an input {@code PCollection<T>} and returns a {@code
- * PCollection<T>} whose contents is the maximum according to the natural ordering of {@code T}
- * of the input {@code PCollection}'s elements, or {@code null} if there are no elements.
- */
- public static <T extends Comparable<? super T>>
- Combine.Globally<T, T> globally() {
- return Combine.<T, T>globally(MaxFn.<T>naturalOrder()).named("Max.Globally");
- }
-
- /**
- * Returns a {@code PTransform} that takes an input {@code PCollection<KV<K, T>>} and returns a
- * {@code PCollection<KV<K, T>>} that contains an output element mapping each distinct key in the
- * input {@code PCollection} to the maximum according to the natural ordering of {@code T} of the
- * values associated with that key in the input {@code PCollection}.
- *
- * <p>See {@link Combine.PerKey} for how this affects timestamps and windowing.
- */
- public static <K, T extends Comparable<? super T>>
- Combine.PerKey<K, T, T> perKey() {
- return Combine.<K, T, T>perKey(MaxFn.<T>naturalOrder()).named("Max.PerKey");
- }
-
- /**
- * Returns a {@code PTransform} that takes an input {@code PCollection<T>} and returns a {@code
- * PCollection<T>} whose contents is the maximum of the input {@code PCollection}'s elements, or
- * {@code null} if there are no elements.
- */
- public static <T, ComparatorT extends Comparator<? super T> & Serializable>
- Combine.Globally<T, T> globally(ComparatorT comparator) {
- return Combine.<T, T>globally(MaxFn.of(comparator)).named("Max.Globally");
- }
-
- /**
- * Returns a {@code PTransform} that takes an input {@code PCollection<KV<K, T>>} and returns a
- * {@code PCollection<KV<K, T>>} that contains one output element per key mapping each
- * to the maximum of the values associated with that key in the input {@code PCollection}.
- *
- * <p>See {@link Combine.PerKey} for how this affects timestamps and windowing.
- */
- public static <K, T, ComparatorT extends Comparator<? super T> & Serializable>
- Combine.PerKey<K, T, T> perKey(ComparatorT comparator) {
- return Combine.<K, T, T>perKey(MaxFn.of(comparator)).named("Max.PerKey");
- }
-
- /////////////////////////////////////////////////////////////////////////////
-
- /**
- * A {@code CombineFn} that computes the maximum of a collection of elements of type {@code T}
- * using an arbitrary {@link Comparator}, useful as an argument to {@link Combine#globally} or
- * {@link Combine#perKey}.
- *
- * @param <T> the type of the values being compared
- */
- public static class MaxFn<T> extends BinaryCombineFn<T> {
-
- private final T identity;
- private final Comparator<? super T> comparator;
-
- private <ComparatorT extends Comparator<? super T> & Serializable> MaxFn(
- T identity, ComparatorT comparator) {
- this.identity = identity;
- this.comparator = comparator;
- }
-
- public static <T, ComparatorT extends Comparator<? super T> & Serializable>
- MaxFn<T> of(T identity, ComparatorT comparator) {
- return new MaxFn<T>(identity, comparator);
- }
-
- public static <T, ComparatorT extends Comparator<? super T> & Serializable>
- MaxFn<T> of(ComparatorT comparator) {
- return new MaxFn<T>(null, comparator);
- }
-
- public static <T extends Comparable<? super T>> MaxFn<T> naturalOrder(T identity) {
- return new MaxFn<T>(identity, new Top.Largest<T>());
- }
-
- public static <T extends Comparable<? super T>> MaxFn<T> naturalOrder() {
- return new MaxFn<T>(null, new Top.Largest<T>());
- }
-
- @Override
- public T identity() {
- return identity;
- }
-
- @Override
- public T apply(T left, T right) {
- return comparator.compare(left, right) >= 0 ? left : right;
- }
- }
-
- /**
- * A {@code CombineFn} that computes the maximum of a collection of {@code Integer}s, useful as an
- * argument to {@link Combine#globally} or {@link Combine#perKey}.
- */
- public static class MaxIntegerFn extends MaxFn<Integer> implements
- CounterProvider<Integer> {
- public MaxIntegerFn() {
- super(Integer.MIN_VALUE, new Top.Largest<Integer>());
- }
-
- @Override
- public Counter<Integer> getCounter(String name) {
- return Counter.ints(name, AggregationKind.MAX);
- }
- }
-
- /**
- * A {@code CombineFn} that computes the maximum of a collection of {@code Long}s, useful as an
- * argument to {@link Combine#globally} or {@link Combine#perKey}.
- */
- public static class MaxLongFn extends MaxFn<Long> implements
- CounterProvider<Long> {
- public MaxLongFn() {
- super(Long.MIN_VALUE, new Top.Largest<Long>());
- }
-
- @Override
- public Counter<Long> getCounter(String name) {
- return Counter.longs(name, AggregationKind.MAX);
- }
- }
-
- /**
- * A {@code CombineFn} that computes the maximum of a collection of {@code Double}s, useful as an
- * argument to {@link Combine#globally} or {@link Combine#perKey}.
- */
- public static class MaxDoubleFn extends MaxFn<Double> implements
- CounterProvider<Double> {
- public MaxDoubleFn() {
- super(Double.NEGATIVE_INFINITY, new Top.Largest<Double>());
- }
-
- @Override
- public Counter<Double> getCounter(String name) {
- return Counter.doubles(name, AggregationKind.MAX);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/Mean.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/Mean.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/Mean.java
deleted file mode 100644
index 7dccfb6..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/Mean.java
+++ /dev/null
@@ -1,202 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms;
-
-import com.google.cloud.dataflow.sdk.coders.AtomicCoder;
-import com.google.cloud.dataflow.sdk.coders.BigEndianLongCoder;
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.coders.CoderException;
-import com.google.cloud.dataflow.sdk.coders.CoderRegistry;
-import com.google.cloud.dataflow.sdk.coders.DoubleCoder;
-import com.google.cloud.dataflow.sdk.transforms.Combine.AccumulatingCombineFn.Accumulator;
-import com.google.common.base.MoreObjects;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.util.Objects;
-
-/**
- * {@code PTransform}s for computing the arithmetic mean
- * (a.k.a. average) of the elements in a {@code PCollection}, or the
- * mean of the values associated with each key in a
- * {@code PCollection} of {@code KV}s.
- *
- * <p>Example 1: get the mean of a {@code PCollection} of {@code Long}s.
- * <pre> {@code
- * PCollection<Long> input = ...;
- * PCollection<Double> mean = input.apply(Mean.<Long>globally());
- * } </pre>
- *
- * <p>Example 2: calculate the mean of the {@code Integer}s
- * associated with each unique key (which is of type {@code String}).
- * <pre> {@code
- * PCollection<KV<String, Integer>> input = ...;
- * PCollection<KV<String, Double>> meanPerKey =
- * input.apply(Mean.<String, Integer>perKey());
- * } </pre>
- */
-public class Mean {
-
- private Mean() { } // Namespace only
-
- /**
- * Returns a {@code PTransform} that takes an input
- * {@code PCollection<NumT>} and returns a
- * {@code PCollection<Double>} whose contents is the mean of the
- * input {@code PCollection}'s elements, or
- * {@code 0} if there are no elements.
- *
- * @param <NumT> the type of the {@code Number}s being combined
- */
- public static <NumT extends Number> Combine.Globally<NumT, Double> globally() {
- return Combine.<NumT, Double>globally(new MeanFn<>()).named("Mean.Globally");
- }
-
- /**
- * Returns a {@code PTransform} that takes an input
- * {@code PCollection<KV<K, N>>} and returns a
- * {@code PCollection<KV<K, Double>>} that contains an output
- * element mapping each distinct key in the input
- * {@code PCollection} to the mean of the values associated with
- * that key in the input {@code PCollection}.
- *
- * <p>See {@link Combine.PerKey} for how this affects timestamps and bucketing.
- *
- * @param <K> the type of the keys
- * @param <NumT> the type of the {@code Number}s being combined
- */
- public static <K, NumT extends Number> Combine.PerKey<K, NumT, Double> perKey() {
- return Combine.<K, NumT, Double>perKey(new MeanFn<>()).named("Mean.PerKey");
- }
-
-
- /////////////////////////////////////////////////////////////////////////////
-
- /**
- * A {@code Combine.CombineFn} that computes the arithmetic mean
- * (a.k.a. average) of an {@code Iterable} of numbers of type
- * {@code N}, useful as an argument to {@link Combine#globally} or
- * {@link Combine#perKey}.
- *
- * <p>Returns {@code Double.NaN} if combining zero elements.
- *
- * @param <NumT> the type of the {@code Number}s being combined
- */
- static class MeanFn<NumT extends Number>
- extends Combine.AccumulatingCombineFn<NumT, CountSum<NumT>, Double> {
- /**
- * Constructs a combining function that computes the mean over
- * a collection of values of type {@code N}.
- */
- public MeanFn() {}
-
- @Override
- public CountSum<NumT> createAccumulator() {
- return new CountSum<>();
- }
-
- @Override
- public Coder<CountSum<NumT>> getAccumulatorCoder(
- CoderRegistry registry, Coder<NumT> inputCoder) {
- return new CountSumCoder<>();
- }
- }
-
- /**
- * Accumulator class for {@link MeanFn}.
- */
- static class CountSum<NumT extends Number>
- implements Accumulator<NumT, CountSum<NumT>, Double> {
-
- long count = 0;
- double sum = 0.0;
-
- public CountSum() {
- this(0, 0);
- }
-
- public CountSum(long count, double sum) {
- this.count = count;
- this.sum = sum;
- }
-
- @Override
- public void addInput(NumT element) {
- count++;
- sum += element.doubleValue();
- }
-
- @Override
- public void mergeAccumulator(CountSum<NumT> accumulator) {
- count += accumulator.count;
- sum += accumulator.sum;
- }
-
- @Override
- public Double extractOutput() {
- return count == 0 ? Double.NaN : sum / count;
- }
-
- @Override
- public boolean equals(Object other) {
- if (!(other instanceof CountSum)) {
- return false;
- }
- @SuppressWarnings("unchecked")
- CountSum<?> otherCountSum = (CountSum<?>) other;
- return (count == otherCountSum.count)
- && (sum == otherCountSum.sum);
- }
-
- @Override
- public int hashCode() {
- return Objects.hash(count, sum);
- }
-
- @Override
- public String toString() {
- return MoreObjects.toStringHelper(this)
- .add("count", count)
- .add("sum", sum)
- .toString();
- }
- }
-
- static class CountSumCoder<NumT extends Number>
- extends AtomicCoder<CountSum<NumT>> {
- private static final Coder<Long> LONG_CODER = BigEndianLongCoder.of();
- private static final Coder<Double> DOUBLE_CODER = DoubleCoder.of();
-
- @Override
- public void encode(CountSum<NumT> value, OutputStream outStream, Coder.Context context)
- throws CoderException, IOException {
- Coder.Context nestedContext = context.nested();
- LONG_CODER.encode(value.count, outStream, nestedContext);
- DOUBLE_CODER.encode(value.sum, outStream, nestedContext);
- }
-
- @Override
- public CountSum<NumT> decode(InputStream inStream, Coder.Context context)
- throws CoderException, IOException {
- Coder.Context nestedContext = context.nested();
- return new CountSum<>(
- LONG_CODER.decode(inStream, nestedContext),
- DOUBLE_CODER.decode(inStream, nestedContext));
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/Min.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/Min.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/Min.java
deleted file mode 100644
index 47ab3a0..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/Min.java
+++ /dev/null
@@ -1,255 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms;
-
-import com.google.cloud.dataflow.sdk.transforms.Combine.BinaryCombineFn;
-import com.google.cloud.dataflow.sdk.util.common.Counter;
-import com.google.cloud.dataflow.sdk.util.common.Counter.AggregationKind;
-import com.google.cloud.dataflow.sdk.util.common.CounterProvider;
-
-import java.io.Serializable;
-import java.util.Comparator;
-
-/**
- * {@code PTransform}s for computing the minimum of the elements in a {@code PCollection}, or the
- * minimum of the values associated with each key in a {@code PCollection} of {@code KV}s.
- *
- * <p>Example 1: get the minimum of a {@code PCollection} of {@code Double}s.
- * <pre> {@code
- * PCollection<Double> input = ...;
- * PCollection<Double> min = input.apply(Min.doublesGlobally());
- * } </pre>
- *
- * <p>Example 2: calculate the minimum of the {@code Integer}s
- * associated with each unique key (which is of type {@code String}).
- * <pre> {@code
- * PCollection<KV<String, Integer>> input = ...;
- * PCollection<KV<String, Integer>> minPerKey = input
- * .apply(Min.<String>integersPerKey());
- * } </pre>
- */
-public class Min {
-
- private Min() {
- // do not instantiate
- }
-
- /**
- * Returns a {@code PTransform} that takes an input {@code PCollection<Integer>} and returns a
- * {@code PCollection<Integer>} whose contents is a single value that is the minimum of the input
- * {@code PCollection}'s elements, or {@code Integer.MAX_VALUE} if there are no elements.
- */
- public static Combine.Globally<Integer, Integer> integersGlobally() {
- return Combine.globally(new MinIntegerFn()).named("Min.Globally");
- }
-
- /**
- * Returns a {@code PTransform} that takes an input {@code PCollection<KV<K, Integer>>} and
- * returns a {@code PCollection<KV<K, Integer>>} that contains an output element mapping each
- * distinct key in the input {@code PCollection} to the minimum of the values associated with that
- * key in the input {@code PCollection}.
- *
- * <p>See {@link Combine.PerKey} for how this affects timestamps and windowing.
- */
- public static <K> Combine.PerKey<K, Integer, Integer> integersPerKey() {
- return Combine.<K, Integer, Integer>perKey(new MinIntegerFn()).named("Min.PerKey");
- }
-
- /**
- * Returns a {@code PTransform} that takes an input {@code PCollection<Long>} and returns a {@code
- * PCollection<Long>} whose contents is the minimum of the input {@code PCollection}'s elements,
- * or {@code Long.MAX_VALUE} if there are no elements.
- */
- public static Combine.Globally<Long, Long> longsGlobally() {
- return Combine.globally(new MinLongFn()).named("Min.Globally");
- }
-
- /**
- * Returns a {@code PTransform} that takes an input {@code PCollection<KV<K, Long>>} and returns a
- * {@code PCollection<KV<K, Long>>} that contains an output element mapping each distinct key in
- * the input {@code PCollection} to the minimum of the values associated with that key in the
- * input {@code PCollection}.
- *
- * <p>See {@link Combine.PerKey} for how this affects timestamps and windowing.
- */
- public static <K> Combine.PerKey<K, Long, Long> longsPerKey() {
- return Combine.<K, Long, Long>perKey(new MinLongFn()).named("Min.PerKey");
- }
-
- /**
- * Returns a {@code PTransform} that takes an input {@code PCollection<Double>} and returns a
- * {@code PCollection<Double>} whose contents is the minimum of the input {@code PCollection}'s
- * elements, or {@code Double.POSITIVE_INFINITY} if there are no elements.
- */
- public static Combine.Globally<Double, Double> doublesGlobally() {
- return Combine.globally(new MinDoubleFn()).named("Min.Globally");
- }
-
- /**
- * Returns a {@code PTransform} that takes an input {@code PCollection<KV<K, Double>>} and returns
- * a {@code PCollection<KV<K, Double>>} that contains an output element mapping each distinct key
- * in the input {@code PCollection} to the minimum of the values associated with that key in the
- * input {@code PCollection}.
- *
- * <p>See {@link Combine.PerKey} for how this affects timestamps and windowing.
- */
- public static <K> Combine.PerKey<K, Double, Double> doublesPerKey() {
- return Combine.<K, Double, Double>perKey(new MinDoubleFn()).named("Min.PerKey");
- }
-
- /**
- * Returns a {@code PTransform} that takes an input {@code PCollection<T>} and returns a {@code
- * PCollection<T>} whose contents is the minimum according to the natural ordering of {@code T}
- * of the input {@code PCollection}'s elements, or {@code null} if there are no elements.
- */
- public static <T extends Comparable<? super T>>
- Combine.Globally<T, T> globally() {
- return Combine.<T, T>globally(MinFn.<T>naturalOrder()).named("Min.Globally");
- }
-
- /**
- * Returns a {@code PTransform} that takes an input {@code PCollection<KV<K, T>>} and returns a
- * {@code PCollection<KV<K, T>>} that contains an output element mapping each distinct key in the
- * input {@code PCollection} to the minimum according to the natural ordering of {@code T} of the
- * values associated with that key in the input {@code PCollection}.
- *
- * <p>See {@link Combine.PerKey} for how this affects timestamps and windowing.
- */
- public static <K, T extends Comparable<? super T>>
- Combine.PerKey<K, T, T> perKey() {
- return Combine.<K, T, T>perKey(MinFn.<T>naturalOrder()).named("Min.PerKey");
- }
-
- /**
- * Returns a {@code PTransform} that takes an input {@code PCollection<T>} and returns a {@code
- * PCollection<T>} whose contents is the minimum of the input {@code PCollection}'s elements, or
- * {@code null} if there are no elements.
- */
- public static <T, ComparatorT extends Comparator<? super T> & Serializable>
- Combine.Globally<T, T> globally(ComparatorT comparator) {
- return Combine.<T, T>globally(MinFn.of(comparator)).named("Min.Globally");
- }
-
- /**
- * Returns a {@code PTransform} that takes an input {@code PCollection<KV<K, T>>} and returns a
- * {@code PCollection<KV<K, T>>} that contains one output element per key mapping each
- * to the minimum of the values associated with that key in the input {@code PCollection}.
- *
- * <p>See {@link Combine.PerKey} for how this affects timestamps and windowing.
- */
- public static <K, T, ComparatorT extends Comparator<? super T> & Serializable>
- Combine.PerKey<K, T, T> perKey(ComparatorT comparator) {
- return Combine.<K, T, T>perKey(MinFn.of(comparator)).named("Min.PerKey");
- }
-
- /////////////////////////////////////////////////////////////////////////////
-
- /**
- * A {@code CombineFn} that computes the maximum of a collection of elements of type {@code T}
- * using an arbitrary {@link Comparator}, useful as an argument to {@link Combine#globally} or
- * {@link Combine#perKey}.
- *
- * @param <T> the type of the values being compared
- */
- public static class MinFn<T> extends BinaryCombineFn<T> {
-
- private final T identity;
- private final Comparator<? super T> comparator;
-
- private <ComparatorT extends Comparator<? super T> & Serializable> MinFn(
- T identity, ComparatorT comparator) {
- this.identity = identity;
- this.comparator = comparator;
- }
-
- public static <T, ComparatorT extends Comparator<? super T> & Serializable>
- MinFn<T> of(T identity, ComparatorT comparator) {
- return new MinFn<T>(identity, comparator);
- }
-
- public static <T, ComparatorT extends Comparator<? super T> & Serializable>
- MinFn<T> of(ComparatorT comparator) {
- return new MinFn<T>(null, comparator);
- }
-
- public static <T extends Comparable<? super T>> MinFn<T> naturalOrder(T identity) {
- return new MinFn<T>(identity, new Top.Largest<T>());
- }
-
- public static <T extends Comparable<? super T>> MinFn<T> naturalOrder() {
- return new MinFn<T>(null, new Top.Largest<T>());
- }
-
- @Override
- public T identity() {
- return identity;
- }
-
- @Override
- public T apply(T left, T right) {
- return comparator.compare(left, right) <= 0 ? left : right;
- }
- }
-
- /**
- * A {@code CombineFn} that computes the minimum of a collection of {@code Integer}s, useful as an
- * argument to {@link Combine#globally} or {@link Combine#perKey}.
- */
- public static class MinIntegerFn extends MinFn<Integer> implements
- CounterProvider<Integer> {
- public MinIntegerFn() {
- super(Integer.MAX_VALUE, new Top.Largest<Integer>());
- }
-
- @Override
- public Counter<Integer> getCounter(String name) {
- return Counter.ints(name, AggregationKind.MIN);
- }
- }
-
- /**
- * A {@code CombineFn} that computes the minimum of a collection of {@code Long}s, useful as an
- * argument to {@link Combine#globally} or {@link Combine#perKey}.
- */
- public static class MinLongFn extends MinFn<Long> implements
- CounterProvider<Long> {
- public MinLongFn() {
- super(Long.MAX_VALUE, new Top.Largest<Long>());
- }
-
- @Override
- public Counter<Long> getCounter(String name) {
- return Counter.longs(name, AggregationKind.MIN);
- }
- }
-
- /**
- * A {@code CombineFn} that computes the minimum of a collection of {@code Double}s, useful as an
- * argument to {@link Combine#globally} or {@link Combine#perKey}.
- */
- public static class MinDoubleFn extends MinFn<Double> implements
- CounterProvider<Double> {
- public MinDoubleFn() {
- super(Double.POSITIVE_INFINITY, new Top.Largest<Double>());
- }
-
- @Override
- public Counter<Double> getCounter(String name) {
- return Counter.doubles(name, AggregationKind.MIN);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/PTransform.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/PTransform.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/PTransform.java
deleted file mode 100644
index d4496b8..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/PTransform.java
+++ /dev/null
@@ -1,324 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms;
-
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.coders.CannotProvideCoderException;
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.transforms.display.DisplayData.Builder;
-import com.google.cloud.dataflow.sdk.transforms.display.HasDisplayData;
-import com.google.cloud.dataflow.sdk.util.StringUtils;
-import com.google.cloud.dataflow.sdk.values.PInput;
-import com.google.cloud.dataflow.sdk.values.POutput;
-import com.google.cloud.dataflow.sdk.values.TypedPValue;
-
-import java.io.ObjectInputStream;
-import java.io.ObjectOutputStream;
-import java.io.Serializable;
-
-/**
- * A {@code PTransform<InputT, OutputT>} is an operation that takes an
- * {@code InputT} (some subtype of {@link PInput}) and produces an
- * {@code OutputT} (some subtype of {@link POutput}).
- *
- * <p>Common PTransforms include root PTransforms like
- * {@link com.google.cloud.dataflow.sdk.io.TextIO.Read},
- * {@link Create}, processing and
- * conversion operations like {@link ParDo},
- * {@link GroupByKey},
- * {@link com.google.cloud.dataflow.sdk.transforms.join.CoGroupByKey},
- * {@link Combine}, and {@link Count}, and outputting
- * PTransforms like
- * {@link com.google.cloud.dataflow.sdk.io.TextIO.Write}. Users also
- * define their own application-specific composite PTransforms.
- *
- * <p>Each {@code PTransform<InputT, OutputT>} has a single
- * {@code InputT} type and a single {@code OutputT} type. Many
- * PTransforms conceptually transform one input value to one output
- * value, and in this case {@code InputT} and {@code Output} are
- * typically instances of
- * {@link com.google.cloud.dataflow.sdk.values.PCollection}.
- * A root
- * PTransform conceptually has no input; in this case, conventionally
- * a {@link com.google.cloud.dataflow.sdk.values.PBegin} object
- * produced by calling {@link Pipeline#begin} is used as the input.
- * An outputting PTransform conceptually has no output; in this case,
- * conventionally {@link com.google.cloud.dataflow.sdk.values.PDone}
- * is used as its output type. Some PTransforms conceptually have
- * multiple inputs and/or outputs; in these cases special "bundling"
- * classes like
- * {@link com.google.cloud.dataflow.sdk.values.PCollectionList},
- * {@link com.google.cloud.dataflow.sdk.values.PCollectionTuple}
- * are used
- * to combine multiple values into a single bundle for passing into or
- * returning from the PTransform.
- *
- * <p>A {@code PTransform<InputT, OutputT>} is invoked by calling
- * {@code apply()} on its {@code InputT}, returning its {@code OutputT}.
- * Calls can be chained to concisely create linear pipeline segments.
- * For example:
- *
- * <pre> {@code
- * PCollection<T1> pc1 = ...;
- * PCollection<T2> pc2 =
- * pc1.apply(ParDo.of(new MyDoFn<T1,KV<K,V>>()))
- * .apply(GroupByKey.<K, V>create())
- * .apply(Combine.perKey(new MyKeyedCombineFn<K,V>()))
- * .apply(ParDo.of(new MyDoFn2<KV<K,V>,T2>()));
- * } </pre>
- *
- * <p>PTransform operations have unique names, which are used by the
- * system when explaining what's going on during optimization and
- * execution. Each PTransform gets a system-provided default name,
- * but it's a good practice to specify an explicit name, where
- * possible, using the {@code named()} method offered by some
- * PTransforms such as {@link ParDo}. For example:
- *
- * <pre> {@code
- * ...
- * .apply(ParDo.named("Step1").of(new MyDoFn3()))
- * ...
- * } </pre>
- *
- * <p>Each PCollection output produced by a PTransform,
- * either directly or within a "bundling" class, automatically gets
- * its own name derived from the name of its producing PTransform.
- *
- * <p>Each PCollection output produced by a PTransform
- * also records a {@link com.google.cloud.dataflow.sdk.coders.Coder}
- * that specifies how the elements of that PCollection
- * are to be encoded as a byte string, if necessary. The
- * PTransform may provide a default Coder for any of its outputs, for
- * instance by deriving it from the PTransform input's Coder. If the
- * PTransform does not specify the Coder for an output PCollection,
- * the system will attempt to infer a Coder for it, based on
- * what's known at run-time about the Java type of the output's
- * elements. The enclosing {@link Pipeline}'s
- * {@link com.google.cloud.dataflow.sdk.coders.CoderRegistry}
- * (accessible via {@link Pipeline#getCoderRegistry}) defines the
- * mapping from Java types to the default Coder to use, for a standard
- * set of Java types; users can extend this mapping for additional
- * types, via
- * {@link com.google.cloud.dataflow.sdk.coders.CoderRegistry#registerCoder}.
- * If this inference process fails, either because the Java type was
- * not known at run-time (e.g., due to Java's "erasure" of generic
- * types) or there was no default Coder registered, then the Coder
- * should be specified manually by calling
- * {@link com.google.cloud.dataflow.sdk.values.TypedPValue#setCoder}
- * on the output PCollection. The Coder of every output
- * PCollection must be determined one way or another
- * before that output is used as an input to another PTransform, or
- * before the enclosing Pipeline is run.
- *
- * <p>A small number of PTransforms are implemented natively by the
- * Google Cloud Dataflow SDK; such PTransforms simply return an
- * output value as their apply implementation.
- * The majority of PTransforms are
- * implemented as composites of other PTransforms. Such a PTransform
- * subclass typically just implements {@link #apply}, computing its
- * Output value from its {@code InputT} value. User programs are encouraged to
- * use this mechanism to modularize their own code. Such composite
- * abstractions get their own name, and navigating through the
- * composition hierarchy of PTransforms is supported by the monitoring
- * interface. Examples of composite PTransforms can be found in this
- * directory and in examples. From the caller's point of view, there
- * is no distinction between a PTransform implemented natively and one
- * implemented in terms of other PTransforms; both kinds of PTransform
- * are invoked in the same way, using {@code apply()}.
- *
- * <h3>Note on Serialization</h3>
- *
- * <p>{@code PTransform} doesn't actually support serialization, despite
- * implementing {@code Serializable}.
- *
- * <p>{@code PTransform} is marked {@code Serializable} solely
- * because it is common for an anonymous {@code DoFn},
- * instance to be created within an
- * {@code apply()} method of a composite {@code PTransform}.
- *
- * <p>Each of those {@code *Fn}s is {@code Serializable}, but
- * unfortunately its instance state will contain a reference to the
- * enclosing {@code PTransform} instance, and so attempt to serialize
- * the {@code PTransform} instance, even though the {@code *Fn}
- * instance never references anything about the enclosing
- * {@code PTransform}.
- *
- * <p>To allow such anonymous {@code *Fn}s to be written
- * conveniently, {@code PTransform} is marked as {@code Serializable},
- * and includes dummy {@code writeObject()} and {@code readObject()}
- * operations that do not save or restore any state.
- *
- * @see <a href=
- * "https://cloud.google.com/dataflow/java-sdk/applying-transforms"
- * >Applying Transformations</a>
- *
- * @param <InputT> the type of the input to this PTransform
- * @param <OutputT> the type of the output of this PTransform
- */
-public abstract class PTransform<InputT extends PInput, OutputT extends POutput>
- implements Serializable /* See the note above */, HasDisplayData {
- /**
- * Applies this {@code PTransform} on the given {@code InputT}, and returns its
- * {@code Output}.
- *
- * <p>Composite transforms, which are defined in terms of other transforms,
- * should return the output of one of the composed transforms. Non-composite
- * transforms, which do not apply any transforms internally, should return
- * a new unbound output and register evaluators (via backend-specific
- * registration methods).
- *
- * <p>The default implementation throws an exception. A derived class must
- * either implement apply, or else each runner must supply a custom
- * implementation via
- * {@link com.google.cloud.dataflow.sdk.runners.PipelineRunner#apply}.
- */
- public OutputT apply(InputT input) {
- throw new IllegalArgumentException(
- "Runner " + input.getPipeline().getRunner()
- + " has not registered an implementation for the required primitive operation "
- + this);
- }
-
- /**
- * Called before invoking apply (which may be intercepted by the runner) to
- * verify this transform is fully specified and applicable to the specified
- * input.
- *
- * <p>By default, does nothing.
- */
- public void validate(InputT input) { }
-
- /**
- * Returns the transform name.
- *
- * <p>This name is provided by the transform creator and is not required to be unique.
- */
- public String getName() {
- return name != null ? name : getKindString();
- }
-
- /////////////////////////////////////////////////////////////////////////////
-
- // See the note about about PTransform's fake Serializability, to
- // understand why all of its instance state is transient.
-
- /**
- * The base name of this {@code PTransform}, e.g., from
- * {@link ParDo#named(String)}, or from defaults, or {@code null} if not
- * yet assigned.
- */
- protected final transient String name;
-
- protected PTransform() {
- this.name = null;
- }
-
- protected PTransform(String name) {
- this.name = name;
- }
-
- @Override
- public String toString() {
- if (name == null) {
- return getKindString();
- } else {
- return getName() + " [" + getKindString() + "]";
- }
- }
-
- /**
- * Returns the name to use by default for this {@code PTransform}
- * (not including the names of any enclosing {@code PTransform}s).
- *
- * <p>By default, returns the base name of this {@code PTransform}'s class.
- *
- * <p>The caller is responsible for ensuring that names of applied
- * {@code PTransform}s are unique, e.g., by adding a uniquifying
- * suffix when needed.
- */
- protected String getKindString() {
- if (getClass().isAnonymousClass()) {
- return "AnonymousTransform";
- } else {
- return StringUtils.approximatePTransformName(getClass());
- }
- }
-
- private void writeObject(ObjectOutputStream oos) {
- // We don't really want to be serializing this object, but we
- // often have serializable anonymous DoFns nested within a
- // PTransform.
- }
-
- private void readObject(ObjectInputStream oos) {
- // We don't really want to be serializing this object, but we
- // often have serializable anonymous DoFns nested within a
- // PTransform.
- }
-
- /**
- * Returns the default {@code Coder} to use for the output of this
- * single-output {@code PTransform}.
- *
- * <p>By default, always throws
- *
- * @throws CannotProvideCoderException if no coder can be inferred
- */
- protected Coder<?> getDefaultOutputCoder() throws CannotProvideCoderException {
- throw new CannotProvideCoderException(
- "PTransform.getDefaultOutputCoder called.");
- }
-
- /**
- * Returns the default {@code Coder} to use for the output of this
- * single-output {@code PTransform} when applied to the given input.
- *
- * @throws CannotProvideCoderException if none can be inferred.
- *
- * <p>By default, always throws.
- */
- protected Coder<?> getDefaultOutputCoder(@SuppressWarnings("unused") InputT input)
- throws CannotProvideCoderException {
- return getDefaultOutputCoder();
- }
-
- /**
- * Returns the default {@code Coder} to use for the given output of
- * this single-output {@code PTransform} when applied to the given input.
- *
- * @throws CannotProvideCoderException if none can be inferred.
- *
- * <p>By default, always throws.
- */
- public <T> Coder<T> getDefaultOutputCoder(
- InputT input, @SuppressWarnings("unused") TypedPValue<T> output)
- throws CannotProvideCoderException {
- @SuppressWarnings("unchecked")
- Coder<T> defaultOutputCoder = (Coder<T>) getDefaultOutputCoder(input);
- return defaultOutputCoder;
- }
-
- /**
- * {@inheritDoc}
- *
- * <p>By default, does not register any display data. Implementors may override this method
- * to provide their own display metadata.
- */
- @Override
- public void populateDisplayData(Builder builder) {
- }
-}
[63/67] incubator-beam git commit: Directory reorganization
Posted by dh...@apache.org.
Directory reorganization
Move Java 8 examples from "java8examples/" into "examples/java8/".
Project: http://git-wip-us.apache.org/repos/asf/incubator-beam/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-beam/commit/11bb9e0e
Tree: http://git-wip-us.apache.org/repos/asf/incubator-beam/tree/11bb9e0e
Diff: http://git-wip-us.apache.org/repos/asf/incubator-beam/diff/11bb9e0e
Branch: refs/heads/master
Commit: 11bb9e0e61f8b15ce81e5181baa5458bb715a059
Parents: 2eaa709
Author: Davor Bonaci <da...@google.com>
Authored: Wed Mar 23 17:16:47 2016 -0700
Committer: Davor Bonaci <da...@google.com>
Committed: Wed Mar 23 18:33:33 2016 -0700
----------------------------------------------------------------------
examples/java8/pom.xml | 279 +++++++++++++
.../examples/MinimalWordCountJava8.java | 68 +++
.../examples/complete/game/GameStats.java | 339 +++++++++++++++
.../examples/complete/game/HourlyTeamScore.java | 193 +++++++++
.../examples/complete/game/LeaderBoard.java | 237 +++++++++++
.../dataflow/examples/complete/game/README.md | 113 +++++
.../examples/complete/game/UserScore.java | 239 +++++++++++
.../complete/game/injector/Injector.java | 415 +++++++++++++++++++
.../complete/game/injector/InjectorUtils.java | 101 +++++
.../injector/RetryHttpInitializerWrapper.java | 126 ++++++
.../complete/game/utils/WriteToBigQuery.java | 134 ++++++
.../game/utils/WriteWindowedToBigQuery.java | 76 ++++
.../examples/MinimalWordCountJava8Test.java | 103 +++++
.../examples/complete/game/GameStatsTest.java | 76 ++++
.../complete/game/HourlyTeamScoreTest.java | 111 +++++
.../examples/complete/game/UserScoreTest.java | 154 +++++++
java8examples/pom.xml | 279 -------------
.../examples/MinimalWordCountJava8.java | 68 ---
.../examples/complete/game/GameStats.java | 339 ---------------
.../examples/complete/game/HourlyTeamScore.java | 193 ---------
.../examples/complete/game/LeaderBoard.java | 237 -----------
.../dataflow/examples/complete/game/README.md | 113 -----
.../examples/complete/game/UserScore.java | 239 -----------
.../complete/game/injector/Injector.java | 415 -------------------
.../complete/game/injector/InjectorUtils.java | 101 -----
.../injector/RetryHttpInitializerWrapper.java | 126 ------
.../complete/game/utils/WriteToBigQuery.java | 134 ------
.../game/utils/WriteWindowedToBigQuery.java | 76 ----
.../examples/MinimalWordCountJava8Test.java | 103 -----
.../examples/complete/game/GameStatsTest.java | 76 ----
.../complete/game/HourlyTeamScoreTest.java | 111 -----
.../examples/complete/game/UserScoreTest.java | 154 -------
pom.xml | 2 +-
33 files changed, 2765 insertions(+), 2765 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/11bb9e0e/examples/java8/pom.xml
----------------------------------------------------------------------
diff --git a/examples/java8/pom.xml b/examples/java8/pom.xml
new file mode 100644
index 0000000..7d55c31
--- /dev/null
+++ b/examples/java8/pom.xml
@@ -0,0 +1,279 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.beam</groupId>
+ <artifactId>parent</artifactId>
+ <version>0.1.0-incubating-SNAPSHOT</version>
+ <relativePath>../../pom.xml</relativePath>
+ </parent>
+
+ <artifactId>java8examples-all</artifactId>
+ <name>Apache Beam :: Examples :: Java 8 All</name>
+ <description>Apache Beam Java SDK provides a simple, Java-based
+ interface for processing virtually any size data.
+ This artifact includes examples of the SDK from a Java 8
+ user.</description>
+
+ <packaging>jar</packaging>
+
+ <profiles>
+ <profile>
+ <id>DataflowPipelineTests</id>
+ <properties>
+ <runIntegrationTestOnService>true</runIntegrationTestOnService>
+ <testGroups>com.google.cloud.dataflow.sdk.testing.RunnableOnService</testGroups>
+ <testParallelValue>both</testParallelValue>
+ </properties>
+ </profile>
+ </profiles>
+
+ <build>
+ <plugins>
+ <plugin>
+ <artifactId>maven-compiler-plugin</artifactId>
+ <configuration>
+ <source>1.8</source>
+ <target>1.8</target>
+ <testSource>1.8</testSource>
+ <testTarget>1.8</testTarget>
+ </configuration>
+ </plugin>
+
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-dependency-plugin</artifactId>
+ <executions>
+ <execution>
+ <goals><goal>analyze-only</goal></goals>
+ <configuration>
+ <failOnWarning>true</failOnWarning>
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
+
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-checkstyle-plugin</artifactId>
+ <version>2.12</version>
+ <dependencies>
+ <dependency>
+ <groupId>com.puppycrawl.tools</groupId>
+ <artifactId>checkstyle</artifactId>
+ <version>6.6</version>
+ </dependency>
+ </dependencies>
+ <configuration>
+ <configLocation>../../checkstyle.xml</configLocation>
+ <consoleOutput>true</consoleOutput>
+ <failOnViolation>true</failOnViolation>
+ <includeTestSourceDirectory>true</includeTestSourceDirectory>
+ <includeResources>false</includeResources>
+ </configuration>
+ <executions>
+ <execution>
+ <goals>
+ <goal>check</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
+
+ <!-- Source plugin for generating source and test-source JARs. -->
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-source-plugin</artifactId>
+ <version>2.4</version>
+ <executions>
+ <execution>
+ <id>attach-sources</id>
+ <phase>compile</phase>
+ <goals>
+ <goal>jar</goal>
+ </goals>
+ </execution>
+ <execution>
+ <id>attach-test-sources</id>
+ <phase>test-compile</phase>
+ <goals>
+ <goal>test-jar</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
+
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-jar-plugin</artifactId>
+ <executions>
+ <execution>
+ <id>default-jar</id>
+ <goals>
+ <goal>jar</goal>
+ </goals>
+ </execution>
+ <execution>
+ <id>default-test-jar</id>
+ <goals>
+ <goal>test-jar</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
+
+ <!-- Coverage analysis for unit tests. -->
+ <plugin>
+ <groupId>org.jacoco</groupId>
+ <artifactId>jacoco-maven-plugin</artifactId>
+ </plugin>
+ </plugins>
+ </build>
+
+ <dependencies>
+ <dependency>
+ <groupId>org.apache.beam</groupId>
+ <artifactId>java-sdk-all</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.beam</groupId>
+ <artifactId>java-examples-all</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+
+ <dependency>
+ <groupId>com.google.guava</groupId>
+ <artifactId>guava</artifactId>
+ <version>${guava.version}</version>
+ </dependency>
+
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-api</artifactId>
+ <version>${slf4j.version}</version>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.avro</groupId>
+ <artifactId>avro</artifactId>
+ <version>${avro.version}</version>
+ </dependency>
+
+ <dependency>
+ <groupId>joda-time</groupId>
+ <artifactId>joda-time</artifactId>
+ <version>${joda.version}</version>
+ </dependency>
+
+ <dependency>
+ <groupId>org.hamcrest</groupId>
+ <artifactId>hamcrest-all</artifactId>
+ <version>${hamcrest.version}</version>
+ <scope>test</scope>
+ </dependency>
+
+ <dependency>
+ <groupId>org.mockito</groupId>
+ <artifactId>mockito-all</artifactId>
+ <version>1.10.19</version>
+ <scope>test</scope>
+ </dependency>
+
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <version>${junit.version}</version>
+ <scope>test</scope>
+ </dependency>
+
+ <dependency>
+ <groupId>com.google.apis</groupId>
+ <artifactId>google-api-services-bigquery</artifactId>
+ <version>${bigquery.version}</version>
+ <exclusions>
+ <!-- Exclude an old version of guava that is being pulled
+ in by a transitive dependency of google-api-client -->
+ <exclusion>
+ <groupId>com.google.guava</groupId>
+ <artifactId>guava-jdk5</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+
+ <dependency>
+ <groupId>com.google.http-client</groupId>
+ <artifactId>google-http-client</artifactId>
+ <version>${google-clients.version}</version>
+ <exclusions>
+ <!-- Exclude an old version of guava that is being pulled
+ in by a transitive dependency of google-api-client -->
+ <exclusion>
+ <groupId>com.google.guava</groupId>
+ <artifactId>guava-jdk5</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+
+ <dependency>
+ <groupId>com.google.oauth-client</groupId>
+ <artifactId>google-oauth-client</artifactId>
+ <version>${google-clients.version}</version>
+ <exclusions>
+ <!-- Exclude an old version of guava that is being pulled
+ in by a transitive dependency of google-api-client -->
+ <exclusion>
+ <groupId>com.google.guava</groupId>
+ <artifactId>guava-jdk5</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+
+ <dependency>
+ <groupId>com.google.apis</groupId>
+ <artifactId>google-api-services-pubsub</artifactId>
+ <version>${pubsub.version}</version>
+ <exclusions>
+ <!-- Exclude an old version of guava that is being pulled
+ in by a transitive dependency of google-api-client -->
+ <exclusion>
+ <groupId>com.google.guava</groupId>
+ <artifactId>guava-jdk5</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+
+ <dependency>
+ <groupId>com.google.api-client</groupId>
+ <artifactId>google-api-client</artifactId>
+ <version>${google-clients.version}</version>
+ <exclusions>
+ <exclusion>
+ <groupId>com.google.guava</groupId>
+ <artifactId>guava-jdk5</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+ </dependencies>
+</project>
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/11bb9e0e/examples/java8/src/main/java/com/google/cloud/dataflow/examples/MinimalWordCountJava8.java
----------------------------------------------------------------------
diff --git a/examples/java8/src/main/java/com/google/cloud/dataflow/examples/MinimalWordCountJava8.java b/examples/java8/src/main/java/com/google/cloud/dataflow/examples/MinimalWordCountJava8.java
new file mode 100644
index 0000000..c115ea0
--- /dev/null
+++ b/examples/java8/src/main/java/com/google/cloud/dataflow/examples/MinimalWordCountJava8.java
@@ -0,0 +1,68 @@
+/*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package com.google.cloud.dataflow.examples;
+
+import com.google.cloud.dataflow.sdk.Pipeline;
+import com.google.cloud.dataflow.sdk.io.TextIO;
+import com.google.cloud.dataflow.sdk.options.DataflowPipelineOptions;
+import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
+import com.google.cloud.dataflow.sdk.runners.BlockingDataflowPipelineRunner;
+import com.google.cloud.dataflow.sdk.transforms.Count;
+import com.google.cloud.dataflow.sdk.transforms.Filter;
+import com.google.cloud.dataflow.sdk.transforms.FlatMapElements;
+import com.google.cloud.dataflow.sdk.transforms.MapElements;
+import com.google.cloud.dataflow.sdk.values.KV;
+import com.google.cloud.dataflow.sdk.values.TypeDescriptor;
+
+import java.util.Arrays;
+
+/**
+ * An example that counts words in Shakespeare, using Java 8 language features.
+ *
+ * <p>See {@link MinimalWordCount} for a comprehensive explanation.
+ */
+public class MinimalWordCountJava8 {
+
+ public static void main(String[] args) {
+ DataflowPipelineOptions options = PipelineOptionsFactory.create()
+ .as(DataflowPipelineOptions.class);
+
+ options.setRunner(BlockingDataflowPipelineRunner.class);
+
+ // CHANGE 1 of 3: Your project ID is required in order to run your pipeline on the Google Cloud.
+ options.setProject("SET_YOUR_PROJECT_ID_HERE");
+
+ // CHANGE 2 of 3: Your Google Cloud Storage path is required for staging local files.
+ options.setStagingLocation("gs://SET_YOUR_BUCKET_NAME_HERE/AND_STAGING_DIRECTORY");
+
+ Pipeline p = Pipeline.create(options);
+
+ p.apply(TextIO.Read.from("gs://dataflow-samples/shakespeare/*"))
+ .apply(FlatMapElements.via((String word) -> Arrays.asList(word.split("[^a-zA-Z']+")))
+ .withOutputType(new TypeDescriptor<String>() {}))
+ .apply(Filter.byPredicate((String word) -> !word.isEmpty()))
+ .apply(Count.<String>perElement())
+ .apply(MapElements
+ .via((KV<String, Long> wordCount) -> wordCount.getKey() + ": " + wordCount.getValue())
+ .withOutputType(new TypeDescriptor<String>() {}))
+
+ // CHANGE 3 of 3: The Google Cloud Storage path is required for outputting the results to.
+ .apply(TextIO.Write.to("gs://YOUR_OUTPUT_BUCKET/AND_OUTPUT_PREFIX"));
+
+ p.run();
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/11bb9e0e/examples/java8/src/main/java/com/google/cloud/dataflow/examples/complete/game/GameStats.java
----------------------------------------------------------------------
diff --git a/examples/java8/src/main/java/com/google/cloud/dataflow/examples/complete/game/GameStats.java b/examples/java8/src/main/java/com/google/cloud/dataflow/examples/complete/game/GameStats.java
new file mode 100644
index 0000000..7c67d10
--- /dev/null
+++ b/examples/java8/src/main/java/com/google/cloud/dataflow/examples/complete/game/GameStats.java
@@ -0,0 +1,339 @@
+/*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package com.google.cloud.dataflow.examples.complete.game;
+
+import com.google.cloud.dataflow.examples.common.DataflowExampleUtils;
+import com.google.cloud.dataflow.examples.complete.game.utils.WriteWindowedToBigQuery;
+import com.google.cloud.dataflow.sdk.Pipeline;
+import com.google.cloud.dataflow.sdk.PipelineResult;
+import com.google.cloud.dataflow.sdk.io.PubsubIO;
+import com.google.cloud.dataflow.sdk.options.Default;
+import com.google.cloud.dataflow.sdk.options.Description;
+import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
+import com.google.cloud.dataflow.sdk.runners.DataflowPipelineRunner;
+import com.google.cloud.dataflow.sdk.transforms.Aggregator;
+import com.google.cloud.dataflow.sdk.transforms.Combine;
+import com.google.cloud.dataflow.sdk.transforms.DoFn;
+import com.google.cloud.dataflow.sdk.transforms.DoFn.RequiresWindowAccess;
+import com.google.cloud.dataflow.sdk.transforms.MapElements;
+import com.google.cloud.dataflow.sdk.transforms.Mean;
+import com.google.cloud.dataflow.sdk.transforms.PTransform;
+import com.google.cloud.dataflow.sdk.transforms.ParDo;
+import com.google.cloud.dataflow.sdk.transforms.Sum;
+import com.google.cloud.dataflow.sdk.transforms.Values;
+import com.google.cloud.dataflow.sdk.transforms.View;
+import com.google.cloud.dataflow.sdk.transforms.windowing.FixedWindows;
+import com.google.cloud.dataflow.sdk.transforms.windowing.IntervalWindow;
+import com.google.cloud.dataflow.sdk.transforms.windowing.OutputTimeFns;
+import com.google.cloud.dataflow.sdk.transforms.windowing.Sessions;
+import com.google.cloud.dataflow.sdk.transforms.windowing.Window;
+import com.google.cloud.dataflow.sdk.values.KV;
+import com.google.cloud.dataflow.sdk.values.PCollection;
+import com.google.cloud.dataflow.sdk.values.PCollectionView;
+import com.google.cloud.dataflow.sdk.values.TypeDescriptor;
+
+import org.joda.time.DateTimeZone;
+import org.joda.time.Duration;
+import org.joda.time.Instant;
+import org.joda.time.format.DateTimeFormat;
+import org.joda.time.format.DateTimeFormatter;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.TimeZone;
+
+/**
+ * This class is the fourth in a series of four pipelines that tell a story in a 'gaming'
+ * domain, following {@link UserScore}, {@link HourlyTeamScore}, and {@link LeaderBoard}.
+ * New concepts: session windows and finding session duration; use of both
+ * singleton and non-singleton side inputs.
+ *
+ * <p> This pipeline builds on the {@link LeaderBoard} functionality, and adds some "business
+ * intelligence" analysis: abuse detection and usage patterns. The pipeline derives the Mean user
+ * score sum for a window, and uses that information to identify likely spammers/robots. (The robots
+ * have a higher click rate than the human users). The 'robot' users are then filtered out when
+ * calculating the team scores.
+ *
+ * <p> Additionally, user sessions are tracked: that is, we find bursts of user activity using
+ * session windows. Then, the mean session duration information is recorded in the context of
+ * subsequent fixed windowing. (This could be used to tell us what games are giving us greater
+ * user retention).
+ *
+ * <p> Run {@code com.google.cloud.dataflow.examples.complete.game.injector.Injector} to generate
+ * pubsub data for this pipeline. The {@code Injector} documentation provides more detail.
+ *
+ * <p> To execute this pipeline using the Dataflow service, specify the pipeline configuration
+ * like this:
+ * <pre>{@code
+ * --project=YOUR_PROJECT_ID
+ * --stagingLocation=gs://YOUR_STAGING_DIRECTORY
+ * --runner=BlockingDataflowPipelineRunner
+ * --dataset=YOUR-DATASET
+ * --topic=projects/YOUR-PROJECT/topics/YOUR-TOPIC
+ * }
+ * </pre>
+ * where the BigQuery dataset you specify must already exist. The PubSub topic you specify should
+ * be the same topic to which the Injector is publishing.
+ */
+public class GameStats extends LeaderBoard {
+
+ private static final String TIMESTAMP_ATTRIBUTE = "timestamp_ms";
+
+ private static DateTimeFormatter fmt =
+ DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss.SSS")
+ .withZone(DateTimeZone.forTimeZone(TimeZone.getTimeZone("PST")));
+
+ /**
+ * Filter out all but those users with a high clickrate, which we will consider as 'spammy' uesrs.
+ * We do this by finding the mean total score per user, then using that information as a side
+ * input to filter out all but those user scores that are > (mean * SCORE_WEIGHT)
+ */
+ // [START DocInclude_AbuseDetect]
+ public static class CalculateSpammyUsers
+ extends PTransform<PCollection<KV<String, Integer>>, PCollection<KV<String, Integer>>> {
+ private static final Logger LOG = LoggerFactory.getLogger(CalculateSpammyUsers.class);
+ private static final double SCORE_WEIGHT = 2.5;
+
+ @Override
+ public PCollection<KV<String, Integer>> apply(PCollection<KV<String, Integer>> userScores) {
+
+ // Get the sum of scores for each user.
+ PCollection<KV<String, Integer>> sumScores = userScores
+ .apply("UserSum", Sum.<String>integersPerKey());
+
+ // Extract the score from each element, and use it to find the global mean.
+ final PCollectionView<Double> globalMeanScore = sumScores.apply(Values.<Integer>create())
+ .apply(Mean.<Integer>globally().asSingletonView());
+
+ // Filter the user sums using the global mean.
+ PCollection<KV<String, Integer>> filtered = sumScores
+ .apply(ParDo
+ .named("ProcessAndFilter")
+ // use the derived mean total score as a side input
+ .withSideInputs(globalMeanScore)
+ .of(new DoFn<KV<String, Integer>, KV<String, Integer>>() {
+ private final Aggregator<Long, Long> numSpammerUsers =
+ createAggregator("SpammerUsers", new Sum.SumLongFn());
+ @Override
+ public void processElement(ProcessContext c) {
+ Integer score = c.element().getValue();
+ Double gmc = c.sideInput(globalMeanScore);
+ if (score > (gmc * SCORE_WEIGHT)) {
+ LOG.info("user " + c.element().getKey() + " spammer score " + score
+ + " with mean " + gmc);
+ numSpammerUsers.addValue(1L);
+ c.output(c.element());
+ }
+ }
+ }));
+ return filtered;
+ }
+ }
+ // [END DocInclude_AbuseDetect]
+
+ /**
+ * Calculate and output an element's session duration.
+ */
+ private static class UserSessionInfoFn extends DoFn<KV<String, Integer>, Integer>
+ implements RequiresWindowAccess {
+
+ @Override
+ public void processElement(ProcessContext c) {
+ IntervalWindow w = (IntervalWindow) c.window();
+ int duration = new Duration(
+ w.start(), w.end()).toPeriod().toStandardMinutes().getMinutes();
+ c.output(duration);
+ }
+ }
+
+
+ /**
+ * Options supported by {@link GameStats}.
+ */
+ static interface Options extends LeaderBoard.Options {
+ @Description("Numeric value of fixed window duration for user analysis, in minutes")
+ @Default.Integer(60)
+ Integer getFixedWindowDuration();
+ void setFixedWindowDuration(Integer value);
+
+ @Description("Numeric value of gap between user sessions, in minutes")
+ @Default.Integer(5)
+ Integer getSessionGap();
+ void setSessionGap(Integer value);
+
+ @Description("Numeric value of fixed window for finding mean of user session duration, "
+ + "in minutes")
+ @Default.Integer(30)
+ Integer getUserActivityWindowDuration();
+ void setUserActivityWindowDuration(Integer value);
+
+ @Description("Prefix used for the BigQuery table names")
+ @Default.String("game_stats")
+ String getTablePrefix();
+ void setTablePrefix(String value);
+ }
+
+
+ /**
+ * Create a map of information that describes how to write pipeline output to BigQuery. This map
+ * is used to write information about team score sums.
+ */
+ protected static Map<String, WriteWindowedToBigQuery.FieldInfo<KV<String, Integer>>>
+ configureWindowedWrite() {
+ Map<String, WriteWindowedToBigQuery.FieldInfo<KV<String, Integer>>> tableConfigure =
+ new HashMap<String, WriteWindowedToBigQuery.FieldInfo<KV<String, Integer>>>();
+ tableConfigure.put("team",
+ new WriteWindowedToBigQuery.FieldInfo<KV<String, Integer>>("STRING",
+ c -> c.element().getKey()));
+ tableConfigure.put("total_score",
+ new WriteWindowedToBigQuery.FieldInfo<KV<String, Integer>>("INTEGER",
+ c -> c.element().getValue()));
+ tableConfigure.put("window_start",
+ new WriteWindowedToBigQuery.FieldInfo<KV<String, Integer>>("STRING",
+ c -> { IntervalWindow w = (IntervalWindow) c.window();
+ return fmt.print(w.start()); }));
+ tableConfigure.put("processing_time",
+ new WriteWindowedToBigQuery.FieldInfo<KV<String, Integer>>(
+ "STRING", c -> fmt.print(Instant.now())));
+ return tableConfigure;
+ }
+
+ /**
+ * Create a map of information that describes how to write pipeline output to BigQuery. This map
+ * is used to write information about mean user session time.
+ */
+ protected static Map<String, WriteWindowedToBigQuery.FieldInfo<Double>>
+ configureSessionWindowWrite() {
+
+ Map<String, WriteWindowedToBigQuery.FieldInfo<Double>> tableConfigure =
+ new HashMap<String, WriteWindowedToBigQuery.FieldInfo<Double>>();
+ tableConfigure.put("window_start",
+ new WriteWindowedToBigQuery.FieldInfo<Double>("STRING",
+ c -> { IntervalWindow w = (IntervalWindow) c.window();
+ return fmt.print(w.start()); }));
+ tableConfigure.put("mean_duration",
+ new WriteWindowedToBigQuery.FieldInfo<Double>("FLOAT", c -> c.element()));
+ return tableConfigure;
+ }
+
+
+
+ public static void main(String[] args) throws Exception {
+
+ Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
+ // Enforce that this pipeline is always run in streaming mode.
+ options.setStreaming(true);
+ // Allow the pipeline to be cancelled automatically.
+ options.setRunner(DataflowPipelineRunner.class);
+ DataflowExampleUtils dataflowUtils = new DataflowExampleUtils(options);
+ Pipeline pipeline = Pipeline.create(options);
+
+ // Read Events from Pub/Sub using custom timestamps
+ PCollection<GameActionInfo> rawEvents = pipeline
+ .apply(PubsubIO.Read.timestampLabel(TIMESTAMP_ATTRIBUTE).topic(options.getTopic()))
+ .apply(ParDo.named("ParseGameEvent").of(new ParseEventFn()));
+
+ // Extract username/score pairs from the event stream
+ PCollection<KV<String, Integer>> userEvents =
+ rawEvents.apply("ExtractUserScore",
+ MapElements.via((GameActionInfo gInfo) -> KV.of(gInfo.getUser(), gInfo.getScore()))
+ .withOutputType(new TypeDescriptor<KV<String, Integer>>() {}));
+
+ // Calculate the total score per user over fixed windows, and
+ // cumulative updates for late data.
+ final PCollectionView<Map<String, Integer>> spammersView = userEvents
+ .apply(Window.named("FixedWindowsUser")
+ .<KV<String, Integer>>into(FixedWindows.of(
+ Duration.standardMinutes(options.getFixedWindowDuration())))
+ )
+
+ // Filter out everyone but those with (SCORE_WEIGHT * avg) clickrate.
+ // These might be robots/spammers.
+ .apply("CalculateSpammyUsers", new CalculateSpammyUsers())
+ // Derive a view from the collection of spammer users. It will be used as a side input
+ // in calculating the team score sums, below.
+ .apply("CreateSpammersView", View.<String, Integer>asMap());
+
+ // [START DocInclude_FilterAndCalc]
+ // Calculate the total score per team over fixed windows,
+ // and emit cumulative updates for late data. Uses the side input derived above-- the set of
+ // suspected robots-- to filter out scores from those users from the sum.
+ // Write the results to BigQuery.
+ rawEvents
+ .apply(Window.named("WindowIntoFixedWindows")
+ .<GameActionInfo>into(FixedWindows.of(
+ Duration.standardMinutes(options.getFixedWindowDuration())))
+ )
+ // Filter out the detected spammer users, using the side input derived above.
+ .apply(ParDo.named("FilterOutSpammers")
+ .withSideInputs(spammersView)
+ .of(new DoFn<GameActionInfo, GameActionInfo>() {
+ @Override
+ public void processElement(ProcessContext c) {
+ // If the user is not in the spammers Map, output the data element.
+ if (c.sideInput(spammersView).get(c.element().getUser().trim()) == null) {
+ c.output(c.element());
+ }
+ }
+ }))
+ // Extract and sum teamname/score pairs from the event data.
+ .apply("ExtractTeamScore", new ExtractAndSumScore("team"))
+ // [END DocInclude_FilterAndCalc]
+ // Write the result to BigQuery
+ .apply("WriteTeamSums",
+ new WriteWindowedToBigQuery<KV<String, Integer>>(
+ options.getTablePrefix() + "_team", configureWindowedWrite()));
+
+
+ // [START DocInclude_SessionCalc]
+ // Detect user sessions-- that is, a burst of activity separated by a gap from further
+ // activity. Find and record the mean session lengths.
+ // This information could help the game designers track the changing user engagement
+ // as their set of games changes.
+ userEvents
+ .apply(Window.named("WindowIntoSessions")
+ .<KV<String, Integer>>into(
+ Sessions.withGapDuration(Duration.standardMinutes(options.getSessionGap())))
+ .withOutputTimeFn(OutputTimeFns.outputAtEndOfWindow()))
+ // For this use, we care only about the existence of the session, not any particular
+ // information aggregated over it, so the following is an efficient way to do that.
+ .apply(Combine.perKey(x -> 0))
+ // Get the duration per session.
+ .apply("UserSessionActivity", ParDo.of(new UserSessionInfoFn()))
+ // [END DocInclude_SessionCalc]
+ // [START DocInclude_Rewindow]
+ // Re-window to process groups of session sums according to when the sessions complete.
+ .apply(Window.named("WindowToExtractSessionMean")
+ .<Integer>into(
+ FixedWindows.of(Duration.standardMinutes(options.getUserActivityWindowDuration()))))
+ // Find the mean session duration in each window.
+ .apply(Mean.<Integer>globally().withoutDefaults())
+ // Write this info to a BigQuery table.
+ .apply("WriteAvgSessionLength",
+ new WriteWindowedToBigQuery<Double>(
+ options.getTablePrefix() + "_sessions", configureSessionWindowWrite()));
+ // [END DocInclude_Rewindow]
+
+
+ // Run the pipeline and wait for the pipeline to finish; capture cancellation requests from the
+ // command line.
+ PipelineResult result = pipeline.run();
+ dataflowUtils.waitToFinish(result);
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/11bb9e0e/examples/java8/src/main/java/com/google/cloud/dataflow/examples/complete/game/HourlyTeamScore.java
----------------------------------------------------------------------
diff --git a/examples/java8/src/main/java/com/google/cloud/dataflow/examples/complete/game/HourlyTeamScore.java b/examples/java8/src/main/java/com/google/cloud/dataflow/examples/complete/game/HourlyTeamScore.java
new file mode 100644
index 0000000..481b9df
--- /dev/null
+++ b/examples/java8/src/main/java/com/google/cloud/dataflow/examples/complete/game/HourlyTeamScore.java
@@ -0,0 +1,193 @@
+/*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package com.google.cloud.dataflow.examples.complete.game;
+
+import com.google.cloud.dataflow.examples.complete.game.utils.WriteWindowedToBigQuery;
+
+import com.google.cloud.dataflow.sdk.Pipeline;
+import com.google.cloud.dataflow.sdk.io.TextIO;
+import com.google.cloud.dataflow.sdk.options.Default;
+import com.google.cloud.dataflow.sdk.options.Description;
+import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
+import com.google.cloud.dataflow.sdk.transforms.Filter;
+import com.google.cloud.dataflow.sdk.transforms.ParDo;
+import com.google.cloud.dataflow.sdk.transforms.WithTimestamps;
+import com.google.cloud.dataflow.sdk.transforms.windowing.FixedWindows;
+import com.google.cloud.dataflow.sdk.transforms.windowing.IntervalWindow;
+import com.google.cloud.dataflow.sdk.transforms.windowing.Window;
+import com.google.cloud.dataflow.sdk.values.KV;
+
+import org.joda.time.DateTimeZone;
+import org.joda.time.Duration;
+import org.joda.time.Instant;
+import org.joda.time.format.DateTimeFormat;
+import org.joda.time.format.DateTimeFormatter;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.TimeZone;
+
+/**
+ * This class is the second in a series of four pipelines that tell a story in a 'gaming'
+ * domain, following {@link UserScore}. In addition to the concepts introduced in {@link UserScore},
+ * new concepts include: windowing and element timestamps; use of {@code Filter.byPredicate()}.
+ *
+ * <p> This pipeline processes data collected from gaming events in batch, building on {@link
+ * UserScore} but using fixed windows. It calculates the sum of scores per team, for each window,
+ * optionally allowing specification of two timestamps before and after which data is filtered out.
+ * This allows a model where late data collected after the intended analysis window can be included,
+ * and any late-arriving data prior to the beginning of the analysis window can be removed as well.
+ * By using windowing and adding element timestamps, we can do finer-grained analysis than with the
+ * {@link UserScore} pipeline. However, our batch processing is high-latency, in that we don't get
+ * results from plays at the beginning of the batch's time period until the batch is processed.
+ *
+ * <p> To execute this pipeline using the Dataflow service, specify the pipeline configuration
+ * like this:
+ * <pre>{@code
+ * --project=YOUR_PROJECT_ID
+ * --stagingLocation=gs://YOUR_STAGING_DIRECTORY
+ * --runner=BlockingDataflowPipelineRunner
+ * --dataset=YOUR-DATASET
+ * }
+ * </pre>
+ * where the BigQuery dataset you specify must already exist.
+ *
+ * <p> Optionally include {@code --input} to specify the batch input file path.
+ * To indicate a time after which the data should be filtered out, include the
+ * {@code --stopMin} arg. E.g., {@code --stopMin=2015-10-18-23-59} indicates that any data
+ * timestamped after 23:59 PST on 2015-10-18 should not be included in the analysis.
+ * To indicate a time before which data should be filtered out, include the {@code --startMin} arg.
+ * If you're using the default input specified in {@link UserScore},
+ * "gs://dataflow-samples/game/gaming_data*.csv", then
+ * {@code --startMin=2015-11-16-16-10 --stopMin=2015-11-17-16-10} are good values.
+ */
+public class HourlyTeamScore extends UserScore {
+
+ private static DateTimeFormatter fmt =
+ DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss.SSS")
+ .withZone(DateTimeZone.forTimeZone(TimeZone.getTimeZone("PST")));
+ private static DateTimeFormatter minFmt =
+ DateTimeFormat.forPattern("yyyy-MM-dd-HH-mm")
+ .withZone(DateTimeZone.forTimeZone(TimeZone.getTimeZone("PST")));
+
+
+ /**
+ * Options supported by {@link HourlyTeamScore}.
+ */
+ static interface Options extends UserScore.Options {
+
+ @Description("Numeric value of fixed window duration, in minutes")
+ @Default.Integer(60)
+ Integer getWindowDuration();
+ void setWindowDuration(Integer value);
+
+ @Description("String representation of the first minute after which to generate results,"
+ + "in the format: yyyy-MM-dd-HH-mm . This time should be in PST."
+ + "Any input data timestamped prior to that minute won't be included in the sums.")
+ @Default.String("1970-01-01-00-00")
+ String getStartMin();
+ void setStartMin(String value);
+
+ @Description("String representation of the first minute for which to not generate results,"
+ + "in the format: yyyy-MM-dd-HH-mm . This time should be in PST."
+ + "Any input data timestamped after that minute won't be included in the sums.")
+ @Default.String("2100-01-01-00-00")
+ String getStopMin();
+ void setStopMin(String value);
+
+ @Description("The BigQuery table name. Should not already exist.")
+ @Default.String("hourly_team_score")
+ String getTableName();
+ void setTableName(String value);
+ }
+
+ /**
+ * Create a map of information that describes how to write pipeline output to BigQuery. This map
+ * is passed to the {@link WriteWindowedToBigQuery} constructor to write team score sums and
+ * includes information about window start time.
+ */
+ protected static Map<String, WriteWindowedToBigQuery.FieldInfo<KV<String, Integer>>>
+ configureWindowedTableWrite() {
+ Map<String, WriteWindowedToBigQuery.FieldInfo<KV<String, Integer>>> tableConfig =
+ new HashMap<String, WriteWindowedToBigQuery.FieldInfo<KV<String, Integer>>>();
+ tableConfig.put("team",
+ new WriteWindowedToBigQuery.FieldInfo<KV<String, Integer>>("STRING",
+ c -> c.element().getKey()));
+ tableConfig.put("total_score",
+ new WriteWindowedToBigQuery.FieldInfo<KV<String, Integer>>("INTEGER",
+ c -> c.element().getValue()));
+ tableConfig.put("window_start",
+ new WriteWindowedToBigQuery.FieldInfo<KV<String, Integer>>("STRING",
+ c -> { IntervalWindow w = (IntervalWindow) c.window();
+ return fmt.print(w.start()); }));
+ return tableConfig;
+ }
+
+
+ /**
+ * Run a batch pipeline to do windowed analysis of the data.
+ */
+ // [START DocInclude_HTSMain]
+ public static void main(String[] args) throws Exception {
+ // Begin constructing a pipeline configured by commandline flags.
+ Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
+ Pipeline pipeline = Pipeline.create(options);
+
+ final Instant stopMinTimestamp = new Instant(minFmt.parseMillis(options.getStopMin()));
+ final Instant startMinTimestamp = new Instant(minFmt.parseMillis(options.getStartMin()));
+
+ // Read 'gaming' events from a text file.
+ pipeline.apply(TextIO.Read.from(options.getInput()))
+ // Parse the incoming data.
+ .apply(ParDo.named("ParseGameEvent").of(new ParseEventFn()))
+
+ // Filter out data before and after the given times so that it is not included
+ // in the calculations. As we collect data in batches (say, by day), the batch for the day
+ // that we want to analyze could potentially include some late-arriving data from the previous
+ // day. If so, we want to weed it out. Similarly, if we include data from the following day
+ // (to scoop up late-arriving events from the day we're analyzing), we need to weed out events
+ // that fall after the time period we want to analyze.
+ // [START DocInclude_HTSFilters]
+ .apply("FilterStartTime", Filter.byPredicate(
+ (GameActionInfo gInfo)
+ -> gInfo.getTimestamp() > startMinTimestamp.getMillis()))
+ .apply("FilterEndTime", Filter.byPredicate(
+ (GameActionInfo gInfo)
+ -> gInfo.getTimestamp() < stopMinTimestamp.getMillis()))
+ // [END DocInclude_HTSFilters]
+
+ // [START DocInclude_HTSAddTsAndWindow]
+ // Add an element timestamp based on the event log, and apply fixed windowing.
+ .apply("AddEventTimestamps",
+ WithTimestamps.of((GameActionInfo i) -> new Instant(i.getTimestamp())))
+ .apply(Window.named("FixedWindowsTeam")
+ .<GameActionInfo>into(FixedWindows.of(
+ Duration.standardMinutes(options.getWindowDuration()))))
+ // [END DocInclude_HTSAddTsAndWindow]
+
+ // Extract and sum teamname/score pairs from the event data.
+ .apply("ExtractTeamScore", new ExtractAndSumScore("team"))
+ .apply("WriteTeamScoreSums",
+ new WriteWindowedToBigQuery<KV<String, Integer>>(options.getTableName(),
+ configureWindowedTableWrite()));
+
+
+ pipeline.run();
+ }
+ // [END DocInclude_HTSMain]
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/11bb9e0e/examples/java8/src/main/java/com/google/cloud/dataflow/examples/complete/game/LeaderBoard.java
----------------------------------------------------------------------
diff --git a/examples/java8/src/main/java/com/google/cloud/dataflow/examples/complete/game/LeaderBoard.java b/examples/java8/src/main/java/com/google/cloud/dataflow/examples/complete/game/LeaderBoard.java
new file mode 100644
index 0000000..4185376
--- /dev/null
+++ b/examples/java8/src/main/java/com/google/cloud/dataflow/examples/complete/game/LeaderBoard.java
@@ -0,0 +1,237 @@
+/*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package com.google.cloud.dataflow.examples.complete.game;
+
+import com.google.cloud.dataflow.examples.common.DataflowExampleOptions;
+import com.google.cloud.dataflow.examples.common.DataflowExampleUtils;
+import com.google.cloud.dataflow.examples.complete.game.utils.WriteToBigQuery;
+import com.google.cloud.dataflow.examples.complete.game.utils.WriteWindowedToBigQuery;
+import com.google.cloud.dataflow.sdk.Pipeline;
+import com.google.cloud.dataflow.sdk.PipelineResult;
+import com.google.cloud.dataflow.sdk.io.PubsubIO;
+import com.google.cloud.dataflow.sdk.options.Default;
+import com.google.cloud.dataflow.sdk.options.Description;
+import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
+import com.google.cloud.dataflow.sdk.options.Validation;
+import com.google.cloud.dataflow.sdk.runners.DataflowPipelineRunner;
+import com.google.cloud.dataflow.sdk.transforms.ParDo;
+import com.google.cloud.dataflow.sdk.transforms.windowing.AfterProcessingTime;
+import com.google.cloud.dataflow.sdk.transforms.windowing.AfterWatermark;
+import com.google.cloud.dataflow.sdk.transforms.windowing.FixedWindows;
+import com.google.cloud.dataflow.sdk.transforms.windowing.GlobalWindows;
+import com.google.cloud.dataflow.sdk.transforms.windowing.IntervalWindow;
+import com.google.cloud.dataflow.sdk.transforms.windowing.Repeatedly;
+import com.google.cloud.dataflow.sdk.transforms.windowing.Window;
+import com.google.cloud.dataflow.sdk.values.KV;
+import com.google.cloud.dataflow.sdk.values.PCollection;
+
+import org.joda.time.DateTimeZone;
+import org.joda.time.Duration;
+import org.joda.time.Instant;
+import org.joda.time.format.DateTimeFormat;
+import org.joda.time.format.DateTimeFormatter;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.TimeZone;
+
+/**
+ * This class is the third in a series of four pipelines that tell a story in a 'gaming' domain,
+ * following {@link UserScore} and {@link HourlyTeamScore}. Concepts include: processing unbounded
+ * data using fixed windows; use of custom timestamps and event-time processing; generation of
+ * early/speculative results; using .accumulatingFiredPanes() to do cumulative processing of late-
+ * arriving data.
+ *
+ * <p> This pipeline processes an unbounded stream of 'game events'. The calculation of the team
+ * scores uses fixed windowing based on event time (the time of the game play event), not
+ * processing time (the time that an event is processed by the pipeline). The pipeline calculates
+ * the sum of scores per team, for each window. By default, the team scores are calculated using
+ * one-hour windows.
+ *
+ * <p> In contrast-- to demo another windowing option-- the user scores are calculated using a
+ * global window, which periodically (every ten minutes) emits cumulative user score sums.
+ *
+ * <p> In contrast to the previous pipelines in the series, which used static, finite input data,
+ * here we're using an unbounded data source, which lets us provide speculative results, and allows
+ * handling of late data, at much lower latency. We can use the early/speculative results to keep a
+ * 'leaderboard' updated in near-realtime. Our handling of late data lets us generate correct
+ * results, e.g. for 'team prizes'. We're now outputing window results as they're
+ * calculated, giving us much lower latency than with the previous batch examples.
+ *
+ * <p> Run {@link injector.Injector} to generate pubsub data for this pipeline. The Injector
+ * documentation provides more detail on how to do this.
+ *
+ * <p> To execute this pipeline using the Dataflow service, specify the pipeline configuration
+ * like this:
+ * <pre>{@code
+ * --project=YOUR_PROJECT_ID
+ * --stagingLocation=gs://YOUR_STAGING_DIRECTORY
+ * --runner=BlockingDataflowPipelineRunner
+ * --dataset=YOUR-DATASET
+ * --topic=projects/YOUR-PROJECT/topics/YOUR-TOPIC
+ * }
+ * </pre>
+ * where the BigQuery dataset you specify must already exist.
+ * The PubSub topic you specify should be the same topic to which the Injector is publishing.
+ */
+public class LeaderBoard extends HourlyTeamScore {
+
+ private static final String TIMESTAMP_ATTRIBUTE = "timestamp_ms";
+
+ private static DateTimeFormatter fmt =
+ DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss.SSS")
+ .withZone(DateTimeZone.forTimeZone(TimeZone.getTimeZone("PST")));
+ static final Duration FIVE_MINUTES = Duration.standardMinutes(5);
+ static final Duration TEN_MINUTES = Duration.standardMinutes(10);
+
+
+ /**
+ * Options supported by {@link LeaderBoard}.
+ */
+ static interface Options extends HourlyTeamScore.Options, DataflowExampleOptions {
+
+ @Description("Pub/Sub topic to read from")
+ @Validation.Required
+ String getTopic();
+ void setTopic(String value);
+
+ @Description("Numeric value of fixed window duration for team analysis, in minutes")
+ @Default.Integer(60)
+ Integer getTeamWindowDuration();
+ void setTeamWindowDuration(Integer value);
+
+ @Description("Numeric value of allowed data lateness, in minutes")
+ @Default.Integer(120)
+ Integer getAllowedLateness();
+ void setAllowedLateness(Integer value);
+
+ @Description("Prefix used for the BigQuery table names")
+ @Default.String("leaderboard")
+ String getTableName();
+ void setTableName(String value);
+ }
+
+ /**
+ * Create a map of information that describes how to write pipeline output to BigQuery. This map
+ * is used to write team score sums and includes event timing information.
+ */
+ protected static Map<String, WriteWindowedToBigQuery.FieldInfo<KV<String, Integer>>>
+ configureWindowedTableWrite() {
+
+ Map<String, WriteWindowedToBigQuery.FieldInfo<KV<String, Integer>>> tableConfigure =
+ new HashMap<String, WriteWindowedToBigQuery.FieldInfo<KV<String, Integer>>>();
+ tableConfigure.put("team",
+ new WriteWindowedToBigQuery.FieldInfo<KV<String, Integer>>("STRING",
+ c -> c.element().getKey()));
+ tableConfigure.put("total_score",
+ new WriteWindowedToBigQuery.FieldInfo<KV<String, Integer>>("INTEGER",
+ c -> c.element().getValue()));
+ tableConfigure.put("window_start",
+ new WriteWindowedToBigQuery.FieldInfo<KV<String, Integer>>("STRING",
+ c -> { IntervalWindow w = (IntervalWindow) c.window();
+ return fmt.print(w.start()); }));
+ tableConfigure.put("processing_time",
+ new WriteWindowedToBigQuery.FieldInfo<KV<String, Integer>>(
+ "STRING", c -> fmt.print(Instant.now())));
+ tableConfigure.put("timing",
+ new WriteWindowedToBigQuery.FieldInfo<KV<String, Integer>>(
+ "STRING", c -> c.pane().getTiming().toString()));
+ return tableConfigure;
+ }
+
+ /**
+ * Create a map of information that describes how to write pipeline output to BigQuery. This map
+ * is used to write user score sums.
+ */
+ protected static Map<String, WriteToBigQuery.FieldInfo<KV<String, Integer>>>
+ configureGlobalWindowBigQueryWrite() {
+
+ Map<String, WriteToBigQuery.FieldInfo<KV<String, Integer>>> tableConfigure =
+ configureBigQueryWrite();
+ tableConfigure.put("processing_time",
+ new WriteToBigQuery.FieldInfo<KV<String, Integer>>(
+ "STRING", c -> fmt.print(Instant.now())));
+ return tableConfigure;
+ }
+
+
+ public static void main(String[] args) throws Exception {
+
+ Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
+ // Enforce that this pipeline is always run in streaming mode.
+ options.setStreaming(true);
+ // For example purposes, allow the pipeline to be easily cancelled instead of running
+ // continuously.
+ options.setRunner(DataflowPipelineRunner.class);
+ DataflowExampleUtils dataflowUtils = new DataflowExampleUtils(options);
+ Pipeline pipeline = Pipeline.create(options);
+
+ // Read game events from Pub/Sub using custom timestamps, which are extracted from the pubsub
+ // data elements, and parse the data.
+ PCollection<GameActionInfo> gameEvents = pipeline
+ .apply(PubsubIO.Read.timestampLabel(TIMESTAMP_ATTRIBUTE).topic(options.getTopic()))
+ .apply(ParDo.named("ParseGameEvent").of(new ParseEventFn()));
+
+ // [START DocInclude_WindowAndTrigger]
+ // Extract team/score pairs from the event stream, using hour-long windows by default.
+ gameEvents
+ .apply(Window.named("LeaderboardTeamFixedWindows")
+ .<GameActionInfo>into(FixedWindows.of(
+ Duration.standardMinutes(options.getTeamWindowDuration())))
+ // We will get early (speculative) results as well as cumulative
+ // processing of late data.
+ .triggering(
+ AfterWatermark.pastEndOfWindow()
+ .withEarlyFirings(AfterProcessingTime.pastFirstElementInPane()
+ .plusDelayOf(FIVE_MINUTES))
+ .withLateFirings(AfterProcessingTime.pastFirstElementInPane()
+ .plusDelayOf(TEN_MINUTES)))
+ .withAllowedLateness(Duration.standardMinutes(options.getAllowedLateness()))
+ .accumulatingFiredPanes())
+ // Extract and sum teamname/score pairs from the event data.
+ .apply("ExtractTeamScore", new ExtractAndSumScore("team"))
+ // Write the results to BigQuery.
+ .apply("WriteTeamScoreSums",
+ new WriteWindowedToBigQuery<KV<String, Integer>>(
+ options.getTableName() + "_team", configureWindowedTableWrite()));
+ // [END DocInclude_WindowAndTrigger]
+
+ // [START DocInclude_ProcTimeTrigger]
+ // Extract user/score pairs from the event stream using processing time, via global windowing.
+ // Get periodic updates on all users' running scores.
+ gameEvents
+ .apply(Window.named("LeaderboardUserGlobalWindow")
+ .<GameActionInfo>into(new GlobalWindows())
+ // Get periodic results every ten minutes.
+ .triggering(Repeatedly.forever(AfterProcessingTime.pastFirstElementInPane()
+ .plusDelayOf(TEN_MINUTES)))
+ .accumulatingFiredPanes()
+ .withAllowedLateness(Duration.standardMinutes(options.getAllowedLateness())))
+ // Extract and sum username/score pairs from the event data.
+ .apply("ExtractUserScore", new ExtractAndSumScore("user"))
+ // Write the results to BigQuery.
+ .apply("WriteUserScoreSums",
+ new WriteToBigQuery<KV<String, Integer>>(
+ options.getTableName() + "_user", configureGlobalWindowBigQueryWrite()));
+ // [END DocInclude_ProcTimeTrigger]
+
+ // Run the pipeline and wait for the pipeline to finish; capture cancellation requests from the
+ // command line.
+ PipelineResult result = pipeline.run();
+ dataflowUtils.waitToFinish(result);
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/11bb9e0e/examples/java8/src/main/java/com/google/cloud/dataflow/examples/complete/game/README.md
----------------------------------------------------------------------
diff --git a/examples/java8/src/main/java/com/google/cloud/dataflow/examples/complete/game/README.md b/examples/java8/src/main/java/com/google/cloud/dataflow/examples/complete/game/README.md
new file mode 100644
index 0000000..79b55ce
--- /dev/null
+++ b/examples/java8/src/main/java/com/google/cloud/dataflow/examples/complete/game/README.md
@@ -0,0 +1,113 @@
+
+# 'Gaming' examples
+
+
+This directory holds a series of example Dataflow pipelines in a simple 'mobile
+gaming' domain. They all require Java 8. Each pipeline successively introduces
+new concepts, and gives some examples of using Java 8 syntax in constructing
+Dataflow pipelines. Other than usage of Java 8 lambda expressions, the concepts
+that are used apply equally well in Java 7.
+
+In the gaming scenario, many users play, as members of different teams, over
+the course of a day, and their actions are logged for processing. Some of the
+logged game events may be late-arriving, if users play on mobile devices and go
+transiently offline for a period.
+
+The scenario includes not only "regular" users, but "robot users", which have a
+higher click rate than the regular users, and may move from team to team.
+
+The first two pipelines in the series use pre-generated batch data samples. The
+second two pipelines read from a [PubSub](https://cloud.google.com/pubsub/)
+topic input. For these examples, you will also need to run the
+`injector.Injector` program, which generates and publishes the gaming data to
+PubSub. The javadocs for each pipeline have more detailed information on how to
+run that pipeline.
+
+All of these pipelines write their results to BigQuery table(s).
+
+
+## The pipelines in the 'gaming' series
+
+### UserScore
+
+The first pipeline in the series is `UserScore`. This pipeline does batch
+processing of data collected from gaming events. It calculates the sum of
+scores per user, over an entire batch of gaming data (collected, say, for each
+day). The batch processing will not include any late data that arrives after
+the day's cutoff point.
+
+### HourlyTeamScore
+
+The next pipeline in the series is `HourlyTeamScore`. This pipeline also
+processes data collected from gaming events in batch. It builds on `UserScore`,
+but uses [fixed windows](https://cloud.google.com/dataflow/model/windowing), by
+default an hour in duration. It calculates the sum of scores per team, for each
+window, optionally allowing specification of two timestamps before and after
+which data is filtered out. This allows a model where late data collected after
+the intended analysis window can be included in the analysis, and any late-
+arriving data prior to the beginning of the analysis window can be removed as
+well.
+
+By using windowing and adding element timestamps, we can do finer-grained
+analysis than with the `UserScore` pipeline — we're now tracking scores for
+each hour rather than over the course of a whole day. However, our batch
+processing is high-latency, in that we don't get results from plays at the
+beginning of the batch's time period until the complete batch is processed.
+
+### LeaderBoard
+
+The third pipeline in the series is `LeaderBoard`. This pipeline processes an
+unbounded stream of 'game events' from a PubSub topic. The calculation of the
+team scores uses fixed windowing based on event time (the time of the game play
+event), not processing time (the time that an event is processed by the
+pipeline). The pipeline calculates the sum of scores per team, for each window.
+By default, the team scores are calculated using one-hour windows.
+
+In contrast — to demo another windowing option — the user scores are calculated
+using a global window, which periodically (every ten minutes) emits cumulative
+user score sums.
+
+In contrast to the previous pipelines in the series, which used static, finite
+input data, here we're using an unbounded data source, which lets us provide
+_speculative_ results, and allows handling of late data, at much lower latency.
+E.g., we could use the early/speculative results to keep a 'leaderboard'
+updated in near-realtime. Our handling of late data lets us generate correct
+results, e.g. for 'team prizes'. We're now outputing window results as they're
+calculated, giving us much lower latency than with the previous batch examples.
+
+### GameStats
+
+The fourth pipeline in the series is `GameStats`. This pipeline builds
+on the `LeaderBoard` functionality — supporting output of speculative and late
+data — and adds some "business intelligence" analysis: identifying abuse
+detection. The pipeline derives the Mean user score sum for a window, and uses
+that information to identify likely spammers/robots. (The injector is designed
+so that the "robots" have a higher click rate than the "real" users). The robot
+users are then filtered out when calculating the team scores.
+
+Additionally, user sessions are tracked: that is, we find bursts of user
+activity using session windows. Then, the mean session duration information is
+recorded in the context of subsequent fixed windowing. (This could be used to
+tell us what games are giving us greater user retention).
+
+### Running the PubSub Injector
+
+The `LeaderBoard` and `GameStats` example pipelines read unbounded data
+from a PubSub topic.
+
+Use the `injector.Injector` program to generate this data and publish to a
+PubSub topic. See the `Injector`javadocs for more information on how to run the
+injector. Set up the injector before you start one of these pipelines. Then,
+when you start the pipeline, pass as an argument the name of that PubSub topic.
+See the pipeline javadocs for the details.
+
+## Viewing the results in BigQuery
+
+All of the pipelines write their results to BigQuery. `UserScore` and
+`HourlyTeamScore` each write one table, and `LeaderBoard` and
+`GameStats` each write two. The pipelines have default table names that
+you can override when you start up the pipeline if those tables already exist.
+
+Depending on the windowing intervals defined in a given pipeline, you may have
+to wait for a while (more than an hour) before you start to see results written
+to the BigQuery tables.
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/11bb9e0e/examples/java8/src/main/java/com/google/cloud/dataflow/examples/complete/game/UserScore.java
----------------------------------------------------------------------
diff --git a/examples/java8/src/main/java/com/google/cloud/dataflow/examples/complete/game/UserScore.java b/examples/java8/src/main/java/com/google/cloud/dataflow/examples/complete/game/UserScore.java
new file mode 100644
index 0000000..de06ce3
--- /dev/null
+++ b/examples/java8/src/main/java/com/google/cloud/dataflow/examples/complete/game/UserScore.java
@@ -0,0 +1,239 @@
+/*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package com.google.cloud.dataflow.examples.complete.game;
+
+import com.google.cloud.dataflow.examples.complete.game.utils.WriteToBigQuery;
+import com.google.cloud.dataflow.sdk.Pipeline;
+import com.google.cloud.dataflow.sdk.coders.AvroCoder;
+import com.google.cloud.dataflow.sdk.coders.DefaultCoder;
+import com.google.cloud.dataflow.sdk.io.TextIO;
+import com.google.cloud.dataflow.sdk.options.Default;
+import com.google.cloud.dataflow.sdk.options.Description;
+import com.google.cloud.dataflow.sdk.options.PipelineOptions;
+import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
+import com.google.cloud.dataflow.sdk.options.Validation;
+import com.google.cloud.dataflow.sdk.transforms.Aggregator;
+import com.google.cloud.dataflow.sdk.transforms.DoFn;
+import com.google.cloud.dataflow.sdk.transforms.MapElements;
+import com.google.cloud.dataflow.sdk.transforms.PTransform;
+import com.google.cloud.dataflow.sdk.transforms.ParDo;
+import com.google.cloud.dataflow.sdk.transforms.Sum;
+import com.google.cloud.dataflow.sdk.values.KV;
+import com.google.cloud.dataflow.sdk.values.PCollection;
+import com.google.cloud.dataflow.sdk.values.TypeDescriptor;
+
+import org.apache.avro.reflect.Nullable;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ * This class is the first in a series of four pipelines that tell a story in a 'gaming' domain.
+ * Concepts: batch processing; reading input from Google Cloud Storage and writing output to
+ * BigQuery; using standalone DoFns; use of the sum by key transform; examples of
+ * Java 8 lambda syntax.
+ *
+ * <p> In this gaming scenario, many users play, as members of different teams, over the course of a
+ * day, and their actions are logged for processing. Some of the logged game events may be late-
+ * arriving, if users play on mobile devices and go transiently offline for a period.
+ *
+ * <p> This pipeline does batch processing of data collected from gaming events. It calculates the
+ * sum of scores per user, over an entire batch of gaming data (collected, say, for each day). The
+ * batch processing will not include any late data that arrives after the day's cutoff point.
+ *
+ * <p> To execute this pipeline using the Dataflow service and static example input data, specify
+ * the pipeline configuration like this:
+ * <pre>{@code
+ * --project=YOUR_PROJECT_ID
+ * --stagingLocation=gs://YOUR_STAGING_DIRECTORY
+ * --runner=BlockingDataflowPipelineRunner
+ * --dataset=YOUR-DATASET
+ * }
+ * </pre>
+ * where the BigQuery dataset you specify must already exist.
+ *
+ * <p> Optionally include the --input argument to specify a batch input file.
+ * See the --input default value for example batch data file, or use {@link injector.Injector} to
+ * generate your own batch data.
+ */
+public class UserScore {
+
+ /**
+ * Class to hold info about a game event.
+ */
+ @DefaultCoder(AvroCoder.class)
+ static class GameActionInfo {
+ @Nullable String user;
+ @Nullable String team;
+ @Nullable Integer score;
+ @Nullable Long timestamp;
+
+ public GameActionInfo() {}
+
+ public GameActionInfo(String user, String team, Integer score, Long timestamp) {
+ this.user = user;
+ this.team = team;
+ this.score = score;
+ this.timestamp = timestamp;
+ }
+
+ public String getUser() {
+ return this.user;
+ }
+ public String getTeam() {
+ return this.team;
+ }
+ public Integer getScore() {
+ return this.score;
+ }
+ public String getKey(String keyname) {
+ if (keyname.equals("team")) {
+ return this.team;
+ } else { // return username as default
+ return this.user;
+ }
+ }
+ public Long getTimestamp() {
+ return this.timestamp;
+ }
+ }
+
+
+ /**
+ * Parses the raw game event info into GameActionInfo objects. Each event line has the following
+ * format: username,teamname,score,timestamp_in_ms,readable_time
+ * e.g.:
+ * user2_AsparagusPig,AsparagusPig,10,1445230923951,2015-11-02 09:09:28.224
+ * The human-readable time string is not used here.
+ */
+ static class ParseEventFn extends DoFn<String, GameActionInfo> {
+
+ // Log and count parse errors.
+ private static final Logger LOG = LoggerFactory.getLogger(ParseEventFn.class);
+ private final Aggregator<Long, Long> numParseErrors =
+ createAggregator("ParseErrors", new Sum.SumLongFn());
+
+ @Override
+ public void processElement(ProcessContext c) {
+ String[] components = c.element().split(",");
+ try {
+ String user = components[0].trim();
+ String team = components[1].trim();
+ Integer score = Integer.parseInt(components[2].trim());
+ Long timestamp = Long.parseLong(components[3].trim());
+ GameActionInfo gInfo = new GameActionInfo(user, team, score, timestamp);
+ c.output(gInfo);
+ } catch (ArrayIndexOutOfBoundsException | NumberFormatException e) {
+ numParseErrors.addValue(1L);
+ LOG.info("Parse error on " + c.element() + ", " + e.getMessage());
+ }
+ }
+ }
+
+ /**
+ * A transform to extract key/score information from GameActionInfo, and sum the scores. The
+ * constructor arg determines whether 'team' or 'user' info is extracted.
+ */
+ // [START DocInclude_USExtractXform]
+ public static class ExtractAndSumScore
+ extends PTransform<PCollection<GameActionInfo>, PCollection<KV<String, Integer>>> {
+
+ private final String field;
+
+ ExtractAndSumScore(String field) {
+ this.field = field;
+ }
+
+ @Override
+ public PCollection<KV<String, Integer>> apply(
+ PCollection<GameActionInfo> gameInfo) {
+
+ return gameInfo
+ .apply(MapElements
+ .via((GameActionInfo gInfo) -> KV.of(gInfo.getKey(field), gInfo.getScore()))
+ .withOutputType(new TypeDescriptor<KV<String, Integer>>() {}))
+ .apply(Sum.<String>integersPerKey());
+ }
+ }
+ // [END DocInclude_USExtractXform]
+
+
+ /**
+ * Options supported by {@link UserScore}.
+ */
+ public static interface Options extends PipelineOptions {
+
+ @Description("Path to the data file(s) containing game data.")
+ // The default maps to two large Google Cloud Storage files (each ~12GB) holding two subsequent
+ // day's worth (roughly) of data.
+ @Default.String("gs://dataflow-samples/game/gaming_data*.csv")
+ String getInput();
+ void setInput(String value);
+
+ @Description("BigQuery Dataset to write tables to. Must already exist.")
+ @Validation.Required
+ String getDataset();
+ void setDataset(String value);
+
+ @Description("The BigQuery table name. Should not already exist.")
+ @Default.String("user_score")
+ String getTableName();
+ void setTableName(String value);
+ }
+
+ /**
+ * Create a map of information that describes how to write pipeline output to BigQuery. This map
+ * is passed to the {@link WriteToBigQuery} constructor to write user score sums.
+ */
+ protected static Map<String, WriteToBigQuery.FieldInfo<KV<String, Integer>>>
+ configureBigQueryWrite() {
+ Map<String, WriteToBigQuery.FieldInfo<KV<String, Integer>>> tableConfigure =
+ new HashMap<String, WriteToBigQuery.FieldInfo<KV<String, Integer>>>();
+ tableConfigure.put("user",
+ new WriteToBigQuery.FieldInfo<KV<String, Integer>>("STRING", c -> c.element().getKey()));
+ tableConfigure.put("total_score",
+ new WriteToBigQuery.FieldInfo<KV<String, Integer>>("INTEGER", c -> c.element().getValue()));
+ return tableConfigure;
+ }
+
+
+ /**
+ * Run a batch pipeline.
+ */
+ // [START DocInclude_USMain]
+ public static void main(String[] args) throws Exception {
+ // Begin constructing a pipeline configured by commandline flags.
+ Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
+ Pipeline pipeline = Pipeline.create(options);
+
+ // Read events from a text file and parse them.
+ pipeline.apply(TextIO.Read.from(options.getInput()))
+ .apply(ParDo.named("ParseGameEvent").of(new ParseEventFn()))
+ // Extract and sum username/score pairs from the event data.
+ .apply("ExtractUserScore", new ExtractAndSumScore("user"))
+ .apply("WriteUserScoreSums",
+ new WriteToBigQuery<KV<String, Integer>>(options.getTableName(),
+ configureBigQueryWrite()));
+
+ // Run the batch pipeline.
+ pipeline.run();
+ }
+ // [END DocInclude_USMain]
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/11bb9e0e/examples/java8/src/main/java/com/google/cloud/dataflow/examples/complete/game/injector/Injector.java
----------------------------------------------------------------------
diff --git a/examples/java8/src/main/java/com/google/cloud/dataflow/examples/complete/game/injector/Injector.java b/examples/java8/src/main/java/com/google/cloud/dataflow/examples/complete/game/injector/Injector.java
new file mode 100644
index 0000000..1691c54
--- /dev/null
+++ b/examples/java8/src/main/java/com/google/cloud/dataflow/examples/complete/game/injector/Injector.java
@@ -0,0 +1,415 @@
+/*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package com.google.cloud.dataflow.examples.complete.game.injector;
+
+import com.google.api.services.pubsub.Pubsub;
+import com.google.api.services.pubsub.model.PublishRequest;
+import com.google.api.services.pubsub.model.PubsubMessage;
+import com.google.common.collect.ImmutableMap;
+
+import org.joda.time.DateTimeZone;
+import org.joda.time.format.DateTimeFormat;
+import org.joda.time.format.DateTimeFormatter;
+
+import java.io.BufferedOutputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.PrintWriter;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Random;
+import java.util.TimeZone;
+
+
+/**
+ * This is a generator that simulates usage data from a mobile game, and either publishes the data
+ * to a pubsub topic or writes it to a file.
+ *
+ * <p> The general model used by the generator is the following. There is a set of teams with team
+ * members. Each member is scoring points for their team. After some period, a team will dissolve
+ * and a new one will be created in its place. There is also a set of 'Robots', or spammer users.
+ * They hop from team to team. The robots are set to have a higher 'click rate' (generate more
+ * events) than the regular team members.
+ *
+ * <p> Each generated line of data has the following form:
+ * username,teamname,score,timestamp_in_ms,readable_time
+ * e.g.:
+ * user2_AsparagusPig,AsparagusPig,10,1445230923951,2015-11-02 09:09:28.224
+ *
+ * <p> The Injector writes either to a PubSub topic, or a file. It will use the PubSub topic if
+ * specified. It takes the following arguments:
+ * {@code Injector project-name (topic-name|none) (filename|none)}.
+ *
+ * <p> To run the Injector in the mode where it publishes to PubSub, you will need to authenticate
+ * locally using project-based service account credentials to avoid running over PubSub
+ * quota.
+ * See https://developers.google.com/identity/protocols/application-default-credentials
+ * for more information on using service account credentials. Set the GOOGLE_APPLICATION_CREDENTIALS
+ * environment variable to point to your downloaded service account credentials before starting the
+ * program, e.g.:
+ * {@code export GOOGLE_APPLICATION_CREDENTIALS=/path/to/your/credentials-key.json}.
+ * If you do not do this, then your injector will only run for a few minutes on your
+ * 'user account' credentials before you will start to see quota error messages like:
+ * "Request throttled due to user QPS limit being reached", and see this exception:
+ * ".com.google.api.client.googleapis.json.GoogleJsonResponseException: 429 Too Many Requests".
+ * Once you've set up your credentials, run the Injector like this":
+ * <pre>{@code
+ * Injector <project-name> <topic-name> none
+ * }
+ * </pre>
+ * The pubsub topic will be created if it does not exist.
+ *
+ * <p> To run the injector in write-to-file-mode, set the topic name to "none" and specify the
+ * filename:
+ * <pre>{@code
+ * Injector <project-name> none <filename>
+ * }
+ * </pre>
+ */
+class Injector {
+ private static Pubsub pubsub;
+ private static Random random = new Random();
+ private static String topic;
+ private static String project;
+ private static final String TIMESTAMP_ATTRIBUTE = "timestamp_ms";
+
+ // QPS ranges from 800 to 1000.
+ private static final int MIN_QPS = 800;
+ private static final int QPS_RANGE = 200;
+ // How long to sleep, in ms, between creation of the threads that make API requests to PubSub.
+ private static final int THREAD_SLEEP_MS = 500;
+
+ // Lists used to generate random team names.
+ private static final ArrayList<String> COLORS =
+ new ArrayList<String>(Arrays.asList(
+ "Magenta", "AliceBlue", "Almond", "Amaranth", "Amber",
+ "Amethyst", "AndroidGreen", "AntiqueBrass", "Fuchsia", "Ruby", "AppleGreen",
+ "Apricot", "Aqua", "ArmyGreen", "Asparagus", "Auburn", "Azure", "Banana",
+ "Beige", "Bisque", "BarnRed", "BattleshipGrey"));
+
+ private static final ArrayList<String> ANIMALS =
+ new ArrayList<String>(Arrays.asList(
+ "Echidna", "Koala", "Wombat", "Marmot", "Quokka", "Kangaroo", "Dingo", "Numbat", "Emu",
+ "Wallaby", "CaneToad", "Bilby", "Possum", "Cassowary", "Kookaburra", "Platypus",
+ "Bandicoot", "Cockatoo", "Antechinus"));
+
+ // The list of live teams.
+ private static ArrayList<TeamInfo> liveTeams = new ArrayList<TeamInfo>();
+
+ private static DateTimeFormatter fmt =
+ DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss.SSS")
+ .withZone(DateTimeZone.forTimeZone(TimeZone.getTimeZone("PST")));
+
+
+ // The total number of robots in the system.
+ private static final int NUM_ROBOTS = 20;
+ // Determines the chance that a team will have a robot team member.
+ private static final int ROBOT_PROBABILITY = 3;
+ private static final int NUM_LIVE_TEAMS = 15;
+ private static final int BASE_MEMBERS_PER_TEAM = 5;
+ private static final int MEMBERS_PER_TEAM = 15;
+ private static final int MAX_SCORE = 20;
+ private static final int LATE_DATA_RATE = 5 * 60 * 2; // Every 10 minutes
+ private static final int BASE_DELAY_IN_MILLIS = 5 * 60 * 1000; // 5-10 minute delay
+ private static final int FUZZY_DELAY_IN_MILLIS = 5 * 60 * 1000;
+
+ // The minimum time a 'team' can live.
+ private static final int BASE_TEAM_EXPIRATION_TIME_IN_MINS = 20;
+ private static final int TEAM_EXPIRATION_TIME_IN_MINS = 20;
+
+
+ /**
+ * A class for holding team info: the name of the team, when it started,
+ * and the current team members. Teams may but need not include one robot team member.
+ */
+ private static class TeamInfo {
+ String teamName;
+ long startTimeInMillis;
+ int expirationPeriod;
+ // The team might but need not include 1 robot. Will be non-null if so.
+ String robot;
+ int numMembers;
+
+ private TeamInfo(String teamName, long startTimeInMillis, String robot) {
+ this.teamName = teamName;
+ this.startTimeInMillis = startTimeInMillis;
+ // How long until this team is dissolved.
+ this.expirationPeriod = random.nextInt(TEAM_EXPIRATION_TIME_IN_MINS) +
+ BASE_TEAM_EXPIRATION_TIME_IN_MINS;
+ this.robot = robot;
+ // Determine the number of team members.
+ numMembers = random.nextInt(MEMBERS_PER_TEAM) + BASE_MEMBERS_PER_TEAM;
+ }
+
+ String getTeamName() {
+ return teamName;
+ }
+ String getRobot() {
+ return robot;
+ }
+
+ long getStartTimeInMillis() {
+ return startTimeInMillis;
+ }
+ long getEndTimeInMillis() {
+ return startTimeInMillis + (expirationPeriod * 60 * 1000);
+ }
+ String getRandomUser() {
+ int userNum = random.nextInt(numMembers);
+ return "user" + userNum + "_" + teamName;
+ }
+
+ int numMembers() {
+ return numMembers;
+ }
+
+ @Override
+ public String toString() {
+ return "(" + teamName + ", num members: " + numMembers() + ", starting at: "
+ + startTimeInMillis + ", expires in: " + expirationPeriod + ", robot: " + robot + ")";
+ }
+ }
+
+ /** Utility to grab a random element from an array of Strings. */
+ private static String randomElement(ArrayList<String> list) {
+ int index = random.nextInt(list.size());
+ return list.get(index);
+ }
+
+ /**
+ * Get and return a random team. If the selected team is too old w.r.t its expiration, remove
+ * it, replacing it with a new team.
+ */
+ private static TeamInfo randomTeam(ArrayList<TeamInfo> list) {
+ int index = random.nextInt(list.size());
+ TeamInfo team = list.get(index);
+ // If the selected team is expired, remove it and return a new team.
+ long currTime = System.currentTimeMillis();
+ if ((team.getEndTimeInMillis() < currTime) || team.numMembers() == 0) {
+ System.out.println("\nteam " + team + " is too old; replacing.");
+ System.out.println("start time: " + team.getStartTimeInMillis() +
+ ", end time: " + team.getEndTimeInMillis() +
+ ", current time:" + currTime);
+ removeTeam(index);
+ // Add a new team in its stead.
+ return (addLiveTeam());
+ } else {
+ return team;
+ }
+ }
+
+ /**
+ * Create and add a team. Possibly add a robot to the team.
+ */
+ private static synchronized TeamInfo addLiveTeam() {
+ String teamName = randomElement(COLORS) + randomElement(ANIMALS);
+ String robot = null;
+ // Decide if we want to add a robot to the team.
+ if (random.nextInt(ROBOT_PROBABILITY) == 0) {
+ robot = "Robot-" + random.nextInt(NUM_ROBOTS);
+ }
+ // Create the new team.
+ TeamInfo newTeam = new TeamInfo(teamName, System.currentTimeMillis(), robot);
+ liveTeams.add(newTeam);
+ System.out.println("[+" + newTeam + "]");
+ return newTeam;
+ }
+
+ /**
+ * Remove a specific team.
+ */
+ private static synchronized void removeTeam(int teamIndex) {
+ TeamInfo removedTeam = liveTeams.remove(teamIndex);
+ System.out.println("[-" + removedTeam + "]");
+ }
+
+ /** Generate a user gaming event. */
+ private static String generateEvent(Long currTime, int delayInMillis) {
+ TeamInfo team = randomTeam(liveTeams);
+ String teamName = team.getTeamName();
+ String user;
+ final int parseErrorRate = 900000;
+
+ String robot = team.getRobot();
+ // If the team has an associated robot team member...
+ if (robot != null) {
+ // Then use that robot for the message with some probability.
+ // Set this probability to higher than that used to select any of the 'regular' team
+ // members, so that if there is a robot on the team, it has a higher click rate.
+ if (random.nextInt(team.numMembers() / 2) == 0) {
+ user = robot;
+ } else {
+ user = team.getRandomUser();
+ }
+ } else { // No robot.
+ user = team.getRandomUser();
+ }
+ String event = user + "," + teamName + "," + random.nextInt(MAX_SCORE);
+ // Randomly introduce occasional parse errors. You can see a custom counter tracking the number
+ // of such errors in the Dataflow Monitoring UI, as the example pipeline runs.
+ if (random.nextInt(parseErrorRate) == 0) {
+ System.out.println("Introducing a parse error.");
+ event = "THIS LINE REPRESENTS CORRUPT DATA AND WILL CAUSE A PARSE ERROR";
+ }
+ return addTimeInfoToEvent(event, currTime, delayInMillis);
+ }
+
+ /**
+ * Add time info to a generated gaming event.
+ */
+ private static String addTimeInfoToEvent(String message, Long currTime, int delayInMillis) {
+ String eventTimeString =
+ Long.toString((currTime - delayInMillis) / 1000 * 1000);
+ // Add a (redundant) 'human-readable' date string to make the data semantics more clear.
+ String dateString = fmt.print(currTime);
+ message = message + "," + eventTimeString + "," + dateString;
+ return message;
+ }
+
+ /**
+ * Publish 'numMessages' arbitrary events from live users with the provided delay, to a
+ * PubSub topic.
+ */
+ public static void publishData(int numMessages, int delayInMillis)
+ throws IOException {
+ List<PubsubMessage> pubsubMessages = new ArrayList<>();
+
+ for (int i = 0; i < Math.max(1, numMessages); i++) {
+ Long currTime = System.currentTimeMillis();
+ String message = generateEvent(currTime, delayInMillis);
+ PubsubMessage pubsubMessage = new PubsubMessage()
+ .encodeData(message.getBytes("UTF-8"));
+ pubsubMessage.setAttributes(
+ ImmutableMap.of(TIMESTAMP_ATTRIBUTE,
+ Long.toString((currTime - delayInMillis) / 1000 * 1000)));
+ if (delayInMillis != 0) {
+ System.out.println(pubsubMessage.getAttributes());
+ System.out.println("late data for: " + message);
+ }
+ pubsubMessages.add(pubsubMessage);
+ }
+
+ PublishRequest publishRequest = new PublishRequest();
+ publishRequest.setMessages(pubsubMessages);
+ pubsub.projects().topics().publish(topic, publishRequest).execute();
+ }
+
+ /**
+ * Publish generated events to a file.
+ */
+ public static void publishDataToFile(String fileName, int numMessages, int delayInMillis)
+ throws IOException {
+ PrintWriter out = new PrintWriter(new OutputStreamWriter(
+ new BufferedOutputStream(new FileOutputStream(fileName, true)), "UTF-8"));
+
+ try {
+ for (int i = 0; i < Math.max(1, numMessages); i++) {
+ Long currTime = System.currentTimeMillis();
+ String message = generateEvent(currTime, delayInMillis);
+ out.println(message);
+ }
+ } catch (Exception e) {
+ e.printStackTrace();
+ } finally {
+ if (out != null) {
+ out.flush();
+ out.close();
+ }
+ }
+ }
+
+
+ public static void main(String[] args) throws IOException, InterruptedException {
+ if (args.length < 3) {
+ System.out.println("Usage: Injector project-name (topic-name|none) (filename|none)");
+ System.exit(1);
+ }
+ boolean writeToFile = false;
+ boolean writeToPubsub = true;
+ project = args[0];
+ String topicName = args[1];
+ String fileName = args[2];
+ // The Injector writes either to a PubSub topic, or a file. It will use the PubSub topic if
+ // specified; otherwise, it will try to write to a file.
+ if (topicName.equalsIgnoreCase("none")) {
+ writeToFile = true;
+ writeToPubsub = false;
+ }
+ if (writeToPubsub) {
+ // Create the PubSub client.
+ pubsub = InjectorUtils.getClient();
+ // Create the PubSub topic as necessary.
+ topic = InjectorUtils.getFullyQualifiedTopicName(project, topicName);
+ InjectorUtils.createTopic(pubsub, topic);
+ System.out.println("Injecting to topic: " + topic);
+ } else {
+ if (fileName.equalsIgnoreCase("none")) {
+ System.out.println("Filename not specified.");
+ System.exit(1);
+ }
+ System.out.println("Writing to file: " + fileName);
+ }
+ System.out.println("Starting Injector");
+
+ // Start off with some random live teams.
+ while (liveTeams.size() < NUM_LIVE_TEAMS) {
+ addLiveTeam();
+ }
+
+ // Publish messages at a rate determined by the QPS and Thread sleep settings.
+ for (int i = 0; true; i++) {
+ if (Thread.activeCount() > 10) {
+ System.err.println("I'm falling behind!");
+ }
+
+ // Decide if this should be a batch of late data.
+ final int numMessages;
+ final int delayInMillis;
+ if (i % LATE_DATA_RATE == 0) {
+ // Insert delayed data for one user (one message only)
+ delayInMillis = BASE_DELAY_IN_MILLIS + random.nextInt(FUZZY_DELAY_IN_MILLIS);
+ numMessages = 1;
+ System.out.println("DELAY(" + delayInMillis + ", " + numMessages + ")");
+ } else {
+ System.out.print(".");
+ delayInMillis = 0;
+ numMessages = MIN_QPS + random.nextInt(QPS_RANGE);
+ }
+
+ if (writeToFile) { // Won't use threading for the file write.
+ publishDataToFile(fileName, numMessages, delayInMillis);
+ } else { // Write to PubSub.
+ // Start a thread to inject some data.
+ new Thread(){
+ @Override
+ public void run() {
+ try {
+ publishData(numMessages, delayInMillis);
+ } catch (IOException e) {
+ System.err.println(e);
+ }
+ }
+ }.start();
+ }
+
+ // Wait before creating another injector thread.
+ Thread.sleep(THREAD_SLEEP_MS);
+ }
+ }
+}
[12/67] [partial] incubator-beam git commit: Directory reorganization
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/FinishedTriggersSet.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/FinishedTriggersSet.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/FinishedTriggersSet.java
deleted file mode 100644
index 6da673d..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/FinishedTriggersSet.java
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.common.collect.Sets;
-
-import java.util.Set;
-
-/**
- * An implementation of {@link FinishedTriggers} atop a user-provided mutable {@link Set}.
- */
-public class FinishedTriggersSet implements FinishedTriggers {
-
- private final Set<ExecutableTrigger<?>> finishedTriggers;
-
- private FinishedTriggersSet(Set<ExecutableTrigger<?>> finishedTriggers) {
- this.finishedTriggers = finishedTriggers;
- }
-
- public static FinishedTriggersSet fromSet(Set<ExecutableTrigger<?>> finishedTriggers) {
- return new FinishedTriggersSet(finishedTriggers);
- }
-
- /**
- * Returns a mutable {@link Set} of the underlying triggers that are finished.
- */
- public Set<ExecutableTrigger<?>> getFinishedTriggers() {
- return finishedTriggers;
- }
-
- @Override
- public boolean isFinished(ExecutableTrigger<?> trigger) {
- return finishedTriggers.contains(trigger);
- }
-
- @Override
- public void setFinished(ExecutableTrigger<?> trigger, boolean value) {
- if (value) {
- finishedTriggers.add(trigger);
- } else {
- finishedTriggers.remove(trigger);
- }
- }
-
- @Override
- public void clearRecursively(ExecutableTrigger<?> trigger) {
- finishedTriggers.remove(trigger);
- for (ExecutableTrigger<?> subTrigger : trigger.subTriggers()) {
- clearRecursively(subTrigger);
- }
- }
-
- @Override
- public FinishedTriggersSet copy() {
- return fromSet(Sets.newHashSet(finishedTriggers));
- }
-
-}
-
-
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/GcpCredentialFactory.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/GcpCredentialFactory.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/GcpCredentialFactory.java
deleted file mode 100644
index 8b6f495..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/GcpCredentialFactory.java
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.api.client.auth.oauth2.Credential;
-import com.google.cloud.dataflow.sdk.options.GcpOptions;
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-
-import java.io.IOException;
-import java.security.GeneralSecurityException;
-
-/**
- * Construct an oauth credential to be used by the SDK and the SDK workers.
- * Returns a GCP credential.
- */
-public class GcpCredentialFactory implements CredentialFactory {
- private GcpOptions options;
-
- private GcpCredentialFactory(GcpOptions options) {
- this.options = options;
- }
-
- public static GcpCredentialFactory fromOptions(PipelineOptions options) {
- return new GcpCredentialFactory(options.as(GcpOptions.class));
- }
-
- @Override
- public Credential getCredential() throws IOException, GeneralSecurityException {
- return Credentials.getCredential(options);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/GcsIOChannelFactory.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/GcsIOChannelFactory.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/GcsIOChannelFactory.java
deleted file mode 100644
index ce933f5..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/GcsIOChannelFactory.java
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.cloud.dataflow.sdk.options.GcsOptions;
-import com.google.cloud.dataflow.sdk.util.gcsfs.GcsPath;
-
-import java.io.IOException;
-import java.nio.channels.ReadableByteChannel;
-import java.nio.channels.WritableByteChannel;
-import java.util.Collection;
-import java.util.LinkedList;
-import java.util.List;
-
-/**
- * Implements IOChannelFactory for GCS.
- */
-public class GcsIOChannelFactory implements IOChannelFactory {
-
- private final GcsOptions options;
-
- public GcsIOChannelFactory(GcsOptions options) {
- this.options = options;
- }
-
- @Override
- public Collection<String> match(String spec) throws IOException {
- GcsPath path = GcsPath.fromUri(spec);
- GcsUtil util = options.getGcsUtil();
- List<GcsPath> matched = util.expand(path);
-
- List<String> specs = new LinkedList<>();
- for (GcsPath match : matched) {
- specs.add(match.toString());
- }
-
- return specs;
- }
-
- @Override
- public ReadableByteChannel open(String spec) throws IOException {
- GcsPath path = GcsPath.fromUri(spec);
- GcsUtil util = options.getGcsUtil();
- return util.open(path);
- }
-
- @Override
- public WritableByteChannel create(String spec, String mimeType)
- throws IOException {
- GcsPath path = GcsPath.fromUri(spec);
- GcsUtil util = options.getGcsUtil();
- return util.create(path, mimeType);
- }
-
- @Override
- public long getSizeBytes(String spec) throws IOException {
- GcsPath path = GcsPath.fromUri(spec);
- GcsUtil util = options.getGcsUtil();
- return util.fileSize(path);
- }
-
- @Override
- public boolean isReadSeekEfficient(String spec) throws IOException {
- // TODO It is incorrect to return true here for files with content encoding set to gzip.
- return true;
- }
-
- @Override
- public String resolve(String path, String other) throws IOException {
- return GcsPath.fromUri(path).resolve(other).toString();
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/GcsStager.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/GcsStager.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/GcsStager.java
deleted file mode 100644
index 4219bc4..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/GcsStager.java
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.api.services.dataflow.model.DataflowPackage;
-import com.google.cloud.dataflow.sdk.options.DataflowPipelineDebugOptions;
-import com.google.cloud.dataflow.sdk.options.DataflowPipelineOptions;
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-import com.google.common.base.Preconditions;
-
-import java.util.List;
-
-/**
- * Utility class for staging files to GCS.
- */
-public class GcsStager implements Stager {
- private DataflowPipelineOptions options;
-
- private GcsStager(DataflowPipelineOptions options) {
- this.options = options;
- }
-
- public static GcsStager fromOptions(PipelineOptions options) {
- return new GcsStager(options.as(DataflowPipelineOptions.class));
- }
-
- @Override
- public List<DataflowPackage> stageFiles() {
- Preconditions.checkNotNull(options.getStagingLocation());
- List<String> filesToStage = options.getFilesToStage();
- String windmillBinary =
- options.as(DataflowPipelineDebugOptions.class).getOverrideWindmillBinary();
- if (windmillBinary != null) {
- filesToStage.add("windmill_main=" + windmillBinary);
- }
- return PackageUtil.stageClasspathElements(
- options.getFilesToStage(), options.getStagingLocation());
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/GcsUtil.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/GcsUtil.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/GcsUtil.java
deleted file mode 100644
index 8fd258f..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/GcsUtil.java
+++ /dev/null
@@ -1,406 +0,0 @@
-/*******************************************************************************
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- ******************************************************************************/
-
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.api.client.googleapis.json.GoogleJsonResponseException;
-import com.google.api.client.util.BackOff;
-import com.google.api.client.util.Sleeper;
-import com.google.api.services.storage.Storage;
-import com.google.api.services.storage.model.Objects;
-import com.google.api.services.storage.model.StorageObject;
-import com.google.cloud.dataflow.sdk.options.DefaultValueFactory;
-import com.google.cloud.dataflow.sdk.options.GcsOptions;
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-import com.google.cloud.dataflow.sdk.util.gcsfs.GcsPath;
-import com.google.cloud.hadoop.gcsio.GoogleCloudStorageReadChannel;
-import com.google.cloud.hadoop.gcsio.GoogleCloudStorageWriteChannel;
-import com.google.cloud.hadoop.gcsio.ObjectWriteConditions;
-import com.google.cloud.hadoop.util.ApiErrorExtractor;
-import com.google.cloud.hadoop.util.AsyncWriteChannelOptions;
-import com.google.cloud.hadoop.util.ClientRequestHelper;
-import com.google.cloud.hadoop.util.ResilientOperation;
-import com.google.cloud.hadoop.util.RetryDeterminer;
-import com.google.common.annotations.VisibleForTesting;
-import com.google.common.base.Preconditions;
-import com.google.common.collect.ImmutableList;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.FileNotFoundException;
-import java.io.IOException;
-import java.nio.channels.SeekableByteChannel;
-import java.nio.channels.WritableByteChannel;
-import java.util.Collections;
-import java.util.LinkedList;
-import java.util.List;
-import java.util.concurrent.ExecutorService;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import javax.annotation.Nullable;
-
-/**
- * Provides operations on GCS.
- */
-public class GcsUtil {
- /**
- * This is a {@link DefaultValueFactory} able to create a {@link GcsUtil} using
- * any transport flags specified on the {@link PipelineOptions}.
- */
- public static class GcsUtilFactory implements DefaultValueFactory<GcsUtil> {
- /**
- * Returns an instance of {@link GcsUtil} based on the
- * {@link PipelineOptions}.
- *
- * <p>If no instance has previously been created, one is created and the value
- * stored in {@code options}.
- */
- @Override
- public GcsUtil create(PipelineOptions options) {
- LOG.debug("Creating new GcsUtil");
- GcsOptions gcsOptions = options.as(GcsOptions.class);
-
- return new GcsUtil(Transport.newStorageClient(gcsOptions).build(),
- gcsOptions.getExecutorService(), gcsOptions.getGcsUploadBufferSizeBytes());
- }
- }
-
- private static final Logger LOG = LoggerFactory.getLogger(GcsUtil.class);
-
- /** Maximum number of items to retrieve per Objects.List request. */
- private static final long MAX_LIST_ITEMS_PER_CALL = 1024;
-
- /** Matches a glob containing a wildcard, capturing the portion before the first wildcard. */
- private static final Pattern GLOB_PREFIX = Pattern.compile("(?<PREFIX>[^\\[*?]*)[\\[*?].*");
-
- private static final String RECURSIVE_WILDCARD = "[*]{2}";
-
- /**
- * A {@link Pattern} for globs with a recursive wildcard.
- */
- private static final Pattern RECURSIVE_GCS_PATTERN =
- Pattern.compile(".*" + RECURSIVE_WILDCARD + ".*");
-
- /////////////////////////////////////////////////////////////////////////////
-
- /** Client for the GCS API. */
- private Storage storageClient;
- /** Buffer size for GCS uploads (in bytes). */
- @Nullable private final Integer uploadBufferSizeBytes;
-
- // Helper delegate for turning IOExceptions from API calls into higher-level semantics.
- private final ApiErrorExtractor errorExtractor = new ApiErrorExtractor();
-
- // Exposed for testing.
- final ExecutorService executorService;
-
- /**
- * Returns true if the given GCS pattern is supported otherwise fails with an
- * exception.
- */
- public boolean isGcsPatternSupported(String gcsPattern) {
- if (RECURSIVE_GCS_PATTERN.matcher(gcsPattern).matches()) {
- throw new IllegalArgumentException("Unsupported wildcard usage in \"" + gcsPattern + "\": "
- + " recursive wildcards are not supported.");
- }
-
- return true;
- }
-
- private GcsUtil(
- Storage storageClient, ExecutorService executorService,
- @Nullable Integer uploadBufferSizeBytes) {
- this.storageClient = storageClient;
- this.uploadBufferSizeBytes = uploadBufferSizeBytes;
- this.executorService = executorService;
- }
-
- // Use this only for testing purposes.
- protected void setStorageClient(Storage storageClient) {
- this.storageClient = storageClient;
- }
-
- /**
- * Expands a pattern into matched paths. The pattern path may contain globs, which are expanded
- * in the result. For patterns that only match a single object, we ensure that the object
- * exists.
- */
- public List<GcsPath> expand(GcsPath gcsPattern) throws IOException {
- Preconditions.checkArgument(isGcsPatternSupported(gcsPattern.getObject()));
- Matcher m = GLOB_PREFIX.matcher(gcsPattern.getObject());
- Pattern p = null;
- String prefix = null;
- if (!m.matches()) {
- // Not a glob.
- Storage.Objects.Get getObject = storageClient.objects().get(
- gcsPattern.getBucket(), gcsPattern.getObject());
- try {
- // Use a get request to fetch the metadata of the object,
- // the request has strong global consistency.
- ResilientOperation.retry(
- ResilientOperation.getGoogleRequestCallable(getObject),
- new AttemptBoundedExponentialBackOff(3, 200),
- RetryDeterminer.SOCKET_ERRORS,
- IOException.class);
- return ImmutableList.of(gcsPattern);
- } catch (IOException | InterruptedException e) {
- if (e instanceof IOException && errorExtractor.itemNotFound((IOException) e)) {
- // If the path was not found, return an empty list.
- return ImmutableList.of();
- }
- throw new IOException("Unable to match files for pattern " + gcsPattern, e);
- }
- } else {
- // Part before the first wildcard character.
- prefix = m.group("PREFIX");
- p = Pattern.compile(globToRegexp(gcsPattern.getObject()));
- }
-
- LOG.debug("matching files in bucket {}, prefix {} against pattern {}", gcsPattern.getBucket(),
- prefix, p.toString());
-
- // List all objects that start with the prefix (including objects in sub-directories).
- Storage.Objects.List listObject = storageClient.objects().list(gcsPattern.getBucket());
- listObject.setMaxResults(MAX_LIST_ITEMS_PER_CALL);
- listObject.setPrefix(prefix);
-
- String pageToken = null;
- List<GcsPath> results = new LinkedList<>();
- do {
- if (pageToken != null) {
- listObject.setPageToken(pageToken);
- }
-
- Objects objects;
- try {
- objects = ResilientOperation.retry(
- ResilientOperation.getGoogleRequestCallable(listObject),
- new AttemptBoundedExponentialBackOff(3, 200),
- RetryDeterminer.SOCKET_ERRORS,
- IOException.class);
- } catch (Exception e) {
- throw new IOException("Unable to match files in bucket " + gcsPattern.getBucket()
- + ", prefix " + prefix + " against pattern " + p.toString(), e);
- }
- //Objects objects = listObject.execute();
- Preconditions.checkNotNull(objects);
-
- if (objects.getItems() == null) {
- break;
- }
-
- // Filter objects based on the regex.
- for (StorageObject o : objects.getItems()) {
- String name = o.getName();
- // Skip directories, which end with a slash.
- if (p.matcher(name).matches() && !name.endsWith("/")) {
- LOG.debug("Matched object: {}", name);
- results.add(GcsPath.fromObject(o));
- }
- }
-
- pageToken = objects.getNextPageToken();
- } while (pageToken != null);
-
- return results;
- }
-
- @VisibleForTesting
- @Nullable
- Integer getUploadBufferSizeBytes() {
- return uploadBufferSizeBytes;
- }
-
- /**
- * Returns the file size from GCS or throws {@link FileNotFoundException}
- * if the resource does not exist.
- */
- public long fileSize(GcsPath path) throws IOException {
- return fileSize(path, new AttemptBoundedExponentialBackOff(4, 200), Sleeper.DEFAULT);
- }
-
- /**
- * Returns the file size from GCS or throws {@link FileNotFoundException}
- * if the resource does not exist.
- */
- @VisibleForTesting
- long fileSize(GcsPath path, BackOff backoff, Sleeper sleeper) throws IOException {
- Storage.Objects.Get getObject =
- storageClient.objects().get(path.getBucket(), path.getObject());
- try {
- StorageObject object = ResilientOperation.retry(
- ResilientOperation.getGoogleRequestCallable(getObject),
- backoff,
- RetryDeterminer.SOCKET_ERRORS,
- IOException.class,
- sleeper);
- return object.getSize().longValue();
- } catch (Exception e) {
- if (e instanceof IOException && errorExtractor.itemNotFound((IOException) e)) {
- throw new FileNotFoundException(path.toString());
- }
- throw new IOException("Unable to get file size", e);
- }
- }
-
- /**
- * Opens an object in GCS.
- *
- * <p>Returns a SeekableByteChannel that provides access to data in the bucket.
- *
- * @param path the GCS filename to read from
- * @return a SeekableByteChannel that can read the object data
- * @throws IOException
- */
- public SeekableByteChannel open(GcsPath path)
- throws IOException {
- return new GoogleCloudStorageReadChannel(storageClient, path.getBucket(),
- path.getObject(), errorExtractor,
- new ClientRequestHelper<StorageObject>());
- }
-
- /**
- * Creates an object in GCS.
- *
- * <p>Returns a WritableByteChannel that can be used to write data to the
- * object.
- *
- * @param path the GCS file to write to
- * @param type the type of object, eg "text/plain".
- * @return a Callable object that encloses the operation.
- * @throws IOException
- */
- public WritableByteChannel create(GcsPath path,
- String type) throws IOException {
- GoogleCloudStorageWriteChannel channel = new GoogleCloudStorageWriteChannel(
- executorService,
- storageClient,
- new ClientRequestHelper<StorageObject>(),
- path.getBucket(),
- path.getObject(),
- AsyncWriteChannelOptions.newBuilder().build(),
- new ObjectWriteConditions(),
- Collections.<String, String>emptyMap(),
- type);
- if (uploadBufferSizeBytes != null) {
- channel.setUploadBufferSize(uploadBufferSizeBytes);
- }
- channel.initialize();
- return channel;
- }
-
- /**
- * Returns whether the GCS bucket exists. If the bucket exists, it must
- * be accessible otherwise the permissions exception will be propagated.
- */
- public boolean bucketExists(GcsPath path) throws IOException {
- return bucketExists(path, new AttemptBoundedExponentialBackOff(4, 200), Sleeper.DEFAULT);
- }
-
- /**
- * Returns whether the GCS bucket exists. This will return false if the bucket
- * is inaccessible due to permissions.
- */
- @VisibleForTesting
- boolean bucketExists(GcsPath path, BackOff backoff, Sleeper sleeper) throws IOException {
- Storage.Buckets.Get getBucket =
- storageClient.buckets().get(path.getBucket());
-
- try {
- ResilientOperation.retry(
- ResilientOperation.getGoogleRequestCallable(getBucket),
- backoff,
- new RetryDeterminer<IOException>() {
- @Override
- public boolean shouldRetry(IOException e) {
- if (errorExtractor.itemNotFound(e) || errorExtractor.accessDenied(e)) {
- return false;
- }
- return RetryDeterminer.SOCKET_ERRORS.shouldRetry(e);
- }
- },
- IOException.class,
- sleeper);
- return true;
- } catch (GoogleJsonResponseException e) {
- if (errorExtractor.itemNotFound(e) || errorExtractor.accessDenied(e)) {
- return false;
- }
- throw e;
- } catch (InterruptedException e) {
- throw new IOException(
- String.format("Error while attempting to verify existence of bucket gs://%s",
- path.getBucket()), e);
- }
- }
-
- /**
- * Expands glob expressions to regular expressions.
- *
- * @param globExp the glob expression to expand
- * @return a string with the regular expression this glob expands to
- */
- static String globToRegexp(String globExp) {
- StringBuilder dst = new StringBuilder();
- char[] src = globExp.toCharArray();
- int i = 0;
- while (i < src.length) {
- char c = src[i++];
- switch (c) {
- case '*':
- dst.append("[^/]*");
- break;
- case '?':
- dst.append("[^/]");
- break;
- case '.':
- case '+':
- case '{':
- case '}':
- case '(':
- case ')':
- case '|':
- case '^':
- case '$':
- // These need to be escaped in regular expressions
- dst.append('\\').append(c);
- break;
- case '\\':
- i = doubleSlashes(dst, src, i);
- break;
- default:
- dst.append(c);
- break;
- }
- }
- return dst.toString();
- }
-
- private static int doubleSlashes(StringBuilder dst, char[] src, int i) {
- // Emit the next character without special interpretation
- dst.append('\\');
- if ((i - 1) != src.length) {
- dst.append(src[i]);
- i++;
- } else {
- // A backslash at the very end is treated like an escaped backslash
- dst.append('\\');
- }
- return i;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/GroupAlsoByWindowViaWindowSetDoFn.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/GroupAlsoByWindowViaWindowSetDoFn.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/GroupAlsoByWindowViaWindowSetDoFn.java
deleted file mode 100644
index 89a4fcb..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/GroupAlsoByWindowViaWindowSetDoFn.java
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.cloud.dataflow.sdk.transforms.Aggregator;
-import com.google.cloud.dataflow.sdk.transforms.DoFn;
-import com.google.cloud.dataflow.sdk.transforms.Sum;
-import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
-import com.google.cloud.dataflow.sdk.util.DoFnRunner.ReduceFnExecutor;
-import com.google.cloud.dataflow.sdk.util.TimerInternals.TimerData;
-import com.google.cloud.dataflow.sdk.util.state.StateInternals;
-import com.google.cloud.dataflow.sdk.values.KV;
-
-/**
- * A general {@link GroupAlsoByWindowsDoFn}. This delegates all of the logic to the
- * {@link ReduceFnRunner}.
- */
-@SystemDoFnInternal
-public class GroupAlsoByWindowViaWindowSetDoFn<
- K, InputT, OutputT, W extends BoundedWindow, RinT extends KeyedWorkItem<K, InputT>>
- extends DoFn<RinT, KV<K, OutputT>> implements ReduceFnExecutor<K, InputT, OutputT, W> {
-
- public static <K, InputT, OutputT, W extends BoundedWindow>
- DoFn<KeyedWorkItem<K, InputT>, KV<K, OutputT>> create(
- WindowingStrategy<?, W> strategy, SystemReduceFn<K, InputT, ?, OutputT, W> reduceFn) {
- return new GroupAlsoByWindowViaWindowSetDoFn<>(strategy, reduceFn);
- }
-
- protected final Aggregator<Long, Long> droppedDueToClosedWindow =
- createAggregator(
- GroupAlsoByWindowsDoFn.DROPPED_DUE_TO_CLOSED_WINDOW_COUNTER, new Sum.SumLongFn());
- protected final Aggregator<Long, Long> droppedDueToLateness =
- createAggregator(GroupAlsoByWindowsDoFn.DROPPED_DUE_TO_LATENESS_COUNTER, new Sum.SumLongFn());
-
- private final WindowingStrategy<Object, W> windowingStrategy;
- private SystemReduceFn<K, InputT, ?, OutputT, W> reduceFn;
-
- private GroupAlsoByWindowViaWindowSetDoFn(
- WindowingStrategy<?, W> windowingStrategy,
- SystemReduceFn<K, InputT, ?, OutputT, W> reduceFn) {
- @SuppressWarnings("unchecked")
- WindowingStrategy<Object, W> noWildcard = (WindowingStrategy<Object, W>) windowingStrategy;
- this.windowingStrategy = noWildcard;
- this.reduceFn = reduceFn;
- }
-
- @Override
- public void processElement(ProcessContext c) throws Exception {
- KeyedWorkItem<K, InputT> element = c.element();
-
- K key = c.element().key();
- TimerInternals timerInternals = c.windowingInternals().timerInternals();
-
- // It is the responsibility of the user of GroupAlsoByWindowsViaWindowSet to only
- // provide a WindowingInternals instance with the appropriate key type for StateInternals.
- @SuppressWarnings("unchecked")
- StateInternals<K> stateInternals = (StateInternals<K>) c.windowingInternals().stateInternals();
-
- ReduceFnRunner<K, InputT, OutputT, W> reduceFnRunner =
- new ReduceFnRunner<>(
- key,
- windowingStrategy,
- stateInternals,
- timerInternals,
- c.windowingInternals(),
- droppedDueToClosedWindow,
- reduceFn,
- c.getPipelineOptions());
-
- reduceFnRunner.processElements(element.elementsIterable());
- for (TimerData timer : element.timersIterable()) {
- reduceFnRunner.onTimer(timer);
- }
- reduceFnRunner.persist();
- }
-
- @Override
- public DoFn<KeyedWorkItem<K, InputT>, KV<K, OutputT>> asDoFn() {
- // Safe contravariant cast
- @SuppressWarnings("unchecked")
- DoFn<KeyedWorkItem<K, InputT>, KV<K, OutputT>> asFn =
- (DoFn<KeyedWorkItem<K, InputT>, KV<K, OutputT>>) this;
- return asFn;
- }
-
- @Override
- public Aggregator<Long, Long> getDroppedDueToLatenessAggregator() {
- return droppedDueToLateness;
- }
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/GroupAlsoByWindowsDoFn.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/GroupAlsoByWindowsDoFn.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/GroupAlsoByWindowsDoFn.java
deleted file mode 100644
index 175921d..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/GroupAlsoByWindowsDoFn.java
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.transforms.Aggregator;
-import com.google.cloud.dataflow.sdk.transforms.DoFn;
-import com.google.cloud.dataflow.sdk.transforms.Sum;
-import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
-import com.google.cloud.dataflow.sdk.values.KV;
-
-/**
- * DoFn that merges windows and groups elements in those windows, optionally
- * combining values.
- *
- * @param <K> key type
- * @param <InputT> input value element type
- * @param <OutputT> output value element type
- * @param <W> window type
- */
-@SystemDoFnInternal
-public abstract class GroupAlsoByWindowsDoFn<K, InputT, OutputT, W extends BoundedWindow>
- extends DoFn<KV<K, Iterable<WindowedValue<InputT>>>, KV<K, OutputT>> {
- public static final String DROPPED_DUE_TO_CLOSED_WINDOW_COUNTER = "DroppedDueToClosedWindow";
- public static final String DROPPED_DUE_TO_LATENESS_COUNTER = "DroppedDueToLateness";
-
- protected final Aggregator<Long, Long> droppedDueToClosedWindow =
- createAggregator(DROPPED_DUE_TO_CLOSED_WINDOW_COUNTER, new Sum.SumLongFn());
- protected final Aggregator<Long, Long> droppedDueToLateness =
- createAggregator(DROPPED_DUE_TO_LATENESS_COUNTER, new Sum.SumLongFn());
-
- /**
- * Create the default {@link GroupAlsoByWindowsDoFn}, which uses window sets to implement the
- * grouping.
- *
- * @param windowingStrategy The window function and trigger to use for grouping
- * @param inputCoder the input coder to use
- */
- public static <K, V, W extends BoundedWindow> GroupAlsoByWindowsDoFn<K, V, Iterable<V>, W>
- createDefault(WindowingStrategy<?, W> windowingStrategy, Coder<V> inputCoder) {
- return new GroupAlsoByWindowsViaOutputBufferDoFn<>(
- windowingStrategy, SystemReduceFn.<K, V, W>buffering(inputCoder));
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/GroupAlsoByWindowsViaOutputBufferDoFn.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/GroupAlsoByWindowsViaOutputBufferDoFn.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/GroupAlsoByWindowsViaOutputBufferDoFn.java
deleted file mode 100644
index d394e81..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/GroupAlsoByWindowsViaOutputBufferDoFn.java
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.cloud.dataflow.sdk.transforms.DoFn;
-import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
-import com.google.cloud.dataflow.sdk.util.state.StateInternals;
-import com.google.cloud.dataflow.sdk.values.KV;
-import com.google.common.collect.Iterables;
-
-import org.joda.time.Instant;
-
-import java.util.List;
-
-/**
- * The default batch {@link GroupAlsoByWindowsDoFn} implementation, if no specialized "fast path"
- * implementation is applicable.
- */
-@SystemDoFnInternal
-public class GroupAlsoByWindowsViaOutputBufferDoFn<K, InputT, OutputT, W extends BoundedWindow>
- extends GroupAlsoByWindowsDoFn<K, InputT, OutputT, W> {
-
- private final WindowingStrategy<?, W> strategy;
- private SystemReduceFn<K, InputT, ?, OutputT, W> reduceFn;
-
- public GroupAlsoByWindowsViaOutputBufferDoFn(
- WindowingStrategy<?, W> windowingStrategy,
- SystemReduceFn<K, InputT, ?, OutputT, W> reduceFn) {
- this.strategy = windowingStrategy;
- this.reduceFn = reduceFn;
- }
-
- @Override
- public void processElement(
- DoFn<KV<K, Iterable<WindowedValue<InputT>>>, KV<K, OutputT>>.ProcessContext c)
- throws Exception {
- K key = c.element().getKey();
- // Used with Batch, we know that all the data is available for this key. We can't use the
- // timer manager from the context because it doesn't exist. So we create one and emulate the
- // watermark, knowing that we have all data and it is in timestamp order.
- BatchTimerInternals timerInternals = new BatchTimerInternals(Instant.now());
-
- // It is the responsibility of the user of GroupAlsoByWindowsViaOutputBufferDoFn to only
- // provide a WindowingInternals instance with the appropriate key type for StateInternals.
- @SuppressWarnings("unchecked")
- StateInternals<K> stateInternals = (StateInternals<K>) c.windowingInternals().stateInternals();
-
- ReduceFnRunner<K, InputT, OutputT, W> reduceFnRunner =
- new ReduceFnRunner<K, InputT, OutputT, W>(
- key,
- strategy,
- stateInternals,
- timerInternals,
- c.windowingInternals(),
- droppedDueToClosedWindow,
- reduceFn,
- c.getPipelineOptions());
-
- Iterable<List<WindowedValue<InputT>>> chunks =
- Iterables.partition(c.element().getValue(), 1000);
- for (Iterable<WindowedValue<InputT>> chunk : chunks) {
- // Process the chunk of elements.
- reduceFnRunner.processElements(chunk);
-
- // Then, since elements are sorted by their timestamp, advance the input watermark
- // to the first element, and fire any timers that may have been scheduled.
- timerInternals.advanceInputWatermark(reduceFnRunner, chunk.iterator().next().getTimestamp());
-
- // Fire any processing timers that need to fire
- timerInternals.advanceProcessingTime(reduceFnRunner, Instant.now());
-
- // Leave the output watermark undefined. Since there's no late data in batch mode
- // there's really no need to track it as we do for streaming.
- }
-
- // Finish any pending windows by advancing the input watermark to infinity.
- timerInternals.advanceInputWatermark(reduceFnRunner, BoundedWindow.TIMESTAMP_MAX_VALUE);
-
- // Finally, advance the processing time to infinity to fire any timers.
- timerInternals.advanceProcessingTime(reduceFnRunner, BoundedWindow.TIMESTAMP_MAX_VALUE);
-
- reduceFnRunner.persist();
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/IOChannelFactory.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/IOChannelFactory.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/IOChannelFactory.java
deleted file mode 100644
index f7d0b9a..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/IOChannelFactory.java
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-import java.io.FileNotFoundException;
-import java.io.IOException;
-import java.nio.channels.ReadableByteChannel;
-import java.nio.channels.WritableByteChannel;
-import java.util.Collection;
-
-/**
- * Defines a factory for working with read and write channels.
- *
- * <p>Channels provide an abstract API for IO operations.
- *
- * <p>See <a href="http://docs.oracle.com/javase/7/docs/api/java/nio/channels/package-summary.html"
- * >Java NIO Channels</a>
- */
-public interface IOChannelFactory {
-
- /**
- * Matches a specification, which may contain globs, against available
- * resources.
- *
- * <p>Glob handling is dependent on the implementation. Implementations should
- * all support globs in the final component of a path (eg /foo/bar/*.txt),
- * however they are not required to support globs in the directory paths.
- *
- * <p>The list of resources returned are required to exist and not represent abstract
- * resources such as symlinks and directories.
- */
- Collection<String> match(String spec) throws IOException;
-
- /**
- * Returns a read channel for the given specification.
- *
- * <p>The specification is not expanded; it is used verbatim.
- *
- * <p>If seeking is supported, then this returns a
- * {@link java.nio.channels.SeekableByteChannel}.
- */
- ReadableByteChannel open(String spec) throws IOException;
-
- /**
- * Returns a write channel for the given specification.
- *
- * <p>The specification is not expanded; is it used verbatim.
- */
- WritableByteChannel create(String spec, String mimeType) throws IOException;
-
- /**
- * Returns the size in bytes for the given specification.
- *
- * <p>The specification is not expanded; it is used verbatim.
- *
- * <p>{@link FileNotFoundException} will be thrown if the resource does not exist.
- */
- long getSizeBytes(String spec) throws IOException;
-
- /**
- * Returns {@code true} if the channel created when invoking method {@link #open} for the given
- * file specification is guaranteed to be of type {@link java.nio.channels.SeekableByteChannel
- * SeekableByteChannel} and if seeking into positions of the channel is recommended. Returns
- * {@code false} if the channel returned is not a {@code SeekableByteChannel}. May return
- * {@code false} even if the channel returned is a {@code SeekableByteChannel}, if seeking is not
- * efficient for the given file specification.
- *
- * <p>Only efficiently seekable files can be split into offset ranges.
- *
- * <p>The specification is not expanded; it is used verbatim.
- */
- boolean isReadSeekEfficient(String spec) throws IOException;
-
- /**
- * Resolve the given {@code other} against the {@code path}.
- *
- * <p>If the {@code other} parameter is an absolute path then this method trivially returns
- * other. If {@code other} is an empty path then this method trivially returns the given
- * {@code path}. Otherwise this method considers the given {@code path} to be a directory and
- * resolves the {@code other} path against this path. In the simplest case, the {@code other}
- * path does not have a root component, in which case this method joins the {@code other} path
- * to the given {@code path} and returns a resulting path that ends with the {@code other} path.
- * Where the {@code other} path has a root component then resolution is highly implementation
- * dependent and therefore unspecified.
- */
- public String resolve(String path, String other) throws IOException;
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/IOChannelUtils.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/IOChannelUtils.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/IOChannelUtils.java
deleted file mode 100644
index cbf420e..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/IOChannelUtils.java
+++ /dev/null
@@ -1,204 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.cloud.dataflow.sdk.options.GcsOptions;
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-
-import java.io.FileNotFoundException;
-import java.io.IOException;
-import java.nio.channels.WritableByteChannel;
-import java.text.DecimalFormat;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Map;
-import java.util.Set;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-/**
- * Provides utilities for creating read and write channels.
- */
-public class IOChannelUtils {
- // TODO: add registration mechanism for adding new schemas.
- private static final Map<String, IOChannelFactory> FACTORY_MAP =
- Collections.synchronizedMap(new HashMap<String, IOChannelFactory>());
-
- // Pattern that matches shard placeholders within a shard template.
- private static final Pattern SHARD_FORMAT_RE = Pattern.compile("(S+|N+)");
-
- /**
- * Associates a scheme with an {@link IOChannelFactory}.
- *
- * <p>The given factory is used to construct read and write channels when
- * a URI is provided with the given scheme.
- *
- * <p>For example, when reading from "gs://bucket/path", the scheme "gs" is
- * used to lookup the appropriate factory.
- */
- public static void setIOFactory(String scheme, IOChannelFactory factory) {
- FACTORY_MAP.put(scheme, factory);
- }
-
- /**
- * Registers standard factories globally. This requires {@link PipelineOptions}
- * to provide, e.g., credentials for GCS.
- */
- public static void registerStandardIOFactories(PipelineOptions options) {
- setIOFactory("gs", new GcsIOChannelFactory(options.as(GcsOptions.class)));
- }
-
- /**
- * Creates a write channel for the given filename.
- */
- public static WritableByteChannel create(String filename, String mimeType)
- throws IOException {
- return getFactory(filename).create(filename, mimeType);
- }
-
- /**
- * Creates a write channel for the given file components.
- *
- * <p>If numShards is specified, then a ShardingWritableByteChannel is
- * returned.
- *
- * <p>Shard numbers are 0 based, meaning they start with 0 and end at the
- * number of shards - 1.
- */
- public static WritableByteChannel create(String prefix, String shardTemplate,
- String suffix, int numShards, String mimeType) throws IOException {
- if (numShards == 1) {
- return create(constructName(prefix, shardTemplate, suffix, 0, 1),
- mimeType);
- }
-
- // It is the callers responsibility to close this channel.
- @SuppressWarnings("resource")
- ShardingWritableByteChannel shardingChannel =
- new ShardingWritableByteChannel();
-
- Set<String> outputNames = new HashSet<>();
- for (int i = 0; i < numShards; i++) {
- String outputName =
- constructName(prefix, shardTemplate, suffix, i, numShards);
- if (!outputNames.add(outputName)) {
- throw new IllegalArgumentException(
- "Shard name collision detected for: " + outputName);
- }
- WritableByteChannel channel = create(outputName, mimeType);
- shardingChannel.addChannel(channel);
- }
-
- return shardingChannel;
- }
-
- /**
- * Returns the size in bytes for the given specification.
- *
- * <p>The specification is not expanded; it is used verbatim.
- *
- * <p>{@link FileNotFoundException} will be thrown if the resource does not exist.
- */
- public static long getSizeBytes(String spec) throws IOException {
- return getFactory(spec).getSizeBytes(spec);
- }
-
- /**
- * Constructs a fully qualified name from components.
- *
- * <p>The name is built from a prefix, shard template (with shard numbers
- * applied), and a suffix. All components are required, but may be empty
- * strings.
- *
- * <p>Within a shard template, repeating sequences of the letters "S" or "N"
- * are replaced with the shard number, or number of shards respectively. The
- * numbers are formatted with leading zeros to match the length of the
- * repeated sequence of letters.
- *
- * <p>For example, if prefix = "output", shardTemplate = "-SSS-of-NNN", and
- * suffix = ".txt", with shardNum = 1 and numShards = 100, the following is
- * produced: "output-001-of-100.txt".
- */
- public static String constructName(String prefix,
- String shardTemplate, String suffix, int shardNum, int numShards) {
- // Matcher API works with StringBuffer, rather than StringBuilder.
- StringBuffer sb = new StringBuffer();
- sb.append(prefix);
-
- Matcher m = SHARD_FORMAT_RE.matcher(shardTemplate);
- while (m.find()) {
- boolean isShardNum = (m.group(1).charAt(0) == 'S');
-
- char[] zeros = new char[m.end() - m.start()];
- Arrays.fill(zeros, '0');
- DecimalFormat df = new DecimalFormat(String.valueOf(zeros));
- String formatted = df.format(isShardNum
- ? shardNum
- : numShards);
- m.appendReplacement(sb, formatted);
- }
- m.appendTail(sb);
-
- sb.append(suffix);
- return sb.toString();
- }
-
- private static final Pattern URI_SCHEME_PATTERN = Pattern.compile(
- "(?<scheme>[a-zA-Z][-a-zA-Z0-9+.]*)://.*");
-
- /**
- * Returns the IOChannelFactory associated with an input specification.
- */
- public static IOChannelFactory getFactory(String spec) throws IOException {
- // The spec is almost, but not quite, a URI. In particular,
- // the reserved characters '[', ']', and '?' have meanings that differ
- // from their use in the URI spec. ('*' is not reserved).
- // Here, we just need the scheme, which is so circumscribed as to be
- // very easy to extract with a regex.
- Matcher matcher = URI_SCHEME_PATTERN.matcher(spec);
-
- if (!matcher.matches()) {
- return new FileIOChannelFactory();
- }
-
- String scheme = matcher.group("scheme");
- IOChannelFactory ioFactory = FACTORY_MAP.get(scheme);
- if (ioFactory != null) {
- return ioFactory;
- }
-
- throw new IOException("Unable to find handler for " + spec);
- }
-
- /**
- * Resolve the given {@code other} against the {@code path}.
- *
- * <p>If the {@code other} parameter is an absolute path then this method trivially returns
- * other. If {@code other} is an empty path then this method trivially returns the given
- * {@code path}. Otherwise this method considers the given {@code path} to be a directory and
- * resolves the {@code other} path against this path. In the simplest case, the {@code other}
- * path does not have a root component, in which case this method joins the {@code other} path
- * to the given {@code path} and returns a resulting path that ends with the {@code other} path.
- * Where the {@code other} path has a root component then resolution is highly implementation
- * dependent and therefore unspecified.
- */
- public static String resolve(String path, String other) throws IOException {
- return getFactory(path).resolve(path, other);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/IllegalMutationException.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/IllegalMutationException.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/IllegalMutationException.java
deleted file mode 100644
index dbe249e..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/IllegalMutationException.java
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-/**
- * Thrown when a value appears to have been mutated, but that mutation is forbidden.
- */
-public class IllegalMutationException extends RuntimeException {
- private Object savedValue;
- private Object newValue;
-
- public IllegalMutationException(String message, Object savedValue, Object newValue) {
- super(message);
- this.savedValue = savedValue;
- this.newValue = newValue;
- }
-
- public IllegalMutationException(
- String message, Object savedValue, Object newValue, Throwable cause) {
- super(message, cause);
- this.savedValue = savedValue;
- this.newValue = newValue;
- }
-
- /**
- * The original value, before the illegal mutation.
- */
- public Object getSavedValue() {
- return savedValue;
- }
-
- /**
- * The value after the illegal mutation.
- */
- public Object getNewValue() {
- return newValue;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/InstanceBuilder.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/InstanceBuilder.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/InstanceBuilder.java
deleted file mode 100644
index 99442d0..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/InstanceBuilder.java
+++ /dev/null
@@ -1,269 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.cloud.dataflow.sdk.values.TypeDescriptor;
-import com.google.common.base.Joiner;
-import com.google.common.base.Preconditions;
-
-import java.lang.reflect.Constructor;
-import java.lang.reflect.InvocationTargetException;
-import java.lang.reflect.Method;
-import java.lang.reflect.Modifier;
-import java.util.LinkedList;
-import java.util.List;
-
-import javax.annotation.Nullable;
-
-/**
- * Utility for creating objects dynamically.
- *
- * @param <T> type type of object returned by this instance builder
- */
-public class InstanceBuilder<T> {
-
- /**
- * Create an InstanceBuilder for the given type.
- *
- * <p>The specified type is the type returned by {@link #build}, which is
- * typically the common base type or interface of the instance being
- * constructed.
- */
- public static <T> InstanceBuilder<T> ofType(Class<T> type) {
- return new InstanceBuilder<>(type);
- }
-
- /**
- * Create an InstanceBuilder for the given type.
- *
- * <p>The specified type is the type returned by {@link #build}, which is
- * typically the common base type or interface for the instance to be
- * constructed.
- *
- * <p>The TypeDescriptor argument allows specification of generic types. For example,
- * a {@code List<String>} return type can be specified as
- * {@code ofType(new TypeDescriptor<List<String>>(){})}.
- */
- public static <T> InstanceBuilder<T> ofType(TypeDescriptor<T> token) {
- @SuppressWarnings("unchecked")
- Class<T> type = (Class<T>) token.getRawType();
- return new InstanceBuilder<>(type);
- }
-
- /**
- * Sets the class name to be constructed.
- *
- * <p>If the name is a simple name (ie {@link Class#getSimpleName()}), then
- * the package of the return type is added as a prefix.
- *
- * <p>The default class is the return type, specified in {@link #ofType}.
- *
- * <p>Modifies and returns the {@code InstanceBuilder} for chaining.
- *
- * @throws ClassNotFoundException if no class can be found by the given name
- */
- public InstanceBuilder<T> fromClassName(String name)
- throws ClassNotFoundException {
- Preconditions.checkArgument(factoryClass == null,
- "Class name may only be specified once");
- if (name.indexOf('.') == -1) {
- name = type.getPackage().getName() + "." + name;
- }
-
- try {
- factoryClass = Class.forName(name);
- } catch (ClassNotFoundException e) {
- throw new ClassNotFoundException(
- String.format("Could not find class: %s", name), e);
- }
- return this;
- }
-
- /**
- * Sets the factory class to use for instance construction.
- *
- * <p>Modifies and returns the {@code InstanceBuilder} for chaining.
- */
- public InstanceBuilder<T> fromClass(Class<?> factoryClass) {
- this.factoryClass = factoryClass;
- return this;
- }
-
- /**
- * Sets the name of the factory method used to construct the instance.
- *
- * <p>The default, if no factory method was specified, is to look for a class
- * constructor.
- *
- * <p>Modifies and returns the {@code InstanceBuilder} for chaining.
- */
- public InstanceBuilder<T> fromFactoryMethod(String methodName) {
- Preconditions.checkArgument(this.methodName == null,
- "Factory method name may only be specified once");
- this.methodName = methodName;
- return this;
- }
-
- /**
- * Adds an argument to be passed to the factory method.
- *
- * <p>The argument type is used to lookup the factory method. This type may be
- * a supertype of the argument value's class.
- *
- * <p>Modifies and returns the {@code InstanceBuilder} for chaining.
- *
- * @param <ArgT> the argument type
- */
- public <ArgT> InstanceBuilder<T> withArg(Class<? super ArgT> argType, ArgT value) {
- parameterTypes.add(argType);
- arguments.add(value);
- return this;
- }
-
- /**
- * Creates the instance by calling the factory method with the given
- * arguments.
- *
- * <h3>Defaults</h3>
- * <ul>
- * <li>factory class: defaults to the output type class, overridden
- * via {@link #fromClassName(String)}.
- * <li>factory method: defaults to using a constructor on the factory
- * class, overridden via {@link #fromFactoryMethod(String)}.
- * </ul>
- *
- * @throws RuntimeException if the method does not exist, on type mismatch,
- * or if the method cannot be made accessible.
- */
- public T build() {
- if (factoryClass == null) {
- factoryClass = type;
- }
-
- Class<?>[] types = parameterTypes
- .toArray(new Class<?>[parameterTypes.size()]);
-
- // TODO: cache results, to speed repeated type lookups?
- if (methodName != null) {
- return buildFromMethod(types);
- } else {
- return buildFromConstructor(types);
- }
- }
-
- /////////////////////////////////////////////////////////////////////////////
-
- /**
- * Type of object to construct.
- */
- private final Class<T> type;
-
- /**
- * Types of parameters for Method lookup.
- *
- * @see Class#getDeclaredMethod(String, Class[])
- */
- private final List<Class<?>> parameterTypes = new LinkedList<>();
-
- /**
- * Arguments to factory method {@link Method#invoke(Object, Object...)}.
- */
- private final List<Object> arguments = new LinkedList<>();
-
- /**
- * Name of factory method, or null to invoke the constructor.
- */
- @Nullable private String methodName;
-
- /**
- * Factory class, or null to instantiate {@code type}.
- */
- @Nullable private Class<?> factoryClass;
-
- private InstanceBuilder(Class<T> type) {
- this.type = type;
- }
-
- private T buildFromMethod(Class<?>[] types) {
- Preconditions.checkState(factoryClass != null);
- Preconditions.checkState(methodName != null);
-
- try {
- Method method = factoryClass.getDeclaredMethod(methodName, types);
-
- Preconditions.checkState(Modifier.isStatic(method.getModifiers()),
- "Factory method must be a static method for "
- + factoryClass.getName() + "#" + method.getName()
- );
-
- Preconditions.checkState(type.isAssignableFrom(method.getReturnType()),
- "Return type for " + factoryClass.getName() + "#" + method.getName()
- + " must be assignable to " + type.getSimpleName());
-
- if (!method.isAccessible()) {
- method.setAccessible(true);
- }
-
- Object[] args = arguments.toArray(new Object[arguments.size()]);
- return type.cast(method.invoke(null, args));
-
- } catch (NoSuchMethodException e) {
- throw new RuntimeException(
- String.format("Unable to find factory method %s#%s(%s)",
- factoryClass.getSimpleName(),
- methodName,
- Joiner.on(", ").join(types)));
-
- } catch (IllegalAccessException | InvocationTargetException e) {
- throw new RuntimeException(
- String.format("Failed to construct instance from factory method %s#%s(%s)",
- factoryClass.getSimpleName(),
- methodName,
- Joiner.on(", ").join(types)),
- e);
- }
- }
-
- private T buildFromConstructor(Class<?>[] types) {
- Preconditions.checkState(factoryClass != null);
-
- try {
- Constructor<?> constructor = factoryClass.getDeclaredConstructor(types);
-
- Preconditions.checkState(type.isAssignableFrom(factoryClass),
- "Instance type " + factoryClass.getName()
- + " must be assignable to " + type.getSimpleName());
-
- if (!constructor.isAccessible()) {
- constructor.setAccessible(true);
- }
-
- Object[] args = arguments.toArray(new Object[arguments.size()]);
- return type.cast(constructor.newInstance(args));
-
- } catch (NoSuchMethodException e) {
- throw new RuntimeException("Unable to find constructor for "
- + factoryClass.getName());
-
- } catch (InvocationTargetException |
- InstantiationException |
- IllegalAccessException e) {
- throw new RuntimeException("Failed to construct instance from "
- + "constructor " + factoryClass.getName(), e);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/IntervalBoundedExponentialBackOff.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/IntervalBoundedExponentialBackOff.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/IntervalBoundedExponentialBackOff.java
deleted file mode 100644
index 4406ee5..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/IntervalBoundedExponentialBackOff.java
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.api.client.util.BackOff;
-import com.google.common.base.Preconditions;
-
-/**
- * Implementation of {@link BackOff} that increases the back off period for each retry attempt
- * using a randomization function that grows exponentially.
- *
- * <p>Example: The initial interval is .5 seconds and the maximum interval is 60 secs.
- * For 14 tries the sequence will be (values in seconds):
- *
- * <pre>
- * retry# retry_interval randomized_interval
- * 1 0.5 [0.25, 0.75]
- * 2 0.75 [0.375, 1.125]
- * 3 1.125 [0.562, 1.687]
- * 4 1.687 [0.8435, 2.53]
- * 5 2.53 [1.265, 3.795]
- * 6 3.795 [1.897, 5.692]
- * 7 5.692 [2.846, 8.538]
- * 8 8.538 [4.269, 12.807]
- * 9 12.807 [6.403, 19.210]
- * 10 28.832 [14.416, 43.248]
- * 11 43.248 [21.624, 64.873]
- * 12 60.0 [30.0, 90.0]
- * 13 60.0 [30.0, 90.0]
- * 14 60.0 [30.0, 90.0]
- * </pre>
- *
- * <p>Implementation is not thread-safe.
- */
-public class IntervalBoundedExponentialBackOff implements BackOff {
- public static final double DEFAULT_MULTIPLIER = 1.5;
- public static final double DEFAULT_RANDOMIZATION_FACTOR = 0.5;
- private final long maximumIntervalMillis;
- private final long initialIntervalMillis;
- private int currentAttempt;
-
- public IntervalBoundedExponentialBackOff(int maximumIntervalMillis, long initialIntervalMillis) {
- Preconditions.checkArgument(
- maximumIntervalMillis > 0, "Maximum interval must be greater than zero.");
- Preconditions.checkArgument(
- initialIntervalMillis > 0, "Initial interval must be greater than zero.");
- this.maximumIntervalMillis = maximumIntervalMillis;
- this.initialIntervalMillis = initialIntervalMillis;
- reset();
- }
-
- @Override
- public void reset() {
- currentAttempt = 1;
- }
-
- @Override
- public long nextBackOffMillis() {
- double currentIntervalMillis =
- Math.min(
- initialIntervalMillis * Math.pow(DEFAULT_MULTIPLIER, currentAttempt - 1),
- maximumIntervalMillis);
- double randomOffset =
- (Math.random() * 2 - 1) * DEFAULT_RANDOMIZATION_FACTOR * currentIntervalMillis;
- currentAttempt += 1;
- return Math.round(currentIntervalMillis + randomOffset);
- }
-
- public boolean atMaxInterval() {
- return initialIntervalMillis * Math.pow(DEFAULT_MULTIPLIER, currentAttempt - 1)
- >= maximumIntervalMillis;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/KeyedWorkItem.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/KeyedWorkItem.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/KeyedWorkItem.java
deleted file mode 100644
index 355f0bb..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/KeyedWorkItem.java
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.cloud.dataflow.sdk.util.TimerInternals.TimerData;
-
-/**
- * Interface that contains all the timers and elements associated with a specific work item.
- *
- * @param <K> the key type
- * @param <ElemT> the element type
- */
-public interface KeyedWorkItem<K, ElemT> {
- /**
- * Returns the key.
- */
- K key();
-
- /**
- * Returns an iterable containing the timers.
- */
- Iterable<TimerData> timersIterable();
-
- /**
- * Returns an iterable containing the elements.
- */
- Iterable<WindowedValue<ElemT>> elementsIterable();
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/KeyedWorkItemCoder.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/KeyedWorkItemCoder.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/KeyedWorkItemCoder.java
deleted file mode 100644
index 398e82a..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/KeyedWorkItemCoder.java
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Copyright (C) 2016 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.util;
-
-import static com.google.common.base.Preconditions.checkArgument;
-
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.coders.CoderException;
-import com.google.cloud.dataflow.sdk.coders.IterableCoder;
-import com.google.cloud.dataflow.sdk.coders.StandardCoder;
-import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
-import com.google.cloud.dataflow.sdk.util.TimerInternals.TimerData;
-import com.google.cloud.dataflow.sdk.util.TimerInternals.TimerDataCoder;
-import com.google.cloud.dataflow.sdk.util.WindowedValue.FullWindowedValueCoder;
-import com.google.common.collect.ImmutableList;
-
-import com.fasterxml.jackson.annotation.JsonCreator;
-import com.fasterxml.jackson.annotation.JsonProperty;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.util.List;
-
-/**
- * A {@link Coder} for {@link KeyedWorkItem KeyedWorkItems}.
- */
-public class KeyedWorkItemCoder<K, ElemT> extends StandardCoder<KeyedWorkItem<K, ElemT>> {
- /**
- * Create a new {@link KeyedWorkItemCoder} with the provided key coder, element coder, and window
- * coder.
- */
- public static <K, ElemT> KeyedWorkItemCoder<K, ElemT> of(
- Coder<K> keyCoder, Coder<ElemT> elemCoder, Coder<? extends BoundedWindow> windowCoder) {
- return new KeyedWorkItemCoder<>(keyCoder, elemCoder, windowCoder);
- }
-
- @JsonCreator
- public static <K, ElemT> KeyedWorkItemCoder<K, ElemT> of(
- @JsonProperty(PropertyNames.COMPONENT_ENCODINGS) List<Coder<?>> components) {
- checkArgument(components.size() == 3, "Expecting 3 components, got %s", components.size());
- @SuppressWarnings("unchecked")
- Coder<K> keyCoder = (Coder<K>) components.get(0);
- @SuppressWarnings("unchecked")
- Coder<ElemT> elemCoder = (Coder<ElemT>) components.get(1);
- @SuppressWarnings("unchecked")
- Coder<? extends BoundedWindow> windowCoder = (Coder<? extends BoundedWindow>) components.get(2);
- return new KeyedWorkItemCoder<>(keyCoder, elemCoder, windowCoder);
- }
-
- private final Coder<K> keyCoder;
- private final Coder<ElemT> elemCoder;
- private final Coder<? extends BoundedWindow> windowCoder;
- private final Coder<Iterable<TimerData>> timersCoder;
- private final Coder<Iterable<WindowedValue<ElemT>>> elemsCoder;
-
- private KeyedWorkItemCoder(
- Coder<K> keyCoder, Coder<ElemT> elemCoder, Coder<? extends BoundedWindow> windowCoder) {
- this.keyCoder = keyCoder;
- this.elemCoder = elemCoder;
- this.windowCoder = windowCoder;
- this.timersCoder = IterableCoder.of(TimerDataCoder.of(windowCoder));
- this.elemsCoder = IterableCoder.of(FullWindowedValueCoder.of(elemCoder, windowCoder));
- }
-
- @Override
- public void encode(KeyedWorkItem<K, ElemT> value, OutputStream outStream, Coder.Context context)
- throws CoderException, IOException {
- Coder.Context nestedContext = context.nested();
- keyCoder.encode(value.key(), outStream, nestedContext);
- timersCoder.encode(value.timersIterable(), outStream, nestedContext);
- elemsCoder.encode(value.elementsIterable(), outStream, nestedContext);
- }
-
- @Override
- public KeyedWorkItem<K, ElemT> decode(InputStream inStream, Coder.Context context)
- throws CoderException, IOException {
- Coder.Context nestedContext = context.nested();
- K key = keyCoder.decode(inStream, nestedContext);
- Iterable<TimerData> timers = timersCoder.decode(inStream, nestedContext);
- Iterable<WindowedValue<ElemT>> elems = elemsCoder.decode(inStream, nestedContext);
- return KeyedWorkItems.workItem(key, timers, elems);
- }
-
- @Override
- public List<? extends Coder<?>> getCoderArguments() {
- return ImmutableList.of(keyCoder, elemCoder, windowCoder);
- }
-
- @Override
- public void verifyDeterministic() throws Coder.NonDeterministicException {
- keyCoder.verifyDeterministic();
- timersCoder.verifyDeterministic();
- elemsCoder.verifyDeterministic();
- }
-
- /**
- * {@inheritDoc}.
- *
- * {@link KeyedWorkItemCoder} is not consistent with equals as it can return a
- * {@link KeyedWorkItem} of a type different from the originally encoded type.
- */
- @Override
- public boolean consistentWithEquals() {
- return false;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/KeyedWorkItems.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/KeyedWorkItems.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/KeyedWorkItems.java
deleted file mode 100644
index 734bd2c..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/KeyedWorkItems.java
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Copyright (C) 2016 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.cloud.dataflow.sdk.util.TimerInternals.TimerData;
-import com.google.common.base.MoreObjects;
-import com.google.common.collect.Iterables;
-
-import java.util.Collections;
-import java.util.Objects;
-
-/**
- * Static utility methods that provide {@link KeyedWorkItem} implementations.
- */
-public class KeyedWorkItems {
- /**
- * Returns an implementation of {@link KeyedWorkItem} that wraps around an elements iterable.
- *
- * @param <K> the key type
- * @param <ElemT> the element type
- */
- public static <K, ElemT> KeyedWorkItem<K, ElemT> elementsWorkItem(
- K key, Iterable<WindowedValue<ElemT>> elementsIterable) {
- return new ComposedKeyedWorkItem<>(key, Collections.<TimerData>emptyList(), elementsIterable);
- }
-
- /**
- * Returns an implementation of {@link KeyedWorkItem} that wraps around an timers iterable.
- *
- * @param <K> the key type
- * @param <ElemT> the element type
- */
- public static <K, ElemT> KeyedWorkItem<K, ElemT> timersWorkItem(
- K key, Iterable<TimerData> timersIterable) {
- return new ComposedKeyedWorkItem<>(
- key, timersIterable, Collections.<WindowedValue<ElemT>>emptyList());
- }
-
- /**
- * Returns an implementation of {@link KeyedWorkItem} that wraps around
- * an timers iterable and an elements iterable.
- *
- * @param <K> the key type
- * @param <ElemT> the element type
- */
- public static <K, ElemT> KeyedWorkItem<K, ElemT> workItem(
- K key, Iterable<TimerData> timersIterable, Iterable<WindowedValue<ElemT>> elementsIterable) {
- return new ComposedKeyedWorkItem<>(key, timersIterable, elementsIterable);
- }
-
- /**
- * A {@link KeyedWorkItem} composed of an underlying key, {@link TimerData} iterable, and element
- * iterable.
- */
- public static class ComposedKeyedWorkItem<K, ElemT> implements KeyedWorkItem<K, ElemT> {
- private final K key;
- private final Iterable<TimerData> timers;
- private final Iterable<WindowedValue<ElemT>> elements;
-
- private ComposedKeyedWorkItem(
- K key, Iterable<TimerData> timers, Iterable<WindowedValue<ElemT>> elements) {
- this.key = key;
- this.timers = timers;
- this.elements = elements;
- }
-
- @Override
- public K key() {
- return key;
- }
-
- @Override
- public Iterable<TimerData> timersIterable() {
- return timers;
- }
-
- @Override
- public Iterable<WindowedValue<ElemT>> elementsIterable() {
- return elements;
- }
-
- @Override
- public boolean equals(Object other) {
- if (other == null || !(other instanceof ComposedKeyedWorkItem)) {
- return false;
- }
- KeyedWorkItem<?, ?> that = (KeyedWorkItem<?, ?>) other;
- return Objects.equals(this.key, that.key())
- && Iterables.elementsEqual(this.timersIterable(), that.timersIterable())
- && Iterables.elementsEqual(this.elementsIterable(), that.elementsIterable());
- }
-
- @Override
- public int hashCode() {
- return Objects.hash(key, timers, elements);
- }
-
- @Override
- public String toString() {
- return MoreObjects.toStringHelper(ComposedKeyedWorkItem.class)
- .add("key", key)
- .add("elements", elements)
- .add("timers", timers)
- .toString();
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/LateDataDroppingDoFnRunner.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/LateDataDroppingDoFnRunner.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/LateDataDroppingDoFnRunner.java
deleted file mode 100644
index 31927ab..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/LateDataDroppingDoFnRunner.java
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- * Copyright (C) 2016 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.cloud.dataflow.sdk.transforms.Aggregator;
-import com.google.cloud.dataflow.sdk.transforms.DoFn;
-import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
-import com.google.cloud.dataflow.sdk.values.KV;
-import com.google.common.annotations.VisibleForTesting;
-import com.google.common.base.Function;
-import com.google.common.base.Predicate;
-import com.google.common.collect.Iterables;
-
-import org.joda.time.Instant;
-
-/**
- * A customized {@link DoFnRunner} that handles late data dropping for
- * a {@link KeyedWorkItem} input {@link DoFn}.
- *
- * <p>It expands windows before checking data lateness.
- *
- * <p>{@link KeyedWorkItem KeyedWorkItems} are always in empty windows.
- *
- * @param <K> key type
- * @param <InputT> input value element type
- * @param <OutputT> output value element type
- * @param <W> window type
- */
-public class LateDataDroppingDoFnRunner<K, InputT, OutputT, W extends BoundedWindow>
- implements DoFnRunner<KeyedWorkItem<K, InputT>, KV<K, OutputT>> {
- private final DoFnRunner<KeyedWorkItem<K, InputT>, KV<K, OutputT>> doFnRunner;
- private final LateDataFilter lateDataFilter;
-
- public LateDataDroppingDoFnRunner(
- DoFnRunner<KeyedWorkItem<K, InputT>, KV<K, OutputT>> doFnRunner,
- WindowingStrategy<?, ?> windowingStrategy,
- TimerInternals timerInternals,
- Aggregator<Long, Long> droppedDueToLateness) {
- this.doFnRunner = doFnRunner;
- lateDataFilter = new LateDataFilter(windowingStrategy, timerInternals, droppedDueToLateness);
- }
-
- @Override
- public void startBundle() {
- doFnRunner.startBundle();
- }
-
- @Override
- public void processElement(WindowedValue<KeyedWorkItem<K, InputT>> elem) {
- Iterable<WindowedValue<InputT>> nonLateElements = lateDataFilter.filter(
- elem.getValue().key(), elem.getValue().elementsIterable());
- KeyedWorkItem<K, InputT> keyedWorkItem = KeyedWorkItems.workItem(
- elem.getValue().key(), elem.getValue().timersIterable(), nonLateElements);
- doFnRunner.processElement(elem.withValue(keyedWorkItem));
- }
-
- @Override
- public void finishBundle() {
- doFnRunner.finishBundle();
- }
-
- /**
- * It filters late data in a {@link KeyedWorkItem}.
- */
- @VisibleForTesting
- static class LateDataFilter {
- private final WindowingStrategy<?, ?> windowingStrategy;
- private final TimerInternals timerInternals;
- private final Aggregator<Long, Long> droppedDueToLateness;
-
- public LateDataFilter(
- WindowingStrategy<?, ?> windowingStrategy,
- TimerInternals timerInternals,
- Aggregator<Long, Long> droppedDueToLateness) {
- this.windowingStrategy = windowingStrategy;
- this.timerInternals = timerInternals;
- this.droppedDueToLateness = droppedDueToLateness;
- }
-
- /**
- * Returns an {@code Iterable<WindowedValue<InputT>>} that only contains
- * non-late input elements.
- */
- public <K, InputT> Iterable<WindowedValue<InputT>> filter(
- final K key, Iterable<WindowedValue<InputT>> elements) {
- Iterable<Iterable<WindowedValue<InputT>>> windowsExpandedElements = Iterables.transform(
- elements,
- new Function<WindowedValue<InputT>, Iterable<WindowedValue<InputT>>>() {
- @Override
- public Iterable<WindowedValue<InputT>> apply(final WindowedValue<InputT> input) {
- return Iterables.transform(
- input.getWindows(),
- new Function<BoundedWindow, WindowedValue<InputT>>() {
- @Override
- public WindowedValue<InputT> apply(BoundedWindow window) {
- return WindowedValue.of(
- input.getValue(), input.getTimestamp(), window, input.getPane());
- }
- });
- }});
-
- Iterable<WindowedValue<InputT>> nonLateElements = Iterables.filter(
- Iterables.concat(windowsExpandedElements),
- new Predicate<WindowedValue<InputT>>() {
- @Override
- public boolean apply(WindowedValue<InputT> input) {
- BoundedWindow window = Iterables.getOnlyElement(input.getWindows());
- if (canDropDueToExpiredWindow(window)) {
- // The element is too late for this window.
- droppedDueToLateness.addValue(1L);
- WindowTracing.debug(
- "ReduceFnRunner.processElement: Dropping element at {} for key:{}; window:{} "
- + "since too far behind inputWatermark:{}; outputWatermark:{}",
- input.getTimestamp(), key, window, timerInternals.currentInputWatermarkTime(),
- timerInternals.currentOutputWatermarkTime());
- return false;
- } else {
- return true;
- }
- }
- });
- return nonLateElements;
- }
-
- /** Is {@code window} expired w.r.t. the garbage collection watermark? */
- private boolean canDropDueToExpiredWindow(BoundedWindow window) {
- Instant inputWM = timerInternals.currentInputWatermarkTime();
- return inputWM != null
- && window.maxTimestamp().plus(windowingStrategy.getAllowedLateness()).isBefore(inputWM);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/MapAggregatorValues.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/MapAggregatorValues.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/MapAggregatorValues.java
deleted file mode 100644
index a4d8ffd..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/MapAggregatorValues.java
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.cloud.dataflow.sdk.runners.AggregatorValues;
-import com.google.cloud.dataflow.sdk.transforms.Aggregator;
-import com.google.common.base.MoreObjects;
-
-import java.util.Map;
-
-/**
- * An {@link AggregatorValues} implementation that is backed by an in-memory map.
- *
- * @param <T> the output type of the {@link Aggregator}
- */
-public class MapAggregatorValues<T> extends AggregatorValues<T> {
- private final Map<String, T> stepValues;
-
- public MapAggregatorValues(Map<String, T> stepValues) {
- this.stepValues = stepValues;
- }
-
- @Override
- public Map<String, T> getValuesAtSteps() {
- return stepValues;
- }
-
- @Override
- public String toString() {
- return MoreObjects.toStringHelper(MapAggregatorValues.class)
- .add("stepValues", stepValues)
- .toString();
- }
-}
[17/67] [partial] incubator-beam git commit: Directory reorganization
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/AfterWatermark.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/AfterWatermark.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/AfterWatermark.java
deleted file mode 100644
index fac2c28..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/AfterWatermark.java
+++ /dev/null
@@ -1,397 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms.windowing;
-
-import static com.google.common.base.Preconditions.checkNotNull;
-
-import com.google.cloud.dataflow.sdk.annotations.Experimental;
-import com.google.cloud.dataflow.sdk.transforms.windowing.Trigger.OnceTrigger;
-import com.google.cloud.dataflow.sdk.util.ExecutableTrigger;
-import com.google.cloud.dataflow.sdk.util.TimeDomain;
-import com.google.common.base.Preconditions;
-import com.google.common.collect.ImmutableList;
-
-import org.joda.time.Instant;
-
-import java.util.List;
-import java.util.Objects;
-
-/**
- * <p>{@code AfterWatermark} triggers fire based on progress of the system watermark. This time is a
- * lower-bound, sometimes heuristically established, on event times that have been fully processed
- * by the pipeline.
- *
- * <p>For sources that provide non-heuristic watermarks (e.g.
- * {@link com.google.cloud.dataflow.sdk.io.PubsubIO} when using arrival times as event times), the
- * watermark is a strict guarantee that no data with an event time earlier than
- * that watermark will ever be observed in the pipeline. In this case, it's safe to assume that any
- * pane triggered by an {@code AfterWatermark} trigger with a reference point at or beyond the end
- * of the window will be the last pane ever for that window.
- *
- * <p>For sources that provide heuristic watermarks (e.g.
- * {@link com.google.cloud.dataflow.sdk.io.PubsubIO} when using user-supplied event times), the
- * watermark itself becomes an <i>estimate</i> that no data with an event time earlier than that
- * watermark (i.e. "late data") will ever be observed in the pipeline. These heuristics can
- * often be quite accurate, but the chance of seeing late data for any given window is non-zero.
- * Thus, if absolute correctness over time is important to your use case, you may want to consider
- * using a trigger that accounts for late data. The default trigger,
- * {@code Repeatedly.forever(AfterWatermark.pastEndOfWindow())}, which fires
- * once when the watermark passes the end of the window and then immediately therafter when any
- * late data arrives, is one such example.
- *
- * <p>The watermark is the clock that defines {@link TimeDomain#EVENT_TIME}.
- *
- * Additionaly firings before or after the watermark can be requested by calling
- * {@code AfterWatermark.pastEndOfWindow.withEarlyFirings(OnceTrigger)} or
- * {@code AfterWatermark.pastEndOfWindow.withEarlyFirings(OnceTrigger)}.
- *
- * @param <W> {@link BoundedWindow} subclass used to represent the windows used.
- */
-@Experimental(Experimental.Kind.TRIGGER)
-public class AfterWatermark<W extends BoundedWindow> {
-
- // Static factory class.
- private AfterWatermark() {}
-
- /**
- * Creates a trigger that fires when the watermark passes the end of the window.
- */
- public static <W extends BoundedWindow> FromEndOfWindow<W> pastEndOfWindow() {
- return new FromEndOfWindow<W>();
- }
-
- /**
- * Interface for building an AfterWatermarkTrigger with early firings already filled in.
- */
- public interface AfterWatermarkEarly<W extends BoundedWindow> extends TriggerBuilder<W> {
- /**
- * Creates a new {@code Trigger} like the this, except that it fires repeatedly whenever
- * the given {@code Trigger} fires after the watermark has passed the end of the window.
- */
- TriggerBuilder<W> withLateFirings(OnceTrigger<W> lateTrigger);
- }
-
- /**
- * Interface for building an AfterWatermarkTrigger with late firings already filled in.
- */
- public interface AfterWatermarkLate<W extends BoundedWindow> extends TriggerBuilder<W> {
- /**
- * Creates a new {@code Trigger} like the this, except that it fires repeatedly whenever
- * the given {@code Trigger} fires before the watermark has passed the end of the window.
- */
- TriggerBuilder<W> withEarlyFirings(OnceTrigger<W> earlyTrigger);
- }
-
- /**
- * A trigger which never fires. Used for the "early" trigger when only a late trigger was
- * specified.
- */
- private static class NeverTrigger<W extends BoundedWindow> extends OnceTrigger<W> {
-
- protected NeverTrigger() {
- super(null);
- }
-
- @Override
- public void onElement(OnElementContext c) throws Exception { }
-
- @Override
- public void onMerge(OnMergeContext c) throws Exception { }
-
- @Override
- protected Trigger<W> getContinuationTrigger(List<Trigger<W>> continuationTriggers) {
- return this;
- }
-
- @Override
- public Instant getWatermarkThatGuaranteesFiring(W window) {
- return BoundedWindow.TIMESTAMP_MAX_VALUE;
- }
-
- @Override
- public boolean shouldFire(Trigger<W>.TriggerContext context) throws Exception {
- return false;
- }
-
- @Override
- protected void onOnlyFiring(Trigger<W>.TriggerContext context) throws Exception {
- throw new UnsupportedOperationException(
- String.format("%s should never fire", getClass().getSimpleName()));
- }
- }
-
- private static class AfterWatermarkEarlyAndLate<W extends BoundedWindow>
- extends Trigger<W>
- implements TriggerBuilder<W>, AfterWatermarkEarly<W>, AfterWatermarkLate<W> {
-
- private static final int EARLY_INDEX = 0;
- private static final int LATE_INDEX = 1;
-
- private final OnceTrigger<W> earlyTrigger;
- private final OnceTrigger<W> lateTrigger;
-
- @SuppressWarnings("unchecked")
- private AfterWatermarkEarlyAndLate(OnceTrigger<W> earlyTrigger, OnceTrigger<W> lateTrigger) {
- super(lateTrigger == null
- ? ImmutableList.<Trigger<W>>of(earlyTrigger)
- : ImmutableList.<Trigger<W>>of(earlyTrigger, lateTrigger));
- this.earlyTrigger = checkNotNull(earlyTrigger, "earlyTrigger should not be null");
- this.lateTrigger = lateTrigger;
- }
-
- @Override
- public TriggerBuilder<W> withEarlyFirings(OnceTrigger<W> earlyTrigger) {
- return new AfterWatermarkEarlyAndLate<W>(earlyTrigger, lateTrigger);
- }
-
- @Override
- public TriggerBuilder<W> withLateFirings(OnceTrigger<W> lateTrigger) {
- return new AfterWatermarkEarlyAndLate<W>(earlyTrigger, lateTrigger);
- }
-
- @Override
- public void onElement(OnElementContext c) throws Exception {
- if (!c.trigger().isMerging()) {
- // If merges can never happen, we just run the unfinished subtrigger
- c.trigger().firstUnfinishedSubTrigger().invokeOnElement(c);
- } else {
- // If merges can happen, we run for all subtriggers because they might be
- // de-activated or re-activated
- for (ExecutableTrigger<W> subTrigger : c.trigger().subTriggers()) {
- subTrigger.invokeOnElement(c);
- }
- }
- }
-
- @Override
- public void onMerge(OnMergeContext c) throws Exception {
- // NOTE that the ReduceFnRunner will delete all end-of-window timers for the
- // merged-away windows.
-
- ExecutableTrigger<W> earlySubtrigger = c.trigger().subTrigger(EARLY_INDEX);
- // We check the early trigger to determine if we are still processing it or
- // if the end of window has transitioned us to the late trigger
- OnMergeContext earlyContext = c.forTrigger(earlySubtrigger);
-
- // If the early trigger is still active in any merging window then it is still active in
- // the new merged window, because even if the merged window is "done" some pending elements
- // haven't had a chance to fire.
- if (!earlyContext.trigger().finishedInAllMergingWindows() || !endOfWindowReached(c)) {
- earlyContext.trigger().setFinished(false);
- if (lateTrigger != null) {
- ExecutableTrigger<W> lateSubtrigger = c.trigger().subTrigger(LATE_INDEX);
- OnMergeContext lateContext = c.forTrigger(lateSubtrigger);
- lateContext.trigger().setFinished(false);
- lateSubtrigger.invokeClear(lateContext);
- }
- } else {
- // Otherwise the early trigger and end-of-window bit is done for good.
- earlyContext.trigger().setFinished(true);
- if (lateTrigger != null) {
- c.trigger().subTrigger(LATE_INDEX).invokeOnMerge(c);
- }
- }
- }
-
- @Override
- public Trigger<W> getContinuationTrigger() {
- return new AfterWatermarkEarlyAndLate<W>(
- earlyTrigger.getContinuationTrigger(),
- lateTrigger == null ? null : lateTrigger.getContinuationTrigger());
- }
-
- @Override
- protected Trigger<W> getContinuationTrigger(List<Trigger<W>> continuationTriggers) {
- throw new UnsupportedOperationException(
- "Should not call getContinuationTrigger(List<Trigger<W>>)");
- }
-
- @Override
- public Instant getWatermarkThatGuaranteesFiring(W window) {
- // Even without an early or late trigger, we'll still produce a firing at the watermark.
- return window.maxTimestamp();
- }
-
- private boolean endOfWindowReached(Trigger<W>.TriggerContext context) {
- return context.currentEventTime() != null
- && context.currentEventTime().isAfter(context.window().maxTimestamp());
- }
-
- @Override
- public boolean shouldFire(Trigger<W>.TriggerContext context) throws Exception {
- if (!context.trigger().isFinished(EARLY_INDEX)) {
- // We have not yet transitioned to late firings.
- // We should fire if either the trigger is ready or we reach the end of the window.
- return context.trigger().subTrigger(EARLY_INDEX).invokeShouldFire(context)
- || endOfWindowReached(context);
- } else if (lateTrigger == null) {
- return false;
- } else {
- // We are running the late trigger
- return context.trigger().subTrigger(LATE_INDEX).invokeShouldFire(context);
- }
- }
-
- @Override
- public void onFire(Trigger<W>.TriggerContext context) throws Exception {
- if (!context.forTrigger(context.trigger().subTrigger(EARLY_INDEX)).trigger().isFinished()) {
- onNonLateFiring(context);
- } else if (lateTrigger != null) {
- onLateFiring(context);
- } else {
- // all done
- context.trigger().setFinished(true);
- }
- }
-
- private void onNonLateFiring(Trigger<W>.TriggerContext context) throws Exception {
- // We have not yet transitioned to late firings.
- ExecutableTrigger<W> earlySubtrigger = context.trigger().subTrigger(EARLY_INDEX);
- Trigger<W>.TriggerContext earlyContext = context.forTrigger(earlySubtrigger);
-
- if (!endOfWindowReached(context)) {
- // This is an early firing, since we have not arrived at the end of the window
- // Implicitly repeats
- earlySubtrigger.invokeOnFire(context);
- earlySubtrigger.invokeClear(context);
- earlyContext.trigger().setFinished(false);
- } else {
- // We have arrived at the end of the window; terminate the early trigger
- // and clear out the late trigger's state
- if (earlySubtrigger.invokeShouldFire(context)) {
- earlySubtrigger.invokeOnFire(context);
- }
- earlyContext.trigger().setFinished(true);
- earlySubtrigger.invokeClear(context);
-
- if (lateTrigger == null) {
- // Done if there is no late trigger.
- context.trigger().setFinished(true);
- } else {
- // If there is a late trigger, we transition to it, and need to clear its state
- // because it was run in parallel.
- context.trigger().subTrigger(LATE_INDEX).invokeClear(context);
- }
- }
-
- }
-
- private void onLateFiring(Trigger<W>.TriggerContext context) throws Exception {
- // We are firing the late trigger, with implicit repeat
- ExecutableTrigger<W> lateSubtrigger = context.trigger().subTrigger(LATE_INDEX);
- lateSubtrigger.invokeOnFire(context);
- // It is a OnceTrigger, so it must have finished; unfinished it and clear it
- lateSubtrigger.invokeClear(context);
- context.forTrigger(lateSubtrigger).trigger().setFinished(false);
- }
- }
-
- /**
- * A watermark trigger targeted relative to the end of the window.
- */
- public static class FromEndOfWindow<W extends BoundedWindow> extends OnceTrigger<W> {
-
- private FromEndOfWindow() {
- super(null);
- }
-
- /**
- * Creates a new {@code Trigger} like the this, except that it fires repeatedly whenever
- * the given {@code Trigger} fires before the watermark has passed the end of the window.
- */
- public AfterWatermarkEarly<W> withEarlyFirings(OnceTrigger<W> earlyFirings) {
- Preconditions.checkNotNull(earlyFirings,
- "Must specify the trigger to use for early firings");
- return new AfterWatermarkEarlyAndLate<W>(earlyFirings, null);
- }
-
- /**
- * Creates a new {@code Trigger} like the this, except that it fires repeatedly whenever
- * the given {@code Trigger} fires after the watermark has passed the end of the window.
- */
- public AfterWatermarkLate<W> withLateFirings(OnceTrigger<W> lateFirings) {
- Preconditions.checkNotNull(lateFirings,
- "Must specify the trigger to use for late firings");
- return new AfterWatermarkEarlyAndLate<W>(new NeverTrigger<W>(), lateFirings);
- }
-
- @Override
- public void onElement(OnElementContext c) throws Exception {
- // We're interested in knowing when the input watermark passes the end of the window.
- // (It is possible this has already happened, in which case the timer will be fired
- // almost immediately).
- c.setTimer(c.window().maxTimestamp(), TimeDomain.EVENT_TIME);
- }
-
- @Override
- public void onMerge(OnMergeContext c) throws Exception {
- // NOTE that the ReduceFnRunner will delete all end-of-window timers for the
- // merged-away windows.
-
- if (!c.trigger().finishedInAllMergingWindows()) {
- // If the trigger is still active in any merging window then it is still active in the new
- // merged window, because even if the merged window is "done" some pending elements haven't
- // had a chance to fire
- c.trigger().setFinished(false);
- } else if (!endOfWindowReached(c)) {
- // If the end of the new window has not been reached, then the trigger is active again.
- c.trigger().setFinished(false);
- } else {
- // Otherwise it is done for good
- c.trigger().setFinished(true);
- }
- }
-
- @Override
- public Instant getWatermarkThatGuaranteesFiring(W window) {
- return window.maxTimestamp();
- }
-
- @Override
- public FromEndOfWindow<W> getContinuationTrigger(List<Trigger<W>> continuationTriggers) {
- return this;
- }
-
- @Override
- public String toString() {
- return "AfterWatermark.pastEndOfWindow()";
- }
-
- @Override
- public boolean equals(Object obj) {
- return obj instanceof FromEndOfWindow;
- }
-
- @Override
- public int hashCode() {
- return Objects.hash(getClass());
- }
-
- @Override
- public boolean shouldFire(Trigger<W>.TriggerContext context) throws Exception {
- return endOfWindowReached(context);
- }
-
- private boolean endOfWindowReached(Trigger<W>.TriggerContext context) {
- return context.currentEventTime() != null
- && context.currentEventTime().isAfter(context.window().maxTimestamp());
- }
-
- @Override
- protected void onOnlyFiring(Trigger<W>.TriggerContext context) throws Exception { }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/BoundedWindow.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/BoundedWindow.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/BoundedWindow.java
deleted file mode 100644
index 0afd8e3..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/BoundedWindow.java
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms.windowing;
-
-import org.joda.time.Instant;
-
-import java.util.concurrent.TimeUnit;
-
-/**
- * A {@code BoundedWindow} represents a finite grouping of elements, with an
- * upper bound (larger timestamps represent more recent data) on the timestamps
- * of elements that can be placed in the window. This finiteness means that for
- * every window, at some point in time, all data for that window will have
- * arrived and can be processed together.
- *
- * <p>Windows must also implement {@link Object#equals} and
- * {@link Object#hashCode} such that windows that are logically equal will
- * be treated as equal by {@code equals()} and {@code hashCode()}.
- */
-public abstract class BoundedWindow {
- // The min and max timestamps that won't overflow when they are converted to
- // usec.
- public static final Instant TIMESTAMP_MIN_VALUE =
- new Instant(TimeUnit.MICROSECONDS.toMillis(Long.MIN_VALUE));
- public static final Instant TIMESTAMP_MAX_VALUE =
- new Instant(TimeUnit.MICROSECONDS.toMillis(Long.MAX_VALUE));
-
- /**
- * Returns the inclusive upper bound of timestamps for values in this window.
- */
- public abstract Instant maxTimestamp();
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/CalendarWindows.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/CalendarWindows.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/CalendarWindows.java
deleted file mode 100644
index de5140f..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/CalendarWindows.java
+++ /dev/null
@@ -1,348 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms.windowing;
-
-import com.google.cloud.dataflow.sdk.coders.Coder;
-
-import org.joda.time.DateTime;
-import org.joda.time.DateTimeZone;
-import org.joda.time.Days;
-import org.joda.time.Instant;
-import org.joda.time.Months;
-import org.joda.time.Years;
-
-/**
- * A collection of {@link WindowFn}s that windows values into calendar-based
- * windows such as spans of days, months, or years.
- *
- * <p>For example, to group data into quarters that change on the 15th, use
- * {@code CalendarWindows.months(3).withStartingMonth(2014, 1).beginningOnDay(15)}.
- */
-public class CalendarWindows {
-
- /**
- * Returns a {@link WindowFn} that windows elements into periods measured by days.
- *
- * <p>For example, {@code CalendarWindows.days(1)} will window elements into
- * separate windows for each day.
- */
- public static DaysWindows days(int number) {
- return new DaysWindows(number, new DateTime(0, DateTimeZone.UTC), DateTimeZone.UTC);
- }
-
- /**
- * Returns a {@link WindowFn} that windows elements into periods measured by weeks.
- *
- * <p>For example, {@code CalendarWindows.weeks(1, DateTimeConstants.TUESDAY)} will
- * window elements into week-long windows starting on Tuesdays.
- */
- public static DaysWindows weeks(int number, int startDayOfWeek) {
- return new DaysWindows(
- 7 * number,
- new DateTime(0, DateTimeZone.UTC).withDayOfWeek(startDayOfWeek),
- DateTimeZone.UTC);
- }
-
- /**
- * Returns a {@link WindowFn} that windows elements into periods measured by months.
- *
- * <p>For example,
- * {@code CalendarWindows.months(8).withStartingMonth(2014, 1).beginningOnDay(10)}
- * will window elements into 8 month windows where that start on the 10th day of month,
- * and the first window begins in January 2014.
- */
- public static MonthsWindows months(int number) {
- return new MonthsWindows(number, 1, new DateTime(0, DateTimeZone.UTC), DateTimeZone.UTC);
- }
-
- /**
- * Returns a {@link WindowFn} that windows elements into periods measured by years.
- *
- * <p>For example,
- * {@code CalendarWindows.years(1).withTimeZone(DateTimeZone.forId("America/Los_Angeles"))}
- * will window elements into year-long windows that start at midnight on Jan 1, in the
- * America/Los_Angeles time zone.
- */
- public static YearsWindows years(int number) {
- return new YearsWindows(number, 1, 1, new DateTime(0, DateTimeZone.UTC), DateTimeZone.UTC);
- }
-
- /**
- * A {@link WindowFn} that windows elements into periods measured by days.
- *
- * <p>By default, periods of multiple days are measured starting at the
- * epoch. This can be overridden with {@link #withStartingDay}.
- *
- * <p>The time zone used to determine calendar boundaries is UTC, unless this
- * is overridden with the {@link #withTimeZone} method.
- */
- public static class DaysWindows extends PartitioningWindowFn<Object, IntervalWindow> {
- public DaysWindows withStartingDay(int year, int month, int day) {
- return new DaysWindows(
- number, new DateTime(year, month, day, 0, 0, timeZone), timeZone);
- }
-
- public DaysWindows withTimeZone(DateTimeZone timeZone) {
- return new DaysWindows(
- number, startDate.withZoneRetainFields(timeZone), timeZone);
- }
-
- ////////////////////////////////////////////////////////////////////////////
-
- private int number;
- private DateTime startDate;
- private DateTimeZone timeZone;
-
- private DaysWindows(int number, DateTime startDate, DateTimeZone timeZone) {
- this.number = number;
- this.startDate = startDate;
- this.timeZone = timeZone;
- }
-
- @Override
- public IntervalWindow assignWindow(Instant timestamp) {
- DateTime datetime = new DateTime(timestamp, timeZone);
-
- int dayOffset = Days.daysBetween(startDate, datetime).getDays() / number * number;
-
- DateTime begin = startDate.plusDays(dayOffset);
- DateTime end = begin.plusDays(number);
-
- return new IntervalWindow(begin.toInstant(), end.toInstant());
- }
-
- @Override
- public Coder<IntervalWindow> windowCoder() {
- return IntervalWindow.getCoder();
- }
-
- @Override
- public boolean isCompatible(WindowFn<?, ?> other) {
- if (!(other instanceof DaysWindows)) {
- return false;
- }
- DaysWindows that = (DaysWindows) other;
- return number == that.number
- && startDate == that.startDate
- && timeZone == that.timeZone;
- }
-
- public int getNumber() {
- return number;
- }
-
- public DateTime getStartDate() {
- return startDate;
- }
-
- public DateTimeZone getTimeZone() {
- return timeZone;
- }
-
- }
-
- /**
- * A {@link WindowFn} that windows elements into periods measured by months.
- *
- * <p>By default, periods of multiple months are measured starting at the
- * epoch. This can be overridden with {@link #withStartingMonth}.
- *
- * <p>Months start on the first day of each calendar month, unless overridden by
- * {@link #beginningOnDay}.
- *
- * <p>The time zone used to determine calendar boundaries is UTC, unless this
- * is overridden with the {@link #withTimeZone} method.
- */
- public static class MonthsWindows extends PartitioningWindowFn<Object, IntervalWindow> {
- public MonthsWindows beginningOnDay(int dayOfMonth) {
- return new MonthsWindows(
- number, dayOfMonth, startDate, timeZone);
- }
-
- public MonthsWindows withStartingMonth(int year, int month) {
- return new MonthsWindows(
- number, dayOfMonth, new DateTime(year, month, 1, 0, 0, timeZone), timeZone);
- }
-
- public MonthsWindows withTimeZone(DateTimeZone timeZone) {
- return new MonthsWindows(
- number, dayOfMonth, startDate.withZoneRetainFields(timeZone), timeZone);
- }
-
- ////////////////////////////////////////////////////////////////////////////
-
- private int number;
- private int dayOfMonth;
- private DateTime startDate;
- private DateTimeZone timeZone;
-
- private MonthsWindows(int number, int dayOfMonth, DateTime startDate, DateTimeZone timeZone) {
- this.number = number;
- this.dayOfMonth = dayOfMonth;
- this.startDate = startDate;
- this.timeZone = timeZone;
- }
-
- @Override
- public IntervalWindow assignWindow(Instant timestamp) {
- DateTime datetime = new DateTime(timestamp, timeZone);
-
- int monthOffset =
- Months.monthsBetween(startDate.withDayOfMonth(dayOfMonth), datetime).getMonths()
- / number * number;
-
- DateTime begin = startDate.withDayOfMonth(dayOfMonth).plusMonths(monthOffset);
- DateTime end = begin.plusMonths(number);
-
- return new IntervalWindow(begin.toInstant(), end.toInstant());
- }
-
- @Override
- public Coder<IntervalWindow> windowCoder() {
- return IntervalWindow.getCoder();
- }
-
- @Override
- public boolean isCompatible(WindowFn<?, ?> other) {
- if (!(other instanceof MonthsWindows)) {
- return false;
- }
- MonthsWindows that = (MonthsWindows) other;
- return number == that.number
- && dayOfMonth == that.dayOfMonth
- && startDate == that.startDate
- && timeZone == that.timeZone;
- }
-
- public int getNumber() {
- return number;
- }
-
- public int getDayOfMonth() {
- return dayOfMonth;
- }
-
- public DateTime getStartDate() {
- return startDate;
- }
-
- public DateTimeZone getTimeZone() {
- return timeZone;
- }
-
- }
-
- /**
- * A {@link WindowFn} that windows elements into periods measured by years.
- *
- * <p>By default, periods of multiple years are measured starting at the
- * epoch. This can be overridden with {@link #withStartingYear}.
- *
- * <p>Years start on the first day of each calendar year, unless overridden by
- * {@link #beginningOnDay}.
- *
- * <p>The time zone used to determine calendar boundaries is UTC, unless this
- * is overridden with the {@link #withTimeZone} method.
- */
- public static class YearsWindows extends PartitioningWindowFn<Object, IntervalWindow> {
- public YearsWindows beginningOnDay(int monthOfYear, int dayOfMonth) {
- return new YearsWindows(
- number, monthOfYear, dayOfMonth, startDate, timeZone);
- }
-
- public YearsWindows withStartingYear(int year) {
- return new YearsWindows(
- number, monthOfYear, dayOfMonth, new DateTime(year, 1, 1, 0, 0, timeZone), timeZone);
- }
-
- public YearsWindows withTimeZone(DateTimeZone timeZone) {
- return new YearsWindows(
- number, monthOfYear, dayOfMonth, startDate.withZoneRetainFields(timeZone), timeZone);
- }
-
- ////////////////////////////////////////////////////////////////////////////
-
- private int number;
- private int monthOfYear;
- private int dayOfMonth;
- private DateTime startDate;
- private DateTimeZone timeZone;
-
- private YearsWindows(
- int number, int monthOfYear, int dayOfMonth, DateTime startDate, DateTimeZone timeZone) {
- this.number = number;
- this.monthOfYear = monthOfYear;
- this.dayOfMonth = dayOfMonth;
- this.startDate = startDate;
- this.timeZone = timeZone;
- }
-
- @Override
- public IntervalWindow assignWindow(Instant timestamp) {
- DateTime datetime = new DateTime(timestamp, timeZone);
-
- DateTime offsetStart = startDate.withMonthOfYear(monthOfYear).withDayOfMonth(dayOfMonth);
-
- int yearOffset =
- Years.yearsBetween(offsetStart, datetime).getYears() / number * number;
-
- DateTime begin = offsetStart.plusYears(yearOffset);
- DateTime end = begin.plusYears(number);
-
- return new IntervalWindow(begin.toInstant(), end.toInstant());
- }
-
- @Override
- public Coder<IntervalWindow> windowCoder() {
- return IntervalWindow.getCoder();
- }
-
- @Override
- public boolean isCompatible(WindowFn<?, ?> other) {
- if (!(other instanceof YearsWindows)) {
- return false;
- }
- YearsWindows that = (YearsWindows) other;
- return number == that.number
- && monthOfYear == that.monthOfYear
- && dayOfMonth == that.dayOfMonth
- && startDate == that.startDate
- && timeZone == that.timeZone;
- }
-
- public DateTimeZone getTimeZone() {
- return timeZone;
- }
-
- public DateTime getStartDate() {
- return startDate;
- }
-
- public int getDayOfMonth() {
- return dayOfMonth;
- }
-
- public int getMonthOfYear() {
- return monthOfYear;
- }
-
- public int getNumber() {
- return number;
- }
-
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/DefaultTrigger.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/DefaultTrigger.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/DefaultTrigger.java
deleted file mode 100644
index 9ac4abd..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/DefaultTrigger.java
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms.windowing;
-
-import com.google.cloud.dataflow.sdk.annotations.Experimental;
-import com.google.cloud.dataflow.sdk.util.TimeDomain;
-
-import org.joda.time.Instant;
-
-import java.util.List;
-
-/**
- * A trigger that is equivalent to {@code Repeatedly.forever(AfterWatermark.pastEndOfWindow())}.
- * See {@link Repeatedly#forever} and {@link AfterWatermark#pastEndOfWindow} for more details.
- *
- * @param <W> The type of windows being triggered/encoded.
- */
-@Experimental(Experimental.Kind.TRIGGER)
-public class DefaultTrigger<W extends BoundedWindow> extends Trigger<W>{
-
- private DefaultTrigger() {
- super(null);
- }
-
- /**
- * Returns the default trigger.
- */
- public static <W extends BoundedWindow> DefaultTrigger<W> of() {
- return new DefaultTrigger<W>();
- }
-
- @Override
- public void onElement(OnElementContext c) throws Exception {
- // If the end of the window has already been reached, then we are already ready to fire
- // and do not need to set a wake-up timer.
- if (!endOfWindowReached(c)) {
- c.setTimer(c.window().maxTimestamp(), TimeDomain.EVENT_TIME);
- }
- }
-
- @Override
- public void onMerge(OnMergeContext c) throws Exception {
- // If the end of the window has already been reached, then we are already ready to fire
- // and do not need to set a wake-up timer.
- if (!endOfWindowReached(c)) {
- c.setTimer(c.window().maxTimestamp(), TimeDomain.EVENT_TIME);
- }
- }
-
- @Override
- public void clear(TriggerContext c) throws Exception { }
-
- @Override
- public Instant getWatermarkThatGuaranteesFiring(W window) {
- return window.maxTimestamp();
- }
-
- @Override
- public boolean isCompatible(Trigger<?> other) {
- // Semantically, all default triggers are identical
- return other instanceof DefaultTrigger;
- }
-
- @Override
- public Trigger<W> getContinuationTrigger(List<Trigger<W>> continuationTriggers) {
- return this;
- }
-
- @Override
- public boolean shouldFire(Trigger<W>.TriggerContext context) throws Exception {
- return endOfWindowReached(context);
- }
-
- private boolean endOfWindowReached(Trigger<W>.TriggerContext context) {
- return context.currentEventTime() != null
- && context.currentEventTime().isAfter(context.window().maxTimestamp());
- }
-
- @Override
- public void onFire(Trigger<W>.TriggerContext context) throws Exception { }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/FixedWindows.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/FixedWindows.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/FixedWindows.java
deleted file mode 100644
index 12a0f1b..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/FixedWindows.java
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms.windowing;
-
-import com.google.cloud.dataflow.sdk.coders.Coder;
-
-import org.joda.time.Duration;
-import org.joda.time.Instant;
-
-import java.util.Objects;
-
-/**
- * A {@link WindowFn} that windows values into fixed-size timestamp-based windows.
- *
- * <p>For example, in order to partition the data into 10 minute windows:
- * <pre> {@code
- * PCollection<Integer> items = ...;
- * PCollection<Integer> windowedItems = items.apply(
- * Window.<Integer>into(FixedWindows.of(Duration.standardMinutes(10))));
- * } </pre>
- */
-public class FixedWindows extends PartitioningWindowFn<Object, IntervalWindow> {
-
- /**
- * Size of this window.
- */
- private final Duration size;
-
- /**
- * Offset of this window. Windows start at time
- * N * size + offset, where 0 is the epoch.
- */
- private final Duration offset;
-
- /**
- * Partitions the timestamp space into half-open intervals of the form
- * [N * size, (N + 1) * size), where 0 is the epoch.
- */
- public static FixedWindows of(Duration size) {
- return new FixedWindows(size, Duration.ZERO);
- }
-
- /**
- * Partitions the timestamp space into half-open intervals of the form
- * [N * size + offset, (N + 1) * size + offset),
- * where 0 is the epoch.
- *
- * @throws IllegalArgumentException if offset is not in [0, size)
- */
- public FixedWindows withOffset(Duration offset) {
- return new FixedWindows(size, offset);
- }
-
- private FixedWindows(Duration size, Duration offset) {
- if (offset.isShorterThan(Duration.ZERO) || !offset.isShorterThan(size)) {
- throw new IllegalArgumentException(
- "FixedWindows WindowingStrategies must have 0 <= offset < size");
- }
- this.size = size;
- this.offset = offset;
- }
-
- @Override
- public IntervalWindow assignWindow(Instant timestamp) {
- long start = timestamp.getMillis()
- - timestamp.plus(size).minus(offset).getMillis() % size.getMillis();
- return new IntervalWindow(new Instant(start), size);
- }
-
- @Override
- public Coder<IntervalWindow> windowCoder() {
- return IntervalWindow.getCoder();
- }
-
- @Override
- public boolean isCompatible(WindowFn<?, ?> other) {
- return this.equals(other);
- }
-
- public Duration getSize() {
- return size;
- }
-
- public Duration getOffset() {
- return offset;
- }
-
- @Override
- public boolean equals(Object object) {
- if (!(object instanceof FixedWindows)) {
- return false;
- }
- FixedWindows other = (FixedWindows) object;
- return getOffset().equals(other.getOffset())
- && getSize().equals(other.getSize());
- }
-
- @Override
- public int hashCode() {
- return Objects.hash(size, offset);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/GlobalWindow.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/GlobalWindow.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/GlobalWindow.java
deleted file mode 100644
index d7fc396..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/GlobalWindow.java
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms.windowing;
-
-import com.google.cloud.dataflow.sdk.coders.AtomicCoder;
-
-import org.joda.time.Duration;
-import org.joda.time.Instant;
-
-import java.io.InputStream;
-import java.io.OutputStream;
-
-/**
- * The default window into which all data is placed (via {@link GlobalWindows}).
- */
-public class GlobalWindow extends BoundedWindow {
- /**
- * Singleton instance of {@link GlobalWindow}.
- */
- public static final GlobalWindow INSTANCE = new GlobalWindow();
-
- // Triggers use maxTimestamp to set timers' timestamp. Timers fires when
- // the watermark passes their timestamps. So, the maxTimestamp needs to be
- // smaller than the TIMESTAMP_MAX_VALUE.
- // One standard day is subtracted from TIMESTAMP_MAX_VALUE to make sure
- // the maxTimestamp is smaller than TIMESTAMP_MAX_VALUE even after rounding up
- // to seconds or minutes.
- private static final Instant END_OF_GLOBAL_WINDOW =
- TIMESTAMP_MAX_VALUE.minus(Duration.standardDays(1));
-
- @Override
- public Instant maxTimestamp() {
- return END_OF_GLOBAL_WINDOW;
- }
-
- private GlobalWindow() {}
-
- /**
- * {@link Coder} for encoding and decoding {@code GlobalWindow}s.
- */
- public static class Coder extends AtomicCoder<GlobalWindow> {
- public static final Coder INSTANCE = new Coder();
-
- @Override
- public void encode(GlobalWindow window, OutputStream outStream, Context context) {}
-
- @Override
- public GlobalWindow decode(InputStream inStream, Context context) {
- return GlobalWindow.INSTANCE;
- }
-
- private Coder() {}
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/GlobalWindows.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/GlobalWindows.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/GlobalWindows.java
deleted file mode 100644
index d3d949c..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/GlobalWindows.java
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms.windowing;
-
-import com.google.cloud.dataflow.sdk.coders.Coder;
-
-import org.joda.time.Instant;
-
-import java.util.Collection;
-import java.util.Collections;
-
-/**
- * Default {@link WindowFn} that assigns all data to the same window.
- */
-public class GlobalWindows extends NonMergingWindowFn<Object, GlobalWindow> {
-
- private static final Collection<GlobalWindow> GLOBAL_WINDOWS =
- Collections.singletonList(GlobalWindow.INSTANCE);
-
- @Override
- public Collection<GlobalWindow> assignWindows(AssignContext c) {
- return GLOBAL_WINDOWS;
- }
-
- @Override
- public boolean isCompatible(WindowFn<?, ?> o) {
- return o instanceof GlobalWindows;
- }
-
- @Override
- public Coder<GlobalWindow> windowCoder() {
- return GlobalWindow.Coder.INSTANCE;
- }
-
- @Override
- public GlobalWindow getSideInputWindow(BoundedWindow window) {
- return GlobalWindow.INSTANCE;
- }
-
- @Override
- public boolean assignsToSingleWindow() {
- return true;
- }
-
- @Override
- public Instant getOutputTime(Instant inputTimestamp, GlobalWindow window) {
- return inputTimestamp;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/IntervalWindow.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/IntervalWindow.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/IntervalWindow.java
deleted file mode 100644
index 58287c7..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/IntervalWindow.java
+++ /dev/null
@@ -1,201 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms.windowing;
-
-import com.google.cloud.dataflow.sdk.coders.AtomicCoder;
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.coders.CoderException;
-import com.google.cloud.dataflow.sdk.coders.DurationCoder;
-import com.google.cloud.dataflow.sdk.coders.InstantCoder;
-
-import com.fasterxml.jackson.annotation.JsonCreator;
-
-import org.joda.time.Duration;
-import org.joda.time.Instant;
-import org.joda.time.ReadableDuration;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-
-/**
- * An implementation of {@link BoundedWindow} that represents an interval from
- * {@link #start} (inclusive) to {@link #end} (exclusive).
- */
-public class IntervalWindow extends BoundedWindow
- implements Comparable<IntervalWindow> {
- /**
- * Start of the interval, inclusive.
- */
- private final Instant start;
-
- /**
- * End of the interval, exclusive.
- */
- private final Instant end;
-
- /**
- * Creates a new IntervalWindow that represents the half-open time
- * interval [start, end).
- */
- public IntervalWindow(Instant start, Instant end) {
- this.start = start;
- this.end = end;
- }
-
- public IntervalWindow(Instant start, ReadableDuration size) {
- this.start = start;
- this.end = start.plus(size);
- }
-
- /**
- * Returns the start of this window, inclusive.
- */
- public Instant start() {
- return start;
- }
-
- /**
- * Returns the end of this window, exclusive.
- */
- public Instant end() {
- return end;
- }
-
- /**
- * Returns the largest timestamp that can be included in this window.
- */
- @Override
- public Instant maxTimestamp() {
- // end not inclusive
- return end.minus(1);
- }
-
- /**
- * Returns whether this window contains the given window.
- */
- public boolean contains(IntervalWindow other) {
- return !this.start.isAfter(other.start) && !this.end.isBefore(other.end);
- }
-
- /**
- * Returns whether this window is disjoint from the given window.
- */
- public boolean isDisjoint(IntervalWindow other) {
- return !this.end.isAfter(other.start) || !other.end.isAfter(this.start);
- }
-
- /**
- * Returns whether this window intersects the given window.
- */
- public boolean intersects(IntervalWindow other) {
- return !isDisjoint(other);
- }
-
- /**
- * Returns the minimal window that includes both this window and
- * the given window.
- */
- public IntervalWindow span(IntervalWindow other) {
- return new IntervalWindow(
- new Instant(Math.min(start.getMillis(), other.start.getMillis())),
- new Instant(Math.max(end.getMillis(), other.end.getMillis())));
- }
-
- @Override
- public boolean equals(Object o) {
- return (o instanceof IntervalWindow)
- && ((IntervalWindow) o).end.isEqual(end)
- && ((IntervalWindow) o).start.isEqual(start);
- }
-
- @Override
- public int hashCode() {
- // The end values are themselves likely to be arithmetic sequence, which
- // is a poor distribution to use for a hashtable, so we
- // add a highly non-linear transformation.
- return (int)
- (start.getMillis() + modInverse((int) (end.getMillis() << 1) + 1));
- }
-
- /**
- * Compute the inverse of (odd) x mod 2^32.
- */
- private int modInverse(int x) {
- // Cube gives inverse mod 2^4, as x^4 == 1 (mod 2^4) for all odd x.
- int inverse = x * x * x;
- // Newton iteration doubles correct bits at each step.
- inverse *= 2 - x * inverse;
- inverse *= 2 - x * inverse;
- inverse *= 2 - x * inverse;
- return inverse;
- }
-
- @Override
- public String toString() {
- return "[" + start + ".." + end + ")";
- }
-
- @Override
- public int compareTo(IntervalWindow o) {
- if (start.isEqual(o.start)) {
- return end.compareTo(o.end);
- }
- return start.compareTo(o.start);
- }
-
- /**
- * Returns a {@link Coder} suitable for {@link IntervalWindow}.
- */
- public static Coder<IntervalWindow> getCoder() {
- return IntervalWindowCoder.of();
- }
-
- /**
- * Encodes an {@link IntervalWindow} as a pair of its upper bound and duration.
- */
- private static class IntervalWindowCoder extends AtomicCoder<IntervalWindow> {
-
- private static final IntervalWindowCoder INSTANCE =
- new IntervalWindowCoder();
-
- private static final Coder<Instant> instantCoder = InstantCoder.of();
- private static final Coder<ReadableDuration> durationCoder = DurationCoder.of();
-
- @JsonCreator
- public static IntervalWindowCoder of() {
- return INSTANCE;
- }
-
- @Override
- public void encode(IntervalWindow window,
- OutputStream outStream,
- Context context)
- throws IOException, CoderException {
- instantCoder.encode(window.end, outStream, context.nested());
- durationCoder.encode(new Duration(window.start, window.end), outStream, context.nested());
- }
-
- @Override
- public IntervalWindow decode(InputStream inStream, Context context)
- throws IOException, CoderException {
- Instant end = instantCoder.decode(inStream, context.nested());
- ReadableDuration duration = durationCoder.decode(inStream, context.nested());
- return new IntervalWindow(end.minus(duration), end);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/InvalidWindows.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/InvalidWindows.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/InvalidWindows.java
deleted file mode 100644
index 596f4e7..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/InvalidWindows.java
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms.windowing;
-
-import com.google.cloud.dataflow.sdk.coders.Coder;
-
-import org.joda.time.Instant;
-
-import java.util.Collection;
-
-/**
- * A {@link WindowFn} that represents an invalid pipeline state.
- *
- * @param <W> window type
- */
-public class InvalidWindows<W extends BoundedWindow> extends WindowFn<Object, W> {
- private String cause;
- private WindowFn<?, W> originalWindowFn;
-
- public InvalidWindows(String cause, WindowFn<?, W> originalWindowFn) {
- this.originalWindowFn = originalWindowFn;
- this.cause = cause;
- }
-
- /**
- * Returns the reason that this {@code WindowFn} is invalid.
- */
- public String getCause() {
- return cause;
- }
-
- /**
- * Returns the original windowFn that this InvalidWindows replaced.
- */
- public WindowFn<?, W> getOriginalWindowFn() {
- return originalWindowFn;
- }
-
- @Override
- public Collection<W> assignWindows(AssignContext c) {
- throw new UnsupportedOperationException();
- }
-
- @Override
- public void mergeWindows(MergeContext c) {
- throw new UnsupportedOperationException();
- }
-
- @Override
- public Coder<W> windowCoder() {
- return originalWindowFn.windowCoder();
- }
-
- /**
- * {@code InvalidWindows} objects with the same {@code originalWindowFn} are compatible.
- */
- @Override
- public boolean isCompatible(WindowFn<?, ?> other) {
- return getClass() == other.getClass()
- && getOriginalWindowFn().isCompatible(
- ((InvalidWindows<?>) other).getOriginalWindowFn());
- }
-
- @Override
- public W getSideInputWindow(BoundedWindow window) {
- throw new UnsupportedOperationException("InvalidWindows is not allowed in side inputs");
- }
-
- @Override
- public Instant getOutputTime(Instant inputTimestamp, W window) {
- return inputTimestamp;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/MergeOverlappingIntervalWindows.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/MergeOverlappingIntervalWindows.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/MergeOverlappingIntervalWindows.java
deleted file mode 100644
index 4e06234..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/MergeOverlappingIntervalWindows.java
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms.windowing;
-
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.List;
-
-/**
- * A utility function for merging overlapping {@link IntervalWindow}s.
- */
-public class MergeOverlappingIntervalWindows {
-
- /**
- * Merge overlapping {@link IntervalWindow}s.
- */
- public static void mergeWindows(WindowFn<?, IntervalWindow>.MergeContext c) throws Exception {
- // Merge any overlapping windows into a single window.
- // Sort the list of existing windows so we only have to
- // traverse the list once rather than considering all
- // O(n^2) window pairs.
- List<IntervalWindow> sortedWindows = new ArrayList<>();
- for (IntervalWindow window : c.windows()) {
- sortedWindows.add(window);
- }
- Collections.sort(sortedWindows);
- List<MergeCandidate> merges = new ArrayList<>();
- MergeCandidate current = new MergeCandidate();
- for (IntervalWindow window : sortedWindows) {
- if (current.intersects(window)) {
- current.add(window);
- } else {
- merges.add(current);
- current = new MergeCandidate(window);
- }
- }
- merges.add(current);
- for (MergeCandidate merge : merges) {
- merge.apply(c);
- }
- }
-
- private static class MergeCandidate {
- private IntervalWindow union;
- private final List<IntervalWindow> parts;
- public MergeCandidate() {
- parts = new ArrayList<>();
- }
- public MergeCandidate(IntervalWindow window) {
- union = window;
- parts = new ArrayList<>(Arrays.asList(window));
- }
- public boolean intersects(IntervalWindow window) {
- return union == null || union.intersects(window);
- }
- public void add(IntervalWindow window) {
- union = union == null ? window : union.span(window);
- parts.add(window);
- }
- public void apply(WindowFn<?, IntervalWindow>.MergeContext c) throws Exception {
- if (parts.size() > 1) {
- c.merge(parts, union);
- }
- }
-
- @Override
- public String toString() {
- return "MergeCandidate[union=" + union + ", parts=" + parts + "]";
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/NonMergingWindowFn.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/NonMergingWindowFn.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/NonMergingWindowFn.java
deleted file mode 100644
index 8aa66fc..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/NonMergingWindowFn.java
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms.windowing;
-
-/**
- * Abstract base class for {@link WindowFn}s that do not merge windows.
- *
- * @param <T> type of elements being windowed
- * @param <W> {@link BoundedWindow} subclass used to represent the windows used by this
- * {@code WindowFn}
- */
-public abstract class NonMergingWindowFn<T, W extends BoundedWindow>
- extends WindowFn<T, W> {
- @Override
- public final void mergeWindows(MergeContext c) { }
-
- @Override
- public final boolean isNonMerging() {
- return true;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/OrFinallyTrigger.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/OrFinallyTrigger.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/OrFinallyTrigger.java
deleted file mode 100644
index 652092a..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/OrFinallyTrigger.java
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.transforms.windowing;
-
-import com.google.cloud.dataflow.sdk.util.ExecutableTrigger;
-import com.google.common.annotations.VisibleForTesting;
-
-import org.joda.time.Instant;
-
-import java.util.Arrays;
-import java.util.List;
-
-/**
- * Executes the {@code actual} trigger until it finishes or until the {@code until} trigger fires.
- */
-class OrFinallyTrigger<W extends BoundedWindow> extends Trigger<W> {
-
- private static final int ACTUAL = 0;
- private static final int UNTIL = 1;
-
- @VisibleForTesting OrFinallyTrigger(Trigger<W> actual, Trigger.OnceTrigger<W> until) {
- super(Arrays.asList(actual, until));
- }
-
- @Override
- public void onElement(OnElementContext c) throws Exception {
- c.trigger().subTrigger(ACTUAL).invokeOnElement(c);
- c.trigger().subTrigger(UNTIL).invokeOnElement(c);
- }
-
- @Override
- public void onMerge(OnMergeContext c) throws Exception {
- for (ExecutableTrigger<W> subTrigger : c.trigger().subTriggers()) {
- subTrigger.invokeOnMerge(c);
- }
- updateFinishedState(c);
- }
-
- @Override
- public Instant getWatermarkThatGuaranteesFiring(W window) {
- // This trigger fires once either the trigger or the until trigger fires.
- Instant actualDeadline = subTriggers.get(ACTUAL).getWatermarkThatGuaranteesFiring(window);
- Instant untilDeadline = subTriggers.get(UNTIL).getWatermarkThatGuaranteesFiring(window);
- return actualDeadline.isBefore(untilDeadline) ? actualDeadline : untilDeadline;
- }
-
- @Override
- public Trigger<W> getContinuationTrigger(List<Trigger<W>> continuationTriggers) {
- // Use OrFinallyTrigger instead of AfterFirst because the continuation of ACTUAL
- // may not be a OnceTrigger.
- return Repeatedly.forever(
- new OrFinallyTrigger<W>(
- continuationTriggers.get(ACTUAL),
- (Trigger.OnceTrigger<W>) continuationTriggers.get(UNTIL)));
- }
-
- @Override
- public boolean shouldFire(Trigger<W>.TriggerContext context) throws Exception {
- return context.trigger().subTrigger(ACTUAL).invokeShouldFire(context)
- || context.trigger().subTrigger(UNTIL).invokeShouldFire(context);
- }
-
- @Override
- public void onFire(Trigger<W>.TriggerContext context) throws Exception {
- ExecutableTrigger<W> actualSubtrigger = context.trigger().subTrigger(ACTUAL);
- ExecutableTrigger<W> untilSubtrigger = context.trigger().subTrigger(UNTIL);
-
- if (untilSubtrigger.invokeShouldFire(context)) {
- untilSubtrigger.invokeOnFire(context);
- actualSubtrigger.invokeClear(context);
- } else {
- // If until didn't fire, then the actual must have (or it is forbidden to call
- // onFire) so we are done only if actual is done.
- actualSubtrigger.invokeOnFire(context);
- // Do not clear the until trigger, because it tracks data cross firings.
- }
- updateFinishedState(context);
- }
-
- private void updateFinishedState(TriggerContext c) throws Exception {
- boolean anyStillFinished = false;
- for (ExecutableTrigger<W> subTrigger : c.trigger().subTriggers()) {
- anyStillFinished |= c.forTrigger(subTrigger).trigger().isFinished();
- }
- c.trigger().setFinished(anyStillFinished);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/OutputTimeFn.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/OutputTimeFn.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/OutputTimeFn.java
deleted file mode 100644
index c5d943d..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/OutputTimeFn.java
+++ /dev/null
@@ -1,319 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms.windowing;
-
-import com.google.cloud.dataflow.sdk.annotations.Experimental;
-import com.google.common.collect.Ordering;
-
-import org.joda.time.Instant;
-
-import java.io.Serializable;
-import java.util.Objects;
-
-/**
- * <b><i>(Experimental)</i></b> A function from timestamps of input values to the timestamp for a
- * computed value.
- *
- * <p>The function is represented via three components:
- * <ol>
- * <li>{@link #assignOutputTime} calculates an output timestamp for any input
- * value in a particular window.</li>
- * <li>The output timestamps for all non-late input values within a window are combined
- * according to {@link #combine combine()}, a commutative and associative operation on
- * the output timestamps.</li>
- * <li>The output timestamp when windows merge is provided by {@link #merge merge()}.</li>
- * </ol>
- *
- * <p>This abstract class cannot be subclassed directly, by design: it may grow
- * in consumer-compatible ways that require mutually-exclusive default implementations. To
- * create a concrete subclass, extend {@link OutputTimeFn.Defaults} or
- * {@link OutputTimeFn.DependsOnlyOnWindow}. Note that as long as this class remains
- * experimental, we may also choose to change it in arbitrary backwards-incompatible ways.
- *
- * @param <W> the type of window. Contravariant: methods accepting any subtype of
- * {@code OutputTimeFn<W>} should use the parameter type {@code OutputTimeFn<? super W>}.
- */
-@Experimental(Experimental.Kind.OUTPUT_TIME)
-public abstract class OutputTimeFn<W extends BoundedWindow> implements Serializable {
-
- /**
- * Private constructor to prevent subclassing other than provided base classes.
- */
- private OutputTimeFn() { }
-
- /**
- * Returns the output timestamp to use for data depending on the given
- * {@code inputTimestamp} in the specified {@code window}.
- *
- *
- * <p>The result of this method must be between {@code inputTimestamp} and
- * {@code window.maxTimestamp()} (inclusive on both sides).
- *
- * <p>This function must be monotonic across input timestamps. Specifically, if {@code A < B},
- * then {@code assignOutputTime(A, window) <= assignOutputTime(B, window)}.
- *
- * <p>For a {@link WindowFn} that doesn't produce overlapping windows, this can (and typically
- * should) just return {@code inputTimestamp}. In the presence of overlapping windows, it is
- * suggested that the result in later overlapping windows is past the end of earlier windows
- * so that the later windows don't prevent the watermark from
- * progressing past the end of the earlier window.
- *
- * <p>See the overview of {@link OutputTimeFn} for the consistency properties required
- * between {@link #assignOutputTime}, {@link #combine}, and {@link #merge}.
- */
- public abstract Instant assignOutputTime(Instant inputTimestamp, W window);
-
- /**
- * Combines the given output times, which must be from the same window, into an output time
- * for a computed value.
- *
- * <ul>
- * <li>{@code combine} must be commutative: {@code combine(a, b).equals(combine(b, a))}.</li>
- * <li>{@code combine} must be associative:
- * {@code combine(a, combine(b, c)).equals(combine(combine(a, b), c))}.</li>
- * </ul>
- */
- public abstract Instant combine(Instant outputTime, Instant otherOutputTime);
-
- /**
- * Merges the given output times, presumed to be combined output times for windows that
- * are merging, into an output time for the {@code resultWindow}.
- *
- * <p>When windows {@code w1} and {@code w2} merge to become a new window {@code w1plus2},
- * then {@link #merge} must be implemented such that the output time is the same as
- * if all timestamps were assigned in {@code w1plus2}. Formally:
- *
- * <p>{@code fn.merge(w, fn.assignOutputTime(t1, w1), fn.assignOutputTime(t2, w2))}
- *
- * <p>must be equal to
- *
- * <p>{@code fn.combine(fn.assignOutputTime(t1, w1plus2), fn.assignOutputTime(t2, w1plus2))}
- *
- * <p>If the assigned time depends only on the window, the correct implementation of
- * {@link #merge merge()} necessarily returns the result of
- * {@link #assignOutputTime assignOutputTime(t1, w1plus2)}
- * (which equals {@link #assignOutputTime assignOutputTime(t2, w1plus2)}.
- * Defaults for this case are provided by {@link DependsOnlyOnWindow}.
- *
- * <p>For many other {@link OutputTimeFn} implementations, such as taking the earliest or latest
- * timestamp, this will be the same as {@link #combine combine()}. Defaults for this
- * case are provided by {@link Defaults}.
- */
- public abstract Instant merge(W intoWindow, Iterable<? extends Instant> mergingTimestamps);
-
- /**
- * Returns {@code true} if the result of combination of many output timestamps actually depends
- * only on the earliest.
- *
- * <p>This may allow optimizations when it is very efficient to retrieve the earliest timestamp
- * to be combined.
- */
- public abstract boolean dependsOnlyOnEarliestInputTimestamp();
-
- /**
- * Returns {@code true} if the result does not depend on what outputs were combined but only
- * the window they are in. The canonical example is if all timestamps are sure to
- * be the end of the window.
- *
- * <p>This may allow optimizations, since it is typically very efficient to retrieve the window
- * and combining output timestamps is not necessary.
- *
- * <p>If the assigned output time for an implementation depends only on the window, consider
- * extending {@link DependsOnlyOnWindow}, which returns {@code true} here and also provides
- * a framework for easily implementing a correct {@link #merge}, {@link #combine} and
- * {@link #assignOutputTime}.
- */
- public abstract boolean dependsOnlyOnWindow();
-
- /**
- * <b><i>(Experimental)</i></b> Default method implementations for {@link OutputTimeFn} where the
- * output time depends on the input element timestamps and possibly the window.
- *
- * <p>To complete an implementation, override {@link #assignOutputTime}, at a minimum.
- *
- * <p>By default, {@link #combine} and {@link #merge} return the earliest timestamp of their
- * inputs.
- */
- public abstract static class Defaults<W extends BoundedWindow> extends OutputTimeFn<W> {
-
- protected Defaults() {
- super();
- }
-
- /**
- * {@inheritDoc}
- *
- * @return the earlier of the two timestamps.
- */
- @Override
- public Instant combine(Instant outputTimestamp, Instant otherOutputTimestamp) {
- return Ordering.natural().min(outputTimestamp, otherOutputTimestamp);
- }
-
- /**
- * {@inheritDoc}
- *
- * @return the result of {@link #combine combine(outputTimstamp, otherOutputTimestamp)},
- * by default.
- */
- @Override
- public Instant merge(W resultWindow, Iterable<? extends Instant> mergingTimestamps) {
- return OutputTimeFns.combineOutputTimes(this, mergingTimestamps);
- }
-
- /**
- * {@inheritDoc}
- *
- * @return {@code false}. An {@link OutputTimeFn} that depends only on the window should extend
- * {@link OutputTimeFn.DependsOnlyOnWindow}.
- */
- @Override
- public final boolean dependsOnlyOnWindow() {
- return false;
- }
-
- /**
- * {@inheritDoc}
- *
- * @return {@code true} by default.
- */
- @Override
- public boolean dependsOnlyOnEarliestInputTimestamp() {
- return false;
- }
-
- /**
- * {@inheritDoc}
- *
- * @return {@code true} if the two {@link OutputTimeFn} instances have the same class, by
- * default.
- */
- @Override
- public boolean equals(Object other) {
- if (other == null) {
- return false;
- }
-
- return this.getClass().equals(other.getClass());
- }
-
- @Override
- public int hashCode() {
- return Objects.hash(getClass());
- }
- }
-
- /**
- * <b><i>(Experimental)</i></b> Default method implementations for {@link OutputTimeFn} when the
- * output time depends only on the window.
- *
- * <p>To complete an implementation, override {@link #assignOutputTime(BoundedWindow)}.
- */
- public abstract static class DependsOnlyOnWindow<W extends BoundedWindow>
- extends OutputTimeFn<W> {
-
- protected DependsOnlyOnWindow() {
- super();
- }
-
- /**
- * Returns the output timestamp to use for data in the specified {@code window}.
- *
- * <p>Note that the result of this method must be between the maximum possible input timestamp
- * in {@code window} and {@code window.maxTimestamp()} (inclusive on both sides).
- *
- * <p>For example, using {@code Sessions.withGapDuration(gapDuration)}, we know that all input
- * timestamps must lie at least {@code gapDuration} from the end of the session, so
- * {@code window.maxTimestamp() - gapDuration} is an acceptable assigned timestamp.
- *
- * @see #assignOutputTime(Instant, BoundedWindow)
- */
- protected abstract Instant assignOutputTime(W window);
-
- /**
- * {@inheritDoc}
- *
- * @return the result of {#link assignOutputTime(BoundedWindow) assignOutputTime(window)}.
- */
- @Override
- public final Instant assignOutputTime(Instant timestamp, W window) {
- return assignOutputTime(window);
- }
-
- /**
- * {@inheritDoc}
- *
- * @return the same timestamp as both argument timestamps, which are necessarily equal.
- */
- @Override
- public final Instant combine(Instant outputTimestamp, Instant otherOutputTimestamp) {
- return outputTimestamp;
- }
-
- /**
- * {@inheritDoc}
- *
- * @return the result of
- * {@link #assignOutputTime(BoundedWindow) assignOutputTime(resultWindow)}.
- */
- @Override
- public final Instant merge(W resultWindow, Iterable<? extends Instant> mergingTimestamps) {
- return assignOutputTime(resultWindow);
- }
-
- /**
- * {@inheritDoc}
- *
- * @return {@code true}.
- */
- @Override
- public final boolean dependsOnlyOnWindow() {
- return true;
- }
-
- /**
- * {@inheritDoc}
- *
- * @return {@code true}. Since the output time depends only on the window, it can
- * certainly be ascertained given a single input timestamp.
- */
- @Override
- public final boolean dependsOnlyOnEarliestInputTimestamp() {
- return true;
- }
-
- /**
- * {@inheritDoc}
- *
- * @return {@code true} if the two {@link OutputTimeFn} instances have the same class, by
- * default.
- */
- @Override
- public boolean equals(Object other) {
- if (other == null) {
- return false;
- }
-
- return this.getClass().equals(other.getClass());
- }
-
- @Override
- public int hashCode() {
- return Objects.hash(getClass());
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/OutputTimeFns.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/OutputTimeFns.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/OutputTimeFns.java
deleted file mode 100644
index dcc0f5b..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/OutputTimeFns.java
+++ /dev/null
@@ -1,168 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms.windowing;
-
-import static com.google.common.base.Preconditions.checkArgument;
-
-import com.google.cloud.dataflow.sdk.annotations.Experimental;
-import com.google.common.collect.Iterables;
-import com.google.common.collect.Ordering;
-
-import org.joda.time.Instant;
-
-import javax.annotation.Nullable;
-
-/**
- * <b><i>(Experimental)</i></b> Static utility methods and provided implementations for
- * {@link OutputTimeFn}.
- */
-@Experimental(Experimental.Kind.OUTPUT_TIME)
-public class OutputTimeFns {
- /**
- * The policy of outputting at the earliest of the input timestamps for non-late input data
- * that led to a computed value.
- *
- * <p>For example, suppose <i>v</i><sub>1</sub> through <i>v</i><sub>n</sub> are all on-time
- * elements being aggregated via some function {@code f} into
- * {@code f}(<i>v</i><sub>1</sub>, ..., <i>v</i><sub>n</sub>. When emitted, the output
- * timestamp of the result will be the earliest of the event time timestamps
- *
- * <p>If data arrives late, it has no effect on the output timestamp.
- */
- public static OutputTimeFn<BoundedWindow> outputAtEarliestInputTimestamp() {
- return new OutputAtEarliestInputTimestamp();
- }
-
- /**
- * The policy of holding the watermark to the latest of the input timestamps
- * for non-late input data that led to a computed value.
- *
- * <p>For example, suppose <i>v</i><sub>1</sub> through <i>v</i><sub>n</sub> are all on-time
- * elements being aggregated via some function {@code f} into
- * {@code f}(<i>v</i><sub>1</sub>, ..., <i>v</i><sub>n</sub>. When emitted, the output
- * timestamp of the result will be the latest of the event time timestamps
- *
- * <p>If data arrives late, it has no effect on the output timestamp.
- */
- public static OutputTimeFn<BoundedWindow> outputAtLatestInputTimestamp() {
- return new OutputAtLatestInputTimestamp();
- }
-
- /**
- * The policy of outputting with timestamps at the end of the window.
- *
- * <p>Note that this output timestamp depends only on the window. See
- * {#link dependsOnlyOnWindow()}.
- *
- * <p>When windows merge, instead of using {@link OutputTimeFn#combine} to obtain an output
- * timestamp for the results in the new window, it is mandatory to obtain a new output
- * timestamp from {@link OutputTimeFn#assignOutputTime} with the new window and an arbitrary
- * timestamp (because it is guaranteed that the timestamp is irrelevant).
- *
- * <p>For non-merging window functions, this {@link OutputTimeFn} works transparently.
- */
- public static OutputTimeFn<BoundedWindow> outputAtEndOfWindow() {
- return new OutputAtEndOfWindow();
- }
-
- /**
- * Applies the given {@link OutputTimeFn} to the given output times, obtaining
- * the output time for a value computed. See {@link OutputTimeFn#combine} for
- * a full specification.
- *
- * @throws IllegalArgumentException if {@code outputTimes} is empty.
- */
- public static Instant combineOutputTimes(
- OutputTimeFn<?> outputTimeFn, Iterable<? extends Instant> outputTimes) {
- checkArgument(
- !Iterables.isEmpty(outputTimes),
- "Collection of output times must not be empty in %s.combineOutputTimes",
- OutputTimeFns.class.getName());
-
- @Nullable
- Instant combinedOutputTime = null;
- for (Instant outputTime : outputTimes) {
- combinedOutputTime =
- combinedOutputTime == null
- ? outputTime : outputTimeFn.combine(combinedOutputTime, outputTime);
- }
- return combinedOutputTime;
- }
-
- /**
- * See {@link #outputAtEarliestInputTimestamp}.
- */
- private static class OutputAtEarliestInputTimestamp extends OutputTimeFn.Defaults<BoundedWindow> {
- @Override
- public Instant assignOutputTime(Instant inputTimestamp, BoundedWindow window) {
- return inputTimestamp;
- }
-
- @Override
- public Instant combine(Instant outputTime, Instant otherOutputTime) {
- return Ordering.natural().min(outputTime, otherOutputTime);
- }
-
- /**
- * {@inheritDoc}
- *
- * @return {@code true}. The result of any combine will be the earliest input timestamp.
- */
- @Override
- public boolean dependsOnlyOnEarliestInputTimestamp() {
- return true;
- }
- }
-
- /**
- * See {@link #outputAtLatestInputTimestamp}.
- */
- private static class OutputAtLatestInputTimestamp extends OutputTimeFn.Defaults<BoundedWindow> {
- @Override
- public Instant assignOutputTime(Instant inputTimestamp, BoundedWindow window) {
- return inputTimestamp;
- }
-
- @Override
- public Instant combine(Instant outputTime, Instant otherOutputTime) {
- return Ordering.natural().max(outputTime, otherOutputTime);
- }
-
- /**
- * {@inheritDoc}
- *
- * @return {@code false}.
- */
- @Override
- public boolean dependsOnlyOnEarliestInputTimestamp() {
- return false;
- }
- }
-
- private static class OutputAtEndOfWindow extends OutputTimeFn.DependsOnlyOnWindow<BoundedWindow> {
-
- /**
- *{@inheritDoc}
- *
- *@return {@code window.maxTimestamp()}.
- */
- @Override
- protected Instant assignOutputTime(BoundedWindow window) {
- return window.maxTimestamp();
- }
- }
-}
[47/67] [partial] incubator-beam git commit: Directory reorganization
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/JAXBCoder.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/JAXBCoder.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/JAXBCoder.java
deleted file mode 100644
index 2b0190b..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/JAXBCoder.java
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
- * in compliance with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software distributed under the License
- * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
- * or implied. See the License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.coders;
-
-import com.google.cloud.dataflow.sdk.util.CloudObject;
-import com.google.cloud.dataflow.sdk.util.Structs;
-
-import com.fasterxml.jackson.annotation.JsonCreator;
-import com.fasterxml.jackson.annotation.JsonProperty;
-
-import java.io.FilterInputStream;
-import java.io.FilterOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-
-import javax.xml.bind.JAXBContext;
-import javax.xml.bind.JAXBException;
-import javax.xml.bind.Marshaller;
-import javax.xml.bind.Unmarshaller;
-
-/**
- * A coder for JAXB annotated objects. This coder uses JAXB marshalling/unmarshalling mechanisms
- * to encode/decode the objects. Users must provide the {@code Class} of the JAXB annotated object.
- *
- * @param <T> type of JAXB annotated objects that will be serialized.
- */
-public class JAXBCoder<T> extends AtomicCoder<T> {
-
- private final Class<T> jaxbClass;
- private transient Marshaller jaxbMarshaller = null;
- private transient Unmarshaller jaxbUnmarshaller = null;
-
- public Class<T> getJAXBClass() {
- return jaxbClass;
- }
-
- private JAXBCoder(Class<T> jaxbClass) {
- this.jaxbClass = jaxbClass;
- }
-
- /**
- * Create a coder for a given type of JAXB annotated objects.
- *
- * @param jaxbClass the {@code Class} of the JAXB annotated objects.
- */
- public static <T> JAXBCoder<T> of(Class<T> jaxbClass) {
- return new JAXBCoder<>(jaxbClass);
- }
-
- @Override
- public void encode(T value, OutputStream outStream, Context context)
- throws CoderException, IOException {
- try {
- if (jaxbMarshaller == null) {
- JAXBContext jaxbContext = JAXBContext.newInstance(jaxbClass);
- jaxbMarshaller = jaxbContext.createMarshaller();
- }
-
- jaxbMarshaller.marshal(value, new FilterOutputStream(outStream) {
- // JAXB closes the underyling stream so we must filter out those calls.
- @Override
- public void close() throws IOException {
- }
- });
- } catch (JAXBException e) {
- throw new CoderException(e);
- }
- }
-
- @Override
- public T decode(InputStream inStream, Context context) throws CoderException, IOException {
- try {
- if (jaxbUnmarshaller == null) {
- JAXBContext jaxbContext = JAXBContext.newInstance(jaxbClass);
- jaxbUnmarshaller = jaxbContext.createUnmarshaller();
- }
-
- @SuppressWarnings("unchecked")
- T obj = (T) jaxbUnmarshaller.unmarshal(new FilterInputStream(inStream) {
- // JAXB closes the underyling stream so we must filter out those calls.
- @Override
- public void close() throws IOException {
- }
- });
- return obj;
- } catch (JAXBException e) {
- throw new CoderException(e);
- }
- }
-
- @Override
- public String getEncodingId() {
- return getJAXBClass().getName();
- }
-
- ////////////////////////////////////////////////////////////////////////////////////
- // JSON Serialization details below
-
- private static final String JAXB_CLASS = "jaxb_class";
-
- /**
- * Constructor for JSON deserialization only.
- */
- @JsonCreator
- public static <T> JAXBCoder<T> of(
- @JsonProperty(JAXB_CLASS) String jaxbClassName) {
- try {
- @SuppressWarnings("unchecked")
- Class<T> jaxbClass = (Class<T>) Class.forName(jaxbClassName);
- return of(jaxbClass);
- } catch (ClassNotFoundException e) {
- throw new IllegalArgumentException(e);
- }
- }
-
- @Override
- public CloudObject asCloudObject() {
- CloudObject result = super.asCloudObject();
- Structs.addString(result, JAXB_CLASS, jaxbClass.getName());
- return result;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/KvCoder.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/KvCoder.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/KvCoder.java
deleted file mode 100644
index 33085cf..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/KvCoder.java
+++ /dev/null
@@ -1,162 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.coders;
-
-import static com.google.cloud.dataflow.sdk.util.Structs.addBoolean;
-
-import com.google.cloud.dataflow.sdk.util.CloudObject;
-import com.google.cloud.dataflow.sdk.util.PropertyNames;
-import com.google.cloud.dataflow.sdk.util.common.ElementByteSizeObserver;
-import com.google.cloud.dataflow.sdk.values.KV;
-import com.google.common.base.Preconditions;
-
-import com.fasterxml.jackson.annotation.JsonCreator;
-import com.fasterxml.jackson.annotation.JsonProperty;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.util.Arrays;
-import java.util.List;
-
-/**
- * A {@code KvCoder} encodes {@link KV}s.
- *
- * @param <K> the type of the keys of the KVs being transcoded
- * @param <V> the type of the values of the KVs being transcoded
- */
-public class KvCoder<K, V> extends KvCoderBase<KV<K, V>> {
- public static <K, V> KvCoder<K, V> of(Coder<K> keyCoder,
- Coder<V> valueCoder) {
- return new KvCoder<>(keyCoder, valueCoder);
- }
-
- @JsonCreator
- public static KvCoder<?, ?> of(
- @JsonProperty(PropertyNames.COMPONENT_ENCODINGS)
- List<Coder<?>> components) {
- Preconditions.checkArgument(components.size() == 2,
- "Expecting 2 components, got " + components.size());
- return of(components.get(0), components.get(1));
- }
-
- public static <K, V> List<Object> getInstanceComponents(
- KV<K, V> exampleValue) {
- return Arrays.asList(
- exampleValue.getKey(),
- exampleValue.getValue());
- }
-
- public Coder<K> getKeyCoder() {
- return keyCoder;
- }
-
- public Coder<V> getValueCoder() {
- return valueCoder;
- }
-
- /////////////////////////////////////////////////////////////////////////////
-
- private final Coder<K> keyCoder;
- private final Coder<V> valueCoder;
-
- private KvCoder(Coder<K> keyCoder, Coder<V> valueCoder) {
- this.keyCoder = keyCoder;
- this.valueCoder = valueCoder;
- }
-
- @Override
- public void encode(KV<K, V> kv, OutputStream outStream, Context context)
- throws IOException, CoderException {
- if (kv == null) {
- throw new CoderException("cannot encode a null KV");
- }
- Context nestedContext = context.nested();
- keyCoder.encode(kv.getKey(), outStream, nestedContext);
- valueCoder.encode(kv.getValue(), outStream, nestedContext);
- }
-
- @Override
- public KV<K, V> decode(InputStream inStream, Context context)
- throws IOException, CoderException {
- Context nestedContext = context.nested();
- K key = keyCoder.decode(inStream, nestedContext);
- V value = valueCoder.decode(inStream, nestedContext);
- return KV.of(key, value);
- }
-
- @Override
- public List<? extends Coder<?>> getCoderArguments() {
- return Arrays.asList(keyCoder, valueCoder);
- }
-
- @Override
- public void verifyDeterministic() throws NonDeterministicException {
- verifyDeterministic("Key coder must be deterministic", getKeyCoder());
- verifyDeterministic("Value coder must be deterministic", getValueCoder());
- }
-
- @Override
- public boolean consistentWithEquals() {
- return keyCoder.consistentWithEquals() && valueCoder.consistentWithEquals();
- }
-
- @Override
- public Object structuralValue(KV<K, V> kv) throws Exception {
- if (consistentWithEquals()) {
- return kv;
- } else {
- return KV.of(getKeyCoder().structuralValue(kv.getKey()),
- getValueCoder().structuralValue(kv.getValue()));
- }
- }
-
- @Override
- public CloudObject asCloudObject() {
- CloudObject result = super.asCloudObject();
- addBoolean(result, PropertyNames.IS_PAIR_LIKE, true);
- return result;
- }
-
- /**
- * Returns whether both keyCoder and valueCoder are considered not expensive.
- */
- @Override
- public boolean isRegisterByteSizeObserverCheap(KV<K, V> kv, Context context) {
- return keyCoder.isRegisterByteSizeObserverCheap(kv.getKey(),
- context.nested())
- && valueCoder.isRegisterByteSizeObserverCheap(kv.getValue(),
- context.nested());
- }
-
- /**
- * Notifies ElementByteSizeObserver about the byte size of the
- * encoded value using this coder.
- */
- @Override
- public void registerByteSizeObserver(
- KV<K, V> kv, ElementByteSizeObserver observer, Context context)
- throws Exception {
- if (kv == null) {
- throw new CoderException("cannot encode a null KV");
- }
- keyCoder.registerByteSizeObserver(
- kv.getKey(), observer, context.nested());
- valueCoder.registerByteSizeObserver(
- kv.getValue(), observer, context.nested());
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/KvCoderBase.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/KvCoderBase.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/KvCoderBase.java
deleted file mode 100644
index 4a12ee0..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/KvCoderBase.java
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.coders;
-
-import com.google.cloud.dataflow.sdk.util.PropertyNames;
-
-import com.fasterxml.jackson.annotation.JsonCreator;
-import com.fasterxml.jackson.annotation.JsonProperty;
-
-import java.util.List;
-
-/**
- * A abstract base class for KvCoder. Works around a Jackson2 bug tickled when building
- * {@link KvCoder} directly (as of this writing, Jackson2 walks off the end of
- * an array when it tries to deserialize a class with multiple generic type
- * parameters). This class should be removed when possible.
- *
- * @param <T> the type of values being transcoded
- */
-@Deprecated
-public abstract class KvCoderBase<T> extends StandardCoder<T> {
- /**
- * A constructor used only for decoding from JSON.
- *
- * @param typeId present in the JSON encoding, but unused
- * @param isPairLike present in the JSON encoding, but unused
- */
- @Deprecated
- @JsonCreator
- public static KvCoderBase<?> of(
- // N.B. typeId is a required parameter here, since a field named "@type"
- // is presented to the deserializer as an input.
- //
- // If this method did not consume the field, Jackson2 would observe an
- // unconsumed field and a returned value of a derived type. So Jackson2
- // would attempt to update the returned value with the unconsumed field
- // data. The standard JsonDeserializer does not implement a mechanism for
- // updating constructed values, so it would throw an exception, causing
- // deserialization to fail.
- @JsonProperty(value = "@type", required = false) String typeId,
- @JsonProperty(value = PropertyNames.IS_PAIR_LIKE, required = false) boolean isPairLike,
- @JsonProperty(PropertyNames.COMPONENT_ENCODINGS) List<Coder<?>> components) {
- return KvCoder.of(components);
- }
-
- protected KvCoderBase() {}
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/ListCoder.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/ListCoder.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/ListCoder.java
deleted file mode 100644
index bc74404..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/ListCoder.java
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.coders;
-
-import com.google.cloud.dataflow.sdk.util.PropertyNames;
-import com.google.common.base.Preconditions;
-
-import com.fasterxml.jackson.annotation.JsonCreator;
-import com.fasterxml.jackson.annotation.JsonProperty;
-
-import java.util.List;
-
-/**
- * A {@link Coder} for {@link List}, using the format of {@link IterableLikeCoder}.
- *
- * @param <T> the type of the elements of the Lists being transcoded
- */
-public class ListCoder<T> extends IterableLikeCoder<T, List<T>> {
-
- public static <T> ListCoder<T> of(Coder<T> elemCoder) {
- return new ListCoder<>(elemCoder);
- }
-
- /////////////////////////////////////////////////////////////////////////////
- // Internal operations below here.
-
- @Override
- protected final List<T> decodeToIterable(List<T> decodedElements) {
- return decodedElements;
- }
-
- @JsonCreator
- public static ListCoder<?> of(
- @JsonProperty(PropertyNames.COMPONENT_ENCODINGS)
- List<Coder<?>> components) {
- Preconditions.checkArgument(components.size() == 1,
- "Expecting 1 component, got " + components.size());
- return of((Coder<?>) components.get(0));
- }
-
- /**
- * Returns the first element in this list if it is non-empty,
- * otherwise returns {@code null}.
- */
- public static <T> List<Object> getInstanceComponents(List<T> exampleValue) {
- return getInstanceComponentsHelper(exampleValue);
- }
-
- protected ListCoder(Coder<T> elemCoder) {
- super(elemCoder, "List");
- }
-
- /**
- * List sizes are always known, so ListIterable may be deterministic while
- * the general IterableLikeCoder is not.
- */
- @Override
- public void verifyDeterministic() throws NonDeterministicException {
- verifyDeterministic(
- "ListCoder.elemCoder must be deterministic", getElemCoder());
- }
-
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/MapCoder.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/MapCoder.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/MapCoder.java
deleted file mode 100644
index b6f3103..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/MapCoder.java
+++ /dev/null
@@ -1,160 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.coders;
-
-import com.google.cloud.dataflow.sdk.util.PropertyNames;
-import com.google.cloud.dataflow.sdk.util.common.ElementByteSizeObserver;
-import com.google.common.base.Preconditions;
-import com.google.common.collect.Maps;
-
-import com.fasterxml.jackson.annotation.JsonCreator;
-import com.fasterxml.jackson.annotation.JsonProperty;
-
-import java.io.DataInputStream;
-import java.io.DataOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.util.Arrays;
-import java.util.List;
-import java.util.Map;
-import java.util.Map.Entry;
-
-/**
- * A {@link Coder} for {@link Map Maps} that encodes them according to provided
- * coders for keys and values.
- *
- * @param <K> the type of the keys of the KVs being transcoded
- * @param <V> the type of the values of the KVs being transcoded
- */
-public class MapCoder<K, V> extends MapCoderBase<Map<K, V>> {
- /**
- * Produces a MapCoder with the given keyCoder and valueCoder.
- */
- public static <K, V> MapCoder<K, V> of(
- Coder<K> keyCoder,
- Coder<V> valueCoder) {
- return new MapCoder<>(keyCoder, valueCoder);
- }
-
- @JsonCreator
- public static MapCoder<?, ?> of(
- @JsonProperty(PropertyNames.COMPONENT_ENCODINGS)
- List<Coder<?>> components) {
- Preconditions.checkArgument(components.size() == 2,
- "Expecting 2 components, got " + components.size());
- return of((Coder<?>) components.get(0), (Coder<?>) components.get(1));
- }
-
- /**
- * Returns the key and value for an arbitrary element of this map,
- * if it is non-empty, otherwise returns {@code null}.
- */
- public static <K, V> List<Object> getInstanceComponents(
- Map<K, V> exampleValue) {
- for (Map.Entry<K, V> entry : exampleValue.entrySet()) {
- return Arrays.asList(entry.getKey(), entry.getValue());
- }
- return null;
- }
-
- public Coder<K> getKeyCoder() {
- return keyCoder;
- }
-
- public Coder<V> getValueCoder() {
- return valueCoder;
- }
-
- /////////////////////////////////////////////////////////////////////////////
-
- Coder<K> keyCoder;
- Coder<V> valueCoder;
-
- MapCoder(Coder<K> keyCoder, Coder<V> valueCoder) {
- this.keyCoder = keyCoder;
- this.valueCoder = valueCoder;
- }
-
- @Override
- public void encode(
- Map<K, V> map,
- OutputStream outStream,
- Context context)
- throws IOException, CoderException {
- if (map == null) {
- throw new CoderException("cannot encode a null Map");
- }
- DataOutputStream dataOutStream = new DataOutputStream(outStream);
- dataOutStream.writeInt(map.size());
- for (Entry<K, V> entry : map.entrySet()) {
- keyCoder.encode(entry.getKey(), outStream, context.nested());
- valueCoder.encode(entry.getValue(), outStream, context.nested());
- }
- dataOutStream.flush();
- }
-
- @Override
- public Map<K, V> decode(InputStream inStream, Context context)
- throws IOException, CoderException {
- DataInputStream dataInStream = new DataInputStream(inStream);
- int size = dataInStream.readInt();
- Map<K, V> retval = Maps.newHashMapWithExpectedSize(size);
- for (int i = 0; i < size; ++i) {
- K key = keyCoder.decode(inStream, context.nested());
- V value = valueCoder.decode(inStream, context.nested());
- retval.put(key, value);
- }
- return retval;
- }
-
- /**
- * {@inheritDoc}
- *
- * @return a {@link List} containing the key coder at index 0 at the and value coder at index 1.
- */
- @Override
- public List<? extends Coder<?>> getCoderArguments() {
- return Arrays.asList(keyCoder, valueCoder);
- }
-
- /**
- * {@inheritDoc}
- *
- * @throws NonDeterministicException always. Not all maps have a deterministic encoding.
- * For example, {@code HashMap} comparison does not depend on element order, so
- * two {@code HashMap} instances may be equal but produce different encodings.
- */
- @Override
- public void verifyDeterministic() throws NonDeterministicException {
- throw new NonDeterministicException(this,
- "Ordering of entries in a Map may be non-deterministic.");
- }
-
- @Override
- public void registerByteSizeObserver(
- Map<K, V> map, ElementByteSizeObserver observer, Context context)
- throws Exception {
- observer.update(4L);
- for (Entry<K, V> entry : map.entrySet()) {
- keyCoder.registerByteSizeObserver(
- entry.getKey(), observer, context.nested());
- valueCoder.registerByteSizeObserver(
- entry.getValue(), observer, context.nested());
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/MapCoderBase.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/MapCoderBase.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/MapCoderBase.java
deleted file mode 100644
index d32406c..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/MapCoderBase.java
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.coders;
-
-import com.google.cloud.dataflow.sdk.util.PropertyNames;
-
-import com.fasterxml.jackson.annotation.JsonCreator;
-import com.fasterxml.jackson.annotation.JsonProperty;
-
-import java.util.List;
-
-/**
- * A abstract base class for MapCoder. Works around a Jackson2 bug tickled when building
- * {@link MapCoder} directly (as of this writing, Jackson2 walks off the end of
- * an array when it tries to deserialize a class with multiple generic type
- * parameters). This should be removed in favor of a better workaround.
- * @param <T> the type of values being transcoded
- */
-@Deprecated
-public abstract class MapCoderBase<T> extends StandardCoder<T> {
- @Deprecated
- @JsonCreator
- public static MapCoderBase<?> of(
- // N.B. typeId is a required parameter here, since a field named "@type"
- // is presented to the deserializer as an input.
- //
- // If this method did not consume the field, Jackson2 would observe an
- // unconsumed field and a returned value of a derived type. So Jackson2
- // would attempt to update the returned value with the unconsumed field
- // data, The standard JsonDeserializer does not implement a mechanism for
- // updating constructed values, so it would throw an exception, causing
- // deserialization to fail.
- @JsonProperty(value = "@type", required = false) String typeId,
- @JsonProperty(PropertyNames.COMPONENT_ENCODINGS)
- List<Coder<?>> components) {
- return MapCoder.of(components);
- }
-
- protected MapCoderBase() {}
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/NullableCoder.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/NullableCoder.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/NullableCoder.java
deleted file mode 100644
index 5598a71..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/NullableCoder.java
+++ /dev/null
@@ -1,175 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.coders;
-
-import com.google.cloud.dataflow.sdk.util.PropertyNames;
-import com.google.cloud.dataflow.sdk.util.common.ElementByteSizeObserver;
-import com.google.common.base.Optional;
-import com.google.common.base.Preconditions;
-import com.google.common.collect.ImmutableList;
-
-import com.fasterxml.jackson.annotation.JsonCreator;
-import com.fasterxml.jackson.annotation.JsonProperty;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.util.List;
-
-import javax.annotation.Nullable;
-
-/**
- * A {@link NullableCoder} encodes nullable values of type {@code T} using a nested
- * {@code Coder<T>} that does not tolerate {@code null} values. {@link NullableCoder} uses
- * exactly 1 byte per entry to indicate whether the value is {@code null}, then adds the encoding
- * of the inner coder for non-null values.
- *
- * @param <T> the type of the values being transcoded
- */
-public class NullableCoder<T> extends StandardCoder<T> {
- public static <T> NullableCoder<T> of(Coder<T> valueCoder) {
- return new NullableCoder<>(valueCoder);
- }
-
- @JsonCreator
- public static NullableCoder<?> of(
- @JsonProperty(PropertyNames.COMPONENT_ENCODINGS)
- List<Coder<?>> components) {
- Preconditions.checkArgument(components.size() == 1,
- "Expecting 1 components, got " + components.size());
- return of(components.get(0));
- }
-
- /////////////////////////////////////////////////////////////////////////////
-
- private final Coder<T> valueCoder;
- private static final int ENCODE_NULL = 0;
- private static final int ENCODE_PRESENT = 1;
-
- private NullableCoder(Coder<T> valueCoder) {
- this.valueCoder = valueCoder;
- }
-
- @Override
- public void encode(@Nullable T value, OutputStream outStream, Context context)
- throws IOException, CoderException {
- if (value == null) {
- outStream.write(ENCODE_NULL);
- } else {
- outStream.write(ENCODE_PRESENT);
- valueCoder.encode(value, outStream, context.nested());
- }
- }
-
- @Override
- @Nullable
- public T decode(InputStream inStream, Context context) throws IOException, CoderException {
- int b = inStream.read();
- if (b == ENCODE_NULL) {
- return null;
- } else if (b != ENCODE_PRESENT) {
- throw new CoderException(String.format(
- "NullableCoder expects either a byte valued %s (null) or %s (present), got %s",
- ENCODE_NULL, ENCODE_PRESENT, b));
- }
- return valueCoder.decode(inStream, context.nested());
- }
-
- @Override
- public List<Coder<T>> getCoderArguments() {
- return ImmutableList.of(valueCoder);
- }
-
- /**
- * {@code NullableCoder} is deterministic if the nested {@code Coder} is.
- *
- * {@inheritDoc}
- */
- @Override
- public void verifyDeterministic() throws NonDeterministicException {
- verifyDeterministic("Value coder must be deterministic", valueCoder);
- }
-
- /**
- * {@code NullableCoder} is consistent with equals if the nested {@code Coder} is.
- *
- * {@inheritDoc}
- */
- @Override
- public boolean consistentWithEquals() {
- return valueCoder.consistentWithEquals();
- }
-
- @Override
- public Object structuralValue(@Nullable T value) throws Exception {
- if (value == null) {
- return Optional.absent();
- }
- return Optional.of(valueCoder.structuralValue(value));
- }
-
- /**
- * Overridden to short-circuit the default {@code StandardCoder} behavior of encoding and
- * counting the bytes. The size is known (1 byte) when {@code value} is {@code null}, otherwise
- * the size is 1 byte plus the size of nested {@code Coder}'s encoding of {@code value}.
- *
- * {@inheritDoc}
- */
- @Override
- public void registerByteSizeObserver(
- @Nullable T value, ElementByteSizeObserver observer, Context context) throws Exception {
- observer.update(1);
- if (value != null) {
- valueCoder.registerByteSizeObserver(value, observer, context.nested());
- }
- }
-
- /**
- * Overridden to short-circuit the default {@code StandardCoder} behavior of encoding and
- * counting the bytes. The size is known (1 byte) when {@code value} is {@code null}, otherwise
- * the size is 1 byte plus the size of nested {@code Coder}'s encoding of {@code value}.
- *
- * {@inheritDoc}
- */
- @Override
- protected long getEncodedElementByteSize(@Nullable T value, Context context) throws Exception {
- if (value == null) {
- return 1;
- }
-
- if (valueCoder instanceof StandardCoder) {
- // If valueCoder is a StandardCoder then we can ask it directly for the encoded size of
- // the value, adding 1 byte to count the null indicator.
- return 1 + ((StandardCoder<T>) valueCoder)
- .getEncodedElementByteSize(value, context.nested());
- }
-
- // If value is not a StandardCoder then fall back to the default StandardCoder behavior
- // of encoding and counting the bytes. The encoding will include the null indicator byte.
- return super.getEncodedElementByteSize(value, context);
- }
-
- /**
- * {@code NullableCoder} is cheap if {@code valueCoder} is cheap.
- *
- * {@inheritDoc}
- */
- @Override
- public boolean isRegisterByteSizeObserverCheap(@Nullable T value, Context context) {
- return valueCoder.isRegisterByteSizeObserverCheap(value, context.nested());
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/Proto2Coder.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/Proto2Coder.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/Proto2Coder.java
deleted file mode 100644
index ef91ba9..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/Proto2Coder.java
+++ /dev/null
@@ -1,361 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.coders;
-
-import static com.google.common.base.Preconditions.checkArgument;
-
-import com.google.cloud.dataflow.sdk.coders.protobuf.ProtoCoder;
-import com.google.cloud.dataflow.sdk.util.CloudObject;
-import com.google.cloud.dataflow.sdk.util.Structs;
-import com.google.cloud.dataflow.sdk.values.TypeDescriptor;
-import com.google.common.collect.ImmutableList;
-import com.google.common.collect.Lists;
-import com.google.common.collect.Sets;
-import com.google.protobuf.ExtensionRegistry;
-import com.google.protobuf.Message;
-import com.google.protobuf.Parser;
-
-import com.fasterxml.jackson.annotation.JsonCreator;
-import com.fasterxml.jackson.annotation.JsonProperty;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.lang.reflect.InvocationTargetException;
-import java.lang.reflect.Method;
-import java.lang.reflect.Modifier;
-import java.util.Collections;
-import java.util.List;
-import java.util.Objects;
-
-import javax.annotation.Nullable;
-
-/**
- * A {@link Coder} using Google Protocol Buffers 2 binary format.
- *
- * <p>To learn more about Protocol Buffers, visit:
- * <a href="https://developers.google.com/protocol-buffers">https://developers.google.com/protocol-buffers</a>
- *
- * <p>To use, specify the {@link Coder} type on a PCollection containing Protocol Buffers messages.
- *
- * <pre>
- * {@code
- * PCollection<MyProto.Message> records =
- * input.apply(...)
- * .setCoder(Proto2Coder.of(MyProto.Message.class));
- * }
- * </pre>
- *
- * <p>Custom message extensions are also supported, but the coder must be made
- * aware of them explicitly:
- *
- * <pre>
- * {@code
- * PCollection<MyProto.Message> records =
- * input.apply(...)
- * .setCoder(Proto2Coder.of(MyProto.Message.class)
- * .addExtensionsFrom(MyProto.class));
- * }
- * </pre>
- *
- * @param <T> the type of elements handled by this coder, must extend {@code Message}
- * @deprecated Use {@link ProtoCoder}.
- */
-@Deprecated
-public class Proto2Coder<T extends Message> extends AtomicCoder<T> {
-
- /** The class of Protobuf message to be encoded. */
- private final Class<T> protoMessageClass;
-
- /**
- * All extension host classes included in this Proto2Coder. The extensions from
- * these classes will be included in the {@link ExtensionRegistry} used during
- * encoding and decoding.
- */
- private final List<Class<?>> extensionHostClasses;
-
- private Proto2Coder(Class<T> protoMessageClass, List<Class<?>> extensionHostClasses) {
- this.protoMessageClass = protoMessageClass;
- this.extensionHostClasses = extensionHostClasses;
- }
-
- private static final CoderProvider PROVIDER =
- new CoderProvider() {
- @Override
- public <T> Coder<T> getCoder(TypeDescriptor<T> type) throws CannotProvideCoderException {
- if (type.isSubtypeOf(new TypeDescriptor<Message>() {})) {
- @SuppressWarnings("unchecked")
- TypeDescriptor<? extends Message> messageType =
- (TypeDescriptor<? extends Message>) type;
- @SuppressWarnings("unchecked")
- Coder<T> coder = (Coder<T>) Proto2Coder.of(messageType);
- return coder;
- } else {
- throw new CannotProvideCoderException(
- String.format(
- "Cannot provide Proto2Coder because %s "
- + "is not a subclass of protocol buffer Messsage",
- type));
- }
- }
- };
-
- public static CoderProvider coderProvider() {
- return PROVIDER;
- }
-
- /**
- * Returns a {@code Proto2Coder} for the given Protobuf message class.
- */
- public static <T extends Message> Proto2Coder<T> of(Class<T> protoMessageClass) {
- return new Proto2Coder<T>(protoMessageClass, Collections.<Class<?>>emptyList());
- }
-
- /**
- * Returns a {@code Proto2Coder} for the given Protobuf message class.
- */
- public static <T extends Message> Proto2Coder<T> of(TypeDescriptor<T> protoMessageType) {
- @SuppressWarnings("unchecked")
- Class<T> protoMessageClass = (Class<T>) protoMessageType.getRawType();
- return of(protoMessageClass);
- }
-
- /**
- * Produces a {@code Proto2Coder} like this one, but with the extensions from
- * the given classes registered.
- *
- * @param moreExtensionHosts an iterable of classes that define a static
- * method {@code registerAllExtensions(ExtensionRegistry)}
- */
- public Proto2Coder<T> withExtensionsFrom(Iterable<Class<?>> moreExtensionHosts) {
- for (Class<?> extensionHost : moreExtensionHosts) {
- // Attempt to access the required method, to make sure it's present.
- try {
- Method registerAllExtensions =
- extensionHost.getDeclaredMethod("registerAllExtensions", ExtensionRegistry.class);
- checkArgument(
- Modifier.isStatic(registerAllExtensions.getModifiers()),
- "Method registerAllExtensions() must be static for use with Proto2Coder");
- } catch (NoSuchMethodException | SecurityException e) {
- throw new IllegalArgumentException(e);
- }
- }
-
- return new Proto2Coder<T>(
- protoMessageClass,
- new ImmutableList.Builder<Class<?>>()
- .addAll(extensionHostClasses)
- .addAll(moreExtensionHosts)
- .build());
- }
-
- /**
- * See {@link #withExtensionsFrom(Iterable)}.
- */
- public Proto2Coder<T> withExtensionsFrom(Class<?>... extensionHosts) {
- return withExtensionsFrom(ImmutableList.copyOf(extensionHosts));
- }
-
- /**
- * Adds custom Protobuf extensions to the coder. Returns {@code this}
- * for method chaining.
- *
- * @param extensionHosts must be a class that defines a static
- * method name {@code registerAllExtensions}
- * @deprecated use {@link #withExtensionsFrom}
- */
- @Deprecated
- public Proto2Coder<T> addExtensionsFrom(Class<?>... extensionHosts) {
- return addExtensionsFrom(ImmutableList.copyOf(extensionHosts));
- }
-
- /**
- * Adds custom Protobuf extensions to the coder. Returns {@code this}
- * for method chaining.
- *
- * @param extensionHosts must be a class that defines a static
- * method name {@code registerAllExtensions}
- * @deprecated use {@link #withExtensionsFrom}
- */
- @Deprecated
- public Proto2Coder<T> addExtensionsFrom(Iterable<Class<?>> extensionHosts) {
- for (Class<?> extensionHost : extensionHosts) {
- try {
- // Attempt to access the declared method, to make sure it's present.
- extensionHost.getDeclaredMethod("registerAllExtensions", ExtensionRegistry.class);
- } catch (NoSuchMethodException e) {
- throw new IllegalArgumentException(e);
- }
- extensionHostClasses.add(extensionHost);
- }
- // The memoized extension registry needs to be recomputed because we have mutated this object.
- synchronized (this) {
- memoizedExtensionRegistry = null;
- getExtensionRegistry();
- }
- return this;
- }
-
- @Override
- public void encode(T value, OutputStream outStream, Context context) throws IOException {
- if (value == null) {
- throw new CoderException("cannot encode a null " + protoMessageClass.getSimpleName());
- }
- if (context.isWholeStream) {
- value.writeTo(outStream);
- } else {
- value.writeDelimitedTo(outStream);
- }
- }
-
- @Override
- public T decode(InputStream inStream, Context context) throws IOException {
- if (context.isWholeStream) {
- return getParser().parseFrom(inStream, getExtensionRegistry());
- } else {
- return getParser().parseDelimitedFrom(inStream, getExtensionRegistry());
- }
- }
-
- @Override
- public boolean equals(Object other) {
- if (this == other) {
- return true;
- }
- if (!(other instanceof Proto2Coder)) {
- return false;
- }
- Proto2Coder<?> otherCoder = (Proto2Coder<?>) other;
- return protoMessageClass.equals(otherCoder.protoMessageClass)
- && Sets.newHashSet(extensionHostClasses)
- .equals(Sets.newHashSet(otherCoder.extensionHostClasses));
- }
-
- @Override
- public int hashCode() {
- return Objects.hash(protoMessageClass, extensionHostClasses);
- }
-
- /**
- * The encoding identifier is designed to support evolution as per the design of Protocol
- * Buffers. In order to use this class effectively, carefully follow the advice in the Protocol
- * Buffers documentation at
- * <a href="https://developers.google.com/protocol-buffers/docs/proto#updating">Updating
- * A Message Type</a>.
- *
- * <p>In particular, the encoding identifier is guaranteed to be the same for {@code Proto2Coder}
- * instances of the same principal message class, and otherwise distinct. Loaded extensions do not
- * affect the id, nor does it encode the full schema.
- *
- * <p>When modifying a message class, here are the broadest guidelines; see the above link
- * for greater detail.
- *
- * <ul>
- * <li>Do not change the numeric tags for any fields.
- * <li>Never remove a <code>required</code> field.
- * <li>Only add <code>optional</code> or <code>repeated</code> fields, with sensible defaults.
- * <li>When changing the type of a field, consult the Protocol Buffers documentation to ensure
- * the new and old types are interchangeable.
- * </ul>
- *
- * <p>Code consuming this message class should be prepared to support <i>all</i> versions of
- * the class until it is certain that no remaining serialized instances exist.
- *
- * <p>If backwards incompatible changes must be made, the best recourse is to change the name
- * of your Protocol Buffers message class.
- */
- @Override
- public String getEncodingId() {
- return protoMessageClass.getName();
- }
-
- private transient Parser<T> memoizedParser;
-
- private Parser<T> getParser() {
- if (memoizedParser == null) {
- try {
- @SuppressWarnings("unchecked")
- T protoMessageInstance = (T) protoMessageClass.getMethod("getDefaultInstance").invoke(null);
- @SuppressWarnings("unchecked")
- Parser<T> tParser = (Parser<T>) protoMessageInstance.getParserForType();
- memoizedParser = tParser;
- } catch (IllegalAccessException | InvocationTargetException | NoSuchMethodException e) {
- throw new IllegalArgumentException(e);
- }
- }
- return memoizedParser;
- }
-
- private transient ExtensionRegistry memoizedExtensionRegistry;
-
- private synchronized ExtensionRegistry getExtensionRegistry() {
- if (memoizedExtensionRegistry == null) {
- ExtensionRegistry registry = ExtensionRegistry.newInstance();
- for (Class<?> extensionHost : extensionHostClasses) {
- try {
- extensionHost
- .getDeclaredMethod("registerAllExtensions", ExtensionRegistry.class)
- .invoke(null, registry);
- } catch (IllegalAccessException | InvocationTargetException | NoSuchMethodException e) {
- throw new IllegalStateException(e);
- }
- }
- memoizedExtensionRegistry = registry.getUnmodifiable();
- }
- return memoizedExtensionRegistry;
- }
-
- ////////////////////////////////////////////////////////////////////////////////////
- // JSON Serialization details below
-
- private static final String PROTO_MESSAGE_CLASS = "proto_message_class";
- private static final String PROTO_EXTENSION_HOSTS = "proto_extension_hosts";
-
- /**
- * Constructor for JSON deserialization only.
- */
- @JsonCreator
- public static <T extends Message> Proto2Coder<T> of(
- @JsonProperty(PROTO_MESSAGE_CLASS) String protoMessageClassName,
- @Nullable @JsonProperty(PROTO_EXTENSION_HOSTS) List<String> extensionHostClassNames) {
-
- try {
- @SuppressWarnings("unchecked")
- Class<T> protoMessageClass = (Class<T>) Class.forName(protoMessageClassName);
- List<Class<?>> extensionHostClasses = Lists.newArrayList();
- if (extensionHostClassNames != null) {
- for (String extensionHostClassName : extensionHostClassNames) {
- extensionHostClasses.add(Class.forName(extensionHostClassName));
- }
- }
- return of(protoMessageClass).withExtensionsFrom(extensionHostClasses);
- } catch (ClassNotFoundException e) {
- throw new IllegalArgumentException(e);
- }
- }
-
- @Override
- public CloudObject asCloudObject() {
- CloudObject result = super.asCloudObject();
- Structs.addString(result, PROTO_MESSAGE_CLASS, protoMessageClass.getName());
- List<CloudObject> extensionHostClassNames = Lists.newArrayList();
- for (Class<?> clazz : extensionHostClasses) {
- extensionHostClassNames.add(CloudObject.forString(clazz.getName()));
- }
- Structs.addList(result, PROTO_EXTENSION_HOSTS, extensionHostClassNames);
- return result;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/SerializableCoder.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/SerializableCoder.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/SerializableCoder.java
deleted file mode 100644
index 593c9f0..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/SerializableCoder.java
+++ /dev/null
@@ -1,183 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.coders;
-
-import com.google.cloud.dataflow.sdk.util.CloudObject;
-import com.google.cloud.dataflow.sdk.values.TypeDescriptor;
-
-import com.fasterxml.jackson.annotation.JsonCreator;
-import com.fasterxml.jackson.annotation.JsonProperty;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.ObjectInputStream;
-import java.io.ObjectOutputStream;
-import java.io.ObjectStreamClass;
-import java.io.OutputStream;
-import java.io.Serializable;
-
-/**
- * A {@link Coder} for Java classes that implement {@link Serializable}.
- *
- * <p>To use, specify the coder type on a PCollection:
- * <pre>
- * {@code
- * PCollection<MyRecord> records =
- * foo.apply(...).setCoder(SerializableCoder.of(MyRecord.class));
- * }
- * </pre>
- *
- * <p>{@link SerializableCoder} does not guarantee a deterministic encoding, as Java
- * serialization may produce different binary encodings for two equivalent
- * objects.
- *
- * @param <T> the type of elements handled by this coder
- */
-public class SerializableCoder<T extends Serializable> extends AtomicCoder<T> {
-
- /**
- * Returns a {@link SerializableCoder} instance for the provided element type.
- * @param <T> the element type
- */
- public static <T extends Serializable> SerializableCoder<T> of(TypeDescriptor<T> type) {
- @SuppressWarnings("unchecked")
- Class<T> clazz = (Class<T>) type.getRawType();
- return of(clazz);
- }
-
- /**
- * Returns a {@link SerializableCoder} instance for the provided element class.
- * @param <T> the element type
- */
- public static <T extends Serializable> SerializableCoder<T> of(Class<T> clazz) {
- return new SerializableCoder<>(clazz);
- }
-
- @JsonCreator
- @SuppressWarnings("unchecked")
- public static SerializableCoder<?> of(@JsonProperty("type") String classType)
- throws ClassNotFoundException {
- Class<?> clazz = Class.forName(classType);
- if (!Serializable.class.isAssignableFrom(clazz)) {
- throw new ClassNotFoundException(
- "Class " + classType + " does not implement Serializable");
- }
- return of((Class<? extends Serializable>) clazz);
- }
-
- /**
- * A {@link CoderProvider} that constructs a {@link SerializableCoder}
- * for any class that implements serializable.
- */
- public static final CoderProvider PROVIDER = new CoderProvider() {
- @Override
- public <T> Coder<T> getCoder(TypeDescriptor<T> typeDescriptor)
- throws CannotProvideCoderException {
- Class<?> clazz = typeDescriptor.getRawType();
- if (Serializable.class.isAssignableFrom(clazz)) {
- @SuppressWarnings("unchecked")
- Class<? extends Serializable> serializableClazz =
- (Class<? extends Serializable>) clazz;
- @SuppressWarnings("unchecked")
- Coder<T> coder = (Coder<T>) SerializableCoder.of(serializableClazz);
- return coder;
- } else {
- throw new CannotProvideCoderException(
- "Cannot provide SerializableCoder because " + typeDescriptor
- + " does not implement Serializable");
- }
- }
- };
-
-
- private final Class<T> type;
-
- protected SerializableCoder(Class<T> type) {
- this.type = type;
- }
-
- public Class<T> getRecordType() {
- return type;
- }
-
- @Override
- public void encode(T value, OutputStream outStream, Context context)
- throws IOException, CoderException {
- try {
- ObjectOutputStream oos = new ObjectOutputStream(outStream);
- oos.writeObject(value);
- oos.flush();
- } catch (IOException exn) {
- throw new CoderException("unable to serialize record " + value, exn);
- }
- }
-
- @Override
- public T decode(InputStream inStream, Context context)
- throws IOException, CoderException {
- try {
- ObjectInputStream ois = new ObjectInputStream(inStream);
- return type.cast(ois.readObject());
- } catch (ClassNotFoundException e) {
- throw new CoderException("unable to deserialize record", e);
- }
- }
-
- @Override
- public String getEncodingId() {
- return String.format("%s:%s",
- type.getName(),
- ObjectStreamClass.lookup(type).getSerialVersionUID());
- }
-
- @Override
- public CloudObject asCloudObject() {
- CloudObject result = super.asCloudObject();
- result.put("type", type.getName());
- return result;
- }
-
- /**
- * {@inheritDoc}
- *
- * @throws NonDeterministicException always. Java serialization is not
- * deterministic with respect to {@link Object#equals} for all types.
- */
- @Override
- public void verifyDeterministic() throws NonDeterministicException {
- throw new NonDeterministicException(this,
- "Java Serialization may be non-deterministic.");
- }
-
- @Override
- public boolean equals(Object other) {
- if (getClass() != other.getClass()) {
- return false;
- }
- return type == ((SerializableCoder<?>) other).type;
- }
-
- @Override
- public int hashCode() {
- return type.hashCode();
- }
-
- // This coder inherits isRegisterByteSizeObserverCheap,
- // getEncodedElementByteSize and registerByteSizeObserver
- // from StandardCoder. Looks like we cannot do much better
- // in this case.
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/SetCoder.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/SetCoder.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/SetCoder.java
deleted file mode 100644
index 36b3606..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/SetCoder.java
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.coders;
-
-import com.google.cloud.dataflow.sdk.util.PropertyNames;
-import com.google.common.base.Preconditions;
-
-import com.fasterxml.jackson.annotation.JsonCreator;
-import com.fasterxml.jackson.annotation.JsonProperty;
-
-import java.util.HashSet;
-import java.util.List;
-import java.util.Set;
-
-/**
- * A {@link SetCoder} encodes any {@link Set} using the format of {@link IterableLikeCoder}. The
- * elements may not be in a deterministic order, depending on the {@code Set} implementation.
- *
- * @param <T> the type of the elements of the set
- */
-public class SetCoder<T> extends IterableLikeCoder<T, Set<T>> {
-
- /**
- * Produces a {@link SetCoder} with the given {@code elementCoder}.
- */
- public static <T> SetCoder<T> of(Coder<T> elementCoder) {
- return new SetCoder<>(elementCoder);
- }
-
- /**
- * Dynamically typed constructor for JSON deserialization.
- */
- @JsonCreator
- public static SetCoder<?> of(
- @JsonProperty(PropertyNames.COMPONENT_ENCODINGS)
- List<Object> components) {
- Preconditions.checkArgument(components.size() == 1,
- "Expecting 1 component, got " + components.size());
- return of((Coder<?>) components.get(0));
- }
-
- /**
- * {@inheritDoc}
- *
- * @throws NonDeterministicException always. Sets are not ordered, but
- * they are encoded in the order of an arbitrary iteration.
- */
- @Override
- public void verifyDeterministic() throws NonDeterministicException {
- throw new NonDeterministicException(this,
- "Ordering of elements in a set may be non-deterministic.");
- }
-
- /**
- * Returns the first element in this set if it is non-empty,
- * otherwise returns {@code null}.
- */
- public static <T> List<Object> getInstanceComponents(
- Set<T> exampleValue) {
- return getInstanceComponentsHelper(exampleValue);
- }
-
- /////////////////////////////////////////////////////////////////////////////
- // Internal operations below here.
-
- /**
- * {@inheritDoc}
- *
- * @return A new {@link Set} built from the elements in the {@link List} decoded by
- * {@link IterableLikeCoder}.
- */
- @Override
- protected final Set<T> decodeToIterable(List<T> decodedElements) {
- return new HashSet<>(decodedElements);
- }
-
- protected SetCoder(Coder<T> elemCoder) {
- super(elemCoder, "Set");
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/StandardCoder.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/StandardCoder.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/StandardCoder.java
deleted file mode 100644
index faa9861..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/StandardCoder.java
+++ /dev/null
@@ -1,229 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.coders;
-
-import static com.google.cloud.dataflow.sdk.util.Structs.addList;
-import static com.google.cloud.dataflow.sdk.util.Structs.addString;
-import static com.google.cloud.dataflow.sdk.util.Structs.addStringList;
-import static com.google.common.base.Preconditions.checkNotNull;
-
-import com.google.cloud.dataflow.sdk.util.CloudObject;
-import com.google.cloud.dataflow.sdk.util.PropertyNames;
-import com.google.cloud.dataflow.sdk.util.common.ElementByteSizeObserver;
-import com.google.common.collect.Lists;
-import com.google.common.io.ByteStreams;
-import com.google.common.io.CountingOutputStream;
-
-import java.io.ByteArrayOutputStream;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.List;
-
-/**
- * An abstract base class to implement a {@link Coder} that defines equality, hashing, and printing
- * via the class name and recursively using {@link #getComponents}.
- *
- * <p>To extend {@link StandardCoder}, override the following methods as appropriate:
- *
- * <ul>
- * <li>{@link #getComponents}: the default implementation returns {@link #getCoderArguments}.</li>
- * <li>{@link #getEncodedElementByteSize} and
- * {@link #isRegisterByteSizeObserverCheap}: the
- * default implementation encodes values to bytes and counts the bytes, which is considered
- * expensive.</li>
- * <li>{@link #getEncodingId} and {@link #getAllowedEncodings}: by default, the encoding id
- * is the empty string, so only the canonical name of the subclass will be used for
- * compatibility checks, and no other encoding ids are allowed.</li>
- * </ul>
- */
-public abstract class StandardCoder<T> implements Coder<T> {
- protected StandardCoder() {}
-
- @Override
- public String getEncodingId() {
- return "";
- }
-
- @Override
- public Collection<String> getAllowedEncodings() {
- return Collections.emptyList();
- }
-
- /**
- * Returns the list of {@link Coder Coders} that are components of this {@link Coder}.
- */
- public List<? extends Coder<?>> getComponents() {
- List<? extends Coder<?>> coderArguments = getCoderArguments();
- if (coderArguments == null) {
- return Collections.emptyList();
- } else {
- return coderArguments;
- }
- }
-
- /**
- * {@inheritDoc}
- *
- * @return {@code true} if the two {@link StandardCoder} instances have the
- * same class and equal components.
- */
- @Override
- public boolean equals(Object o) {
- if (o == null || this.getClass() != o.getClass()) {
- return false;
- }
- StandardCoder<?> that = (StandardCoder<?>) o;
- return this.getComponents().equals(that.getComponents());
- }
-
- @Override
- public int hashCode() {
- return getClass().hashCode() * 31 + getComponents().hashCode();
- }
-
- @Override
- public String toString() {
- String s = getClass().getName();
- s = s.substring(s.lastIndexOf('.') + 1);
- List<? extends Coder<?>> componentCoders = getComponents();
- if (!componentCoders.isEmpty()) {
- s += "(";
- boolean first = true;
- for (Coder<?> componentCoder : componentCoders) {
- if (first) {
- first = false;
- } else {
- s += ", ";
- }
- s += componentCoder.toString();
- }
- s += ")";
- }
- return s;
- }
-
- @Override
- public CloudObject asCloudObject() {
- CloudObject result = CloudObject.forClass(getClass());
-
- List<? extends Coder<?>> components = getComponents();
- if (!components.isEmpty()) {
- List<CloudObject> cloudComponents = new ArrayList<>(components.size());
- for (Coder<?> coder : components) {
- cloudComponents.add(coder.asCloudObject());
- }
- addList(result, PropertyNames.COMPONENT_ENCODINGS, cloudComponents);
- }
-
- String encodingId = getEncodingId();
- checkNotNull(encodingId, "Coder.getEncodingId() must not return null.");
- if (!encodingId.isEmpty()) {
- addString(result, PropertyNames.ENCODING_ID, encodingId);
- }
-
- Collection<String> allowedEncodings = getAllowedEncodings();
- if (!allowedEncodings.isEmpty()) {
- addStringList(result, PropertyNames.ALLOWED_ENCODINGS, Lists.newArrayList(allowedEncodings));
- }
-
- return result;
- }
-
- /**
- * {@inheritDoc}
- *
- * @return {@code false} unless it is overridden. {@link StandardCoder#registerByteSizeObserver}
- * invokes {@link #getEncodedElementByteSize} which requires re-encoding an element
- * unless it is overridden. This is considered expensive.
- */
- @Override
- public boolean isRegisterByteSizeObserverCheap(T value, Context context) {
- return false;
- }
-
- /**
- * Returns the size in bytes of the encoded value using this coder.
- */
- protected long getEncodedElementByteSize(T value, Context context)
- throws Exception {
- try {
- CountingOutputStream os = new CountingOutputStream(ByteStreams.nullOutputStream());
- encode(value, os, context);
- return os.getCount();
- } catch (Exception exn) {
- throw new IllegalArgumentException(
- "Unable to encode element '" + value + "' with coder '" + this + "'.", exn);
- }
- }
-
- /**
- * {@inheritDoc}
- *
- * <p>For {@link StandardCoder} subclasses, this notifies {@code observer} about the byte size
- * of the encoded value using this coder as returned by {@link #getEncodedElementByteSize}.
- */
- @Override
- public void registerByteSizeObserver(
- T value, ElementByteSizeObserver observer, Context context)
- throws Exception {
- observer.update(getEncodedElementByteSize(value, context));
- }
-
- protected void verifyDeterministic(String message, Iterable<Coder<?>> coders)
- throws NonDeterministicException {
- for (Coder<?> coder : coders) {
- try {
- coder.verifyDeterministic();
- } catch (NonDeterministicException e) {
- throw new NonDeterministicException(this, message, e);
- }
- }
- }
-
- protected void verifyDeterministic(String message, Coder<?>... coders)
- throws NonDeterministicException {
- verifyDeterministic(message, Arrays.asList(coders));
- }
-
- /**
- * {@inheritDoc}
- *
- * @return {@code false} for {@link StandardCoder} unless overridden.
- */
- @Override
- public boolean consistentWithEquals() {
- return false;
- }
-
- @Override
- public Object structuralValue(T value) throws Exception {
- if (value != null && consistentWithEquals()) {
- return value;
- } else {
- try {
- ByteArrayOutputStream os = new ByteArrayOutputStream();
- encode(value, os, Context.OUTER);
- return new StructuralByteArray(os.toByteArray());
- } catch (Exception exn) {
- throw new IllegalArgumentException(
- "Unable to encode element '" + value + "' with coder '" + this + "'.", exn);
- }
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/StringDelegateCoder.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/StringDelegateCoder.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/StringDelegateCoder.java
deleted file mode 100644
index 1fc1247..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/StringDelegateCoder.java
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.coders;
-
-import com.google.cloud.dataflow.sdk.coders.protobuf.ProtoCoder;
-
-import java.lang.reflect.InvocationTargetException;
-
-/**
- * A {@link Coder} that wraps a {@code Coder<String>}
- * and encodes/decodes values via string representations.
- *
- * <p>To decode, the input byte stream is decoded to
- * a {@link String}, and this is passed to the single-argument
- * constructor for {@code T}.
- *
- * <p>To encode, the input value is converted via {@code toString()},
- * and this string is encoded.
- *
- * <p>In order for this to operate correctly for a class {@code Clazz},
- * it must be the case for any instance {@code x} that
- * {@code x.equals(new Clazz(x.toString()))}.
- *
- * <p>This method of encoding is not designed for ease of evolution of {@code Clazz};
- * it should only be used in cases where the class is stable or the encoding is not
- * important. If evolution of the class is important, see {@link ProtoCoder}, {@link AvroCoder},
- * or {@link JAXBCoder}.
- *
- * @param <T> The type of objects coded.
- */
-public class StringDelegateCoder<T> extends DelegateCoder<T, String> {
- public static <T> StringDelegateCoder<T> of(Class<T> clazz) {
- return new StringDelegateCoder<T>(clazz);
- }
-
- @Override
- public String toString() {
- return "StringDelegateCoder(" + clazz + ")";
- }
-
- private final Class<T> clazz;
-
- protected StringDelegateCoder(final Class<T> clazz) {
- super(StringUtf8Coder.of(),
- new CodingFunction<T, String>() {
- @Override
- public String apply(T input) {
- return input.toString();
- }
- },
- new CodingFunction<String, T>() {
- @Override
- public T apply(String input) throws
- NoSuchMethodException,
- InstantiationException,
- IllegalAccessException,
- InvocationTargetException {
- return clazz.getConstructor(String.class).newInstance(input);
- }
- });
-
- this.clazz = clazz;
- }
-
- /**
- * The encoding id is the fully qualified name of the encoded/decoded class.
- */
- @Override
- public String getEncodingId() {
- return clazz.getName();
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/StringUtf8Coder.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/StringUtf8Coder.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/StringUtf8Coder.java
deleted file mode 100644
index 179840c..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/StringUtf8Coder.java
+++ /dev/null
@@ -1,139 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.coders;
-
-import com.google.cloud.dataflow.sdk.util.ExposedByteArrayOutputStream;
-import com.google.cloud.dataflow.sdk.util.StreamUtils;
-import com.google.cloud.dataflow.sdk.util.VarInt;
-import com.google.common.base.Utf8;
-import com.google.common.io.ByteStreams;
-import com.google.common.io.CountingOutputStream;
-
-import com.fasterxml.jackson.annotation.JsonCreator;
-
-import java.io.DataInputStream;
-import java.io.DataOutputStream;
-import java.io.EOFException;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.io.UTFDataFormatException;
-import java.nio.charset.StandardCharsets;
-
-/**
- * A {@link Coder} that encodes {@link String Strings} in UTF-8 encoding.
- * If in a nested context, prefixes the string with an integer length field,
- * encoded via a {@link VarIntCoder}.
- */
-public class StringUtf8Coder extends AtomicCoder<String> {
-
- @JsonCreator
- public static StringUtf8Coder of() {
- return INSTANCE;
- }
-
- /////////////////////////////////////////////////////////////////////////////
-
- private static final StringUtf8Coder INSTANCE = new StringUtf8Coder();
-
- private static void writeString(String value, DataOutputStream dos)
- throws IOException {
- byte[] bytes = value.getBytes(StandardCharsets.UTF_8);
- VarInt.encode(bytes.length, dos);
- dos.write(bytes);
- }
-
- private static String readString(DataInputStream dis) throws IOException {
- int len = VarInt.decodeInt(dis);
- if (len < 0) {
- throw new CoderException("Invalid encoded string length: " + len);
- }
- byte[] bytes = new byte[len];
- dis.readFully(bytes);
- return new String(bytes, StandardCharsets.UTF_8);
- }
-
- private StringUtf8Coder() {}
-
- @Override
- public void encode(String value, OutputStream outStream, Context context)
- throws IOException {
- if (value == null) {
- throw new CoderException("cannot encode a null String");
- }
- if (context.isWholeStream) {
- byte[] bytes = value.getBytes(StandardCharsets.UTF_8);
- if (outStream instanceof ExposedByteArrayOutputStream) {
- ((ExposedByteArrayOutputStream) outStream).writeAndOwn(bytes);
- } else {
- outStream.write(bytes);
- }
- } else {
- writeString(value, new DataOutputStream(outStream));
- }
- }
-
- @Override
- public String decode(InputStream inStream, Context context)
- throws IOException {
- if (context.isWholeStream) {
- byte[] bytes = StreamUtils.getBytes(inStream);
- return new String(bytes, StandardCharsets.UTF_8);
- } else {
- try {
- return readString(new DataInputStream(inStream));
- } catch (EOFException | UTFDataFormatException exn) {
- // These exceptions correspond to decoding problems, so change
- // what kind of exception they're branded as.
- throw new CoderException(exn);
- }
- }
- }
-
- /**
- * {@inheritDoc}
- *
- * @return {@code true}. This coder is injective.
- */
- @Override
- public boolean consistentWithEquals() {
- return true;
- }
-
- /**
- * {@inheritDoc}
- *
- * @return the byte size of the UTF-8 encoding of the a string or, in a nested context,
- * the byte size of the encoding plus the encoded length prefix.
- */
- @Override
- protected long getEncodedElementByteSize(String value, Context context)
- throws Exception {
- if (value == null) {
- throw new CoderException("cannot encode a null String");
- }
- if (context.isWholeStream) {
- return Utf8.encodedLength(value);
- } else {
- CountingOutputStream countingStream =
- new CountingOutputStream(ByteStreams.nullOutputStream());
- DataOutputStream stream = new DataOutputStream(countingStream);
- writeString(value, stream);
- return countingStream.getCount();
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/StructuralByteArray.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/StructuralByteArray.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/StructuralByteArray.java
deleted file mode 100644
index ea18eb9..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/StructuralByteArray.java
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (C) 2016 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.coders;
-
-import static com.google.api.client.util.Base64.encodeBase64String;
-
-import java.util.Arrays;
-
-/**
- * A wrapper around a byte[] that uses structural, value-based
- * equality rather than byte[]'s normal object identity.
- */
-public class StructuralByteArray {
- byte[] value;
-
- public StructuralByteArray(byte[] value) {
- this.value = value;
- }
-
- public byte[] getValue() {
- return value;
- }
-
- @Override
- public boolean equals(Object o) {
- if (o instanceof StructuralByteArray) {
- StructuralByteArray that = (StructuralByteArray) o;
- return Arrays.equals(this.value, that.value);
- } else {
- return false;
- }
- }
-
- @Override
- public int hashCode() {
- return Arrays.hashCode(value);
- }
-
- @Override
- public String toString() {
- return "base64:" + encodeBase64String(value);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/TableRowJsonCoder.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/TableRowJsonCoder.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/TableRowJsonCoder.java
deleted file mode 100644
index bed88b0..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/TableRowJsonCoder.java
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.coders;
-
-import com.google.api.services.bigquery.model.TableRow;
-
-import com.fasterxml.jackson.annotation.JsonCreator;
-import com.fasterxml.jackson.databind.ObjectMapper;
-import com.fasterxml.jackson.databind.SerializationFeature;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-
-/**
- * A {@link Coder} that encodes BigQuery {@link TableRow} objects in their native JSON format.
- */
-public class TableRowJsonCoder extends AtomicCoder<TableRow> {
-
- @JsonCreator
- public static TableRowJsonCoder of() {
- return INSTANCE;
- }
-
- @Override
- public void encode(TableRow value, OutputStream outStream, Context context)
- throws IOException {
- String strValue = MAPPER.writeValueAsString(value);
- StringUtf8Coder.of().encode(strValue, outStream, context);
- }
-
- @Override
- public TableRow decode(InputStream inStream, Context context)
- throws IOException {
- String strValue = StringUtf8Coder.of().decode(inStream, context);
- return MAPPER.readValue(strValue, TableRow.class);
- }
-
- @Override
- protected long getEncodedElementByteSize(TableRow value, Context context)
- throws Exception {
- String strValue = MAPPER.writeValueAsString(value);
- return StringUtf8Coder.of().getEncodedElementByteSize(strValue, context);
- }
-
- /////////////////////////////////////////////////////////////////////////////
-
- // FAIL_ON_EMPTY_BEANS is disabled in order to handle null values in
- // TableRow.
- private static final ObjectMapper MAPPER =
- new ObjectMapper().disable(SerializationFeature.FAIL_ON_EMPTY_BEANS);
-
- private static final TableRowJsonCoder INSTANCE = new TableRowJsonCoder();
-
- private TableRowJsonCoder() { }
-
- /**
- * {@inheritDoc}
- *
- * @throws NonDeterministicException always. A {@link TableRow} can hold arbitrary
- * {@link Object} instances, which makes the encoding non-deterministic.
- */
- @Override
- public void verifyDeterministic() throws NonDeterministicException {
- throw new NonDeterministicException(this,
- "TableCell can hold arbitrary instances, which may be non-deterministic.");
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/TextualIntegerCoder.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/TextualIntegerCoder.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/TextualIntegerCoder.java
deleted file mode 100644
index 9250c68..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/TextualIntegerCoder.java
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.coders;
-
-import com.fasterxml.jackson.annotation.JsonCreator;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-
-/**
- * A {@link Coder} that encodes {@code Integer Integers} as the ASCII bytes of
- * their textual, decimal, representation.
- */
-public class TextualIntegerCoder extends AtomicCoder<Integer> {
-
- @JsonCreator
- public static TextualIntegerCoder of() {
- return new TextualIntegerCoder();
- }
-
- /////////////////////////////////////////////////////////////////////////////
-
- protected TextualIntegerCoder() {}
-
- @Override
- public void encode(Integer value, OutputStream outStream, Context context)
- throws IOException, CoderException {
- if (value == null) {
- throw new CoderException("cannot encode a null Integer");
- }
- String textualValue = value.toString();
- StringUtf8Coder.of().encode(textualValue, outStream, context);
- }
-
- @Override
- public Integer decode(InputStream inStream, Context context)
- throws IOException, CoderException {
- String textualValue = StringUtf8Coder.of().decode(inStream, context);
- try {
- return Integer.valueOf(textualValue);
- } catch (NumberFormatException exn) {
- throw new CoderException("error when decoding a textual integer", exn);
- }
- }
-
- @Override
- protected long getEncodedElementByteSize(Integer value, Context context) throws Exception {
- if (value == null) {
- throw new CoderException("cannot encode a null Integer");
- }
- String textualValue = value.toString();
- return StringUtf8Coder.of().getEncodedElementByteSize(textualValue, context);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/VarIntCoder.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/VarIntCoder.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/VarIntCoder.java
deleted file mode 100644
index 18ec250..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/VarIntCoder.java
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.coders;
-
-import com.google.cloud.dataflow.sdk.util.VarInt;
-
-import com.fasterxml.jackson.annotation.JsonCreator;
-
-import java.io.EOFException;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.io.UTFDataFormatException;
-
-/**
- * A {@link Coder} that encodes {@link Integer Integers} using between 1 and 5 bytes. Negative
- * numbers always take 5 bytes, so {@link BigEndianIntegerCoder} may be preferable for
- * integers that are known to often be large or negative.
- */
-public class VarIntCoder extends AtomicCoder<Integer> {
-
- @JsonCreator
- public static VarIntCoder of() {
- return INSTANCE;
- }
-
- /////////////////////////////////////////////////////////////////////////////
-
- private static final VarIntCoder INSTANCE =
- new VarIntCoder();
-
- private VarIntCoder() {}
-
- @Override
- public void encode(Integer value, OutputStream outStream, Context context)
- throws IOException, CoderException {
- if (value == null) {
- throw new CoderException("cannot encode a null Integer");
- }
- VarInt.encode(value.intValue(), outStream);
- }
-
- @Override
- public Integer decode(InputStream inStream, Context context)
- throws IOException, CoderException {
- try {
- return VarInt.decodeInt(inStream);
- } catch (EOFException | UTFDataFormatException exn) {
- // These exceptions correspond to decoding problems, so change
- // what kind of exception they're branded as.
- throw new CoderException(exn);
- }
- }
-
- /**
- * {@inheritDoc}
- *
- * @return {@code true}. {@link VarIntCoder} is injective.
- */
- @Override
- public boolean consistentWithEquals() {
- return true;
- }
-
- /**
- * {@inheritDoc}
- *
- * @return {@code true}. {@link #getEncodedElementByteSize} is cheap.
- */
- @Override
- public boolean isRegisterByteSizeObserverCheap(Integer value, Context context) {
- return true;
- }
-
- @Override
- protected long getEncodedElementByteSize(Integer value, Context context)
- throws Exception {
- if (value == null) {
- throw new CoderException("cannot encode a null Integer");
- }
- return VarInt.getLength(value.longValue());
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/VarLongCoder.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/VarLongCoder.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/VarLongCoder.java
deleted file mode 100644
index 520245e..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/VarLongCoder.java
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.coders;
-
-import com.google.cloud.dataflow.sdk.util.VarInt;
-
-import com.fasterxml.jackson.annotation.JsonCreator;
-
-import java.io.EOFException;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.io.UTFDataFormatException;
-
-/**
- * A {@link Coder} that encodes {@link Long Longs} using between 1 and 10 bytes. Negative
- * numbers always take 10 bytes, so {@link BigEndianLongCoder} may be preferable for
- * longs that are known to often be large or negative.
- */
-public class VarLongCoder extends AtomicCoder<Long> {
-
- @JsonCreator
- public static VarLongCoder of() {
- return INSTANCE;
- }
-
- /////////////////////////////////////////////////////////////////////////////
-
- private static final VarLongCoder INSTANCE = new VarLongCoder();
-
- private VarLongCoder() {}
-
- @Override
- public void encode(Long value, OutputStream outStream, Context context)
- throws IOException, CoderException {
- if (value == null) {
- throw new CoderException("cannot encode a null Long");
- }
- VarInt.encode(value.longValue(), outStream);
- }
-
- @Override
- public Long decode(InputStream inStream, Context context)
- throws IOException, CoderException {
- try {
- return VarInt.decodeLong(inStream);
- } catch (EOFException | UTFDataFormatException exn) {
- // These exceptions correspond to decoding problems, so change
- // what kind of exception they're branded as.
- throw new CoderException(exn);
- }
- }
-
- /**
- * {@inheritDoc}
- *
- * @return {@code true}. {@link VarLongCoder} is injective.
- */
- @Override
- public boolean consistentWithEquals() {
- return true;
- }
-
- /**
- * {@inheritDoc}
- *
- * @return {@code true}. {@link #getEncodedElementByteSize} is cheap.
- */
- @Override
- public boolean isRegisterByteSizeObserverCheap(Long value, Context context) {
- return true;
- }
-
- @Override
- protected long getEncodedElementByteSize(Long value, Context context)
- throws Exception {
- if (value == null) {
- throw new CoderException("cannot encode a null Long");
- }
- return VarInt.getLength(value.longValue());
- }
-}
[41/67] [partial] incubator-beam git commit: Directory reorganization
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/Read.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/Read.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/Read.java
deleted file mode 100644
index cde8769..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/Read.java
+++ /dev/null
@@ -1,253 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.io;
-
-import static com.google.cloud.dataflow.sdk.util.StringUtils.approximateSimpleName;
-
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.runners.DirectPipelineRunner;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-import com.google.cloud.dataflow.sdk.util.SerializableUtils;
-import com.google.cloud.dataflow.sdk.util.WindowedValue;
-import com.google.cloud.dataflow.sdk.util.WindowingStrategy;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.cloud.dataflow.sdk.values.PCollection.IsBounded;
-import com.google.cloud.dataflow.sdk.values.PInput;
-
-import org.joda.time.Duration;
-
-import java.util.ArrayList;
-import java.util.List;
-
-import javax.annotation.Nullable;
-
-/**
- * A {@link PTransform} for reading from a {@link Source}.
- *
- * <p>Usage example:
- * <pre>
- * Pipeline p = Pipeline.create();
- * p.apply(Read.from(new MySource().withFoo("foo").withBar("bar"))
- * .named("foobar"));
- * </pre>
- */
-public class Read {
- /**
- * Returns a new {@code Read} {@code PTransform} builder with the given name.
- */
- public static Builder named(String name) {
- return new Builder(name);
- }
-
- /**
- * Returns a new {@code Read.Bounded} {@code PTransform} reading from the given
- * {@code BoundedSource}.
- */
- public static <T> Bounded<T> from(BoundedSource<T> source) {
- return new Bounded<>(null, source);
- }
-
- /**
- * Returns a new {@code Read.Unbounded} {@code PTransform} reading from the given
- * {@code UnboundedSource}.
- */
- public static <T> Unbounded<T> from(UnboundedSource<T, ?> source) {
- return new Unbounded<>(null, source);
- }
-
- /**
- * Helper class for building {@code Read} transforms.
- */
- public static class Builder {
- private final String name;
-
- private Builder(String name) {
- this.name = name;
- }
-
- /**
- * Returns a new {@code Read.Bounded} {@code PTransform} reading from the given
- * {@code BoundedSource}.
- */
- public <T> Bounded<T> from(BoundedSource<T> source) {
- return new Bounded<>(name, source);
- }
-
- /**
- * Returns a new {@code Read.Unbounded} {@code PTransform} reading from the given
- * {@code UnboundedSource}.
- */
- public <T> Unbounded<T> from(UnboundedSource<T, ?> source) {
- return new Unbounded<>(name, source);
- }
- }
-
- /**
- * {@link PTransform} that reads from a {@link BoundedSource}.
- */
- public static class Bounded<T> extends PTransform<PInput, PCollection<T>> {
- private final BoundedSource<T> source;
-
- private Bounded(@Nullable String name, BoundedSource<T> source) {
- super(name);
- this.source = SerializableUtils.ensureSerializable(source);
- }
-
- /**
- * Returns a new {@code Bounded} {@code PTransform} that's like this one but
- * has the given name.
- *
- * <p>Does not modify this object.
- */
- public Bounded<T> named(String name) {
- return new Bounded<T>(name, source);
- }
-
- @Override
- protected Coder<T> getDefaultOutputCoder() {
- return source.getDefaultOutputCoder();
- }
-
- @Override
- public final PCollection<T> apply(PInput input) {
- source.validate();
-
- return PCollection.<T>createPrimitiveOutputInternal(input.getPipeline(),
- WindowingStrategy.globalDefault(), IsBounded.BOUNDED)
- .setCoder(getDefaultOutputCoder());
- }
-
- /**
- * Returns the {@code BoundedSource} used to create this {@code Read} {@code PTransform}.
- */
- public BoundedSource<T> getSource() {
- return source;
- }
-
- @Override
- public String getKindString() {
- return "Read(" + approximateSimpleName(source.getClass()) + ")";
- }
-
- static {
- registerDefaultTransformEvaluator();
- }
-
- @SuppressWarnings({"rawtypes", "unchecked"})
- private static void registerDefaultTransformEvaluator() {
- DirectPipelineRunner.registerDefaultTransformEvaluator(
- Bounded.class,
- new DirectPipelineRunner.TransformEvaluator<Bounded>() {
- @Override
- public void evaluate(
- Bounded transform, DirectPipelineRunner.EvaluationContext context) {
- evaluateReadHelper(transform, context);
- }
-
- private <T> void evaluateReadHelper(
- Read.Bounded<T> transform, DirectPipelineRunner.EvaluationContext context) {
- try {
- List<DirectPipelineRunner.ValueWithMetadata<T>> output = new ArrayList<>();
- BoundedSource<T> source = transform.getSource();
- try (BoundedSource.BoundedReader<T> reader =
- source.createReader(context.getPipelineOptions())) {
- for (boolean available = reader.start();
- available;
- available = reader.advance()) {
- output.add(
- DirectPipelineRunner.ValueWithMetadata.of(
- WindowedValue.timestampedValueInGlobalWindow(
- reader.getCurrent(), reader.getCurrentTimestamp())));
- }
- }
- context.setPCollectionValuesWithMetadata(context.getOutput(transform), output);
- } catch (Exception e) {
- throw new RuntimeException(e);
- }
- }
- });
- }
- }
-
- /**
- * {@link PTransform} that reads from a {@link UnboundedSource}.
- */
- public static class Unbounded<T> extends PTransform<PInput, PCollection<T>> {
- private final UnboundedSource<T, ?> source;
-
- private Unbounded(@Nullable String name, UnboundedSource<T, ?> source) {
- super(name);
- this.source = SerializableUtils.ensureSerializable(source);
- }
-
- /**
- * Returns a new {@code Unbounded} {@code PTransform} that's like this one but
- * has the given name.
- *
- * <p>Does not modify this object.
- */
- public Unbounded<T> named(String name) {
- return new Unbounded<T>(name, source);
- }
-
- /**
- * Returns a new {@link BoundedReadFromUnboundedSource} that reads a bounded amount
- * of data from the given {@link UnboundedSource}. The bound is specified as a number
- * of records to read.
- *
- * <p>This may take a long time to execute if the splits of this source are slow to read
- * records.
- */
- public BoundedReadFromUnboundedSource<T> withMaxNumRecords(long maxNumRecords) {
- return new BoundedReadFromUnboundedSource<T>(source, maxNumRecords, null);
- }
-
- /**
- * Returns a new {@link BoundedReadFromUnboundedSource} that reads a bounded amount
- * of data from the given {@link UnboundedSource}. The bound is specified as an amount
- * of time to read for. Each split of the source will read for this much time.
- */
- public BoundedReadFromUnboundedSource<T> withMaxReadTime(Duration maxReadTime) {
- return new BoundedReadFromUnboundedSource<T>(source, Long.MAX_VALUE, maxReadTime);
- }
-
- @Override
- protected Coder<T> getDefaultOutputCoder() {
- return source.getDefaultOutputCoder();
- }
-
- @Override
- public final PCollection<T> apply(PInput input) {
- source.validate();
-
- return PCollection.<T>createPrimitiveOutputInternal(
- input.getPipeline(), WindowingStrategy.globalDefault(), IsBounded.UNBOUNDED);
- }
-
- /**
- * Returns the {@code UnboundedSource} used to create this {@code Read} {@code PTransform}.
- */
- public UnboundedSource<T, ?> getSource() {
- return source;
- }
-
- @Override
- public String getKindString() {
- return "Read(" + approximateSimpleName(source.getClass()) + ")";
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/ShardNameTemplate.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/ShardNameTemplate.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/ShardNameTemplate.java
deleted file mode 100644
index 7270012..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/ShardNameTemplate.java
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.io;
-
-/**
- * Standard shard naming templates.
- *
- * <p>Shard naming templates are strings that may contain placeholders for
- * the shard number and shard count. When constructing a filename for a
- * particular shard number, the upper-case letters 'S' and 'N' are replaced
- * with the 0-padded shard number and shard count respectively.
- *
- * <p>Left-padding of the numbers enables lexicographical sorting of the
- * resulting filenames. If the shard number or count are too large for the
- * space provided in the template, then the result may no longer sort
- * lexicographically. For example, a shard template of "S-of-N", for 200
- * shards, will result in outputs named "0-of-200", ... '10-of-200',
- * '100-of-200", etc.
- *
- * <p>Shard numbers start with 0, so the last shard number is the shard count
- * minus one. For example, the template "-SSSSS-of-NNNNN" will be
- * instantiated as "-00000-of-01000" for the first shard (shard 0) of a
- * 1000-way sharded output.
- *
- * <p>A shard name template is typically provided along with a name prefix
- * and suffix, which allows constructing complex paths that have embedded
- * shard information. For example, outputs in the form
- * "gs://bucket/path-01-of-99.txt" could be constructed by providing the
- * individual components:
- *
- * <pre>{@code
- * pipeline.apply(
- * TextIO.Write.to("gs://bucket/path")
- * .withShardNameTemplate("-SS-of-NN")
- * .withSuffix(".txt"))
- * }</pre>
- *
- * <p>In the example above, you could make parts of the output configurable
- * by users without the user having to specify all components of the output
- * name.
- *
- * <p>If a shard name template does not contain any repeating 'S', then
- * the output shard count must be 1, as otherwise the same filename would be
- * generated for multiple shards.
- */
-public class ShardNameTemplate {
- /**
- * Shard name containing the index and max.
- *
- * <p>Eg: [prefix]-00000-of-00100[suffix] and
- * [prefix]-00001-of-00100[suffix]
- */
- public static final String INDEX_OF_MAX = "-SSSSS-of-NNNNN";
-
- /**
- * Shard is a file within a directory.
- *
- * <p>Eg: [prefix]/part-00000[suffix] and [prefix]/part-00001[suffix]
- */
- public static final String DIRECTORY_CONTAINER = "/part-SSSSS";
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/Sink.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/Sink.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/Sink.java
deleted file mode 100644
index a5649ce..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/Sink.java
+++ /dev/null
@@ -1,252 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
- * in compliance with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software distributed under the License
- * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
- * or implied. See the License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.io;
-
-import com.google.cloud.dataflow.sdk.annotations.Experimental;
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-
-import java.io.Serializable;
-
-/**
- * A {@code Sink} represents a resource that can be written to using the {@link Write} transform.
- *
- * <p>A parallel write to a {@code Sink} consists of three phases:
- * <ol>
- * <li>A sequential <i>initialization</i> phase (e.g., creating a temporary output directory, etc.)
- * <li>A <i>parallel write</i> phase where workers write bundles of records
- * <li>A sequential <i>finalization</i> phase (e.g., committing the writes, merging output files,
- * etc.)
- * </ol>
- *
- * <p>The {@link Write} transform can be used in a Dataflow pipeline to perform this write.
- * Specifically, a Write transform can be applied to a {@link PCollection} {@code p} by:
- *
- * <p>{@code p.apply(Write.to(new MySink()));}
- *
- * <p>Implementing a {@link Sink} and the corresponding write operations requires extending three
- * abstract classes:
- *
- * <ul>
- * <li>{@link Sink}: an immutable logical description of the location/resource to write to.
- * Depending on the type of sink, it may contain fields such as the path to an output directory
- * on a filesystem, a database table name, etc. Implementors of {@link Sink} must
- * implement two methods: {@link Sink#validate} and {@link Sink#createWriteOperation}.
- * {@link Sink#validate Validate} is called by the Write transform at pipeline creation, and should
- * validate that the Sink can be written to. The createWriteOperation method is also called at
- * pipeline creation, and should return a WriteOperation object that defines how to write to the
- * Sink. Note that implementations of Sink must be serializable and Sinks must be immutable.
- *
- * <li>{@link WriteOperation}: The WriteOperation implements the <i>initialization</i> and
- * <i>finalization</i> phases of a write. Implementors of {@link WriteOperation} must implement
- * corresponding {@link WriteOperation#initialize} and {@link WriteOperation#finalize} methods. A
- * WriteOperation must also implement {@link WriteOperation#createWriter} that creates Writers,
- * {@link WriteOperation#getWriterResultCoder} that returns a {@link Coder} for the result of a
- * parallel write, and a {@link WriteOperation#getSink} that returns the Sink that the write
- * operation corresponds to. See below for more information about these methods and restrictions on
- * their implementation.
- *
- * <li>{@link Writer}: A Writer writes a bundle of records. Writer defines four methods:
- * {@link Writer#open}, which is called once at the start of writing a bundle; {@link Writer#write},
- * which writes a single record from the bundle; {@link Writer#close}, which is called once at the
- * end of writing a bundle; and {@link Writer#getWriteOperation}, which returns the write operation
- * that the writer belongs to.
- * </ul>
- *
- * <h2>WriteOperation</h2>
- * <p>{@link WriteOperation#initialize} and {@link WriteOperation#finalize} are conceptually called
- * once: at the beginning and end of a Write transform. However, implementors must ensure that these
- * methods are idempotent, as they may be called multiple times on different machines in the case of
- * failure/retry or for redundancy.
- *
- * <p>The finalize method of WriteOperation is passed an Iterable of a writer result type. This
- * writer result type should encode the result of a write and, in most cases, some encoding of the
- * unique bundle id.
- *
- * <p>All implementations of {@link WriteOperation} must be serializable.
- *
- * <p>WriteOperation may have mutable state. For instance, {@link WriteOperation#initialize} may
- * mutate the object state. These mutations will be visible in {@link WriteOperation#createWriter}
- * and {@link WriteOperation#finalize} because the object will be serialized after initialize and
- * deserialized before these calls. However, it is not serialized again after createWriter is
- * called, as createWriter will be called within workers to create Writers for the bundles that are
- * distributed to these workers. Therefore, newWriter should not mutate the WriteOperation state (as
- * these mutations will not be visible in finalize).
- *
- * <h2>Bundle Ids:</h2>
- * <p>In order to ensure fault-tolerance, a bundle may be executed multiple times (e.g., in the
- * event of failure/retry or for redundancy). However, exactly one of these executions will have its
- * result passed to the WriteOperation's finalize method. Each call to {@link Writer#open} is passed
- * a unique <i>bundle id</i> when it is called by the Write transform, so even redundant or retried
- * bundles will have a unique way of identifying their output.
- *
- * <p>The bundle id should be used to guarantee that a bundle's output is unique. This uniqueness
- * guarantee is important; if a bundle is to be output to a file, for example, the name of the file
- * must be unique to avoid conflicts with other Writers. The bundle id should be encoded in the
- * writer result returned by the Writer and subsequently used by the WriteOperation's finalize
- * method to identify the results of successful writes.
- *
- * <p>For example, consider the scenario where a Writer writes files containing serialized records
- * and the WriteOperation's finalization step is to merge or rename these output files. In this
- * case, a Writer may use its unique id to name its output file (to avoid conflicts) and return the
- * name of the file it wrote as its writer result. The WriteOperation will then receive an Iterable
- * of output file names that it can then merge or rename using some bundle naming scheme.
- *
- * <h2>Writer Results:</h2>
- * <p>{@link WriteOperation}s and {@link Writer}s must agree on a writer result type that will be
- * returned by a Writer after it writes a bundle. This type can be a client-defined object or an
- * existing type; {@link WriteOperation#getWriterResultCoder} should return a {@link Coder} for the
- * type.
- *
- * <p>A note about thread safety: Any use of static members or methods in Writer should be thread
- * safe, as different instances of Writer objects may be created in different threads on the same
- * worker.
- *
- * @param <T> the type that will be written to the Sink.
- */
-@Experimental(Experimental.Kind.SOURCE_SINK)
-public abstract class Sink<T> implements Serializable {
- /**
- * Ensures that the sink is valid and can be written to before the write operation begins. One
- * should use {@link com.google.common.base.Preconditions} to implement this method.
- */
- public abstract void validate(PipelineOptions options);
-
- /**
- * Returns an instance of a {@link WriteOperation} that can write to this Sink.
- */
- public abstract WriteOperation<T, ?> createWriteOperation(PipelineOptions options);
-
- /**
- * A {@link WriteOperation} defines the process of a parallel write of objects to a Sink.
- *
- * <p>The {@code WriteOperation} defines how to perform initialization and finalization of a
- * parallel write to a sink as well as how to create a {@link Sink.Writer} object that can write
- * a bundle to the sink.
- *
- * <p>Since operations in Dataflow may be run multiple times for redundancy or fault-tolerance,
- * the initialization and finalization defined by a WriteOperation <b>must be idempotent</b>.
- *
- * <p>{@code WriteOperation}s may be mutable; a {@code WriteOperation} is serialized after the
- * call to {@code initialize} method and deserialized before calls to
- * {@code createWriter} and {@code finalized}. However, it is not
- * reserialized after {@code createWriter}, so {@code createWriter} should not mutate the
- * state of the {@code WriteOperation}.
- *
- * <p>See {@link Sink} for more detailed documentation about the process of writing to a Sink.
- *
- * @param <T> The type of objects to write
- * @param <WriteT> The result of a per-bundle write
- */
- public abstract static class WriteOperation<T, WriteT> implements Serializable {
- /**
- * Performs initialization before writing to the sink. Called before writing begins.
- */
- public abstract void initialize(PipelineOptions options) throws Exception;
-
- /**
- * Given an Iterable of results from bundle writes, performs finalization after writing and
- * closes the sink. Called after all bundle writes are complete.
- *
- * <p>The results that are passed to finalize are those returned by bundles that completed
- * successfully. Although bundles may have been run multiple times (for fault-tolerance), only
- * one writer result will be passed to finalize for each bundle. An implementation of finalize
- * should perform clean up of any failed and successfully retried bundles. Note that these
- * failed bundles will not have their writer result passed to finalize, so finalize should be
- * capable of locating any temporary/partial output written by failed bundles.
- *
- * <p>A best practice is to make finalize atomic. If this is impossible given the semantics
- * of the sink, finalize should be idempotent, as it may be called multiple times in the case of
- * failure/retry or for redundancy.
- *
- * <p>Note that the iteration order of the writer results is not guaranteed to be consistent if
- * finalize is called multiple times.
- *
- * @param writerResults an Iterable of results from successful bundle writes.
- */
- public abstract void finalize(Iterable<WriteT> writerResults, PipelineOptions options)
- throws Exception;
-
- /**
- * Creates a new {@link Sink.Writer} to write a bundle of the input to the sink.
- *
- * <p>The bundle id that the writer will use to uniquely identify its output will be passed to
- * {@link Writer#open}.
- *
- * <p>Must not mutate the state of the WriteOperation.
- */
- public abstract Writer<T, WriteT> createWriter(PipelineOptions options) throws Exception;
-
- /**
- * Returns the Sink that this write operation writes to.
- */
- public abstract Sink<T> getSink();
-
- /**
- * Returns a coder for the writer result type.
- */
- public Coder<WriteT> getWriterResultCoder() {
- return null;
- }
- }
-
- /**
- * A Writer writes a bundle of elements from a PCollection to a sink. {@link Writer#open} is
- * called before writing begins and {@link Writer#close} is called after all elements in the
- * bundle have been written. {@link Writer#write} writes an element to the sink.
- *
- * <p>Note that any access to static members or methods of a Writer must be thread-safe, as
- * multiple instances of a Writer may be instantiated in different threads on the same worker.
- *
- * <p>See {@link Sink} for more detailed documentation about the process of writing to a Sink.
- *
- * @param <T> The type of object to write
- * @param <WriteT> The writer results type (e.g., the bundle's output filename, as String)
- */
- public abstract static class Writer<T, WriteT> {
- /**
- * Performs bundle initialization. For example, creates a temporary file for writing or
- * initializes any state that will be used across calls to {@link Writer#write}.
- *
- * <p>The unique id that is given to open should be used to ensure that the writer's output does
- * not interfere with the output of other Writers, as a bundle may be executed many times for
- * fault tolerance. See {@link Sink} for more information about bundle ids.
- */
- public abstract void open(String uId) throws Exception;
-
- /**
- * Called for each value in the bundle.
- */
- public abstract void write(T value) throws Exception;
-
- /**
- * Finishes writing the bundle. Closes any resources used for writing the bundle.
- *
- * <p>Returns a writer result that will be used in the {@link Sink.WriteOperation}'s
- * finalization. The result should contain some way to identify the output of this bundle (using
- * the bundle id). {@link WriteOperation#finalize} will use the writer result to identify
- * successful writes. See {@link Sink} for more information about bundle ids.
- *
- * @return the writer result
- */
- public abstract WriteT close() throws Exception;
-
- /**
- * Returns the write operation this writer belongs to.
- */
- public abstract WriteOperation<T, WriteT> getWriteOperation();
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/Source.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/Source.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/Source.java
deleted file mode 100644
index 4a02078..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/Source.java
+++ /dev/null
@@ -1,193 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.io;
-
-import com.google.cloud.dataflow.sdk.annotations.Experimental;
-import com.google.cloud.dataflow.sdk.coders.Coder;
-
-import org.joda.time.Instant;
-
-import java.io.IOException;
-import java.io.Serializable;
-import java.util.NoSuchElementException;
-
-/**
- * Base class for defining input formats and creating a {@code Source} for reading the input.
- *
- * <p>This class is not intended to be subclassed directly. Instead, to define
- * a bounded source (a source which produces a finite amount of input), subclass
- * {@link BoundedSource}; to define an unbounded source, subclass {@link UnboundedSource}.
- *
- * <p>A {@code Source} passed to a {@code Read} transform must be
- * {@code Serializable}. This allows the {@code Source} instance
- * created in this "main program" to be sent (in serialized form) to
- * remote worker machines and reconstituted for each batch of elements
- * of the input {@code PCollection} being processed or for each source splitting
- * operation. A {@code Source} can have instance variable state, and
- * non-transient instance variable state will be serialized in the main program
- * and then deserialized on remote worker machines.
- *
- * <p>{@code Source} classes MUST be effectively immutable. The only acceptable use of
- * mutable fields is to cache the results of expensive operations, and such fields MUST be
- * marked {@code transient}.
- *
- * <p>{@code Source} objects should override {@link Object#toString}, as it will be
- * used in important error and debugging messages.
- *
- * @param <T> Type of elements read by the source.
- */
-@Experimental(Experimental.Kind.SOURCE_SINK)
-public abstract class Source<T> implements Serializable {
- /**
- * Checks that this source is valid, before it can be used in a pipeline.
- *
- * <p>It is recommended to use {@link com.google.common.base.Preconditions} for implementing
- * this method.
- */
- public abstract void validate();
-
- /**
- * Returns the default {@code Coder} to use for the data read from this source.
- */
- public abstract Coder<T> getDefaultOutputCoder();
-
- /**
- * The interface that readers of custom input sources must implement.
- *
- * <p>This interface is deliberately distinct from {@link java.util.Iterator} because
- * the current model tends to be easier to program and more efficient in practice
- * for iterating over sources such as files, databases etc. (rather than pure collections).
- *
- * <p>Reading data from the {@link Reader} must obey the following access pattern:
- * <ul>
- * <li> One call to {@link #start}
- * <ul><li>If {@link #start} returned true, any number of calls to {@code getCurrent}*
- * methods</ul>
- * <li> Repeatedly, a call to {@link #advance}. This may be called regardless
- * of what the previous {@link #start}/{@link #advance} returned.
- * <ul><li>If {@link #advance} returned true, any number of calls to {@code getCurrent}*
- * methods</ul>
- * </ul>
- *
- * <p>For example, if the reader is reading a fixed set of data:
- * <pre>
- * try {
- * for (boolean available = reader.start(); available; available = reader.advance()) {
- * T item = reader.getCurrent();
- * Instant timestamp = reader.getCurrentTimestamp();
- * ...
- * }
- * } finally {
- * reader.close();
- * }
- * </pre>
- *
- * <p>If the set of data being read is continually growing:
- * <pre>
- * try {
- * boolean available = reader.start();
- * while (true) {
- * if (available) {
- * T item = reader.getCurrent();
- * Instant timestamp = reader.getCurrentTimestamp();
- * ...
- * resetExponentialBackoff();
- * } else {
- * exponentialBackoff();
- * }
- * available = reader.advance();
- * }
- * } finally {
- * reader.close();
- * }
- * </pre>
- *
- * <p>Note: this interface is a work-in-progress and may change.
- *
- * <p>All {@code Reader} functions except {@link #getCurrentSource} do not need to be thread-safe;
- * they may only be accessed by a single thread at once. However, {@link #getCurrentSource} needs
- * to be thread-safe, and other functions should assume that its returned value can change
- * asynchronously.
- */
- public abstract static class Reader<T> implements AutoCloseable {
- /**
- * Initializes the reader and advances the reader to the first record.
- *
- * <p>This method should be called exactly once. The invocation should occur prior to calling
- * {@link #advance} or {@link #getCurrent}. This method may perform expensive operations that
- * are needed to initialize the reader.
- *
- * @return {@code true} if a record was read, {@code false} if there is no more input available.
- */
- public abstract boolean start() throws IOException;
-
- /**
- * Advances the reader to the next valid record.
- *
- * <p>It is an error to call this without having called {@link #start} first.
- *
- * @return {@code true} if a record was read, {@code false} if there is no more input available.
- */
- public abstract boolean advance() throws IOException;
-
- /**
- * Returns the value of the data item that was read by the last {@link #start} or
- * {@link #advance} call. The returned value must be effectively immutable and remain valid
- * indefinitely.
- *
- * <p>Multiple calls to this method without an intervening call to {@link #advance} should
- * return the same result.
- *
- * @throws java.util.NoSuchElementException if {@link #start} was never called, or if
- * the last {@link #start} or {@link #advance} returned {@code false}.
- */
- public abstract T getCurrent() throws NoSuchElementException;
-
- /**
- * Returns the timestamp associated with the current data item.
- *
- * <p>If the source does not support timestamps, this should return
- * {@code BoundedWindow.TIMESTAMP_MIN_VALUE}.
- *
- * <p>Multiple calls to this method without an intervening call to {@link #advance} should
- * return the same result.
- *
- * @throws NoSuchElementException if the reader is at the beginning of the input and
- * {@link #start} or {@link #advance} wasn't called, or if the last {@link #start} or
- * {@link #advance} returned {@code false}.
- */
- public abstract Instant getCurrentTimestamp() throws NoSuchElementException;
-
- /**
- * Closes the reader. The reader cannot be used after this method is called.
- */
- @Override
- public abstract void close() throws IOException;
-
- /**
- * Returns a {@code Source} describing the same input that this {@code Reader} currently reads
- * (including items already read).
- *
- * <p>Usually, an implementation will simply return the immutable {@link Source} object from
- * which the current {@link Reader} was constructed, or delegate to the base class.
- * However, when using or implementing this method on a {@link BoundedSource.BoundedReader},
- * special considerations apply, see documentation for
- * {@link BoundedSource.BoundedReader#getCurrentSource}.
- */
- public abstract Source<T> getCurrentSource();
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/TextIO.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/TextIO.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/TextIO.java
deleted file mode 100644
index d342f25..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/TextIO.java
+++ /dev/null
@@ -1,992 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.io;
-
-import static com.google.common.base.Preconditions.checkState;
-
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.coders.Coder.Context;
-import com.google.cloud.dataflow.sdk.coders.StringUtf8Coder;
-import com.google.cloud.dataflow.sdk.coders.VoidCoder;
-import com.google.cloud.dataflow.sdk.io.Read.Bounded;
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-import com.google.cloud.dataflow.sdk.runners.DataflowPipelineRunner;
-import com.google.cloud.dataflow.sdk.runners.DirectPipelineRunner;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-import com.google.cloud.dataflow.sdk.util.IOChannelUtils;
-import com.google.cloud.dataflow.sdk.util.MimeTypes;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.cloud.dataflow.sdk.values.PDone;
-import com.google.cloud.dataflow.sdk.values.PInput;
-import com.google.common.annotations.VisibleForTesting;
-import com.google.common.base.Preconditions;
-import com.google.protobuf.ByteString;
-
-import java.io.IOException;
-import java.io.OutputStream;
-import java.nio.ByteBuffer;
-import java.nio.channels.Channels;
-import java.nio.channels.ReadableByteChannel;
-import java.nio.channels.SeekableByteChannel;
-import java.nio.channels.WritableByteChannel;
-import java.nio.charset.StandardCharsets;
-import java.util.NoSuchElementException;
-import java.util.regex.Pattern;
-
-import javax.annotation.Nullable;
-
-/**
- * {@link PTransform}s for reading and writing text files.
- *
- * <p>To read a {@link PCollection} from one or more text files, use {@link TextIO.Read}.
- * You can instantiate a transform using {@link TextIO.Read#from(String)} to specify
- * the path of the file(s) to read from (e.g., a local filename or
- * filename pattern if running locally, or a Google Cloud Storage
- * filename or filename pattern of the form
- * {@code "gs://<bucket>/<filepath>"}). You may optionally call
- * {@link TextIO.Read#named(String)} to specify the name of the pipeline step.
- *
- * <p>By default, {@link TextIO.Read} returns a {@link PCollection} of {@link String Strings},
- * each corresponding to one line of an input UTF-8 text file. To convert directly from the raw
- * bytes (split into lines delimited by '\n', '\r', or '\r\n') to another object of type {@code T},
- * supply a {@code Coder<T>} using {@link TextIO.Read#withCoder(Coder)}.
- *
- * <p>See the following examples:
- *
- * <pre>{@code
- * Pipeline p = ...;
- *
- * // A simple Read of a local file (only runs locally):
- * PCollection<String> lines =
- * p.apply(TextIO.Read.from("/local/path/to/file.txt"));
- *
- * // A fully-specified Read from a GCS file (runs locally and via the
- * // Google Cloud Dataflow service):
- * PCollection<Integer> numbers =
- * p.apply(TextIO.Read.named("ReadNumbers")
- * .from("gs://my_bucket/path/to/numbers-*.txt")
- * .withCoder(TextualIntegerCoder.of()));
- * }</pre>
- *
- * <p>To write a {@link PCollection} to one or more text files, use
- * {@link TextIO.Write}, specifying {@link TextIO.Write#to(String)} to specify
- * the path of the file to write to (e.g., a local filename or sharded
- * filename pattern if running locally, or a Google Cloud Storage
- * filename or sharded filename pattern of the form
- * {@code "gs://<bucket>/<filepath>"}). You can optionally name the resulting transform using
- * {@link TextIO.Write#named(String)}, and you can use {@link TextIO.Write#withCoder(Coder)}
- * to specify the Coder to use to encode the Java values into text lines.
- *
- * <p>Any existing files with the same names as generated output files
- * will be overwritten.
- *
- * <p>For example:
- * <pre>{@code
- * // A simple Write to a local file (only runs locally):
- * PCollection<String> lines = ...;
- * lines.apply(TextIO.Write.to("/path/to/file.txt"));
- *
- * // A fully-specified Write to a sharded GCS file (runs locally and via the
- * // Google Cloud Dataflow service):
- * PCollection<Integer> numbers = ...;
- * numbers.apply(TextIO.Write.named("WriteNumbers")
- * .to("gs://my_bucket/path/to/numbers")
- * .withSuffix(".txt")
- * .withCoder(TextualIntegerCoder.of()));
- * }</pre>
- *
- * <h3>Permissions</h3>
- * <p>When run using the {@link DirectPipelineRunner}, your pipeline can read and write text files
- * on your local drive and remote text files on Google Cloud Storage that you have access to using
- * your {@code gcloud} credentials. When running in the Dataflow service using
- * {@link DataflowPipelineRunner}, the pipeline can only read and write files from GCS. For more
- * information about permissions, see the Cloud Dataflow documentation on
- * <a href="https://cloud.google.com/dataflow/security-and-permissions">Security and
- * Permissions</a>.
- */
-public class TextIO {
- /** The default coder, which returns each line of the input file as a string. */
- public static final Coder<String> DEFAULT_TEXT_CODER = StringUtf8Coder.of();
-
- /**
- * A {@link PTransform} that reads from a text file (or multiple text
- * files matching a pattern) and returns a {@link PCollection} containing
- * the decoding of each of the lines of the text file(s). The
- * default decoding just returns each line as a {@link String}, but you may call
- * {@link #withCoder(Coder)} to change the return type.
- */
- public static class Read {
- /**
- * Returns a transform for reading text files that uses the given step name.
- */
- public static Bound<String> named(String name) {
- return new Bound<>(DEFAULT_TEXT_CODER).named(name);
- }
-
- /**
- * Returns a transform for reading text files that reads from the file(s)
- * with the given filename or filename pattern. This can be a local path (if running locally),
- * or a Google Cloud Storage filename or filename pattern of the form
- * {@code "gs://<bucket>/<filepath>"} (if running locally or via the Google Cloud Dataflow
- * service). Standard <a href="http://docs.oracle.com/javase/tutorial/essential/io/find.html"
- * >Java Filesystem glob patterns</a> ("*", "?", "[..]") are supported.
- */
- public static Bound<String> from(String filepattern) {
- return new Bound<>(DEFAULT_TEXT_CODER).from(filepattern);
- }
-
- /**
- * Returns a transform for reading text files that uses the given
- * {@code Coder<T>} to decode each of the lines of the file into a
- * value of type {@code T}.
- *
- * <p>By default, uses {@link StringUtf8Coder}, which just
- * returns the text lines as Java strings.
- *
- * @param <T> the type of the decoded elements, and the elements
- * of the resulting PCollection
- */
- public static <T> Bound<T> withCoder(Coder<T> coder) {
- return new Bound<>(coder);
- }
-
- /**
- * Returns a transform for reading text files that has GCS path validation on
- * pipeline creation disabled.
- *
- * <p>This can be useful in the case where the GCS input does not
- * exist at the pipeline creation time, but is expected to be
- * available at execution time.
- */
- public static Bound<String> withoutValidation() {
- return new Bound<>(DEFAULT_TEXT_CODER).withoutValidation();
- }
-
- /**
- * Returns a transform for reading text files that decompresses all input files
- * using the specified compression type.
- *
- * <p>If no compression type is specified, the default is {@link TextIO.CompressionType#AUTO}.
- * In this mode, the compression type of the file is determined by its extension
- * (e.g., {@code *.gz} is gzipped, {@code *.bz2} is bzipped, and all other extensions are
- * uncompressed).
- */
- public static Bound<String> withCompressionType(TextIO.CompressionType compressionType) {
- return new Bound<>(DEFAULT_TEXT_CODER).withCompressionType(compressionType);
- }
-
- // TODO: strippingNewlines, etc.
-
- /**
- * A {@link PTransform} that reads from one or more text files and returns a bounded
- * {@link PCollection} containing one element for each line of the input files.
- *
- * @param <T> the type of each of the elements of the resulting
- * {@link PCollection}. By default, each line is returned as a {@link String}, however you
- * may use {@link #withCoder(Coder)} to supply a {@code Coder<T>} to produce a
- * {@code PCollection<T>} instead.
- */
- public static class Bound<T> extends PTransform<PInput, PCollection<T>> {
- /** The filepattern to read from. */
- @Nullable private final String filepattern;
-
- /** The Coder to use to decode each line. */
- private final Coder<T> coder;
-
- /** An option to indicate if input validation is desired. Default is true. */
- private final boolean validate;
-
- /** Option to indicate the input source's compression type. Default is AUTO. */
- private final TextIO.CompressionType compressionType;
-
- Bound(Coder<T> coder) {
- this(null, null, coder, true, TextIO.CompressionType.AUTO);
- }
-
- private Bound(String name, String filepattern, Coder<T> coder, boolean validate,
- TextIO.CompressionType compressionType) {
- super(name);
- this.coder = coder;
- this.filepattern = filepattern;
- this.validate = validate;
- this.compressionType = compressionType;
- }
-
- /**
- * Returns a new transform for reading from text files that's like this one but
- * with the given step name.
- *
- * <p>Does not modify this object.
- */
- public Bound<T> named(String name) {
- return new Bound<>(name, filepattern, coder, validate, compressionType);
- }
-
- /**
- * Returns a new transform for reading from text files that's like this one but
- * that reads from the file(s) with the given name or pattern. See {@link TextIO.Read#from}
- * for a description of filepatterns.
- *
- * <p>Does not modify this object.
-
- */
- public Bound<T> from(String filepattern) {
- return new Bound<>(name, filepattern, coder, validate, compressionType);
- }
-
- /**
- * Returns a new transform for reading from text files that's like this one but
- * that uses the given {@link Coder Coder<X>} to decode each of the
- * lines of the file into a value of type {@code X}.
- *
- * <p>Does not modify this object.
- *
- * @param <X> the type of the decoded elements, and the
- * elements of the resulting PCollection
- */
- public <X> Bound<X> withCoder(Coder<X> coder) {
- return new Bound<>(name, filepattern, coder, validate, compressionType);
- }
-
- /**
- * Returns a new transform for reading from text files that's like this one but
- * that has GCS path validation on pipeline creation disabled.
- *
- * <p>This can be useful in the case where the GCS input does not
- * exist at the pipeline creation time, but is expected to be
- * available at execution time.
- *
- * <p>Does not modify this object.
- */
- public Bound<T> withoutValidation() {
- return new Bound<>(name, filepattern, coder, false, compressionType);
- }
-
- /**
- * Returns a new transform for reading from text files that's like this one but
- * reads from input sources using the specified compression type.
- *
- * <p>If no compression type is specified, the default is {@link TextIO.CompressionType#AUTO}.
- * See {@link TextIO.Read#withCompressionType} for more details.
- *
- * <p>Does not modify this object.
- */
- public Bound<T> withCompressionType(TextIO.CompressionType compressionType) {
- return new Bound<>(name, filepattern, coder, validate, compressionType);
- }
-
- @Override
- public PCollection<T> apply(PInput input) {
- if (filepattern == null) {
- throw new IllegalStateException("need to set the filepattern of a TextIO.Read transform");
- }
-
- if (validate) {
- try {
- checkState(
- !IOChannelUtils.getFactory(filepattern).match(filepattern).isEmpty(),
- "Unable to find any files matching %s",
- filepattern);
- } catch (IOException e) {
- throw new IllegalStateException(
- String.format("Failed to validate %s", filepattern), e);
- }
- }
-
- // Create a source specific to the requested compression type.
- final Bounded<T> read;
- switch(compressionType) {
- case UNCOMPRESSED:
- read = com.google.cloud.dataflow.sdk.io.Read.from(
- new TextSource<T>(filepattern, coder));
- break;
- case AUTO:
- read = com.google.cloud.dataflow.sdk.io.Read.from(
- CompressedSource.from(new TextSource<T>(filepattern, coder)));
- break;
- case BZIP2:
- read = com.google.cloud.dataflow.sdk.io.Read.from(
- CompressedSource.from(new TextSource<T>(filepattern, coder))
- .withDecompression(CompressedSource.CompressionMode.BZIP2));
- break;
- case GZIP:
- read = com.google.cloud.dataflow.sdk.io.Read.from(
- CompressedSource.from(new TextSource<T>(filepattern, coder))
- .withDecompression(CompressedSource.CompressionMode.GZIP));
- break;
- default:
- throw new IllegalArgumentException("Unknown compression mode: " + compressionType);
- }
-
- PCollection<T> pcol = input.getPipeline().apply("Read", read);
- // Honor the default output coder that would have been used by this PTransform.
- pcol.setCoder(getDefaultOutputCoder());
- return pcol;
- }
-
- @Override
- protected Coder<T> getDefaultOutputCoder() {
- return coder;
- }
-
- public String getFilepattern() {
- return filepattern;
- }
-
- public boolean needsValidation() {
- return validate;
- }
-
- public TextIO.CompressionType getCompressionType() {
- return compressionType;
- }
- }
-
- /** Disallow construction of utility classes. */
- private Read() {}
- }
-
-
- /////////////////////////////////////////////////////////////////////////////
-
- /**
- * A {@link PTransform} that writes a {@link PCollection} to text file (or
- * multiple text files matching a sharding pattern), with each
- * element of the input collection encoded into its own line.
- */
- public static class Write {
- /**
- * Returns a transform for writing to text files with the given step name.
- */
- public static Bound<String> named(String name) {
- return new Bound<>(DEFAULT_TEXT_CODER).named(name);
- }
-
- /**
- * Returns a transform for writing to text files that writes to the file(s)
- * with the given prefix. This can be a local filename
- * (if running locally), or a Google Cloud Storage filename of
- * the form {@code "gs://<bucket>/<filepath>"}
- * (if running locally or via the Google Cloud Dataflow service).
- *
- * <p>The files written will begin with this prefix, followed by
- * a shard identifier (see {@link Bound#withNumShards(int)}, and end
- * in a common extension, if given by {@link Bound#withSuffix(String)}.
- */
- public static Bound<String> to(String prefix) {
- return new Bound<>(DEFAULT_TEXT_CODER).to(prefix);
- }
-
- /**
- * Returns a transform for writing to text files that appends the specified suffix
- * to the created files.
- */
- public static Bound<String> withSuffix(String nameExtension) {
- return new Bound<>(DEFAULT_TEXT_CODER).withSuffix(nameExtension);
- }
-
- /**
- * Returns a transform for writing to text files that uses the provided shard count.
- *
- * <p>Constraining the number of shards is likely to reduce
- * the performance of a pipeline. Setting this value is not recommended
- * unless you require a specific number of output files.
- *
- * @param numShards the number of shards to use, or 0 to let the system
- * decide.
- */
- public static Bound<String> withNumShards(int numShards) {
- return new Bound<>(DEFAULT_TEXT_CODER).withNumShards(numShards);
- }
-
- /**
- * Returns a transform for writing to text files that uses the given shard name
- * template.
- *
- * <p>See {@link ShardNameTemplate} for a description of shard templates.
- */
- public static Bound<String> withShardNameTemplate(String shardTemplate) {
- return new Bound<>(DEFAULT_TEXT_CODER).withShardNameTemplate(shardTemplate);
- }
-
- /**
- * Returns a transform for writing to text files that forces a single file as
- * output.
- */
- public static Bound<String> withoutSharding() {
- return new Bound<>(DEFAULT_TEXT_CODER).withoutSharding();
- }
-
- /**
- * Returns a transform for writing to text files that uses the given
- * {@link Coder} to encode each of the elements of the input
- * {@link PCollection} into an output text line.
- *
- * <p>By default, uses {@link StringUtf8Coder}, which writes input
- * Java strings directly as output lines.
- *
- * @param <T> the type of the elements of the input {@link PCollection}
- */
- public static <T> Bound<T> withCoder(Coder<T> coder) {
- return new Bound<>(coder);
- }
-
- /**
- * Returns a transform for writing to text files that has GCS path validation on
- * pipeline creation disabled.
- *
- * <p>This can be useful in the case where the GCS output location does
- * not exist at the pipeline creation time, but is expected to be available
- * at execution time.
- */
- public static Bound<String> withoutValidation() {
- return new Bound<>(DEFAULT_TEXT_CODER).withoutValidation();
- }
-
- // TODO: appendingNewlines, header, footer, etc.
-
- /**
- * A PTransform that writes a bounded PCollection to a text file (or
- * multiple text files matching a sharding pattern), with each
- * PCollection element being encoded into its own line.
- *
- * @param <T> the type of the elements of the input PCollection
- */
- public static class Bound<T> extends PTransform<PCollection<T>, PDone> {
- /** The prefix of each file written, combined with suffix and shardTemplate. */
- @Nullable private final String filenamePrefix;
- /** The suffix of each file written, combined with prefix and shardTemplate. */
- private final String filenameSuffix;
-
- /** The Coder to use to decode each line. */
- private final Coder<T> coder;
-
- /** Requested number of shards. 0 for automatic. */
- private final int numShards;
-
- /** The shard template of each file written, combined with prefix and suffix. */
- private final String shardTemplate;
-
- /** An option to indicate if output validation is desired. Default is true. */
- private final boolean validate;
-
- Bound(Coder<T> coder) {
- this(null, null, "", coder, 0, ShardNameTemplate.INDEX_OF_MAX, true);
- }
-
- private Bound(String name, String filenamePrefix, String filenameSuffix, Coder<T> coder,
- int numShards, String shardTemplate, boolean validate) {
- super(name);
- this.coder = coder;
- this.filenamePrefix = filenamePrefix;
- this.filenameSuffix = filenameSuffix;
- this.numShards = numShards;
- this.shardTemplate = shardTemplate;
- this.validate = validate;
- }
-
- /**
- * Returns a transform for writing to text files that's like this one but
- * with the given step name.
- *
- * <p>Does not modify this object.
- */
- public Bound<T> named(String name) {
- return new Bound<>(name, filenamePrefix, filenameSuffix, coder, numShards,
- shardTemplate, validate);
- }
-
- /**
- * Returns a transform for writing to text files that's like this one but
- * that writes to the file(s) with the given filename prefix.
- *
- * <p>See {@link TextIO.Write#to(String) Write.to(String)} for more information.
- *
- * <p>Does not modify this object.
- */
- public Bound<T> to(String filenamePrefix) {
- validateOutputComponent(filenamePrefix);
- return new Bound<>(name, filenamePrefix, filenameSuffix, coder, numShards,
- shardTemplate, validate);
- }
-
- /**
- * Returns a transform for writing to text files that that's like this one but
- * that writes to the file(s) with the given filename suffix.
- *
- * <p>Does not modify this object.
- *
- * @see ShardNameTemplate
- */
- public Bound<T> withSuffix(String nameExtension) {
- validateOutputComponent(nameExtension);
- return new Bound<>(name, filenamePrefix, nameExtension, coder, numShards,
- shardTemplate, validate);
- }
-
- /**
- * Returns a transform for writing to text files that's like this one but
- * that uses the provided shard count.
- *
- * <p>Constraining the number of shards is likely to reduce
- * the performance of a pipeline. Setting this value is not recommended
- * unless you require a specific number of output files.
- *
- * <p>Does not modify this object.
- *
- * @param numShards the number of shards to use, or 0 to let the system
- * decide.
- * @see ShardNameTemplate
- */
- public Bound<T> withNumShards(int numShards) {
- Preconditions.checkArgument(numShards >= 0);
- return new Bound<>(name, filenamePrefix, filenameSuffix, coder, numShards,
- shardTemplate, validate);
- }
-
- /**
- * Returns a transform for writing to text files that's like this one but
- * that uses the given shard name template.
- *
- * <p>Does not modify this object.
- *
- * @see ShardNameTemplate
- */
- public Bound<T> withShardNameTemplate(String shardTemplate) {
- return new Bound<>(name, filenamePrefix, filenameSuffix, coder, numShards,
- shardTemplate, validate);
- }
-
- /**
- * Returns a transform for writing to text files that's like this one but
- * that forces a single file as output.
- *
- * <p>Constraining the number of shards is likely to reduce
- * the performance of a pipeline. Using this setting is not recommended
- * unless you truly require a single output file.
- *
- * <p>This is a shortcut for
- * {@code .withNumShards(1).withShardNameTemplate("")}
- *
- * <p>Does not modify this object.
- */
- public Bound<T> withoutSharding() {
- return new Bound<>(name, filenamePrefix, filenameSuffix, coder, 1, "", validate);
- }
-
- /**
- * Returns a transform for writing to text files that's like this one
- * but that uses the given {@link Coder Coder<X>} to encode each of
- * the elements of the input {@link PCollection PCollection<X>} into an
- * output text line. Does not modify this object.
- *
- * @param <X> the type of the elements of the input {@link PCollection}
- */
- public <X> Bound<X> withCoder(Coder<X> coder) {
- return new Bound<>(name, filenamePrefix, filenameSuffix, coder, numShards,
- shardTemplate, validate);
- }
-
- /**
- * Returns a transform for writing to text files that's like this one but
- * that has GCS output path validation on pipeline creation disabled.
- *
- * <p>This can be useful in the case where the GCS output location does
- * not exist at the pipeline creation time, but is expected to be
- * available at execution time.
- *
- * <p>Does not modify this object.
- */
- public Bound<T> withoutValidation() {
- return new Bound<>(name, filenamePrefix, filenameSuffix, coder, numShards,
- shardTemplate, false);
- }
-
- @Override
- public PDone apply(PCollection<T> input) {
- if (filenamePrefix == null) {
- throw new IllegalStateException(
- "need to set the filename prefix of a TextIO.Write transform");
- }
-
- // Note that custom sinks currently do not expose sharding controls.
- // Thus pipeline runner writers need to individually add support internally to
- // apply user requested sharding limits.
- return input.apply("Write", com.google.cloud.dataflow.sdk.io.Write.to(
- new TextSink<>(
- filenamePrefix, filenameSuffix, shardTemplate, coder)));
- }
-
- /**
- * Returns the current shard name template string.
- */
- public String getShardNameTemplate() {
- return shardTemplate;
- }
-
- @Override
- protected Coder<Void> getDefaultOutputCoder() {
- return VoidCoder.of();
- }
-
- public String getFilenamePrefix() {
- return filenamePrefix;
- }
-
- public String getShardTemplate() {
- return shardTemplate;
- }
-
- public int getNumShards() {
- return numShards;
- }
-
- public String getFilenameSuffix() {
- return filenameSuffix;
- }
-
- public Coder<T> getCoder() {
- return coder;
- }
-
- public boolean needsValidation() {
- return validate;
- }
- }
- }
-
- /**
- * Possible text file compression types.
- */
- public static enum CompressionType {
- /**
- * Automatically determine the compression type based on filename extension.
- */
- AUTO(""),
- /**
- * Uncompressed (i.e., may be split).
- */
- UNCOMPRESSED(""),
- /**
- * GZipped.
- */
- GZIP(".gz"),
- /**
- * BZipped.
- */
- BZIP2(".bz2");
-
- private String filenameSuffix;
-
- private CompressionType(String suffix) {
- this.filenameSuffix = suffix;
- }
-
- /**
- * Determine if a given filename matches a compression type based on its extension.
- * @param filename the filename to match
- * @return true iff the filename ends with the compression type's known extension.
- */
- public boolean matches(String filename) {
- return filename.toLowerCase().endsWith(filenameSuffix.toLowerCase());
- }
- }
-
- // Pattern which matches old-style shard output patterns, which are now
- // disallowed.
- private static final Pattern SHARD_OUTPUT_PATTERN = Pattern.compile("@([0-9]+|\\*)");
-
- private static void validateOutputComponent(String partialFilePattern) {
- Preconditions.checkArgument(
- !SHARD_OUTPUT_PATTERN.matcher(partialFilePattern).find(),
- "Output name components are not allowed to contain @* or @N patterns: "
- + partialFilePattern);
- }
-
- //////////////////////////////////////////////////////////////////////////////
-
- /** Disable construction of utility class. */
- private TextIO() {}
-
- /**
- * A {@link FileBasedSource} which can decode records delimited by new line characters.
- *
- * <p>This source splits the data into records using {@code UTF-8} {@code \n}, {@code \r}, or
- * {@code \r\n} as the delimiter. This source is not strict and supports decoding the last record
- * even if it is not delimited. Finally, no records are decoded if the stream is empty.
- *
- * <p>This source supports reading from any arbitrary byte position within the stream. If the
- * starting position is not {@code 0}, then bytes are skipped until the first delimiter is found
- * representing the beginning of the first record to be decoded.
- */
- @VisibleForTesting
- static class TextSource<T> extends FileBasedSource<T> {
- /** The Coder to use to decode each line. */
- private final Coder<T> coder;
-
- @VisibleForTesting
- TextSource(String fileSpec, Coder<T> coder) {
- super(fileSpec, 1L);
- this.coder = coder;
- }
-
- private TextSource(String fileName, long start, long end, Coder<T> coder) {
- super(fileName, 1L, start, end);
- this.coder = coder;
- }
-
- @Override
- protected FileBasedSource<T> createForSubrangeOfFile(String fileName, long start, long end) {
- return new TextSource<>(fileName, start, end, coder);
- }
-
- @Override
- protected FileBasedReader<T> createSingleFileReader(PipelineOptions options) {
- return new TextBasedReader<>(this);
- }
-
- @Override
- public boolean producesSortedKeys(PipelineOptions options) throws Exception {
- return false;
- }
-
- @Override
- public Coder<T> getDefaultOutputCoder() {
- return coder;
- }
-
- /**
- * A {@link com.google.cloud.dataflow.sdk.io.FileBasedSource.FileBasedReader FileBasedReader}
- * which can decode records delimited by new line characters.
- *
- * See {@link TextSource} for further details.
- */
- @VisibleForTesting
- static class TextBasedReader<T> extends FileBasedReader<T> {
- private static final int READ_BUFFER_SIZE = 8192;
- private final Coder<T> coder;
- private final ByteBuffer readBuffer = ByteBuffer.allocate(READ_BUFFER_SIZE);
- private ByteString buffer;
- private int startOfSeparatorInBuffer;
- private int endOfSeparatorInBuffer;
- private long startOfNextRecord;
- private boolean eof;
- private boolean elementIsPresent;
- private T currentValue;
- private ReadableByteChannel inChannel;
-
- private TextBasedReader(TextSource<T> source) {
- super(source);
- coder = source.coder;
- buffer = ByteString.EMPTY;
- }
-
- @Override
- protected long getCurrentOffset() throws NoSuchElementException {
- if (!elementIsPresent) {
- throw new NoSuchElementException();
- }
- return startOfNextRecord;
- }
-
- @Override
- public T getCurrent() throws NoSuchElementException {
- if (!elementIsPresent) {
- throw new NoSuchElementException();
- }
- return currentValue;
- }
-
- @Override
- protected void startReading(ReadableByteChannel channel) throws IOException {
- this.inChannel = channel;
- // If the first offset is greater than zero, we need to skip bytes until we see our
- // first separator.
- if (getCurrentSource().getStartOffset() > 0) {
- checkState(channel instanceof SeekableByteChannel,
- "%s only supports reading from a SeekableByteChannel when given a start offset"
- + " greater than 0.", TextSource.class.getSimpleName());
- long requiredPosition = getCurrentSource().getStartOffset() - 1;
- ((SeekableByteChannel) channel).position(requiredPosition);
- findSeparatorBounds();
- buffer = buffer.substring(endOfSeparatorInBuffer);
- startOfNextRecord = requiredPosition + endOfSeparatorInBuffer;
- endOfSeparatorInBuffer = 0;
- startOfSeparatorInBuffer = 0;
- }
- }
-
- /**
- * Locates the start position and end position of the next delimiter. Will
- * consume the channel till either EOF or the delimiter bounds are found.
- *
- * <p>This fills the buffer and updates the positions as follows:
- * <pre>{@code
- * ------------------------------------------------------
- * | element bytes | delimiter bytes | unconsumed bytes |
- * ------------------------------------------------------
- * 0 start of end of buffer
- * separator separator size
- * in buffer in buffer
- * }</pre>
- */
- private void findSeparatorBounds() throws IOException {
- int bytePositionInBuffer = 0;
- while (true) {
- if (!tryToEnsureNumberOfBytesInBuffer(bytePositionInBuffer + 1)) {
- startOfSeparatorInBuffer = endOfSeparatorInBuffer = bytePositionInBuffer;
- break;
- }
-
- byte currentByte = buffer.byteAt(bytePositionInBuffer);
-
- if (currentByte == '\n') {
- startOfSeparatorInBuffer = bytePositionInBuffer;
- endOfSeparatorInBuffer = startOfSeparatorInBuffer + 1;
- break;
- } else if (currentByte == '\r') {
- startOfSeparatorInBuffer = bytePositionInBuffer;
- endOfSeparatorInBuffer = startOfSeparatorInBuffer + 1;
-
- if (tryToEnsureNumberOfBytesInBuffer(bytePositionInBuffer + 2)) {
- currentByte = buffer.byteAt(bytePositionInBuffer + 1);
- if (currentByte == '\n') {
- endOfSeparatorInBuffer += 1;
- }
- }
- break;
- }
-
- // Move to the next byte in buffer.
- bytePositionInBuffer += 1;
- }
- }
-
- @Override
- protected boolean readNextRecord() throws IOException {
- startOfNextRecord += endOfSeparatorInBuffer;
- findSeparatorBounds();
-
- // If we have reached EOF file and consumed all of the buffer then we know
- // that there are no more records.
- if (eof && buffer.size() == 0) {
- elementIsPresent = false;
- return false;
- }
-
- decodeCurrentElement();
- return true;
- }
-
- /**
- * Decodes the current element updating the buffer to only contain the unconsumed bytes.
- *
- * This invalidates the currently stored {@code startOfSeparatorInBuffer} and
- * {@code endOfSeparatorInBuffer}.
- */
- private void decodeCurrentElement() throws IOException {
- ByteString dataToDecode = buffer.substring(0, startOfSeparatorInBuffer);
- currentValue = coder.decode(dataToDecode.newInput(), Context.OUTER);
- elementIsPresent = true;
- buffer = buffer.substring(endOfSeparatorInBuffer);
- }
-
- /**
- * Returns false if we were unable to ensure the minimum capacity by consuming the channel.
- */
- private boolean tryToEnsureNumberOfBytesInBuffer(int minCapacity) throws IOException {
- // While we aren't at EOF or haven't fulfilled the minimum buffer capacity,
- // attempt to read more bytes.
- while (buffer.size() <= minCapacity && !eof) {
- eof = inChannel.read(readBuffer) == -1;
- readBuffer.flip();
- buffer = buffer.concat(ByteString.copyFrom(readBuffer));
- readBuffer.clear();
- }
- // Return true if we were able to honor the minimum buffer capacity request
- return buffer.size() >= minCapacity;
- }
- }
- }
-
- /**
- * A {@link FileBasedSink} for text files. Produces text files with the new line separator
- * {@code '\n'} represented in {@code UTF-8} format as the record separator.
- * Each record (including the last) is terminated.
- */
- @VisibleForTesting
- static class TextSink<T> extends FileBasedSink<T> {
- private final Coder<T> coder;
-
- @VisibleForTesting
- TextSink(
- String baseOutputFilename, String extension, String fileNameTemplate, Coder<T> coder) {
- super(baseOutputFilename, extension, fileNameTemplate);
- this.coder = coder;
- }
-
- @Override
- public FileBasedSink.FileBasedWriteOperation<T> createWriteOperation(PipelineOptions options) {
- return new TextWriteOperation<>(this, coder);
- }
-
- /**
- * A {@link com.google.cloud.dataflow.sdk.io.FileBasedSink.FileBasedWriteOperation
- * FileBasedWriteOperation} for text files.
- */
- private static class TextWriteOperation<T> extends FileBasedWriteOperation<T> {
- private final Coder<T> coder;
-
- private TextWriteOperation(TextSink<T> sink, Coder<T> coder) {
- super(sink);
- this.coder = coder;
- }
-
- @Override
- public FileBasedWriter<T> createWriter(PipelineOptions options) throws Exception {
- return new TextWriter<>(this, coder);
- }
- }
-
- /**
- * A {@link com.google.cloud.dataflow.sdk.io.FileBasedSink.FileBasedWriter FileBasedWriter}
- * for text files.
- */
- private static class TextWriter<T> extends FileBasedWriter<T> {
- private static final byte[] NEWLINE = "\n".getBytes(StandardCharsets.UTF_8);
- private final Coder<T> coder;
- private OutputStream out;
-
- public TextWriter(FileBasedWriteOperation<T> writeOperation, Coder<T> coder) {
- super(writeOperation);
- this.mimeType = MimeTypes.TEXT;
- this.coder = coder;
- }
-
- @Override
- protected void prepareWrite(WritableByteChannel channel) throws Exception {
- out = Channels.newOutputStream(channel);
- }
-
- @Override
- public void write(T value) throws Exception {
- coder.encode(value, out, Context.OUTER);
- out.write(NEWLINE);
- }
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/UnboundedSource.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/UnboundedSource.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/UnboundedSource.java
deleted file mode 100644
index e585151..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/UnboundedSource.java
+++ /dev/null
@@ -1,253 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.io;
-
-import com.google.cloud.dataflow.sdk.annotations.Experimental;
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-
-import org.joda.time.Instant;
-
-import java.io.IOException;
-import java.util.List;
-import java.util.NoSuchElementException;
-
-import javax.annotation.Nullable;
-
-/**
- * A {@link Source} that reads an unbounded amount of input and, because of that, supports
- * some additional operations such as checkpointing, watermarks, and record ids.
- *
- * <ul>
- * <li> Checkpointing allows sources to not re-read the same data again in the case of failures.
- * <li> Watermarks allow for downstream parts of the pipeline to know up to what point
- * in time the data is complete.
- * <li> Record ids allow for efficient deduplication of input records; many streaming sources
- * do not guarantee that a given record will only be read a single time.
- * </ul>
- *
- * <p>See {@link com.google.cloud.dataflow.sdk.transforms.windowing.Window} and
- * {@link com.google.cloud.dataflow.sdk.transforms.windowing.Trigger} for more information on
- * timestamps and watermarks.
- *
- * @param <OutputT> Type of records output by this source.
- * @param <CheckpointMarkT> Type of checkpoint marks used by the readers of this source.
- */
-public abstract class UnboundedSource<
- OutputT, CheckpointMarkT extends UnboundedSource.CheckpointMark> extends Source<OutputT> {
- /**
- * Returns a list of {@code UnboundedSource} objects representing the instances of this source
- * that should be used when executing the workflow. Each split should return a separate partition
- * of the input data.
- *
- * <p>For example, for a source reading from a growing directory of files, each split
- * could correspond to a prefix of file names.
- *
- * <p>Some sources are not splittable, such as reading from a single TCP stream. In that
- * case, only a single split should be returned.
- *
- * <p>Some data sources automatically partition their data among readers. For these types of
- * inputs, {@code n} identical replicas of the top-level source can be returned.
- *
- * <p>The size of the returned list should be as close to {@code desiredNumSplits}
- * as possible, but does not have to match exactly. A low number of splits
- * will limit the amount of parallelism in the source.
- */
- public abstract List<? extends UnboundedSource<OutputT, CheckpointMarkT>> generateInitialSplits(
- int desiredNumSplits, PipelineOptions options) throws Exception;
-
- /**
- * Create a new {@link UnboundedReader} to read from this source, resuming from the given
- * checkpoint if present.
- */
- public abstract UnboundedReader<OutputT> createReader(
- PipelineOptions options, @Nullable CheckpointMarkT checkpointMark);
-
- /**
- * Returns a {@link Coder} for encoding and decoding the checkpoints for this source, or
- * null if the checkpoints do not need to be durably committed.
- */
- @Nullable
- public abstract Coder<CheckpointMarkT> getCheckpointMarkCoder();
-
- /**
- * Returns whether this source requires explicit deduping.
- *
- * <p>This is needed if the underlying data source can return the same record multiple times,
- * such a queuing system with a pull-ack model. Sources where the records read are uniquely
- * identified by the persisted state in the CheckpointMark do not need this.
- */
- public boolean requiresDeduping() {
- return false;
- }
-
- /**
- * A marker representing the progress and state of an
- * {@link com.google.cloud.dataflow.sdk.io.UnboundedSource.UnboundedReader}.
- *
- * <p>For example, this could be offsets in a set of files being read.
- */
- public interface CheckpointMark {
- /**
- * Perform any finalization that needs to happen after a bundle of data read from
- * the source has been processed and committed.
- *
- * <p>For example, this could be sending acknowledgement requests to an external
- * data source such as Pub/Sub.
- *
- * <p>This may be called from any thread, potentially at the same time as calls to the
- * {@code UnboundedReader} that created it.
- */
- void finalizeCheckpoint() throws IOException;
- }
-
- /**
- * A {@code Reader} that reads an unbounded amount of input.
- *
- * <p>A given {@code UnboundedReader} object will only be accessed by a single thread at once.
- */
- @Experimental(Experimental.Kind.SOURCE_SINK)
- public abstract static class UnboundedReader<OutputT> extends Source.Reader<OutputT> {
- private static final byte[] EMPTY = new byte[0];
-
- /**
- * Initializes the reader and advances the reader to the first record.
- *
- * <p>This method should be called exactly once. The invocation should occur prior to calling
- * {@link #advance} or {@link #getCurrent}. This method may perform expensive operations that
- * are needed to initialize the reader.
- *
- * <p>Returns {@code true} if a record was read, {@code false} if there is no more input
- * currently available. Future calls to {@link #advance} may return {@code true} once more data
- * is available. Regardless of the return value of {@code start}, {@code start} will not be
- * called again on the same {@code UnboundedReader} object; it will only be called again when a
- * new reader object is constructed for the same source, e.g. on recovery.
- */
- @Override
- public abstract boolean start() throws IOException;
-
- /**
- * Advances the reader to the next valid record.
- *
- * <p>Returns {@code true} if a record was read, {@code false} if there is no more input
- * available. Future calls to {@link #advance} may return {@code true} once more data is
- * available.
- */
- @Override
- public abstract boolean advance() throws IOException;
-
- /**
- * Returns a unique identifier for the current record. This should be the same for each
- * instance of the same logical record read from the underlying data source.
- *
- * <p>It is only necessary to override this if {@link #requiresDeduping} has been overridden to
- * return true.
- *
- * <p>For example, this could be a hash of the record contents, or a logical ID present in
- * the record. If this is generated as a hash of the record contents, it should be at least 16
- * bytes (128 bits) to avoid collisions.
- *
- * <p>This method has the same restrictions on when it can be called as {@link #getCurrent} and
- * {@link #getCurrentTimestamp}.
- *
- * @throws NoSuchElementException if the reader is at the beginning of the input and
- * {@link #start} or {@link #advance} wasn't called, or if the last {@link #start} or
- * {@link #advance} returned {@code false}.
- */
- public byte[] getCurrentRecordId() throws NoSuchElementException {
- if (getCurrentSource().requiresDeduping()) {
- throw new IllegalStateException(
- "getCurrentRecordId() must be overridden if requiresDeduping returns true()");
- }
- return EMPTY;
- }
-
- /**
- * Returns a timestamp before or at the timestamps of all future elements read by this reader.
- *
- * <p>This can be approximate. If records are read that violate this guarantee, they will be
- * considered late, which will affect how they will be processed. See
- * {@link com.google.cloud.dataflow.sdk.transforms.windowing.Window} for more information on
- * late data and how to handle it.
- *
- * <p>However, this value should be as late as possible. Downstream windows may not be able
- * to close until this watermark passes their end.
- *
- * <p>For example, a source may know that the records it reads will be in timestamp order. In
- * this case, the watermark can be the timestamp of the last record read. For a
- * source that does not have natural timestamps, timestamps can be set to the time of
- * reading, in which case the watermark is the current clock time.
- *
- * <p>See {@link com.google.cloud.dataflow.sdk.transforms.windowing.Window} and
- * {@link com.google.cloud.dataflow.sdk.transforms.windowing.Trigger} for more
- * information on timestamps and watermarks.
- *
- * <p>May be called after {@link #advance} or {@link #start} has returned false, but not before
- * {@link #start} has been called.
- */
- public abstract Instant getWatermark();
-
- /**
- * Returns a {@link CheckpointMark} representing the progress of this {@code UnboundedReader}.
- *
- * <p>The elements read up until this is called will be processed together as a bundle. Once
- * the result of this processing has been durably committed,
- * {@link CheckpointMark#finalizeCheckpoint} will be called on the {@link CheckpointMark}
- * object.
- *
- * <p>The returned object should not be modified.
- *
- * <p>May be called after {@link #advance} or {@link #start} has returned false, but not before
- * {@link #start} has been called.
- */
- public abstract CheckpointMark getCheckpointMark();
-
- /**
- * Constant representing an unknown amount of backlog.
- */
- public static final long BACKLOG_UNKNOWN = -1L;
-
- /**
- * Returns the size of the backlog of unread data in the underlying data source represented by
- * this split of this source.
- *
- * <p>One of this or {@link #getTotalBacklogBytes} should be overridden in order to allow the
- * runner to scale the amount of resources allocated to the pipeline.
- */
- public long getSplitBacklogBytes() {
- return BACKLOG_UNKNOWN;
- }
-
- /**
- * Returns the size of the backlog of unread data in the underlying data source represented by
- * all splits of this source.
- *
- * <p>One of this or {@link #getSplitBacklogBytes} should be overridden in order to allow the
- * runner to scale the amount of resources allocated to the pipeline.
- */
- public long getTotalBacklogBytes() {
- return BACKLOG_UNKNOWN;
- }
-
- /**
- * Returns the {@link UnboundedSource} that created this reader. This will not change over the
- * life of the reader.
- */
- @Override
- public abstract UnboundedSource<OutputT, ?> getCurrentSource();
- }
-}
[50/67] [partial] incubator-beam git commit: Directory reorganization
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/pom.xml
----------------------------------------------------------------------
diff --git a/sdk/pom.xml b/sdk/pom.xml
deleted file mode 100644
index 13fe950..0000000
--- a/sdk/pom.xml
+++ /dev/null
@@ -1,771 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project xmlns="http://maven.apache.org/POM/4.0.0"
- xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
- xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
-
- <modelVersion>4.0.0</modelVersion>
-
- <parent>
- <groupId>org.apache.beam</groupId>
- <artifactId>parent</artifactId>
- <version>0.1.0-incubating-SNAPSHOT</version>
- <relativePath>../pom.xml</relativePath>
- </parent>
-
- <artifactId>java-sdk-all</artifactId>
- <name>Apache Beam :: SDK :: Java All</name>
- <description>Beam SDK Java All provides a simple, Java-based
- interface for processing virtually any size data. This
- artifact includes entire Apache Beam Java SDK.</description>
-
- <packaging>jar</packaging>
-
- <properties>
- <timestamp>${maven.build.timestamp}</timestamp>
- <maven.build.timestamp.format>yyyy-MM-dd HH:mm</maven.build.timestamp.format>
- <dataflow>com.google.cloud.dataflow</dataflow>
- <runIntegrationTestOnService>false</runIntegrationTestOnService>
- <testParallelValue>none</testParallelValue>
- <testGroups></testGroups>
- <dataflowProjectName></dataflowProjectName>
- </properties>
-
- <profiles>
- <profile>
- <id>DataflowPipelineTests</id>
- <properties>
- <runIntegrationTestOnService>true</runIntegrationTestOnService>
- <testGroups>com.google.cloud.dataflow.sdk.testing.RunnableOnService</testGroups>
- <testParallelValue>both</testParallelValue>
- </properties>
- </profile>
- </profiles>
-
- <build>
- <resources>
- <resource>
- <directory>src/main/resources</directory>
- <filtering>true</filtering>
- </resource>
- </resources>
-
- <plugins>
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-compiler-plugin</artifactId>
- </plugin>
-
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-dependency-plugin</artifactId>
- <executions>
- <execution>
- <goals><goal>analyze-only</goal></goals>
- <configuration>
- <failOnWarning>true</failOnWarning>
- </configuration>
- </execution>
- </executions>
- </plugin>
-
- <!-- Run CheckStyle pass on transforms, as they are release in
- source form. -->
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-checkstyle-plugin</artifactId>
- <version>2.12</version>
- <dependencies>
- <dependency>
- <groupId>com.puppycrawl.tools</groupId>
- <artifactId>checkstyle</artifactId>
- <version>6.6</version>
- </dependency>
- </dependencies>
- <configuration>
- <configLocation>../checkstyle.xml</configLocation>
- <consoleOutput>true</consoleOutput>
- <failOnViolation>true</failOnViolation>
- <includeResources>false</includeResources>
- <includeTestSourceDirectory>true</includeTestSourceDirectory>
- <excludes>${project.build.directory}/generated-test-sources/**</excludes>
- </configuration>
- <executions>
- <execution>
- <goals>
- <goal>check</goal>
- </goals>
- </execution>
- </executions>
- </plugin>
-
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-jar-plugin</artifactId>
- <executions>
- <execution>
- <id>default-jar</id>
- <goals>
- <goal>jar</goal>
- </goals>
- </execution>
- <execution>
- <id>default-test-jar</id>
- <goals>
- <goal>test-jar</goal>
- </goals>
- </execution>
- </executions>
- </plugin>
-
- <!-- Source plugin for generating source and test-source JARs. -->
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-source-plugin</artifactId>
- <version>2.4</version>
- <executions>
- <execution>
- <id>attach-sources</id>
- <phase>compile</phase>
- <goals>
- <goal>jar</goal>
- </goals>
- </execution>
- <execution>
- <id>attach-test-sources</id>
- <phase>test-compile</phase>
- <goals>
- <goal>test-jar</goal>
- </goals>
- </execution>
- </executions>
- </plugin>
-
- <plugin>
- <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-javadoc-plugin</artifactId>
- <configuration>
- <windowtitle>Google Cloud Dataflow SDK ${project.version} API</windowtitle>
- <doctitle>Google Cloud Dataflow SDK for Java, version ${project.version}</doctitle>
- <overview>../javadoc/overview.html</overview>
-
- <subpackages>com.google.cloud.dataflow.sdk</subpackages>
- <additionalparam>-exclude com.google.cloud.dataflow.sdk.runners.worker:com.google.cloud.dataflow.sdk.runners.dataflow:com.google.cloud.dataflow.sdk.util:com.google.cloud.dataflow.sdk.runners.inprocess ${dataflow.javadoc_opts}</additionalparam>
- <use>false</use>
- <quiet>true</quiet>
- <bottom><![CDATA[<br>]]></bottom>
-
- <offlineLinks>
- <offlineLink>
- <url>https://developers.google.com/api-client-library/java/google-api-java-client/reference/1.20.0/</url>
- <location>${basedir}/../javadoc/apiclient-docs</location>
- </offlineLink>
- <offlineLink>
- <url>http://avro.apache.org/docs/1.7.7/api/java/</url>
- <location>${basedir}/../javadoc/avro-docs</location>
- </offlineLink>
- <offlineLink>
- <url>https://developers.google.com/resources/api-libraries/documentation/bigquery/v2/java/latest/</url>
- <location>${basedir}/../javadoc/bq-docs</location>
- </offlineLink>
- <offlineLink>
- <url>https://cloud.google.com/datastore/docs/apis/javadoc/</url>
- <location>${basedir}/../javadoc/datastore-docs</location>
- </offlineLink>
- <offlineLink>
- <url>http://docs.guava-libraries.googlecode.com/git-history/release19/javadoc/</url>
- <location>${basedir}/../javadoc/guava-docs</location>
- </offlineLink>
- <offlineLink>
- <url>http://hamcrest.org/JavaHamcrest/javadoc/1.3/</url>
- <location>${basedir}/../javadoc/hamcrest-docs</location>
- </offlineLink>
- <offlineLink>
- <url>http://fasterxml.github.io/jackson-annotations/javadoc/2.7/</url>
- <location>${basedir}/../javadoc/jackson-annotations-docs</location>
- </offlineLink>
- <offlineLink>
- <url>http://fasterxml.github.io/jackson-databind/javadoc/2.7/</url>
- <location>${basedir}/../javadoc/jackson-databind-docs</location>
- </offlineLink>
- <offlineLink>
- <url>http://www.joda.org/joda-time/apidocs</url>
- <location>${basedir}/../javadoc/joda-docs</location>
- </offlineLink>
- <offlineLink>
- <url>http://junit.sourceforge.net/javadoc/</url>
- <location>${basedir}/../javadoc/junit-docs</location>
- </offlineLink>
- <offlineLink>
- <url>https://developers.google.com/api-client-library/java/google-oauth-java-client/reference/1.20.0/</url>
- <location>${basedir}/../javadoc/oauth-docs</location>
- </offlineLink>
- </offlineLinks>
- </configuration>
- <executions>
- <execution>
- <goals>
- <goal>jar</goal>
- </goals>
- <phase>package</phase>
- </execution>
- </executions>
- </plugin>
-
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-shade-plugin</artifactId>
- <version>2.4.1</version>
- <executions>
- <!-- In the first phase, we pick dependencies and relocate them. -->
- <execution>
- <id>bundle-and-repackage</id>
- <phase>package</phase>
- <goals>
- <goal>shade</goal>
- </goals>
- <configuration>
- <shadeTestJar>true</shadeTestJar>
- <artifactSet>
- <includes>
- <include>com.google.cloud.bigtable:bigtable-client-core</include>
- <include>com.google.guava:guava</include>
- </includes>
- </artifactSet>
- <filters>
- <filter>
- <artifact>*:*</artifact>
- <excludes>
- <exclude>META-INF/*.SF</exclude>
- <exclude>META-INF/*.DSA</exclude>
- <exclude>META-INF/*.RSA</exclude>
- </excludes>
- </filter>
- </filters>
- <relocations>
- <!-- TODO: Once ready, change the following pattern to 'com'
- only, exclude 'com.google.cloud.dataflow.**', and remove
- the second relocation. -->
- <relocation>
- <pattern>com.google.common</pattern>
- <shadedPattern>com.google.cloud.dataflow.sdk.repackaged.com.google.common</shadedPattern>
- </relocation>
- <relocation>
- <pattern>com.google.thirdparty</pattern>
- <shadedPattern>com.google.cloud.dataflow.sdk.repackaged.com.google.thirdparty</shadedPattern>
- </relocation>
- <relocation>
- <pattern>com.google.cloud.bigtable</pattern>
- <shadedPattern>com.google.cloud.dataflow.sdk.repackaged.com.google.cloud.bigtable</shadedPattern>
- <excludes>
- <exclude>com.google.cloud.bigtable.config.BigtableOptions*</exclude>
- <exclude>com.google.cloud.bigtable.config.CredentialOptions*</exclude>
- <exclude>com.google.cloud.bigtable.config.RetryOptions*</exclude>
- <exclude>com.google.cloud.bigtable.grpc.BigtableClusterName</exclude>
- <exclude>com.google.cloud.bigtable.grpc.BigtableTableName</exclude>
- </excludes>
- </relocation>
- </relocations>
- </configuration>
- </execution>
-
- <!-- In the second phase, we pick remaining dependencies and bundle
- them without repackaging. -->
- <execution>
- <id>bundle-rest-without-repackaging</id>
- <phase>package</phase>
- <goals>
- <goal>shade</goal>
- </goals>
- <configuration>
- <shadeTestJar>true</shadeTestJar>
- <finalName>${project.artifactId}-bundled-${project.version}</finalName>
- <artifactSet>
- <excludes>
- <exclude>com.google.cloud.bigtable:bigtable-client-core</exclude>
- <exclude>com.google.guava:guava</exclude>
- </excludes>
- </artifactSet>
- <filters>
- <filter>
- <artifact>*:*</artifact>
- <excludes>
- <exclude>META-INF/*.SF</exclude>
- <exclude>META-INF/*.DSA</exclude>
- <exclude>META-INF/*.RSA</exclude>
- </excludes>
- </filter>
- </filters>
- </configuration>
- </execution>
- </executions>
- </plugin>
-
- <!-- Coverage analysis for unit tests. -->
- <plugin>
- <groupId>org.jacoco</groupId>
- <artifactId>jacoco-maven-plugin</artifactId>
- </plugin>
-
- <!-- Avro plugin for automatic code generation -->
- <plugin>
- <groupId>org.apache.avro</groupId>
- <artifactId>avro-maven-plugin</artifactId>
- <version>${avro.version}</version>
- <executions>
- <execution>
- <id>schemas</id>
- <phase>generate-sources</phase>
- <goals>
- <goal>schema</goal>
- </goals>
- <configuration>
- <testSourceDirectory>${project.basedir}/src/test/</testSourceDirectory>
- <testOutputDirectory>${project.build.directory}/generated-test-sources/java</testOutputDirectory>
- </configuration>
- </execution>
- </executions>
- </plugin>
-
- <!-- This plugin tells Maven about an additional test-source directory to
- build, which contains Avro-generated source files. This is not
- strictly needed for the regular Maven build, but helps certain IDEs
- automatically find and compile generated code. -->
- <plugin>
- <groupId>org.codehaus.mojo</groupId>
- <artifactId>build-helper-maven-plugin</artifactId>
- <version>1.9.1</version>
- <executions>
- <execution>
- <id>add-test-source</id>
- <phase>generate-test-sources</phase>
- <goals>
- <goal>add-test-source</goal>
- </goals>
- <configuration>
- <sources>
- <source>${project.build.directory}/generated-test-sources/java</source>
- </sources>
- </configuration>
- </execution>
- </executions>
- </plugin>
- </plugins>
- </build>
-
- <dependencies>
- <dependency>
- <groupId>com.google.apis</groupId>
- <artifactId>google-api-services-dataflow</artifactId>
- <version>${dataflow.version}</version>
- <exclusions>
- <!-- Exclude an old version of guava that is being pulled
- in by a transitive dependency of google-api-client -->
- <exclusion>
- <groupId>com.google.guava</groupId>
- <artifactId>guava-jdk5</artifactId>
- </exclusion>
- </exclusions>
- </dependency>
-
- <dependency>
- <groupId>io.grpc</groupId>
- <artifactId>grpc-all</artifactId>
- <version>0.12.0</version>
- </dependency>
-
- <dependency>
- <groupId>com.google.cloud.bigtable</groupId>
- <artifactId>bigtable-protos</artifactId>
- <version>${bigtable.version}</version>
- </dependency>
-
- <dependency>
- <groupId>com.google.cloud.bigtable</groupId>
- <artifactId>bigtable-client-core</artifactId>
- <version>${bigtable.version}</version>
- </dependency>
-
- <dependency>
- <groupId>com.google.api-client</groupId>
- <artifactId>google-api-client</artifactId>
- <version>${google-clients.version}</version>
- <exclusions>
- <exclusion>
- <groupId>com.google.guava</groupId>
- <artifactId>guava-jdk5</artifactId>
- </exclusion>
- </exclusions>
- </dependency>
-
- <dependency>
- <groupId>com.google.apis</groupId>
- <artifactId>google-api-services-bigquery</artifactId>
- <version>${bigquery.version}</version>
- <exclusions>
- <!-- Exclude an old version of guava that is being pulled
- in by a transitive dependency of google-api-client -->
- <exclusion>
- <groupId>com.google.guava</groupId>
- <artifactId>guava-jdk5</artifactId>
- </exclusion>
- </exclusions>
- </dependency>
-
- <dependency>
- <groupId>com.google.apis</groupId>
- <artifactId>google-api-services-clouddebugger</artifactId>
- <version>${clouddebugger.version}</version>
- <exclusions>
- <!-- Exclude an old version of guava that is being pulled
- in by a transitive dependency of google-api-client -->
- <exclusion>
- <groupId>com.google.guava</groupId>
- <artifactId>guava-jdk5</artifactId>
- </exclusion>
- </exclusions>
- </dependency>
-
- <dependency>
- <groupId>com.google.apis</groupId>
- <artifactId>google-api-services-pubsub</artifactId>
- <version>${pubsub.version}</version>
- <exclusions>
- <!-- Exclude an old version of guava that is being pulled
- in by a transitive dependency of google-api-client -->
- <exclusion>
- <groupId>com.google.guava</groupId>
- <artifactId>guava-jdk5</artifactId>
- </exclusion>
- </exclusions>
- </dependency>
-
- <dependency>
- <groupId>com.google.apis</groupId>
- <artifactId>google-api-services-storage</artifactId>
- <version>${storage.version}</version>
- <exclusions>
- <!-- Exclude an old version of guava that is being pulled
- in by a transitive dependency of google-api-client -->
- <exclusion>
- <groupId>com.google.guava</groupId>
- <artifactId>guava-jdk5</artifactId>
- </exclusion>
- </exclusions>
- </dependency>
-
- <dependency>
- <groupId>com.google.http-client</groupId>
- <artifactId>google-http-client</artifactId>
- <version>${google-clients.version}</version>
- <exclusions>
- <!-- Exclude an old version of guava that is being pulled
- in by a transitive dependency of google-api-client -->
- <exclusion>
- <groupId>com.google.guava</groupId>
- <artifactId>guava-jdk5</artifactId>
- </exclusion>
- </exclusions>
- </dependency>
-
- <!-- Required by com.google.apis:google-api-services-datastore-protobuf,
- but the version they depend on differs from our api-client versions -->
- <dependency>
- <groupId>com.google.http-client</groupId>
- <artifactId>google-http-client-jackson</artifactId>
- <version>${google-clients.version}</version>
- <exclusions>
- <!-- Exclude an old version of guava that is being pulled
- in by a transitive dependency of google-api-client -->
- <exclusion>
- <groupId>com.google.guava</groupId>
- <artifactId>guava-jdk5</artifactId>
- </exclusion>
- <!-- Exclude an old version of jackson-core-asl -->
- <exclusion>
- <groupId>org.codehaus.jackson</groupId>
- <artifactId>jackson-core-asl</artifactId>
- </exclusion>
- </exclusions>
- <scope>runtime</scope>
- </dependency>
-
- <dependency>
- <groupId>com.google.http-client</groupId>
- <artifactId>google-http-client-jackson2</artifactId>
- <version>${google-clients.version}</version>
- <exclusions>
- <!-- Exclude an old version of guava that is being pulled
- in by a transitive dependency of google-api-client -->
- <exclusion>
- <groupId>com.google.guava</groupId>
- <artifactId>guava-jdk5</artifactId>
- </exclusion>
- </exclusions>
- </dependency>
-
- <dependency>
- <groupId>com.google.http-client</groupId>
- <artifactId>google-http-client-protobuf</artifactId>
- <version>${google-clients.version}</version>
- <exclusions>
- <!-- Exclude an old version of guava that is being pulled
- in by a transitive dependency of google-api-client -->
- <exclusion>
- <groupId>com.google.guava</groupId>
- <artifactId>guava-jdk5</artifactId>
- </exclusion>
- </exclusions>
- <scope>runtime</scope>
- </dependency>
-
- <dependency>
- <groupId>com.google.oauth-client</groupId>
- <artifactId>google-oauth-client-java6</artifactId>
- <version>${google-clients.version}</version>
- <exclusions>
- <!-- Exclude an old version of guava that is being pulled
- in by a transitive dependency of google-api-client -->
- <exclusion>
- <groupId>com.google.guava</groupId>
- <artifactId>guava-jdk5</artifactId>
- </exclusion>
- </exclusions>
- </dependency>
-
- <dependency>
- <groupId>com.google.oauth-client</groupId>
- <artifactId>google-oauth-client</artifactId>
- <version>${google-clients.version}</version>
- <exclusions>
- <!-- Exclude an old version of guava that is being pulled
- in by a transitive dependency of google-api-client -->
- <exclusion>
- <groupId>com.google.guava</groupId>
- <artifactId>guava-jdk5</artifactId>
- </exclusion>
- </exclusions>
- </dependency>
-
- <dependency>
- <groupId>com.google.apis</groupId>
- <artifactId>google-api-services-datastore-protobuf</artifactId>
- <version>${datastore.version}</version>
- <exclusions>
- <!-- Exclude an old version of guava that is being pulled
- in by a transitive dependency of google-api-client -->
- <exclusion>
- <groupId>com.google.guava</groupId>
- <artifactId>guava-jdk5</artifactId>
- </exclusion>
- <!-- Exclude old version of api client dependencies. -->
- <exclusion>
- <groupId>com.google.http-client</groupId>
- <artifactId>google-http-client</artifactId>
- </exclusion>
- <exclusion>
- <groupId>com.google.api-client</groupId>
- <artifactId>google-api-client</artifactId>
- </exclusion>
- <exclusion>
- <groupId>com.google.oauth-client</groupId>
- <artifactId>google-oauth-client</artifactId>
- </exclusion>
- <exclusion>
- <groupId>com.google.http-client</groupId>
- <artifactId>google-http-client-jackson</artifactId>
- </exclusion>
- <exclusion>
- <groupId>com.google.http-client</groupId>
- <artifactId>google-http-client-protobuf</artifactId>
- </exclusion>
- </exclusions>
- </dependency>
-
- <dependency>
- <groupId>com.google.cloud.bigdataoss</groupId>
- <artifactId>gcsio</artifactId>
- <version>1.4.3</version>
- </dependency>
-
- <dependency>
- <groupId>com.google.cloud.bigdataoss</groupId>
- <artifactId>util</artifactId>
- <version>1.4.3</version>
- </dependency>
-
- <dependency>
- <groupId>com.google.guava</groupId>
- <artifactId>guava</artifactId>
- <!-- If updating version, please update the javadoc offlineLink -->
- <version>${guava.version}</version>
- </dependency>
-
- <dependency>
- <groupId>com.google.guava</groupId>
- <artifactId>guava-testlib</artifactId>
- <version>${guava.version}</version>
- <scope>test</scope>
- </dependency>
-
- <dependency>
- <groupId>com.google.protobuf</groupId>
- <artifactId>protobuf-java</artifactId>
- <version>${protobuf.version}</version>
- </dependency>
-
- <dependency>
- <groupId>com.google.code.findbugs</groupId>
- <artifactId>jsr305</artifactId>
- <version>${jsr305.version}</version>
- </dependency>
-
- <dependency>
- <groupId>com.fasterxml.jackson.core</groupId>
- <artifactId>jackson-core</artifactId>
- <version>${jackson.version}</version>
- </dependency>
-
- <dependency>
- <groupId>com.fasterxml.jackson.core</groupId>
- <artifactId>jackson-annotations</artifactId>
- <version>${jackson.version}</version>
- </dependency>
-
- <dependency>
- <groupId>com.fasterxml.jackson.core</groupId>
- <artifactId>jackson-databind</artifactId>
- <version>${jackson.version}</version>
- </dependency>
-
- <dependency>
- <groupId>org.slf4j</groupId>
- <artifactId>slf4j-api</artifactId>
- <version>${slf4j.version}</version>
- </dependency>
-
- <dependency>
- <groupId>org.apache.avro</groupId>
- <artifactId>avro</artifactId>
- <version>${avro.version}</version>
- </dependency>
-
- <dependency>
- <groupId>org.xerial.snappy</groupId>
- <artifactId>snappy-java</artifactId>
- <version>1.1.2.1</version>
- </dependency>
-
- <dependency>
- <groupId>org.apache.commons</groupId>
- <artifactId>commons-compress</artifactId>
- <version>1.9</version>
- </dependency>
-
- <dependency>
- <groupId>joda-time</groupId>
- <artifactId>joda-time</artifactId>
- <version>${joda.version}</version>
- </dependency>
-
- <!--
- To use com.google.cloud.dataflow.io.XmlSource:
-
- 1. Explicitly declare the following dependency for the stax2 API.
- 2. Include a stax2 implementation on the classpath. One example
- is given below as an optional runtime dependency on woodstox-core-asl
- -->
- <dependency>
- <groupId>org.codehaus.woodstox</groupId>
- <artifactId>stax2-api</artifactId>
- <version>${stax2.version}</version>
- <optional>true</optional>
- </dependency>
-
- <dependency>
- <groupId>org.codehaus.woodstox</groupId>
- <artifactId>woodstox-core-asl</artifactId>
- <version>${woodstox.version}</version>
- <scope>runtime</scope>
- <optional>true</optional>
- <exclusions>
- <!-- javax.xml.stream:stax-api is included in JDK 1.6+ -->
- <exclusion>
- <groupId>javax.xml.stream</groupId>
- <artifactId>stax-api</artifactId>
- </exclusion>
- </exclusions>
- </dependency>
-
- <!--
- To use com.google.cloud.dataflow.io.AvroSource with XZ-encoded files,
- please explicitly declare this dependency to include org.tukaani:xz on
- the classpath at runtime.
- -->
- <dependency>
- <groupId>org.tukaani</groupId>
- <artifactId>xz</artifactId>
- <version>1.5</version>
- <scope>runtime</scope>
- <optional>true</optional>
- </dependency>
-
- <!-- build dependencies -->
- <dependency>
- <groupId>com.google.auto.service</groupId>
- <artifactId>auto-service</artifactId>
- <version>1.0-rc2</version>
- <optional>true</optional>
- </dependency>
-
- <!-- test dependencies -->
- <dependency>
- <groupId>org.hamcrest</groupId>
- <artifactId>hamcrest-all</artifactId>
- <version>${hamcrest.version}</version>
- <scope>provided</scope>
- </dependency>
-
- <dependency>
- <groupId>junit</groupId>
- <artifactId>junit</artifactId>
- <version>${junit.version}</version>
- <scope>provided</scope>
- </dependency>
-
- <dependency>
- <groupId>org.slf4j</groupId>
- <artifactId>slf4j-jdk14</artifactId>
- <version>${slf4j.version}</version>
- <scope>test</scope>
- </dependency>
-
- <dependency>
- <groupId>org.mockito</groupId>
- <artifactId>mockito-all</artifactId>
- <version>1.10.19</version>
- <scope>test</scope>
- </dependency>
-
- <dependency>
- <groupId>com.google.cloud.dataflow</groupId>
- <artifactId>google-cloud-dataflow-java-proto-library-all</artifactId>
- <version>0.5.160304</version>
- <scope>test</scope>
- </dependency>
- </dependencies>
-</project>
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/Pipeline.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/Pipeline.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/Pipeline.java
deleted file mode 100644
index b166673..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/Pipeline.java
+++ /dev/null
@@ -1,502 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk;
-
-import com.google.cloud.dataflow.sdk.coders.CoderRegistry;
-import com.google.cloud.dataflow.sdk.io.Read;
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
-import com.google.cloud.dataflow.sdk.runners.PipelineRunner;
-import com.google.cloud.dataflow.sdk.runners.TransformHierarchy;
-import com.google.cloud.dataflow.sdk.runners.TransformTreeNode;
-import com.google.cloud.dataflow.sdk.transforms.AppliedPTransform;
-import com.google.cloud.dataflow.sdk.transforms.Create;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-import com.google.cloud.dataflow.sdk.util.UserCodeException;
-import com.google.cloud.dataflow.sdk.values.PBegin;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.cloud.dataflow.sdk.values.PInput;
-import com.google.cloud.dataflow.sdk.values.POutput;
-import com.google.cloud.dataflow.sdk.values.PValue;
-import com.google.common.base.Preconditions;
-import com.google.common.collect.HashMultimap;
-import com.google.common.collect.Iterables;
-import com.google.common.collect.Multimap;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Set;
-
-/**
- * A {@link Pipeline} manages a directed acyclic graph of {@link PTransform PTransforms}, and the
- * {@link PCollection PCollections} that the {@link PTransform}s consume and produce.
- *
- * <p>A {@link Pipeline} is initialized with a {@link PipelineRunner} that will later
- * execute the {@link Pipeline}.
- *
- * <p>{@link Pipeline Pipelines} are independent, so they can be constructed and executed
- * concurrently.
- *
- * <p>Each {@link Pipeline} is self-contained and isolated from any other
- * {@link Pipeline}. The {@link PValue PValues} that are inputs and outputs of each of a
- * {@link Pipeline Pipeline's} {@link PTransform PTransforms} are also owned by that
- * {@link Pipeline}. A {@link PValue} owned by one {@link Pipeline} can be read only by
- * {@link PTransform PTransforms} also owned by that {@link Pipeline}.
- *
- * <p>Here is a typical example of use:
- * <pre> {@code
- * // Start by defining the options for the pipeline.
- * PipelineOptions options = PipelineOptionsFactory.create();
- * // Then create the pipeline. The runner is determined by the options.
- * Pipeline p = Pipeline.create(options);
- *
- * // A root PTransform, like TextIO.Read or Create, gets added
- * // to the Pipeline by being applied:
- * PCollection<String> lines =
- * p.apply(TextIO.Read.from("gs://bucket/dir/file*.txt"));
- *
- * // A Pipeline can have multiple root transforms:
- * PCollection<String> moreLines =
- * p.apply(TextIO.Read.from("gs://bucket/other/dir/file*.txt"));
- * PCollection<String> yetMoreLines =
- * p.apply(Create.of("yet", "more", "lines").withCoder(StringUtf8Coder.of()));
- *
- * // Further PTransforms can be applied, in an arbitrary (acyclic) graph.
- * // Subsequent PTransforms (and intermediate PCollections etc.) are
- * // implicitly part of the same Pipeline.
- * PCollection<String> allLines =
- * PCollectionList.of(lines).and(moreLines).and(yetMoreLines)
- * .apply(new Flatten<String>());
- * PCollection<KV<String, Integer>> wordCounts =
- * allLines
- * .apply(ParDo.of(new ExtractWords()))
- * .apply(new Count<String>());
- * PCollection<String> formattedWordCounts =
- * wordCounts.apply(ParDo.of(new FormatCounts()));
- * formattedWordCounts.apply(TextIO.Write.to("gs://bucket/dir/counts.txt"));
- *
- * // PTransforms aren't executed when they're applied, rather they're
- * // just added to the Pipeline. Once the whole Pipeline of PTransforms
- * // is constructed, the Pipeline's PTransforms can be run using a
- * // PipelineRunner. The default PipelineRunner executes the Pipeline
- * // directly, sequentially, in this one process, which is useful for
- * // unit tests and simple experiments:
- * p.run();
- *
- * } </pre>
- */
-public class Pipeline {
- private static final Logger LOG = LoggerFactory.getLogger(Pipeline.class);
-
- /**
- * Thrown during execution of a {@link Pipeline}, whenever user code within that
- * {@link Pipeline} throws an exception.
- *
- * <p>The original exception thrown by user code may be retrieved via {@link #getCause}.
- */
- public static class PipelineExecutionException extends RuntimeException {
- /**
- * Wraps {@code cause} into a {@link PipelineExecutionException}.
- */
- public PipelineExecutionException(Throwable cause) {
- super(cause);
- }
- }
-
- /////////////////////////////////////////////////////////////////////////////
- // Public operations.
-
- /**
- * Constructs a pipeline from the provided options.
- *
- * @return The newly created pipeline.
- */
- public static Pipeline create(PipelineOptions options) {
- Pipeline pipeline = new Pipeline(PipelineRunner.fromOptions(options), options);
- LOG.debug("Creating {}", pipeline);
- return pipeline;
- }
-
- /**
- * Returns a {@link PBegin} owned by this Pipeline. This is useful
- * as the input of a root PTransform such as {@link Read} or
- * {@link Create}.
- */
- public PBegin begin() {
- return PBegin.in(this);
- }
-
- /**
- * Like {@link #apply(String, PTransform)} but the transform node in the {@link Pipeline}
- * graph will be named according to {@link PTransform#getName}.
- *
- * @see #apply(String, PTransform)
- */
- public <OutputT extends POutput> OutputT apply(
- PTransform<? super PBegin, OutputT> root) {
- return begin().apply(root);
- }
-
- /**
- * Adds a root {@link PTransform}, such as {@link Read} or {@link Create},
- * to this {@link Pipeline}.
- *
- * <p>The node in the {@link Pipeline} graph will use the provided {@code name}.
- * This name is used in various places, including the monitoring UI, logging,
- * and to stably identify this node in the {@link Pipeline} graph upon update.
- *
- * <p>Alias for {@code begin().apply(name, root)}.
- */
- public <OutputT extends POutput> OutputT apply(
- String name, PTransform<? super PBegin, OutputT> root) {
- return begin().apply(name, root);
- }
-
- /**
- * Runs the {@link Pipeline} using its {@link PipelineRunner}.
- */
- public PipelineResult run() {
- LOG.debug("Running {} via {}", this, runner);
- try {
- return runner.run(this);
- } catch (UserCodeException e) {
- // This serves to replace the stack with one that ends here and
- // is caused by the caught UserCodeException, thereby splicing
- // out all the stack frames in between the PipelineRunner itself
- // and where the worker calls into the user's code.
- throw new PipelineExecutionException(e.getCause());
- }
- }
-
-
- /////////////////////////////////////////////////////////////////////////////
- // Below here are operations that aren't normally called by users.
-
- /**
- * Returns the {@link CoderRegistry} that this {@link Pipeline} uses.
- */
- public CoderRegistry getCoderRegistry() {
- if (coderRegistry == null) {
- coderRegistry = new CoderRegistry();
- coderRegistry.registerStandardCoders();
- }
- return coderRegistry;
- }
-
- /**
- * Sets the {@link CoderRegistry} that this {@link Pipeline} uses.
- */
- public void setCoderRegistry(CoderRegistry coderRegistry) {
- this.coderRegistry = coderRegistry;
- }
-
- /**
- * A {@link PipelineVisitor} can be passed into
- * {@link Pipeline#traverseTopologically} to be called for each of the
- * transforms and values in the {@link Pipeline}.
- */
- public interface PipelineVisitor {
- /**
- * Called for each composite transform after all topological predecessors have been visited
- * but before any of its component transforms.
- */
- public void enterCompositeTransform(TransformTreeNode node);
-
- /**
- * Called for each composite transform after all of its component transforms and their outputs
- * have been visited.
- */
- public void leaveCompositeTransform(TransformTreeNode node);
-
- /**
- * Called for each primitive transform after all of its topological predecessors
- * and inputs have been visited.
- */
- public void visitTransform(TransformTreeNode node);
-
- /**
- * Called for each value after the transform that produced the value has been
- * visited.
- */
- public void visitValue(PValue value, TransformTreeNode producer);
- }
-
- /**
- * Invokes the {@link PipelineVisitor PipelineVisitor's}
- * {@link PipelineVisitor#visitTransform} and
- * {@link PipelineVisitor#visitValue} operations on each of this
- * {@link Pipeline Pipeline's} transform and value nodes, in forward
- * topological order.
- *
- * <p>Traversal of the {@link Pipeline} causes {@link PTransform PTransforms} and
- * {@link PValue PValues} owned by the {@link Pipeline} to be marked as finished,
- * at which point they may no longer be modified.
- *
- * <p>Typically invoked by {@link PipelineRunner} subclasses.
- */
- public void traverseTopologically(PipelineVisitor visitor) {
- Set<PValue> visitedValues = new HashSet<>();
- // Visit all the transforms, which should implicitly visit all the values.
- transforms.visit(visitor, visitedValues);
- if (!visitedValues.containsAll(values)) {
- throw new RuntimeException(
- "internal error: should have visited all the values "
- + "after visiting all the transforms");
- }
- }
-
- /**
- * Like {@link #applyTransform(String, PInput, PTransform)} but defaulting to the name
- * provided by the {@link PTransform}.
- */
- public static <InputT extends PInput, OutputT extends POutput>
- OutputT applyTransform(InputT input,
- PTransform<? super InputT, OutputT> transform) {
- return input.getPipeline().applyInternal(transform.getName(), input, transform);
- }
-
- /**
- * Applies the given {@code PTransform} to this input {@code InputT} and returns
- * its {@code OutputT}. This uses {@code name} to identify this specific application
- * of the transform. This name is used in various places, including the monitoring UI,
- * logging, and to stably identify this application node in the {@link Pipeline} graph during
- * update.
- *
- * <p>Each {@link PInput} subclass that provides an {@code apply} method should delegate to
- * this method to ensure proper registration with the {@link PipelineRunner}.
- */
- public static <InputT extends PInput, OutputT extends POutput>
- OutputT applyTransform(String name, InputT input,
- PTransform<? super InputT, OutputT> transform) {
- return input.getPipeline().applyInternal(name, input, transform);
- }
-
- /////////////////////////////////////////////////////////////////////////////
- // Below here are internal operations, never called by users.
-
- private final PipelineRunner<?> runner;
- private final PipelineOptions options;
- private final TransformHierarchy transforms = new TransformHierarchy();
- private Collection<PValue> values = new ArrayList<>();
- private Set<String> usedFullNames = new HashSet<>();
- private CoderRegistry coderRegistry;
- private Multimap<PTransform<?, ?>, AppliedPTransform<?, ?, ?>> transformApplicationsForTesting =
- HashMultimap.create();
-
- /**
- * @deprecated replaced by {@link #Pipeline(PipelineRunner, PipelineOptions)}
- */
- @Deprecated
- protected Pipeline(PipelineRunner<?> runner) {
- this(runner, PipelineOptionsFactory.create());
- }
-
- protected Pipeline(PipelineRunner<?> runner, PipelineOptions options) {
- this.runner = runner;
- this.options = options;
- }
-
- @Override
- public String toString() {
- return "Pipeline#" + hashCode();
- }
-
- /**
- * Applies a {@link PTransform} to the given {@link PInput}.
- *
- * @see Pipeline#apply
- */
- private <InputT extends PInput, OutputT extends POutput>
- OutputT applyInternal(String name, InputT input,
- PTransform<? super InputT, OutputT> transform) {
- input.finishSpecifying();
-
- TransformTreeNode parent = transforms.getCurrent();
- String namePrefix = parent.getFullName();
- String fullName = uniquifyInternal(namePrefix, name);
-
- boolean nameIsUnique = fullName.equals(buildName(namePrefix, name));
-
- if (!nameIsUnique) {
- switch (getOptions().getStableUniqueNames()) {
- case OFF:
- break;
- case WARNING:
- LOG.warn("Transform {} does not have a stable unique name. "
- + "This will prevent updating of pipelines.", fullName);
- break;
- case ERROR:
- throw new IllegalStateException(
- "Transform " + fullName + " does not have a stable unique name. "
- + "This will prevent updating of pipelines.");
- default:
- throw new IllegalArgumentException(
- "Unrecognized value for stable unique names: " + getOptions().getStableUniqueNames());
- }
- }
-
- TransformTreeNode child =
- new TransformTreeNode(parent, transform, fullName, input);
- parent.addComposite(child);
-
- transforms.addInput(child, input);
-
- LOG.debug("Adding {} to {}", transform, this);
- try {
- transforms.pushNode(child);
- transform.validate(input);
- OutputT output = runner.apply(transform, input);
- transforms.setOutput(child, output);
-
- AppliedPTransform<?, ?, ?> applied = AppliedPTransform.of(
- child.getFullName(), input, output, transform);
- transformApplicationsForTesting.put(transform, applied);
- // recordAsOutput is a NOOP if already called;
- output.recordAsOutput(applied);
- verifyOutputState(output, child);
- return output;
- } finally {
- transforms.popNode();
- }
- }
-
- /**
- * Returns all producing transforms for the {@link PValue PValues} contained
- * in {@code output}.
- */
- private List<AppliedPTransform<?, ?, ?>> getProducingTransforms(POutput output) {
- List<AppliedPTransform<?, ?, ?>> producingTransforms = new ArrayList<>();
- for (PValue value : output.expand()) {
- AppliedPTransform<?, ?, ?> transform = value.getProducingTransformInternal();
- if (transform != null) {
- producingTransforms.add(transform);
- }
- }
- return producingTransforms;
- }
-
- /**
- * Verifies that the output of a {@link PTransform} is correctly configured in its
- * {@link TransformTreeNode} in the {@link Pipeline} graph.
- *
- * <p>A non-composite {@link PTransform} must have all
- * of its outputs registered as produced by that {@link PTransform}.
- *
- * <p>A composite {@link PTransform} must have all of its outputs
- * registered as produced by the contained primitive {@link PTransform PTransforms}.
- * They have each had the above check performed already, when
- * they were applied, so the only possible failure state is
- * that the composite {@link PTransform} has returned a primitive output.
- */
- private void verifyOutputState(POutput output, TransformTreeNode node) {
- if (!node.isCompositeNode()) {
- PTransform<?, ?> thisTransform = node.getTransform();
- List<AppliedPTransform<?, ?, ?>> producingTransforms = getProducingTransforms(output);
- for (AppliedPTransform<?, ?, ?> producingTransform : producingTransforms) {
- // Using != because object identity indicates that the transforms
- // are the same node in the pipeline
- if (thisTransform != producingTransform.getTransform()) {
- throw new IllegalArgumentException("Output of non-composite transform "
- + thisTransform + " is registered as being produced by"
- + " a different transform: " + producingTransform);
- }
- }
- } else {
- PTransform<?, ?> thisTransform = node.getTransform();
- List<AppliedPTransform<?, ?, ?>> producingTransforms = getProducingTransforms(output);
- for (AppliedPTransform<?, ?, ?> producingTransform : producingTransforms) {
- // Using == because object identity indicates that the transforms
- // are the same node in the pipeline
- if (thisTransform == producingTransform.getTransform()) {
- throw new IllegalStateException("Output of composite transform "
- + thisTransform + " is registered as being produced by it,"
- + " but the output of every composite transform should be"
- + " produced by a primitive transform contained therein.");
- }
- }
- }
- }
-
- /**
- * Returns the configured {@link PipelineRunner}.
- */
- public PipelineRunner<?> getRunner() {
- return runner;
- }
-
- /**
- * Returns the configured {@link PipelineOptions}.
- */
- public PipelineOptions getOptions() {
- return options;
- }
-
- /**
- * @deprecated this method is no longer compatible with the design of {@link Pipeline},
- * as {@link PTransform PTransforms} can be applied multiple times, with different names
- * each time.
- */
- @Deprecated
- public String getFullNameForTesting(PTransform<?, ?> transform) {
- Collection<AppliedPTransform<?, ?, ?>> uses =
- transformApplicationsForTesting.get(transform);
- Preconditions.checkState(uses.size() > 0, "Unknown transform: " + transform);
- Preconditions.checkState(uses.size() <= 1, "Transform used multiple times: " + transform);
- return Iterables.getOnlyElement(uses).getFullName();
- }
-
- /**
- * Returns a unique name for a transform with the given prefix (from
- * enclosing transforms) and initial name.
- *
- * <p>For internal use only.
- */
- private String uniquifyInternal(String namePrefix, String origName) {
- String name = origName;
- int suffixNum = 2;
- while (true) {
- String candidate = buildName(namePrefix, name);
- if (usedFullNames.add(candidate)) {
- return candidate;
- }
- // A duplicate! Retry.
- name = origName + suffixNum++;
- }
- }
-
- /**
- * Builds a name from a "/"-delimited prefix and a name.
- */
- private String buildName(String namePrefix, String name) {
- return namePrefix.isEmpty() ? name : namePrefix + "/" + name;
- }
-
- /**
- * Adds the given {@link PValue} to this {@link Pipeline}.
- *
- * <p>For internal use only.
- */
- public void addValueInternal(PValue value) {
- this.values.add(value);
- LOG.debug("Adding {} to {}", value, this);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/PipelineResult.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/PipelineResult.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/PipelineResult.java
deleted file mode 100644
index 6b9a36b..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/PipelineResult.java
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk;
-
-import com.google.cloud.dataflow.sdk.runners.AggregatorRetrievalException;
-import com.google.cloud.dataflow.sdk.runners.AggregatorValues;
-import com.google.cloud.dataflow.sdk.transforms.Aggregator;
-
-/**
- * Result of {@link Pipeline#run()}.
- */
-public interface PipelineResult {
-
- /**
- * Retrieves the current state of the pipeline execution.
- *
- * @return the {@link State} representing the state of this pipeline.
- */
- State getState();
-
- /**
- * Retrieves the current value of the provided {@link Aggregator}.
- *
- * @param aggregator the {@link Aggregator} to retrieve values for.
- * @return the current values of the {@link Aggregator},
- * which may be empty if there are no values yet.
- * @throws AggregatorRetrievalException if the {@link Aggregator} values could not be retrieved.
- */
- <T> AggregatorValues<T> getAggregatorValues(Aggregator<?, T> aggregator)
- throws AggregatorRetrievalException;
-
- // TODO: method to retrieve error messages.
-
- /** Named constants for common values for the job state. */
- public enum State {
-
- /** The job state could not be obtained or was not specified. */
- UNKNOWN(false, false),
-
- /** The job has been paused, or has not yet started. */
- STOPPED(false, false),
-
- /** The job is currently running. */
- RUNNING(false, false),
-
- /** The job has successfully completed. */
- DONE(true, false),
-
- /** The job has failed. */
- FAILED(true, false),
-
- /** The job has been explicitly cancelled. */
- CANCELLED(true, false),
-
- /** The job has been updated. */
- UPDATED(true, true);
-
- private final boolean terminal;
-
- private final boolean hasReplacement;
-
- private State(boolean terminal, boolean hasReplacement) {
- this.terminal = terminal;
- this.hasReplacement = hasReplacement;
- }
-
- /**
- * @return {@code true} if the job state can no longer complete work.
- */
- public final boolean isTerminal() {
- return terminal;
- }
-
- /**
- * @return {@code true} if this job state indicates that a replacement job exists.
- */
- public final boolean hasReplacementJob() {
- return hasReplacement;
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/annotations/Experimental.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/annotations/Experimental.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/annotations/Experimental.java
deleted file mode 100644
index cac2aa8..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/annotations/Experimental.java
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.annotations;
-
-import java.lang.annotation.Documented;
-import java.lang.annotation.ElementType;
-import java.lang.annotation.Retention;
-import java.lang.annotation.RetentionPolicy;
-import java.lang.annotation.Target;
-
-/**
- * Signifies that a public API (public class, method or field) is subject to
- * incompatible changes, or even removal, in a future release. An API bearing
- * this annotation is exempt from any compatibility guarantees made by its
- * containing library. Note that the presence of this annotation implies nothing
- * about the quality or performance of the API in question, only the fact that
- * it is not "API-frozen."
- *
- * <p>It is generally safe for <i>applications</i> to depend on experimental
- * APIs, at the cost of some extra work during upgrades. However, it is
- * generally inadvisable for <i>libraries</i> (which get included on users'
- * class paths, outside the library developers' control) to do so.
- */
-@Retention(RetentionPolicy.CLASS)
-@Target({
- ElementType.ANNOTATION_TYPE,
- ElementType.CONSTRUCTOR,
- ElementType.FIELD,
- ElementType.METHOD,
- ElementType.TYPE})
-@Documented
-public @interface Experimental {
- public Kind value() default Kind.UNSPECIFIED;
-
- /**
- * An enumeration of various kinds of experimental APIs.
- */
- public enum Kind {
- /** Generic group of experimental APIs. This is the default value. */
- UNSPECIFIED,
-
- /** Sources and sinks related experimental APIs. */
- SOURCE_SINK,
-
- /** Auto-scaling related experimental APIs. */
- AUTOSCALING,
-
- /** Trigger-related experimental APIs. */
- TRIGGER,
-
- /** Aggregator-related experimental APIs. */
- AGGREGATOR,
-
- /** Experimental APIs for Coder binary format identifiers. */
- CODER_ENCODING_ID,
-
- /** State-related experimental APIs. */
- STATE,
-
- /** Timer-related experimental APIs. */
- TIMERS,
-
- /** Experimental APIs related to customizing the output time for computed values. */
- OUTPUT_TIME
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/annotations/package-info.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/annotations/package-info.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/annotations/package-info.java
deleted file mode 100644
index 6c224a6..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/annotations/package-info.java
+++ /dev/null
@@ -1,20 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-/**
- * Defines annotations used across the SDK.
- */
-package com.google.cloud.dataflow.sdk.annotations;
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/AtomicCoder.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/AtomicCoder.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/AtomicCoder.java
deleted file mode 100644
index c4951b4..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/AtomicCoder.java
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.coders;
-
-import java.util.Collections;
-import java.util.List;
-
-/**
- * A {@link Coder} that has no component {@link Coder Coders} or other state.
- *
- * <p>Note that, unless the behavior is overridden, atomic coders are presumed to be deterministic
- * and all instances are considered equal.
- *
- * @param <T> the type of the values being transcoded
- */
-public abstract class AtomicCoder<T> extends DeterministicStandardCoder<T> {
- protected AtomicCoder() { }
-
- @Override
- public List<Coder<?>> getCoderArguments() {
- return null;
- }
-
- /**
- * Returns a list of values contained in the provided example
- * value, one per type parameter. If there are no type parameters,
- * returns an empty list.
- *
- * <p>Because {@link AtomicCoder} has no components, always returns an empty list.
- *
- * @param exampleValue unused, but part of the latent interface expected by
- * {@link CoderFactories#fromStaticMethods}
- */
- public static <T> List<Object> getInstanceComponents(T exampleValue) {
- return Collections.emptyList();
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/AvroCoder.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/AvroCoder.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/AvroCoder.java
deleted file mode 100644
index 91efb43..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/AvroCoder.java
+++ /dev/null
@@ -1,714 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.coders;
-
-import static com.google.cloud.dataflow.sdk.util.Structs.addString;
-
-import com.google.cloud.dataflow.sdk.util.CloudObject;
-import com.google.cloud.dataflow.sdk.values.TypeDescriptor;
-
-import com.fasterxml.jackson.annotation.JsonCreator;
-import com.fasterxml.jackson.annotation.JsonProperty;
-
-import org.apache.avro.Schema;
-import org.apache.avro.generic.GenericDatumReader;
-import org.apache.avro.generic.GenericDatumWriter;
-import org.apache.avro.generic.GenericRecord;
-import org.apache.avro.generic.IndexedRecord;
-import org.apache.avro.io.BinaryDecoder;
-import org.apache.avro.io.BinaryEncoder;
-import org.apache.avro.io.DatumReader;
-import org.apache.avro.io.DatumWriter;
-import org.apache.avro.io.DecoderFactory;
-import org.apache.avro.io.EncoderFactory;
-import org.apache.avro.reflect.AvroEncode;
-import org.apache.avro.reflect.AvroName;
-import org.apache.avro.reflect.AvroSchema;
-import org.apache.avro.reflect.ReflectData;
-import org.apache.avro.reflect.ReflectDatumReader;
-import org.apache.avro.reflect.ReflectDatumWriter;
-import org.apache.avro.reflect.Union;
-import org.apache.avro.specific.SpecificData;
-import org.apache.avro.util.ClassUtils;
-import org.apache.avro.util.Utf8;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.io.Serializable;
-import java.lang.reflect.Field;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-import java.util.SortedMap;
-import java.util.SortedSet;
-
-import javax.annotation.Nullable;
-
-/**
- * A {@link Coder} using Avro binary format.
- *
- * <p>Each instance of {@code AvroCoder<T>} encapsulates an Avro schema for objects of type
- * {@code T}.
- *
- * <p>The Avro schema may be provided explicitly via {@link AvroCoder#of(Class, Schema)} or
- * omitted via {@link AvroCoder#of(Class)}, in which case it will be inferred
- * using Avro's {@link org.apache.avro.reflect.ReflectData}.
- *
- * <p>For complete details about schema generation and how it can be controlled please see
- * the {@link org.apache.avro.reflect} package.
- * Only concrete classes with a no-argument constructor can be mapped to Avro records.
- * All inherited fields that are not static or transient are included. Fields are not permitted to
- * be null unless annotated by {@link Nullable} or a {@link Union} schema
- * containing {@code "null"}.
- *
- * <p>To use, specify the {@code Coder} type on a PCollection:
- * <pre>
- * {@code
- * PCollection<MyCustomElement> records =
- * input.apply(...)
- * .setCoder(AvroCoder.of(MyCustomElement.class);
- * }
- * </pre>
- *
- * <p>or annotate the element class using {@code @DefaultCoder}.
- * <pre><code>
- * {@literal @}DefaultCoder(AvroCoder.class)
- * public class MyCustomElement {
- * ...
- * }
- * </code></pre>
- *
- * <p>The implementation attempts to determine if the Avro encoding of the given type will satisfy
- * the criteria of {@link Coder#verifyDeterministic} by inspecting both the type and the
- * Schema provided or generated by Avro. Only coders that are deterministic can be used in
- * {@link com.google.cloud.dataflow.sdk.transforms.GroupByKey} operations.
- *
- * @param <T> the type of elements handled by this coder
- */
-public class AvroCoder<T> extends StandardCoder<T> {
-
- /**
- * Returns an {@code AvroCoder} instance for the provided element type.
- * @param <T> the element type
- */
- public static <T> AvroCoder<T> of(TypeDescriptor<T> type) {
- @SuppressWarnings("unchecked")
- Class<T> clazz = (Class<T>) type.getRawType();
- return of(clazz);
- }
-
- /**
- * Returns an {@code AvroCoder} instance for the provided element class.
- * @param <T> the element type
- */
- public static <T> AvroCoder<T> of(Class<T> clazz) {
- return new AvroCoder<>(clazz, ReflectData.get().getSchema(clazz));
- }
-
- /**
- * Returns an {@code AvroCoder} instance for the Avro schema. The implicit
- * type is GenericRecord.
- */
- public static AvroCoder<GenericRecord> of(Schema schema) {
- return new AvroCoder<>(GenericRecord.class, schema);
- }
-
- /**
- * Returns an {@code AvroCoder} instance for the provided element type
- * using the provided Avro schema.
- *
- * <p>If the type argument is GenericRecord, the schema may be arbitrary.
- * Otherwise, the schema must correspond to the type provided.
- *
- * @param <T> the element type
- */
- public static <T> AvroCoder<T> of(Class<T> type, Schema schema) {
- return new AvroCoder<>(type, schema);
- }
-
- @SuppressWarnings({"unchecked", "rawtypes"})
- @JsonCreator
- public static AvroCoder<?> of(
- @JsonProperty("type") String classType,
- @JsonProperty("schema") String schema) throws ClassNotFoundException {
- Schema.Parser parser = new Schema.Parser();
- return new AvroCoder(Class.forName(classType), parser.parse(schema));
- }
-
- public static final CoderProvider PROVIDER = new CoderProvider() {
- @Override
- public <T> Coder<T> getCoder(TypeDescriptor<T> typeDescriptor) {
- // This is a downcast from `? super T` to T. However, because
- // it comes from a TypeDescriptor<T>, the class object itself
- // is the same so the supertype in question shares the same
- // generated AvroCoder schema.
- @SuppressWarnings("unchecked")
- Class<T> rawType = (Class<T>) typeDescriptor.getRawType();
- return AvroCoder.of(rawType);
- }
- };
-
- private final Class<T> type;
- private final Schema schema;
-
- private final List<String> nonDeterministicReasons;
-
- // Factories allocated by .get() are thread-safe and immutable.
- private static final EncoderFactory ENCODER_FACTORY = EncoderFactory.get();
- private static final DecoderFactory DECODER_FACTORY = DecoderFactory.get();
- // Cache the old encoder/decoder and let the factories reuse them when possible. To be threadsafe,
- // these are ThreadLocal. This code does not need to be re-entrant as AvroCoder does not use
- // an inner coder.
- private final ThreadLocal<BinaryDecoder> decoder;
- private final ThreadLocal<BinaryEncoder> encoder;
- private final ThreadLocal<DatumWriter<T>> writer;
- private final ThreadLocal<DatumReader<T>> reader;
-
- protected AvroCoder(Class<T> type, Schema schema) {
- this.type = type;
- this.schema = schema;
-
- nonDeterministicReasons = new AvroDeterminismChecker().check(TypeDescriptor.of(type), schema);
-
- // Decoder and Encoder start off null for each thread. They are allocated and potentially
- // reused inside encode/decode.
- this.decoder = new ThreadLocal<>();
- this.encoder = new ThreadLocal<>();
-
- // Reader and writer are allocated once per thread and are "final" for thread-local Coder
- // instance.
- this.reader = new ThreadLocal<DatumReader<T>>() {
- @Override
- public DatumReader<T> initialValue() {
- return createDatumReader();
- }
- };
- this.writer = new ThreadLocal<DatumWriter<T>>() {
- @Override
- public DatumWriter<T> initialValue() {
- return createDatumWriter();
- }
- };
- }
-
- /**
- * The encoding identifier is designed to support evolution as per the design of Avro
- * In order to use this class effectively, carefully read the Avro
- * documentation at
- * <a href="https://avro.apache.org/docs/1.7.7/spec.html#Schema+Resolution">Schema Resolution</a>
- * to ensure that the old and new schema <i>match</i>.
- *
- * <p>In particular, this encoding identifier is guaranteed to be the same for {@code AvroCoder}
- * instances of the same principal class, and otherwise distinct. The schema is not included
- * in the identifier.
- *
- * <p>When modifying a class to be encoded as Avro, here are some guidelines; see the above link
- * for greater detail.
- *
- * <ul>
- * <li>Avoid changing field names.
- * <li>Never remove a <code>required</code> field.
- * <li>Only add <code>optional</code> fields, with sensible defaults.
- * <li>When changing the type of a field, consult the Avro documentation to ensure the new and
- * old types are interchangeable.
- * </ul>
- *
- * <p>Code consuming this message class should be prepared to support <i>all</i> versions of
- * the class until it is certain that no remaining serialized instances exist.
- *
- * <p>If backwards incompatible changes must be made, the best recourse is to change the name
- * of your class.
- */
- @Override
- public String getEncodingId() {
- return type.getName();
- }
-
- /**
- * Returns the type this coder encodes/decodes.
- */
- public Class<T> getType() {
- return type;
- }
-
- private Object writeReplace() {
- // When serialized by Java, instances of AvroCoder should be replaced by
- // a SerializedAvroCoderProxy.
- return new SerializedAvroCoderProxy<>(type, schema.toString());
- }
-
- @Override
- public void encode(T value, OutputStream outStream, Context context) throws IOException {
- // Get a BinaryEncoder instance from the ThreadLocal cache and attempt to reuse it.
- BinaryEncoder encoderInstance = ENCODER_FACTORY.directBinaryEncoder(outStream, encoder.get());
- // Save the potentially-new instance for reuse later.
- encoder.set(encoderInstance);
- writer.get().write(value, encoderInstance);
- // Direct binary encoder does not buffer any data and need not be flushed.
- }
-
- @Override
- public T decode(InputStream inStream, Context context) throws IOException {
- // Get a BinaryDecoder instance from the ThreadLocal cache and attempt to reuse it.
- BinaryDecoder decoderInstance = DECODER_FACTORY.directBinaryDecoder(inStream, decoder.get());
- // Save the potentially-new instance for later.
- decoder.set(decoderInstance);
- return reader.get().read(null, decoderInstance);
- }
-
- @Override
- public List<? extends Coder<?>> getCoderArguments() {
- return null;
- }
-
- @Override
- public CloudObject asCloudObject() {
- CloudObject result = super.asCloudObject();
- addString(result, "type", type.getName());
- addString(result, "schema", schema.toString());
- return result;
- }
-
- /**
- * @throws NonDeterministicException when the type may not be deterministically
- * encoded using the given {@link Schema}, the {@code directBinaryEncoder}, and the
- * {@link ReflectDatumWriter} or {@link GenericDatumWriter}.
- */
- @Override
- public void verifyDeterministic() throws NonDeterministicException {
- if (!nonDeterministicReasons.isEmpty()) {
- throw new NonDeterministicException(this, nonDeterministicReasons);
- }
- }
-
- /**
- * Returns a new {@link DatumReader} that can be used to read from an Avro file directly. Assumes
- * the schema used to read is the same as the schema that was used when writing.
- *
- * @deprecated For {@code AvroCoder} internal use only.
- */
- // TODO: once we can remove this deprecated function, inline in constructor.
- @Deprecated
- public DatumReader<T> createDatumReader() {
- if (type.equals(GenericRecord.class)) {
- return new GenericDatumReader<>(schema);
- } else {
- return new ReflectDatumReader<>(schema);
- }
- }
-
- /**
- * Returns a new {@link DatumWriter} that can be used to write to an Avro file directly.
- *
- * @deprecated For {@code AvroCoder} internal use only.
- */
- // TODO: once we can remove this deprecated function, inline in constructor.
- @Deprecated
- public DatumWriter<T> createDatumWriter() {
- if (type.equals(GenericRecord.class)) {
- return new GenericDatumWriter<>(schema);
- } else {
- return new ReflectDatumWriter<>(schema);
- }
- }
-
- /**
- * Returns the schema used by this coder.
- */
- public Schema getSchema() {
- return schema;
- }
-
- /**
- * Proxy to use in place of serializing the {@link AvroCoder}. This allows the fields
- * to remain final.
- */
- private static class SerializedAvroCoderProxy<T> implements Serializable {
- private final Class<T> type;
- private final String schemaStr;
-
- public SerializedAvroCoderProxy(Class<T> type, String schemaStr) {
- this.type = type;
- this.schemaStr = schemaStr;
- }
-
- private Object readResolve() {
- // When deserialized, instances of this object should be replaced by
- // constructing an AvroCoder.
- Schema.Parser parser = new Schema.Parser();
- return new AvroCoder<T>(type, parser.parse(schemaStr));
- }
- }
-
- /**
- * Helper class encapsulating the various pieces of state maintained by the
- * recursive walk used for checking if the encoding will be deterministic.
- */
- private static class AvroDeterminismChecker {
-
- // Reasons that the original type are not deterministic. This accumulates
- // the actual output.
- private List<String> reasons = new ArrayList<>();
-
- // Types that are currently "open". Used to make sure we don't have any
- // recursive types. Note that we assume that all occurrences of a given type
- // are equal, rather than tracking pairs of type + schema.
- private Set<TypeDescriptor<?>> activeTypes = new HashSet<>();
-
- // Similarly to how we record active types, we record the schemas we visit
- // to make sure we don't encounter recursive fields.
- private Set<Schema> activeSchemas = new HashSet<>();
-
- /**
- * Report an error in the current context.
- */
- private void reportError(String context, String fmt, Object... args) {
- String message = String.format(fmt, args);
- reasons.add(context + ": " + message);
- }
-
- /**
- * Classes that are serialized by Avro as a String include
- * <ul>
- * <li>Subtypes of CharSequence (including String, Avro's mutable Utf8, etc.)
- * <li>Several predefined classes (BigDecimal, BigInteger, URI, URL)
- * <li>Classes annotated with @Stringable (uses their #toString() and a String constructor)
- * </ul>
- *
- * <p>Rather than determine which of these cases are deterministic, we list some classes
- * that definitely are, and treat any others as non-deterministic.
- */
- private static final Set<Class<?>> DETERMINISTIC_STRINGABLE_CLASSES = new HashSet<>();
- static {
- // CharSequences:
- DETERMINISTIC_STRINGABLE_CLASSES.add(String.class);
- DETERMINISTIC_STRINGABLE_CLASSES.add(Utf8.class);
-
- // Explicitly Stringable:
- DETERMINISTIC_STRINGABLE_CLASSES.add(java.math.BigDecimal.class);
- DETERMINISTIC_STRINGABLE_CLASSES.add(java.math.BigInteger.class);
- DETERMINISTIC_STRINGABLE_CLASSES.add(java.net.URI.class);
- DETERMINISTIC_STRINGABLE_CLASSES.add(java.net.URL.class);
-
- // Classes annotated with @Stringable:
- }
-
- /**
- * Return true if the given type token is a subtype of *any* of the listed parents.
- */
- private static boolean isSubtypeOf(TypeDescriptor<?> type, Class<?>... parents) {
- for (Class<?> parent : parents) {
- if (type.isSubtypeOf(TypeDescriptor.of(parent))) {
- return true;
- }
- }
- return false;
- }
-
- protected AvroDeterminismChecker() {}
-
- // The entry point for the check. Should not be recursively called.
- public List<String> check(TypeDescriptor<?> type, Schema schema) {
- recurse(type.getRawType().getName(), type, schema);
- return reasons;
- }
-
- // This is the method that should be recursively called. It sets up the path
- // and visited types correctly.
- private void recurse(String context, TypeDescriptor<?> type, Schema schema) {
- if (type.getRawType().isAnnotationPresent(AvroSchema.class)) {
- reportError(context, "Custom schemas are not supported -- remove @AvroSchema.");
- return;
- }
-
- if (!activeTypes.add(type)) {
- reportError(context, "%s appears recursively", type);
- return;
- }
-
- // If the the record isn't a true class, but rather a GenericRecord, SpecificRecord, etc.
- // with a specified schema, then we need to make the decision based on the generated
- // implementations.
- if (isSubtypeOf(type, IndexedRecord.class)) {
- checkIndexedRecord(context, schema, null);
- } else {
- doCheck(context, type, schema);
- }
-
- activeTypes.remove(type);
- }
-
- private void doCheck(String context, TypeDescriptor<?> type, Schema schema) {
- switch (schema.getType()) {
- case ARRAY:
- checkArray(context, type, schema);
- break;
- case ENUM:
- // Enums should be deterministic, since they depend only on the ordinal.
- break;
- case FIXED:
- // Depending on the implementation of GenericFixed, we don't know how
- // the given field will be encoded. So, we assume that it isn't
- // deterministic.
- reportError(context, "FIXED encodings are not guaranteed to be deterministic");
- break;
- case MAP:
- checkMap(context, type, schema);
- break;
- case RECORD:
- checkRecord(type, schema);
- break;
- case UNION:
- checkUnion(context, type, schema);
- break;
- case STRING:
- checkString(context, type);
- break;
- case BOOLEAN:
- case BYTES:
- case DOUBLE:
- case INT:
- case FLOAT:
- case LONG:
- case NULL:
- // For types that Avro encodes using one of the above primitives, we assume they are
- // deterministic.
- break;
- default:
- // In any other case (eg., new types added to Avro) we cautiously return
- // false.
- reportError(context, "Unknown schema type %s may be non-deterministic", schema.getType());
- break;
- }
- }
-
- private void checkString(String context, TypeDescriptor<?> type) {
- // For types that are encoded as strings, we need to make sure they're in an approved
- // whitelist. For other types that are annotated @Stringable, Avro will just use the
- // #toString() methods, which has no guarantees of determinism.
- if (!DETERMINISTIC_STRINGABLE_CLASSES.contains(type.getRawType())) {
- reportError(context, "%s may not have deterministic #toString()", type);
- }
- }
-
- private static final Schema AVRO_NULL_SCHEMA = Schema.create(Schema.Type.NULL);
-
- private void checkUnion(String context, TypeDescriptor<?> type, Schema schema) {
- final List<Schema> unionTypes = schema.getTypes();
-
- if (!type.getRawType().isAnnotationPresent(Union.class)) {
- // First check for @Nullable field, which shows up as a union of field type and null.
- if (unionTypes.size() == 2 && unionTypes.contains(AVRO_NULL_SCHEMA)) {
- // Find the Schema that is not NULL and recursively check that it is deterministic.
- Schema nullableFieldSchema = unionTypes.get(0).equals(AVRO_NULL_SCHEMA)
- ? unionTypes.get(1) : unionTypes.get(0);
- doCheck(context, type, nullableFieldSchema);
- return;
- }
-
- // Otherwise report a schema error.
- reportError(context, "Expected type %s to have @Union annotation", type);
- return;
- }
-
- // Errors associated with this union will use the base class as their context.
- String baseClassContext = type.getRawType().getName();
-
- // For a union, we need to make sure that each possible instantiation is deterministic.
- for (Schema concrete : unionTypes) {
- @SuppressWarnings("unchecked")
- TypeDescriptor<?> unionType = TypeDescriptor.of(ReflectData.get().getClass(concrete));
-
- recurse(baseClassContext, unionType, concrete);
- }
- }
-
- private void checkRecord(TypeDescriptor<?> type, Schema schema) {
- // For a record, we want to make sure that all the fields are deterministic.
- Class<?> clazz = type.getRawType();
- for (org.apache.avro.Schema.Field fieldSchema : schema.getFields()) {
- Field field = getField(clazz, fieldSchema.name());
- String fieldContext = field.getDeclaringClass().getName() + "#" + field.getName();
-
- if (field.isAnnotationPresent(AvroEncode.class)) {
- reportError(fieldContext,
- "Custom encoders may be non-deterministic -- remove @AvroEncode");
- continue;
- }
-
- if (!IndexedRecord.class.isAssignableFrom(field.getType())
- && field.isAnnotationPresent(AvroSchema.class)) {
- // TODO: We should be able to support custom schemas on POJO fields, but we shouldn't
- // need to, so we just allow it in the case of IndexedRecords.
- reportError(fieldContext,
- "Custom schemas are only supported for subtypes of IndexedRecord.");
- continue;
- }
-
- TypeDescriptor<?> fieldType = type.resolveType(field.getGenericType());
- recurse(fieldContext, fieldType, fieldSchema.schema());
- }
- }
-
- private void checkIndexedRecord(String context, Schema schema,
- @Nullable String specificClassStr) {
-
- if (!activeSchemas.add(schema)) {
- reportError(context, "%s appears recursively", schema.getName());
- return;
- }
-
- switch (schema.getType()) {
- case ARRAY:
- // Generic Records use GenericData.Array to implement arrays, which is
- // essentially an ArrayList, and therefore ordering is deterministic.
- // The array is thus deterministic if the elements are deterministic.
- checkIndexedRecord(context, schema.getElementType(), null);
- break;
- case ENUM:
- // Enums are deterministic because they encode as a single integer.
- break;
- case FIXED:
- // In the case of GenericRecords, FIXED is deterministic because it
- // encodes/decodes as a Byte[].
- break;
- case MAP:
- reportError(context,
- "GenericRecord and SpecificRecords use a HashMap to represent MAPs,"
- + " so it is non-deterministic");
- break;
- case RECORD:
- for (org.apache.avro.Schema.Field field : schema.getFields()) {
- checkIndexedRecord(
- schema.getName() + "." + field.name(),
- field.schema(),
- field.getProp(SpecificData.CLASS_PROP));
- }
- break;
- case STRING:
- // GenericDatumWriter#findStringClass will use a CharSequence or a String
- // for each string, so it is deterministic.
-
- // SpecificCompiler#getStringType will use java.lang.String, org.apache.avro.util.Utf8,
- // or java.lang.CharSequence, unless SpecificData.CLASS_PROP overrides that.
- if (specificClassStr != null) {
- Class<?> specificClass;
- try {
- specificClass = ClassUtils.forName(specificClassStr);
- if (!DETERMINISTIC_STRINGABLE_CLASSES.contains(specificClass)) {
- reportError(context, "Specific class %s is not known to be deterministic",
- specificClassStr);
- }
- } catch (ClassNotFoundException e) {
- reportError(context, "Specific class %s is not known to be deterministic",
- specificClassStr);
- }
- }
- break;
- case UNION:
- for (org.apache.avro.Schema subschema : schema.getTypes()) {
- checkIndexedRecord(subschema.getName(), subschema, null);
- }
- break;
- case BOOLEAN:
- case BYTES:
- case DOUBLE:
- case INT:
- case FLOAT:
- case LONG:
- case NULL:
- // For types that Avro encodes using one of the above primitives, we assume they are
- // deterministic.
- break;
- default:
- reportError(context, "Unknown schema type %s may be non-deterministic", schema.getType());
- break;
- }
-
- activeSchemas.remove(schema);
- }
-
- private void checkMap(String context, TypeDescriptor<?> type, Schema schema) {
- if (!isSubtypeOf(type, SortedMap.class)) {
- reportError(context, "%s may not be deterministically ordered", type);
- }
-
- // Avro (currently) asserts that all keys are strings.
- // In case that changes, we double check that the key was a string:
- Class<?> keyType = type.resolveType(Map.class.getTypeParameters()[0]).getRawType();
- if (!String.class.equals(keyType)) {
- reportError(context, "map keys should be Strings, but was %s", keyType);
- }
-
- recurse(context,
- type.resolveType(Map.class.getTypeParameters()[1]),
- schema.getValueType());
- }
-
- private void checkArray(String context, TypeDescriptor<?> type, Schema schema) {
- TypeDescriptor<?> elementType = null;
- if (type.isArray()) {
- // The type is an array (with ordering)-> deterministic iff the element is deterministic.
- elementType = type.getComponentType();
- } else if (isSubtypeOf(type, Collection.class)) {
- if (isSubtypeOf(type, List.class, SortedSet.class)) {
- // Ordered collection -> deterministic iff the element is deterministic
- elementType = type.resolveType(Collection.class.getTypeParameters()[0]);
- } else {
- // Not an ordered collection -> not deterministic
- reportError(context, "%s may not be deterministically ordered", type);
- return;
- }
- } else {
- // If it was an unknown type encoded as an array, be conservative and assume
- // that we don't know anything about the order.
- reportError(context, "encoding %s as an ARRAY was unexpected");
- return;
- }
-
- // If we get here, it's either a deterministically-ordered Collection, or
- // an array. Either way, the type is deterministic iff the element type is
- // deterministic.
- recurse(context, elementType, schema.getElementType());
- }
-
- /**
- * Extract a field from a class. We need to look at the declared fields so that we can
- * see private fields. We may need to walk up to the parent to get classes from the parent.
- */
- private static Field getField(Class<?> clazz, String name) {
- while (clazz != null) {
- for (Field field : clazz.getDeclaredFields()) {
- AvroName avroName = field.getAnnotation(AvroName.class);
- if (avroName != null && name.equals(avroName.value())) {
- return field;
- } else if (avroName == null && name.equals(field.getName())) {
- return field;
- }
- }
- clazz = clazz.getSuperclass();
- }
-
- throw new IllegalArgumentException(
- "Unable to get field " + name + " from class " + clazz);
- }
- }
-}
[09/67] [partial] incubator-beam git commit: Directory reorganization
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/ReduceFnRunner.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/ReduceFnRunner.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/ReduceFnRunner.java
deleted file mode 100644
index 2e2d1f6..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/ReduceFnRunner.java
+++ /dev/null
@@ -1,843 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-import com.google.cloud.dataflow.sdk.transforms.Aggregator;
-import com.google.cloud.dataflow.sdk.transforms.DoFn;
-import com.google.cloud.dataflow.sdk.transforms.GroupByKey.GroupByKeyOnly;
-import com.google.cloud.dataflow.sdk.transforms.windowing.AfterWatermark;
-import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
-import com.google.cloud.dataflow.sdk.transforms.windowing.OutputTimeFn;
-import com.google.cloud.dataflow.sdk.transforms.windowing.PaneInfo;
-import com.google.cloud.dataflow.sdk.transforms.windowing.PaneInfo.Timing;
-import com.google.cloud.dataflow.sdk.transforms.windowing.Window.ClosingBehavior;
-import com.google.cloud.dataflow.sdk.transforms.windowing.WindowFn;
-import com.google.cloud.dataflow.sdk.util.ReduceFnContextFactory.OnTriggerCallbacks;
-import com.google.cloud.dataflow.sdk.util.ReduceFnContextFactory.StateStyle;
-import com.google.cloud.dataflow.sdk.util.TimerInternals.TimerData;
-import com.google.cloud.dataflow.sdk.util.WindowingStrategy.AccumulationMode;
-import com.google.cloud.dataflow.sdk.util.state.ReadableState;
-import com.google.cloud.dataflow.sdk.util.state.StateInternals;
-import com.google.cloud.dataflow.sdk.util.state.StateNamespaces.WindowNamespace;
-import com.google.cloud.dataflow.sdk.values.KV;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.common.annotations.VisibleForTesting;
-import com.google.common.base.Preconditions;
-
-import org.joda.time.Duration;
-import org.joda.time.Instant;
-
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Set;
-
-import javax.annotation.Nullable;
-
-/**
- * Manages the execution of a {@link ReduceFn} after a {@link GroupByKeyOnly} has partitioned the
- * {@link PCollection} by key.
- *
- * <p>The {@link #onTrigger} relies on a {@link TriggerRunner} to manage the execution of
- * the triggering logic. The {@code ReduceFnRunner}s responsibilities are:
- *
- * <ul>
- * <li>Tracking the windows that are active (have buffered data) as elements arrive and
- * triggers are fired.
- * <li>Holding the watermark based on the timestamps of elements in a pane and releasing it
- * when the trigger fires.
- * <li>Calling the appropriate callbacks on {@link ReduceFn} based on trigger execution, timer
- * firings, etc, and providing appropriate contexts to the {@link ReduceFn} for actions
- * such as output.
- * <li>Scheduling garbage collection of state associated with a specific window, and making that
- * happen when the appropriate timer fires.
- * </ul>
- *
- * @param <K> The type of key being processed.
- * @param <InputT> The type of values associated with the key.
- * @param <OutputT> The output type that will be produced for each key.
- * @param <W> The type of windows this operates on.
- */
-public class ReduceFnRunner<K, InputT, OutputT, W extends BoundedWindow> {
-
- /**
- * The {@link ReduceFnRunner} depends on most aspects of the {@link WindowingStrategy}.
- *
- * <ul>
- * <li>It runs the trigger from the {@link WindowingStrategy}.</li>
- * <li>It merges windows according to the {@link WindowingStrategy}.</li>
- * <li>It chooses how to track active windows and clear out expired windows
- * according to the {@link WindowingStrategy}, based on the allowed lateness and
- * whether windows can merge.</li>
- * <li>It decides whether to emit empty final panes according to whether the
- * {@link WindowingStrategy} requires it.<li>
- * <li>It uses discarding or accumulation mode according to the {@link WindowingStrategy}.</li>
- * </ul>
- */
- private final WindowingStrategy<Object, W> windowingStrategy;
-
- private final OutputWindowedValue<KV<K, OutputT>> outputter;
-
- private final StateInternals<K> stateInternals;
-
- private final Aggregator<Long, Long> droppedDueToClosedWindow;
-
- private final K key;
-
- private final OnMergeCallback onMergeCallback = new OnMergeCallback();
-
- /**
- * Track which windows are still active and which 'state address' windows contain state
- * for a merged window.
- *
- * <ul>
- * <li>State: Global map for all active windows for this computation and key.
- * <li>Lifetime: Cleared when no active windows need to be tracked. A window lives within
- * the active window set until its trigger is closed or the window is garbage collected.
- * </ul>
- */
- private final ActiveWindowSet<W> activeWindows;
-
- /**
- * Always a {@link SystemReduceFn}.
- *
- * <ul>
- * <li>State: A bag of accumulated values, or the intermediate result of a combiner.
- * <li>State style: RENAMED
- * <li>Merging: Concatenate or otherwise combine the state from each merged window.
- * <li>Lifetime: Cleared when a pane fires if DISCARDING_FIRED_PANES. Otherwise cleared
- * when trigger is finished or when the window is garbage collected.
- * </ul>
- */
- private final ReduceFn<K, InputT, OutputT, W> reduceFn;
-
- /**
- * Manage the setting and firing of timer events.
- *
- * <ul>
- * <li>Merging: End-of-window and garbage collection timers are cancelled when windows are
- * merged away. Timers created by triggers are never garbage collected and are left to
- * fire and be ignored.
- * <li>Lifetime: Timers automatically disappear after they fire.
- * </ul>
- */
- private final TimerInternals timerInternals;
-
- /**
- * Manage the execution and state for triggers.
- *
- * <ul>
- * <li>State: Tracks which sub-triggers have finished, and any additional state needed to
- * determine when the trigger should fire.
- * <li>State style: DIRECT
- * <li>Merging: Finished bits are explicitly managed. Other state is eagerly merged as
- * needed.
- * <li>Lifetime: Most trigger state is cleared when the final pane is emitted. However
- * the finished bits are left behind and must be cleared when the window is
- * garbage collected.
- * </ul>
- */
- private final TriggerRunner<W> triggerRunner;
-
- /**
- * Store the output watermark holds for each window.
- *
- * <ul>
- * <li>State: Bag of hold timestamps.
- * <li>State style: RENAMED
- * <li>Merging: Depending on {@link OutputTimeFn}, may need to be recalculated on merging.
- * When a pane fires it may be necessary to add (back) an end-of-window or garbage collection
- * hold.
- * <li>Lifetime: Cleared when a pane fires or when the window is garbage collected.
- * </ul>
- */
- private final WatermarkHold<W> watermarkHold;
-
- private final ReduceFnContextFactory<K, InputT, OutputT, W> contextFactory;
-
- /**
- * Store the previously emitted pane (if any) for each window.
- *
- * <ul>
- * <li>State: The previous {@link PaneInfo} passed to the user's {@link DoFn#processElement},
- * if any.
- * <li>Style style: DIRECT
- * <li>Merging: Always keyed by actual window, so does not depend on {@link #activeWindows}.
- * Cleared when window is merged away.
- * <li>Lifetime: Cleared when trigger is closed or window is garbage collected.
- * </ul>
- */
- private final PaneInfoTracker paneInfoTracker;
-
- /**
- * Store whether we've seen any elements for a window since the last pane was emitted.
- *
- * <ul>
- * <li>State: Unless DISCARDING_FIRED_PANES, a count of number of elements added so far.
- * <li>State style: RENAMED.
- * <li>Merging: Counts are summed when windows are merged.
- * <li>Lifetime: Cleared when pane fires or window is garbage collected.
- * </ul>
- */
- private final NonEmptyPanes<K, W> nonEmptyPanes;
-
- public ReduceFnRunner(
- K key,
- WindowingStrategy<?, W> windowingStrategy,
- StateInternals<K> stateInternals,
- TimerInternals timerInternals,
- WindowingInternals<?, KV<K, OutputT>> windowingInternals,
- Aggregator<Long, Long> droppedDueToClosedWindow,
- ReduceFn<K, InputT, OutputT, W> reduceFn,
- PipelineOptions options) {
- this.key = key;
- this.timerInternals = timerInternals;
- this.paneInfoTracker = new PaneInfoTracker(timerInternals);
- this.stateInternals = stateInternals;
- this.outputter = new OutputViaWindowingInternals<>(windowingInternals);
- this.droppedDueToClosedWindow = droppedDueToClosedWindow;
- this.reduceFn = reduceFn;
-
- @SuppressWarnings("unchecked")
- WindowingStrategy<Object, W> objectWindowingStrategy =
- (WindowingStrategy<Object, W>) windowingStrategy;
- this.windowingStrategy = objectWindowingStrategy;
-
- this.nonEmptyPanes = NonEmptyPanes.create(this.windowingStrategy, this.reduceFn);
-
- // Note this may incur I/O to load persisted window set data.
- this.activeWindows = createActiveWindowSet();
-
- this.contextFactory =
- new ReduceFnContextFactory<K, InputT, OutputT, W>(key, reduceFn, this.windowingStrategy,
- stateInternals, this.activeWindows, timerInternals, windowingInternals, options);
-
- this.watermarkHold = new WatermarkHold<>(timerInternals, windowingStrategy);
- this.triggerRunner =
- new TriggerRunner<>(
- windowingStrategy.getTrigger(),
- new TriggerContextFactory<>(windowingStrategy, stateInternals, activeWindows));
- }
-
- private ActiveWindowSet<W> createActiveWindowSet() {
- return windowingStrategy.getWindowFn().isNonMerging()
- ? new NonMergingActiveWindowSet<W>()
- : new MergingActiveWindowSet<W>(windowingStrategy.getWindowFn(), stateInternals);
- }
-
- @VisibleForTesting
- boolean isFinished(W window) {
- return triggerRunner.isClosed(contextFactory.base(window, StateStyle.DIRECT).state());
- }
-
- /**
- * Incorporate {@code values} into the underlying reduce function, and manage holds, timers,
- * triggers, and window merging.
- *
- * <p>The general strategy is:
- * <ol>
- * <li>Use {@link WindowedValue#getWindows} (itself determined using
- * {@link WindowFn#assignWindows}) to determine which windows each element belongs to. Some
- * of those windows will already have state associated with them. The rest are considered
- * NEW.
- * <li>Use {@link WindowFn#mergeWindows} to attempt to merge currently ACTIVE and NEW windows.
- * Each NEW window will become either ACTIVE, MERGED, or EPHEMERAL. (See {@link
- * ActiveWindowSet} for definitions of these terms.)
- * <li>If at all possible, eagerly substitute EPHEMERAL windows with their ACTIVE state address
- * windows before any state is associated with the EPHEMERAL window. In the common case that
- * windows for new elements are merged into existing ACTIVE windows then no additional
- * storage or merging overhead will be incurred.
- * <li>Otherwise, keep track of the state address windows for ACTIVE windows so that their
- * states can be merged on-demand when a pane fires.
- * <li>Process the element for each of the windows it's windows have been merged into according
- * to {@link ActiveWindowSet}. Processing may require running triggers, setting timers,
- * setting holds, and invoking {@link ReduceFn#onTrigger}.
- * </ol>
- */
- public void processElements(Iterable<WindowedValue<InputT>> values) throws Exception {
- // If an incoming element introduces a new window, attempt to merge it into an existing
- // window eagerly. The outcome is stored in the ActiveWindowSet.
- collectAndMergeWindows(values);
-
- Set<W> windowsToConsider = new HashSet<>();
-
- // Process each element, using the updated activeWindows determined by collectAndMergeWindows.
- for (WindowedValue<InputT> value : values) {
- windowsToConsider.addAll(processElement(value));
- }
-
- // Trigger output from any window for which the trigger is ready
- for (W mergedWindow : windowsToConsider) {
- ReduceFn<K, InputT, OutputT, W>.Context directContext =
- contextFactory.base(mergedWindow, StateStyle.DIRECT);
- ReduceFn<K, InputT, OutputT, W>.Context renamedContext =
- contextFactory.base(mergedWindow, StateStyle.RENAMED);
- triggerRunner.prefetchShouldFire(mergedWindow, directContext.state());
- emitIfAppropriate(directContext, renamedContext);
- }
-
- // We're all done with merging and emitting elements so can compress the activeWindow state.
- activeWindows.removeEphemeralWindows();
- }
-
- public void persist() {
- activeWindows.persist();
- }
-
- /**
- * Extract the windows associated with the values, and invoke merge.
- */
- private void collectAndMergeWindows(Iterable<WindowedValue<InputT>> values) throws Exception {
- // No-op if no merging can take place
- if (windowingStrategy.getWindowFn().isNonMerging()) {
- return;
- }
-
- // Collect the windows from all elements (except those which are too late) and
- // make sure they are already in the active window set or are added as NEW windows.
- for (WindowedValue<?> value : values) {
- for (BoundedWindow untypedWindow : value.getWindows()) {
- @SuppressWarnings("unchecked")
- W window = (W) untypedWindow;
-
- ReduceFn<K, InputT, OutputT, W>.Context directContext =
- contextFactory.base(window, StateStyle.DIRECT);
- if (triggerRunner.isClosed(directContext.state())) {
- // This window has already been closed.
- // We will update the counter for this in the corresponding processElement call.
- continue;
- }
-
- if (activeWindows.isActive(window)) {
- Set<W> stateAddressWindows = activeWindows.readStateAddresses(window);
- if (stateAddressWindows.size() > 1) {
- // This is a legacy window who's state has not been eagerly merged.
- // Do that now.
- ReduceFn<K, InputT, OutputT, W>.OnMergeContext premergeContext =
- contextFactory.forPremerge(window);
- reduceFn.onMerge(premergeContext);
- watermarkHold.onMerge(premergeContext);
- activeWindows.merged(window);
- }
- }
-
- // Add this window as NEW if we've not yet seen it.
- activeWindows.addNew(window);
- }
- }
-
- // Merge all of the active windows and retain a mapping from source windows to result windows.
- mergeActiveWindows();
- }
-
- private class OnMergeCallback implements ActiveWindowSet.MergeCallback<W> {
- /**
- * Called from the active window set to indicate {@code toBeMerged} (of which only
- * {@code activeToBeMerged} are ACTIVE and thus have state associated with them) will later
- * be merged into {@code mergeResult}.
- */
- @Override
- public void prefetchOnMerge(
- Collection<W> toBeMerged, Collection<W> activeToBeMerged, W mergeResult) throws Exception {
- ReduceFn<K, InputT, OutputT, W>.OnMergeContext directMergeContext =
- contextFactory.forMerge(activeToBeMerged, mergeResult, StateStyle.DIRECT);
- ReduceFn<K, InputT, OutputT, W>.OnMergeContext renamedMergeContext =
- contextFactory.forMerge(activeToBeMerged, mergeResult, StateStyle.RENAMED);
-
- // Prefetch various state.
- triggerRunner.prefetchForMerge(mergeResult, activeToBeMerged, directMergeContext.state());
- reduceFn.prefetchOnMerge(renamedMergeContext.state());
- watermarkHold.prefetchOnMerge(renamedMergeContext.state());
- nonEmptyPanes.prefetchOnMerge(renamedMergeContext.state());
- }
-
- /**
- * Called from the active window set to indicate {@code toBeMerged} (of which only
- * {@code activeToBeMerged} are ACTIVE and thus have state associated with them) are about
- * to be merged into {@code mergeResult}.
- */
- @Override
- public void onMerge(Collection<W> toBeMerged, Collection<W> activeToBeMerged, W mergeResult)
- throws Exception {
- // At this point activeWindows has NOT incorporated the results of the merge.
- ReduceFn<K, InputT, OutputT, W>.OnMergeContext directMergeContext =
- contextFactory.forMerge(activeToBeMerged, mergeResult, StateStyle.DIRECT);
- ReduceFn<K, InputT, OutputT, W>.OnMergeContext renamedMergeContext =
- contextFactory.forMerge(activeToBeMerged, mergeResult, StateStyle.RENAMED);
-
- // Run the reduceFn to perform any needed merging.
- reduceFn.onMerge(renamedMergeContext);
-
- // Merge the watermark holds.
- watermarkHold.onMerge(renamedMergeContext);
-
- // Merge non-empty pane state.
- nonEmptyPanes.onMerge(renamedMergeContext.state());
-
- // Have the trigger merge state as needed
- triggerRunner.onMerge(
- directMergeContext.window(), directMergeContext.timers(), directMergeContext.state());
-
- for (W active : activeToBeMerged) {
- if (active.equals(mergeResult)) {
- // Not merged away.
- continue;
- }
- // Cleanup flavor A: Currently ACTIVE window is about to become MERGED.
- // Clear any state not already cleared by the onMerge calls above.
- WindowTracing.debug("ReduceFnRunner.onMerge: Merging {} into {}", active, mergeResult);
- ReduceFn<K, InputT, OutputT, W>.Context directClearContext =
- contextFactory.base(active, StateStyle.DIRECT);
- // No need for the end-of-window or garbage collection timers.
- // We will establish a new end-of-window or garbage collection timer for the mergeResult
- // window in processElement below. There must be at least one element for the mergeResult
- // window since a new element with a new window must have triggered this onMerge.
- cancelEndOfWindowAndGarbageCollectionTimers(directClearContext);
- // We no longer care about any previous panes of merged away windows. The
- // merge result window gets to start fresh if it is new.
- paneInfoTracker.clear(directClearContext.state());
- }
- }
- }
-
- private void mergeActiveWindows() throws Exception {
- activeWindows.merge(onMergeCallback);
- }
-
- /**
- * Process an element.
- * @param value the value being processed
- *
- * @return the set of windows in which the element was actually processed
- */
- private Collection<W> processElement(WindowedValue<InputT> value) throws Exception {
- // Redirect element windows to the ACTIVE windows they have been merged into.
- // The compressed representation (value, {window1, window2, ...}) actually represents
- // distinct elements (value, window1), (value, window2), ...
- // so if window1 and window2 merge, the resulting window will contain both copies
- // of the value.
- Collection<W> windows = new ArrayList<>();
- for (BoundedWindow untypedWindow : value.getWindows()) {
- @SuppressWarnings("unchecked")
- W window = (W) untypedWindow;
- W active = activeWindows.representative(window);
- Preconditions.checkState(active != null, "Window %s should have been added", window);
- windows.add(active);
- }
-
- // Prefetch in each of the windows if we're going to need to process triggers
- for (W window : windows) {
- ReduceFn<K, InputT, OutputT, W>.ProcessValueContext directContext = contextFactory.forValue(
- window, value.getValue(), value.getTimestamp(), StateStyle.DIRECT);
- triggerRunner.prefetchForValue(window, directContext.state());
- }
-
- // Process the element for each (representative) window it belongs to.
- for (W window : windows) {
- ReduceFn<K, InputT, OutputT, W>.ProcessValueContext directContext = contextFactory.forValue(
- window, value.getValue(), value.getTimestamp(), StateStyle.DIRECT);
- ReduceFn<K, InputT, OutputT, W>.ProcessValueContext renamedContext = contextFactory.forValue(
- window, value.getValue(), value.getTimestamp(), StateStyle.RENAMED);
-
- // Check to see if the triggerRunner thinks the window is closed. If so, drop that window.
- if (triggerRunner.isClosed(directContext.state())) {
- droppedDueToClosedWindow.addValue(1L);
- WindowTracing.debug(
- "ReduceFnRunner.processElement: Dropping element at {} for key:{}; window:{} "
- + "since window is no longer active at inputWatermark:{}; outputWatermark:{}",
- value.getTimestamp(), key, window, timerInternals.currentInputWatermarkTime(),
- timerInternals.currentOutputWatermarkTime());
- continue;
- }
-
- nonEmptyPanes.recordContent(renamedContext.state());
-
- // Make sure we've scheduled the end-of-window or garbage collection timer for this window.
- Instant timer = scheduleEndOfWindowOrGarbageCollectionTimer(directContext);
-
- // Hold back progress of the output watermark until we have processed the pane this
- // element will be included within. If the element is too late for that, place a hold at
- // the end-of-window or garbage collection time to allow empty panes to contribute elements
- // which won't be dropped due to lateness by a following computation (assuming the following
- // computation uses the same allowed lateness value...)
- @Nullable Instant hold = watermarkHold.addHolds(renamedContext);
-
- if (hold != null) {
- // Assert that holds have a proximate timer.
- boolean holdInWindow = !hold.isAfter(window.maxTimestamp());
- boolean timerInWindow = !timer.isAfter(window.maxTimestamp());
- Preconditions.checkState(
- holdInWindow == timerInWindow,
- "set a hold at %s, a timer at %s, which disagree as to whether they are in window %s",
- hold,
- timer,
- directContext.window());
- }
-
- // Execute the reduceFn, which will buffer the value as appropriate
- reduceFn.processValue(renamedContext);
-
- // Run the trigger to update its state
- triggerRunner.processValue(
- directContext.window(),
- directContext.timestamp(),
- directContext.timers(),
- directContext.state());
- }
-
- return windows;
- }
-
- /**
- * Called when an end-of-window, garbage collection, or trigger-specific timer fires.
- */
- public void onTimer(TimerData timer) throws Exception {
- // Which window is the timer for?
- Preconditions.checkArgument(timer.getNamespace() instanceof WindowNamespace,
- "Expected timer to be in WindowNamespace, but was in %s", timer.getNamespace());
- @SuppressWarnings("unchecked")
- WindowNamespace<W> windowNamespace = (WindowNamespace<W>) timer.getNamespace();
- W window = windowNamespace.getWindow();
- ReduceFn<K, InputT, OutputT, W>.Context directContext =
- contextFactory.base(window, StateStyle.DIRECT);
- ReduceFn<K, InputT, OutputT, W>.Context renamedContext =
- contextFactory.base(window, StateStyle.RENAMED);
-
- // Has this window had its trigger finish?
- // - The trigger may implement isClosed as constant false.
- // - If the window function does not support windowing then all windows will be considered
- // active.
- // So we must take conjunction of activeWindows and triggerRunner state.
- boolean windowIsActive =
- activeWindows.isActive(window) && !triggerRunner.isClosed(directContext.state());
-
- if (!windowIsActive) {
- WindowTracing.debug(
- "ReduceFnRunner.onTimer: Note that timer {} is for non-ACTIVE window {}", timer, window);
- }
-
- // If this is a garbage collection timer then we should trigger and garbage collect the window.
- Instant cleanupTime = window.maxTimestamp().plus(windowingStrategy.getAllowedLateness());
- boolean isGarbageCollection =
- TimeDomain.EVENT_TIME == timer.getDomain() && timer.getTimestamp().equals(cleanupTime);
-
- if (isGarbageCollection) {
- WindowTracing.debug(
- "ReduceFnRunner.onTimer: Cleaning up for key:{}; window:{} at {} with "
- + "inputWatermark:{}; outputWatermark:{}",
- key, window, timer.getTimestamp(), timerInternals.currentInputWatermarkTime(),
- timerInternals.currentOutputWatermarkTime());
-
- if (windowIsActive) {
- // We need to call onTrigger to emit the final pane if required.
- // The final pane *may* be ON_TIME if no prior ON_TIME pane has been emitted,
- // and the watermark has passed the end of the window.
- onTrigger(directContext, renamedContext, true/* isFinished */);
- }
-
- // Cleanup flavor B: Clear all the remaining state for this window since we'll never
- // see elements for it again.
- clearAllState(directContext, renamedContext, windowIsActive);
- } else {
- WindowTracing.debug(
- "ReduceFnRunner.onTimer: Triggering for key:{}; window:{} at {} with "
- + "inputWatermark:{}; outputWatermark:{}",
- key, window, timer.getTimestamp(), timerInternals.currentInputWatermarkTime(),
- timerInternals.currentOutputWatermarkTime());
- if (windowIsActive) {
- emitIfAppropriate(directContext, renamedContext);
- }
-
- // If this is an end-of-window timer then, we need to set a GC timer
- boolean isEndOfWindow = TimeDomain.EVENT_TIME == timer.getDomain()
- && timer.getTimestamp().equals(window.maxTimestamp());
- if (isEndOfWindow) {
- // Since we are processing an on-time firing we should schedule the garbage collection
- // timer. (If getAllowedLateness is zero then the timer event will be considered a
- // cleanup event and handled by the above).
- // Note we must do this even if the trigger is finished so that we are sure to cleanup
- // any final trigger tombstones.
- Preconditions.checkState(
- windowingStrategy.getAllowedLateness().isLongerThan(Duration.ZERO),
- "Unexpected zero getAllowedLateness");
- WindowTracing.debug(
- "ReduceFnRunner.onTimer: Scheduling cleanup timer for key:{}; window:{} at {} with "
- + "inputWatermark:{}; outputWatermark:{}",
- key, directContext.window(), cleanupTime, timerInternals.currentInputWatermarkTime(),
- timerInternals.currentOutputWatermarkTime());
- directContext.timers().setTimer(cleanupTime, TimeDomain.EVENT_TIME);
- }
- }
- }
-
- /**
- * Clear all the state associated with {@code context}'s window.
- * Should only be invoked if we know all future elements for this window will be considered
- * beyond allowed lateness.
- * This is a superset of the clearing done by {@link #emitIfAppropriate} below since:
- * <ol>
- * <li>We can clear the trigger state tombstone since we'll never need to ask about it again.
- * <li>We can clear any remaining garbage collection hold.
- * </ol>
- */
- private void clearAllState(
- ReduceFn<K, InputT, OutputT, W>.Context directContext,
- ReduceFn<K, InputT, OutputT, W>.Context renamedContext,
- boolean windowIsActive)
- throws Exception {
- if (windowIsActive) {
- // Since both the window is in the active window set AND the trigger was not yet closed,
- // it is possible we still have state.
- reduceFn.clearState(renamedContext);
- watermarkHold.clearHolds(renamedContext);
- nonEmptyPanes.clearPane(renamedContext.state());
- triggerRunner.clearState(
- directContext.window(), directContext.timers(), directContext.state());
- } else {
- // Needed only for backwards compatibility over UPDATE.
- // Clear any end-of-window or garbage collection holds keyed by the current window.
- // Only needed if:
- // - We have merging windows.
- // - We are DISCARDING_FIRED_PANES.
- // - A pane has fired.
- // - But the trigger is not (yet) closed.
- if (windowingStrategy.getMode() == AccumulationMode.DISCARDING_FIRED_PANES
- && !windowingStrategy.getWindowFn().isNonMerging()) {
- watermarkHold.clearHolds(directContext);
- }
- }
- paneInfoTracker.clear(directContext.state());
- if (activeWindows.isActive(directContext.window())) {
- // Don't need to track address state windows anymore.
- activeWindows.remove(directContext.window());
- }
- // We'll never need to test for the trigger being closed again.
- triggerRunner.clearFinished(directContext.state());
- }
-
- /** Should the reduce function state be cleared? */
- private boolean shouldDiscardAfterFiring(boolean isFinished) {
- if (isFinished) {
- // This is the last firing for trigger.
- return true;
- }
- if (windowingStrategy.getMode() == AccumulationMode.DISCARDING_FIRED_PANES) {
- // Nothing should be accumulated between panes.
- return true;
- }
- return false;
- }
-
- /**
- * Possibly emit a pane if a trigger is ready to fire or timers require it, and cleanup state.
- */
- private void emitIfAppropriate(ReduceFn<K, InputT, OutputT, W>.Context directContext,
- ReduceFn<K, InputT, OutputT, W>.Context renamedContext)
- throws Exception {
- if (!triggerRunner.shouldFire(
- directContext.window(), directContext.timers(), directContext.state())) {
- // Ignore unless trigger is ready to fire
- return;
- }
-
- // Inform the trigger of the transition to see if it is finished
- triggerRunner.onFire(directContext.window(), directContext.timers(), directContext.state());
- boolean isFinished = triggerRunner.isClosed(directContext.state());
-
- // Will be able to clear all element state after triggering?
- boolean shouldDiscard = shouldDiscardAfterFiring(isFinished);
-
- // Run onTrigger to produce the actual pane contents.
- // As a side effect it will clear all element holds, but not necessarily any
- // end-of-window or garbage collection holds.
- onTrigger(directContext, renamedContext, isFinished);
-
- // Now that we've triggered, the pane is empty.
- nonEmptyPanes.clearPane(renamedContext.state());
-
- // Cleanup buffered data if appropriate
- if (shouldDiscard) {
- // Cleanup flavor C: The user does not want any buffered data to persist between panes.
- reduceFn.clearState(renamedContext);
- }
-
- if (isFinished) {
- // Cleanup flavor D: If trigger is closed we will ignore all new incoming elements.
- // Clear state not otherwise cleared by onTrigger and clearPane above.
- // Remember the trigger is, indeed, closed until the window is garbage collected.
- triggerRunner.clearState(
- directContext.window(), directContext.timers(), directContext.state());
- paneInfoTracker.clear(directContext.state());
- activeWindows.remove(directContext.window());
- }
- }
-
- /**
- * Do we need to emit a pane?
- */
- private boolean needToEmit(boolean isEmpty, boolean isFinished, PaneInfo.Timing timing) {
- if (!isEmpty) {
- // The pane has elements.
- return true;
- }
- if (timing == Timing.ON_TIME) {
- // This is the unique ON_TIME pane.
- return true;
- }
- if (isFinished && windowingStrategy.getClosingBehavior() == ClosingBehavior.FIRE_ALWAYS) {
- // This is known to be the final pane, and the user has requested it even when empty.
- return true;
- }
- return false;
- }
-
- /**
- * Run the {@link ReduceFn#onTrigger} method and produce any necessary output.
- */
- private void onTrigger(
- final ReduceFn<K, InputT, OutputT, W>.Context directContext,
- ReduceFn<K, InputT, OutputT, W>.Context renamedContext,
- boolean isFinished)
- throws Exception {
- // Prefetch necessary states
- ReadableState<Instant> outputTimestampFuture =
- watermarkHold.extractAndRelease(renamedContext, isFinished).readLater();
- ReadableState<PaneInfo> paneFuture =
- paneInfoTracker.getNextPaneInfo(directContext, isFinished).readLater();
- ReadableState<Boolean> isEmptyFuture =
- nonEmptyPanes.isEmpty(renamedContext.state()).readLater();
-
- reduceFn.prefetchOnTrigger(directContext.state());
- triggerRunner.prefetchOnFire(directContext.window(), directContext.state());
-
- // Calculate the pane info.
- final PaneInfo pane = paneFuture.read();
- // Extract the window hold, and as a side effect clear it.
- final Instant outputTimestamp = outputTimestampFuture.read();
-
- // Only emit a pane if it has data or empty panes are observable.
- if (needToEmit(isEmptyFuture.read(), isFinished, pane.getTiming())) {
- // Run reduceFn.onTrigger method.
- final List<W> windows = Collections.singletonList(directContext.window());
- ReduceFn<K, InputT, OutputT, W>.OnTriggerContext renamedTriggerContext =
- contextFactory.forTrigger(directContext.window(), paneFuture, StateStyle.RENAMED,
- new OnTriggerCallbacks<OutputT>() {
- @Override
- public void output(OutputT toOutput) {
- // We're going to output panes, so commit the (now used) PaneInfo.
- // TODO: This is unnecessary if the trigger isFinished since the saved
- // state will be immediately deleted.
- paneInfoTracker.storeCurrentPaneInfo(directContext, pane);
-
- // Output the actual value.
- outputter.outputWindowedValue(
- KV.of(key, toOutput), outputTimestamp, windows, pane);
- }
- });
-
- reduceFn.onTrigger(renamedTriggerContext);
- }
- }
-
- /**
- * Make sure we'll eventually have a timer fire which will tell us to garbage collect
- * the window state. For efficiency we may need to do this in two steps rather
- * than one. Return the time at which the timer will fire.
- *
- * <ul>
- * <li>If allowedLateness is zero then we'll garbage collect at the end of the window.
- * For simplicity we'll set our own timer for this situation even though an
- * {@link AfterWatermark} trigger may have also set an end-of-window timer.
- * ({@code setTimer} is idempotent.)
- * <li>If allowedLateness is non-zero then we could just always set a timer for the garbage
- * collection time. However if the windows are large (eg hourly) and the allowedLateness is small
- * (eg seconds) then we'll end up with nearly twice the number of timers in-flight. So we
- * instead set an end-of-window timer and then roll that forward to a garbage collection timer
- * when it fires. We use the input watermark to distinguish those cases.
- * </ul>
- */
- private Instant scheduleEndOfWindowOrGarbageCollectionTimer(
- ReduceFn<?, ?, ?, W>.Context directContext) {
- Instant inputWM = timerInternals.currentInputWatermarkTime();
- Instant endOfWindow = directContext.window().maxTimestamp();
- Instant fireTime;
- String which;
- if (inputWM != null && endOfWindow.isBefore(inputWM)) {
- fireTime = endOfWindow.plus(windowingStrategy.getAllowedLateness());
- which = "garbage collection";
- } else {
- fireTime = endOfWindow;
- which = "end-of-window";
- }
- WindowTracing.trace(
- "ReduceFnRunner.scheduleEndOfWindowOrGarbageCollectionTimer: Scheduling {} timer at {} for "
- + "key:{}; window:{} where inputWatermark:{}; outputWatermark:{}",
- which,
- fireTime,
- key,
- directContext.window(),
- inputWM,
- timerInternals.currentOutputWatermarkTime());
- directContext.timers().setTimer(fireTime, TimeDomain.EVENT_TIME);
- return fireTime;
- }
-
- private void cancelEndOfWindowAndGarbageCollectionTimers(ReduceFn<?, ?, ?, W>.Context context) {
- WindowTracing.debug(
- "ReduceFnRunner.cancelEndOfWindowAndGarbageCollectionTimers: Deleting timers for "
- + "key:{}; window:{} where inputWatermark:{}; outputWatermark:{}",
- key, context.window(), timerInternals.currentInputWatermarkTime(),
- timerInternals.currentOutputWatermarkTime());
- Instant timer = context.window().maxTimestamp();
- context.timers().deleteTimer(timer, TimeDomain.EVENT_TIME);
- if (windowingStrategy.getAllowedLateness().isLongerThan(Duration.ZERO)) {
- timer = timer.plus(windowingStrategy.getAllowedLateness());
- context.timers().deleteTimer(timer, TimeDomain.EVENT_TIME);
- }
- }
-
- /**
- * An object that can output a value with all of its windowing information. This is a deliberately
- * restricted subinterface of {@link WindowingInternals} to express how it is used here.
- */
- private interface OutputWindowedValue<OutputT> {
- void outputWindowedValue(OutputT output, Instant timestamp,
- Collection<? extends BoundedWindow> windows, PaneInfo pane);
- }
-
- private static class OutputViaWindowingInternals<OutputT>
- implements OutputWindowedValue<OutputT> {
-
- private final WindowingInternals<?, OutputT> windowingInternals;
-
- public OutputViaWindowingInternals(WindowingInternals<?, OutputT> windowingInternals) {
- this.windowingInternals = windowingInternals;
- }
-
- @Override
- public void outputWindowedValue(
- OutputT output,
- Instant timestamp,
- Collection<? extends BoundedWindow> windows,
- PaneInfo pane) {
- windowingInternals.outputWindowedValue(output, timestamp, windows, pane);
- }
-
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/ReifyTimestampAndWindowsDoFn.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/ReifyTimestampAndWindowsDoFn.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/ReifyTimestampAndWindowsDoFn.java
deleted file mode 100644
index 88a1c15..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/ReifyTimestampAndWindowsDoFn.java
+++ /dev/null
@@ -1,46 +0,0 @@
-/*******************************************************************************
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- ******************************************************************************/
-
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.cloud.dataflow.sdk.transforms.DoFn;
-import com.google.cloud.dataflow.sdk.values.KV;
-
-/**
- * DoFn that makes timestamps and window assignments explicit in the value part of each key/value
- * pair.
- *
- * @param <K> the type of the keys of the input and output {@code PCollection}s
- * @param <V> the type of the values of the input {@code PCollection}
- */
-@SystemDoFnInternal
-public class ReifyTimestampAndWindowsDoFn<K, V>
- extends DoFn<KV<K, V>, KV<K, WindowedValue<V>>> {
- @Override
- public void processElement(ProcessContext c)
- throws Exception {
- KV<K, V> kv = c.element();
- K key = kv.getKey();
- V value = kv.getValue();
- c.output(KV.of(
- key,
- WindowedValue.of(
- value,
- c.timestamp(),
- c.windowingInternals().windows(),
- c.pane())));
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/Reshuffle.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/Reshuffle.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/Reshuffle.java
deleted file mode 100644
index 367db2d..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/Reshuffle.java
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.transforms.DoFn;
-import com.google.cloud.dataflow.sdk.transforms.GroupByKey;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-import com.google.cloud.dataflow.sdk.transforms.ParDo;
-import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
-import com.google.cloud.dataflow.sdk.transforms.windowing.NonMergingWindowFn;
-import com.google.cloud.dataflow.sdk.transforms.windowing.Window;
-import com.google.cloud.dataflow.sdk.transforms.windowing.WindowFn;
-import com.google.cloud.dataflow.sdk.values.KV;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-
-import org.joda.time.Duration;
-import org.joda.time.Instant;
-
-import java.util.Collection;
-
-/**
- * A {@link PTransform} that returns a {@link PCollection} equivalent to its input but operationally
- * provides some of the side effects of a {@link GroupByKey}, in particular preventing fusion of
- * the surrounding transforms, checkpointing and deduplication by id (see
- * {@link ValueWithRecordId}).
- *
- * <p>Performs a {@link GroupByKey} so that the data is key-partitioned. Configures the
- * {@link WindowingStrategy} so that no data is dropped, but doesn't affect the need for
- * the user to specify allowed lateness and accumulation mode before a user-inserted GroupByKey.
- *
- * @param <K> The type of key being reshuffled on.
- * @param <V> The type of value being reshuffled.
- */
-public class Reshuffle<K, V> extends PTransform<PCollection<KV<K, V>>, PCollection<KV<K, V>>> {
-
- private Reshuffle() {
- }
-
- public static <K, V> Reshuffle<K, V> of() {
- return new Reshuffle<K, V>();
- }
-
- @Override
- public PCollection<KV<K, V>> apply(PCollection<KV<K, V>> input) {
- WindowingStrategy<?, ?> originalStrategy = input.getWindowingStrategy();
- // If the input has already had its windows merged, then the GBK that performed the merge
- // will have set originalStrategy.getWindowFn() to InvalidWindows, causing the GBK contained
- // here to fail. Instead, we install a valid WindowFn that leaves all windows unchanged.
- Window.Bound<KV<K, V>> rewindow = Window
- .<KV<K, V>>into(new PassThroughWindowFn<>(originalStrategy.getWindowFn()))
- .triggering(new ReshuffleTrigger<>())
- .discardingFiredPanes()
- .withAllowedLateness(Duration.millis(BoundedWindow.TIMESTAMP_MAX_VALUE.getMillis()));
-
- return input.apply(rewindow)
- .apply(GroupByKey.<K, V>create())
- // Set the windowing strategy directly, so that it doesn't get counted as the user having
- // set allowed lateness.
- .setWindowingStrategyInternal(originalStrategy)
- .apply(ParDo.named("ExpandIterable").of(
- new DoFn<KV<K, Iterable<V>>, KV<K, V>>() {
- @Override
- public void processElement(ProcessContext c) {
- K key = c.element().getKey();
- for (V value : c.element().getValue()) {
- c.output(KV.of(key, value));
- }
- }
- }));
- }
-
- /**
- * A {@link WindowFn} that leaves all associations between elements and windows unchanged.
- *
- * <p>In order to implement all the abstract methods of {@link WindowFn}, this requires the
- * prior {@link WindowFn}, to which all auxiliary functionality is delegated.
- */
- private static class PassThroughWindowFn<T> extends NonMergingWindowFn<T, BoundedWindow> {
-
- /** The WindowFn prior to this. Used for its windowCoder, etc. */
- private final WindowFn<?, BoundedWindow> priorWindowFn;
-
- public PassThroughWindowFn(WindowFn<?, ?> priorWindowFn) {
- // Safe because it is only used privately here.
- // At every point where a window is returned or accepted, it has been provided
- // by priorWindowFn, so it is of the type expected.
- @SuppressWarnings("unchecked")
- WindowFn<?, BoundedWindow> internalWindowFn = (WindowFn<?, BoundedWindow>) priorWindowFn;
- this.priorWindowFn = internalWindowFn;
- }
-
- @Override
- public Collection<BoundedWindow> assignWindows(WindowFn<T, BoundedWindow>.AssignContext c)
- throws Exception {
- // The windows are provided by priorWindowFn, which also provides the coder for them
- @SuppressWarnings("unchecked")
- Collection<BoundedWindow> priorWindows = (Collection<BoundedWindow>) c.windows();
- return priorWindows;
- }
-
- @Override
- public boolean isCompatible(WindowFn<?, ?> other) {
- throw new UnsupportedOperationException(
- String.format("%s.isCompatible() should never be called."
- + " It is a private implementation detail of Reshuffle."
- + " This message indicates a bug in the Dataflow SDK.",
- getClass().getCanonicalName()));
- }
-
- @Override
- public Coder<BoundedWindow> windowCoder() {
- // Safe because priorWindowFn provides the windows also.
- // The Coder is _not_ actually a coder for an arbitrary BoundedWindow.
- return priorWindowFn.windowCoder();
- }
-
- @Override
- public BoundedWindow getSideInputWindow(BoundedWindow window) {
- throw new UnsupportedOperationException(
- String.format("%s.getSideInputWindow() should never be called."
- + " It is a private implementation detail of Reshuffle."
- + " This message indicates a bug in the Dataflow SDK.",
- getClass().getCanonicalName()));
- }
-
- @Override
- public Instant getOutputTime(Instant inputTimestamp, BoundedWindow window) {
- return inputTimestamp;
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/ReshuffleTrigger.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/ReshuffleTrigger.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/ReshuffleTrigger.java
deleted file mode 100644
index 248f005..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/ReshuffleTrigger.java
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
-import com.google.cloud.dataflow.sdk.transforms.windowing.Trigger;
-
-import org.joda.time.Instant;
-
-import java.util.List;
-
-/**
- * The trigger used with {@link Reshuffle} which triggers on every element
- * and never buffers state.
- *
- * @param <W> The kind of window that is being reshuffled.
- */
-public class ReshuffleTrigger<W extends BoundedWindow> extends Trigger<W> {
-
- ReshuffleTrigger() {
- super(null);
- }
-
- @Override
- public void onElement(Trigger<W>.OnElementContext c) { }
-
- @Override
- public void onMerge(Trigger<W>.OnMergeContext c) { }
-
- @Override
- protected Trigger<W> getContinuationTrigger(List<Trigger<W>> continuationTriggers) {
- return this;
- }
-
- @Override
- public Instant getWatermarkThatGuaranteesFiring(W window) {
- throw new UnsupportedOperationException(
- "ReshuffleTrigger should not be used outside of Reshuffle");
- }
-
- @Override
- public boolean shouldFire(Trigger<W>.TriggerContext context) throws Exception {
- return true;
- }
-
- @Override
- public void onFire(Trigger<W>.TriggerContext context) throws Exception { }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/RetryHttpRequestInitializer.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/RetryHttpRequestInitializer.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/RetryHttpRequestInitializer.java
deleted file mode 100644
index 756dce0..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/RetryHttpRequestInitializer.java
+++ /dev/null
@@ -1,250 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.api.client.http.HttpBackOffIOExceptionHandler;
-import com.google.api.client.http.HttpBackOffUnsuccessfulResponseHandler;
-import com.google.api.client.http.HttpRequest;
-import com.google.api.client.http.HttpRequestInitializer;
-import com.google.api.client.http.HttpResponse;
-import com.google.api.client.http.HttpResponseInterceptor;
-import com.google.api.client.http.HttpUnsuccessfulResponseHandler;
-import com.google.api.client.util.BackOff;
-import com.google.api.client.util.ExponentialBackOff;
-import com.google.api.client.util.NanoClock;
-import com.google.api.client.util.Sleeper;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.IOException;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.HashSet;
-import java.util.Set;
-
-import javax.annotation.Nullable;
-
-/**
- * Implements a request initializer that adds retry handlers to all
- * HttpRequests.
- *
- * <p>This allows chaining through to another HttpRequestInitializer, since
- * clients have exactly one HttpRequestInitializer, and Credential is also
- * a required HttpRequestInitializer.
- *
- * <p>Also can take a HttpResponseInterceptor to be applied to the responses.
- */
-public class RetryHttpRequestInitializer implements HttpRequestInitializer {
-
- private static final Logger LOG = LoggerFactory.getLogger(RetryHttpRequestInitializer.class);
-
- /**
- * Http response codes that should be silently ignored.
- */
- private static final Set<Integer> DEFAULT_IGNORED_RESPONSE_CODES = new HashSet<>(
- Arrays.asList(307 /* Redirect, handled by the client library */,
- 308 /* Resume Incomplete, handled by the client library */));
-
- /**
- * Http response timeout to use for hanging gets.
- */
- private static final int HANGING_GET_TIMEOUT_SEC = 80;
-
- private static class LoggingHttpBackOffIOExceptionHandler
- extends HttpBackOffIOExceptionHandler {
- public LoggingHttpBackOffIOExceptionHandler(BackOff backOff) {
- super(backOff);
- }
-
- @Override
- public boolean handleIOException(HttpRequest request, boolean supportsRetry)
- throws IOException {
- boolean willRetry = super.handleIOException(request, supportsRetry);
- if (willRetry) {
- LOG.debug("Request failed with IOException, will retry: {}", request.getUrl());
- } else {
- LOG.warn("Request failed with IOException, will NOT retry: {}", request.getUrl());
- }
- return willRetry;
- }
- }
-
- private static class LoggingHttpBackoffUnsuccessfulResponseHandler
- implements HttpUnsuccessfulResponseHandler {
- private final HttpBackOffUnsuccessfulResponseHandler handler;
- private final Set<Integer> ignoredResponseCodes;
-
- public LoggingHttpBackoffUnsuccessfulResponseHandler(BackOff backoff,
- Sleeper sleeper, Set<Integer> ignoredResponseCodes) {
- this.ignoredResponseCodes = ignoredResponseCodes;
- handler = new HttpBackOffUnsuccessfulResponseHandler(backoff);
- handler.setSleeper(sleeper);
- handler.setBackOffRequired(
- new HttpBackOffUnsuccessfulResponseHandler.BackOffRequired() {
- @Override
- public boolean isRequired(HttpResponse response) {
- int statusCode = response.getStatusCode();
- return (statusCode / 100 == 5) || // 5xx: server error
- statusCode == 429; // 429: Too many requests
- }
- });
- }
-
- @Override
- public boolean handleResponse(HttpRequest request, HttpResponse response,
- boolean supportsRetry) throws IOException {
- boolean retry = handler.handleResponse(request, response, supportsRetry);
- if (retry) {
- LOG.debug("Request failed with code {} will retry: {}",
- response.getStatusCode(), request.getUrl());
-
- } else if (!ignoredResponseCodes.contains(response.getStatusCode())) {
- LOG.warn("Request failed with code {}, will NOT retry: {}",
- response.getStatusCode(), request.getUrl());
- }
-
- return retry;
- }
- }
-
- @Deprecated
- private final HttpRequestInitializer chained;
-
- private final HttpResponseInterceptor responseInterceptor; // response Interceptor to use
-
- private final NanoClock nanoClock; // used for testing
-
- private final Sleeper sleeper; // used for testing
-
- private Set<Integer> ignoredResponseCodes = new HashSet<>(DEFAULT_IGNORED_RESPONSE_CODES);
-
- public RetryHttpRequestInitializer() {
- this(Collections.<Integer>emptyList());
- }
-
- /**
- * @param chained a downstream HttpRequestInitializer, which will also be
- * applied to HttpRequest initialization. May be null.
- *
- * @deprecated use {@link #RetryHttpRequestInitializer}.
- */
- @Deprecated
- public RetryHttpRequestInitializer(@Nullable HttpRequestInitializer chained) {
- this(chained, Collections.<Integer>emptyList());
- }
-
- /**
- * @param additionalIgnoredResponseCodes a list of HTTP status codes that should not be logged.
- */
- public RetryHttpRequestInitializer(Collection<Integer> additionalIgnoredResponseCodes) {
- this(additionalIgnoredResponseCodes, null);
- }
-
-
- /**
- * @param chained a downstream HttpRequestInitializer, which will also be
- * applied to HttpRequest initialization. May be null.
- * @param additionalIgnoredResponseCodes a list of HTTP status codes that should not be logged.
- *
- * @deprecated use {@link #RetryHttpRequestInitializer(Collection)}.
- */
- @Deprecated
- public RetryHttpRequestInitializer(@Nullable HttpRequestInitializer chained,
- Collection<Integer> additionalIgnoredResponseCodes) {
- this(chained, additionalIgnoredResponseCodes, null);
- }
-
- /**
- * @param additionalIgnoredResponseCodes a list of HTTP status codes that should not be logged.
- * @param responseInterceptor HttpResponseInterceptor to be applied on all requests. May be null.
- */
- public RetryHttpRequestInitializer(
- Collection<Integer> additionalIgnoredResponseCodes,
- @Nullable HttpResponseInterceptor responseInterceptor) {
- this(null, NanoClock.SYSTEM, Sleeper.DEFAULT, additionalIgnoredResponseCodes,
- responseInterceptor);
- }
-
- /**
- * @param chained a downstream HttpRequestInitializer, which will also be applied to HttpRequest
- * initialization. May be null.
- * @param additionalIgnoredResponseCodes a list of HTTP status codes that should not be logged.
- * @param responseInterceptor HttpResponseInterceptor to be applied on all requests. May be null.
- *
- * @deprecated use {@link #RetryHttpRequestInitializer(Collection, HttpResponseInterceptor)}.
- */
- @Deprecated
- public RetryHttpRequestInitializer(
- @Nullable HttpRequestInitializer chained,
- Collection<Integer> additionalIgnoredResponseCodes,
- @Nullable HttpResponseInterceptor responseInterceptor) {
- this(chained, NanoClock.SYSTEM, Sleeper.DEFAULT, additionalIgnoredResponseCodes,
- responseInterceptor);
- }
-
- /**
- * Visible for testing.
- *
- * @param chained a downstream HttpRequestInitializer, which will also be
- * applied to HttpRequest initialization. May be null.
- * @param nanoClock used as a timing source for knowing how much time has elapsed.
- * @param sleeper used to sleep between retries.
- * @param additionalIgnoredResponseCodes a list of HTTP status codes that should not be logged.
- */
- RetryHttpRequestInitializer(@Nullable HttpRequestInitializer chained,
- NanoClock nanoClock, Sleeper sleeper, Collection<Integer> additionalIgnoredResponseCodes,
- HttpResponseInterceptor responseInterceptor) {
- this.chained = chained;
- this.nanoClock = nanoClock;
- this.sleeper = sleeper;
- this.ignoredResponseCodes.addAll(additionalIgnoredResponseCodes);
- this.responseInterceptor = responseInterceptor;
- }
-
- @Override
- public void initialize(HttpRequest request) throws IOException {
- if (chained != null) {
- chained.initialize(request);
- }
-
- // Set a timeout for hanging-gets.
- // TODO: Do this exclusively for work requests.
- request.setReadTimeout(HANGING_GET_TIMEOUT_SEC * 1000);
-
- // Back off on retryable http errors.
- request.setUnsuccessfulResponseHandler(
- // A back-off multiplier of 2 raises the maximum request retrying time
- // to approximately 5 minutes (keeping other back-off parameters to
- // their default values).
- new LoggingHttpBackoffUnsuccessfulResponseHandler(
- new ExponentialBackOff.Builder().setNanoClock(nanoClock)
- .setMultiplier(2).build(),
- sleeper, ignoredResponseCodes));
-
- // Retry immediately on IOExceptions.
- LoggingHttpBackOffIOExceptionHandler loggingBackoffHandler =
- new LoggingHttpBackOffIOExceptionHandler(BackOff.ZERO_BACKOFF);
- request.setIOExceptionHandler(loggingBackoffHandler);
-
- // Set response initializer
- if (responseInterceptor != null) {
- request.setResponseInterceptor(responseInterceptor);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/SerializableUtils.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/SerializableUtils.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/SerializableUtils.java
deleted file mode 100644
index 501b430..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/SerializableUtils.java
+++ /dev/null
@@ -1,159 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-import static com.google.cloud.dataflow.sdk.util.CoderUtils.decodeFromByteArray;
-import static com.google.cloud.dataflow.sdk.util.CoderUtils.encodeToByteArray;
-
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.coders.CoderException;
-import com.google.common.base.Preconditions;
-
-import org.xerial.snappy.SnappyInputStream;
-import org.xerial.snappy.SnappyOutputStream;
-
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
-import java.io.ObjectInputStream;
-import java.io.ObjectOutputStream;
-import java.io.Serializable;
-import java.util.Arrays;
-
-/**
- * Utilities for working with Serializables.
- */
-public class SerializableUtils {
- /**
- * Serializes the argument into an array of bytes, and returns it.
- *
- * @throws IllegalArgumentException if there are errors when serializing
- */
- public static byte[] serializeToByteArray(Serializable value) {
- try {
- ByteArrayOutputStream buffer = new ByteArrayOutputStream();
- try (ObjectOutputStream oos = new ObjectOutputStream(new SnappyOutputStream(buffer))) {
- oos.writeObject(value);
- }
- return buffer.toByteArray();
- } catch (IOException exn) {
- throw new IllegalArgumentException(
- "unable to serialize " + value,
- exn);
- }
- }
-
- /**
- * Deserializes an object from the given array of bytes, e.g., as
- * serialized using {@link #serializeToByteArray}, and returns it.
- *
- * @throws IllegalArgumentException if there are errors when
- * deserializing, using the provided description to identify what
- * was being deserialized
- */
- public static Object deserializeFromByteArray(byte[] encodedValue,
- String description) {
- try {
- try (ObjectInputStream ois = new ObjectInputStream(
- new SnappyInputStream(new ByteArrayInputStream(encodedValue)))) {
- return ois.readObject();
- }
- } catch (IOException | ClassNotFoundException exn) {
- throw new IllegalArgumentException(
- "unable to deserialize " + description,
- exn);
- }
- }
-
- public static <T extends Serializable> T ensureSerializable(T value) {
- @SuppressWarnings("unchecked")
- T copy = (T) deserializeFromByteArray(serializeToByteArray(value),
- value.toString());
- return copy;
- }
-
- public static <T extends Serializable> T clone(T value) {
- @SuppressWarnings("unchecked")
- T copy = (T) deserializeFromByteArray(serializeToByteArray(value),
- value.toString());
- return copy;
- }
-
- /**
- * Serializes a Coder and verifies that it can be correctly deserialized.
- *
- * <p>Throws a RuntimeException if serialized Coder cannot be deserialized, or
- * if the deserialized instance is not equal to the original.
- *
- * @return the serialized Coder, as a {@link CloudObject}
- */
- public static CloudObject ensureSerializable(Coder<?> coder) {
- // Make sure that Coders are java serializable as well since
- // they are regularly captured within DoFn's.
- Coder<?> copy = (Coder<?>) ensureSerializable((Serializable) coder);
-
- CloudObject cloudObject = copy.asCloudObject();
-
- Coder<?> decoded;
- try {
- decoded = Serializer.deserialize(cloudObject, Coder.class);
- } catch (RuntimeException e) {
- throw new RuntimeException(
- String.format("Unable to deserialize Coder: %s. "
- + "Check that a suitable constructor is defined. "
- + "See Coder for details.", coder), e
- );
- }
- Preconditions.checkState(coder.equals(decoded),
- String.format("Coder not equal to original after serialization, "
- + "indicating that the Coder may not implement serialization "
- + "correctly. Before: %s, after: %s, cloud encoding: %s",
- coder, decoded, cloudObject));
-
- return cloudObject;
- }
-
- /**
- * Serializes an arbitrary T with the given {@code Coder<T>} and verifies
- * that it can be correctly deserialized.
- */
- public static <T> T ensureSerializableByCoder(
- Coder<T> coder, T value, String errorContext) {
- byte[] encodedValue;
- try {
- encodedValue = encodeToByteArray(coder, value);
- } catch (CoderException exn) {
- // TODO: Put in better element printing:
- // truncate if too long.
- throw new IllegalArgumentException(
- errorContext + ": unable to encode value "
- + value + " using " + coder,
- exn);
- }
- try {
- return decodeFromByteArray(coder, encodedValue);
- } catch (CoderException exn) {
- // TODO: Put in better encoded byte array printing:
- // use printable chars with escapes instead of codes, and
- // truncate if too long.
- throw new IllegalArgumentException(
- errorContext + ": unable to decode " + Arrays.toString(encodedValue)
- + ", encoding of value " + value + ", using " + coder,
- exn);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/Serializer.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/Serializer.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/Serializer.java
deleted file mode 100644
index 6a8a337..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/Serializer.java
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-import com.fasterxml.jackson.core.JsonProcessingException;
-import com.fasterxml.jackson.databind.DeserializationFeature;
-import com.fasterxml.jackson.databind.ObjectMapper;
-
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-
-import javax.annotation.Nullable;
-
-/**
- * Utility for converting objects between Java and Cloud representations.
- */
-public final class Serializer {
- // Delay initialization of statics until the first call to Serializer.
- private static class SingletonHelper {
- static final ObjectMapper OBJECT_MAPPER = createObjectMapper();
- static final ObjectMapper TREE_MAPPER = createTreeMapper();
-
- /**
- * Creates the object mapper that will be used for serializing Google API
- * client maps into Jackson trees.
- */
- private static ObjectMapper createTreeMapper() {
- return new ObjectMapper();
- }
-
- /**
- * Creates the object mapper that will be used for deserializing Jackson
- * trees into objects.
- */
- private static ObjectMapper createObjectMapper() {
- ObjectMapper m = new ObjectMapper();
- // Ignore properties that are not used by the object.
- m.disable(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES);
-
- // For parameters of type Object, use the @type property to determine the
- // class to instantiate.
- //
- // TODO: It would be ideal to do this for all non-final classes. The
- // problem with using DefaultTyping.NON_FINAL is that it insists on having
- // type information in the JSON for classes with useful default
- // implementations, such as List. Ideally, we'd combine these defaults
- // with available type information if that information's present.
- m.enableDefaultTypingAsProperty(
- ObjectMapper.DefaultTyping.JAVA_LANG_OBJECT,
- PropertyNames.OBJECT_TYPE_NAME);
-
- m.registerModule(new CoderUtils.Jackson2Module());
-
- return m;
- }
- }
-
- /**
- * Deserializes an object from a Dataflow structured encoding (represented in
- * Java as a map).
- *
- * <p>The standard Dataflow SDK object serialization protocol is based on JSON.
- * Data is typically encoded as a JSON object whose fields represent the
- * object's data.
- *
- * <p>The actual deserialization is performed by Jackson, which can deserialize
- * public fields, use JavaBean setters, or use injection annotations to
- * indicate how to construct the object. The {@link ObjectMapper} used is
- * configured to use the "@type" field as the name of the class to instantiate
- * (supporting polymorphic types), and may be further configured by
- * annotations or via {@link ObjectMapper#registerModule}.
- *
- * @see <a href="http://wiki.fasterxml.com/JacksonFAQ#Data_Binding.2C_general">
- * Jackson Data-Binding</a>
- * @see <a href="https://github.com/FasterXML/jackson-annotations/wiki/Jackson-Annotations">
- * Jackson-Annotations</a>
- * @param serialized the object in untyped decoded form (i.e. a nested {@link Map})
- * @param clazz the expected object class
- */
- public static <T> T deserialize(Map<String, Object> serialized, Class<T> clazz) {
- try {
- return SingletonHelper.OBJECT_MAPPER.treeToValue(
- SingletonHelper.TREE_MAPPER.valueToTree(
- deserializeCloudKnownTypes(serialized)),
- clazz);
- } catch (JsonProcessingException e) {
- throw new RuntimeException(
- "Unable to deserialize class " + clazz, e);
- }
- }
-
- /**
- * Recursively walks the supplied map, looking for well-known cloud type
- * information (keyed as {@link PropertyNames#OBJECT_TYPE_NAME}, matching a
- * URI value from the {@link CloudKnownType} enum. Upon finding this type
- * information, it converts it into the correspondingly typed Java value.
- */
- @SuppressWarnings("unchecked")
- private static Object deserializeCloudKnownTypes(Object src) {
- if (src instanceof Map) {
- Map<String, Object> srcMap = (Map<String, Object>) src;
- @Nullable Object value = srcMap.get(PropertyNames.SCALAR_FIELD_NAME);
- @Nullable CloudKnownType type =
- CloudKnownType.forUri((String) srcMap.get(PropertyNames.OBJECT_TYPE_NAME));
- if (type != null && value != null) {
- // It's a value of a well-known cloud type; let the known type handler
- // handle the translation.
- Object result = type.parse(value, type.defaultClass());
- return result;
- }
- // Otherwise, it's just an ordinary map.
- Map<String, Object> dest = new HashMap<>(srcMap.size());
- for (Map.Entry<String, Object> entry : srcMap.entrySet()) {
- dest.put(entry.getKey(), deserializeCloudKnownTypes(entry.getValue()));
- }
- return dest;
- }
- if (src instanceof List) {
- List<Object> srcList = (List<Object>) src;
- List<Object> dest = new ArrayList<>(srcList.size());
- for (Object obj : srcList) {
- dest.add(deserializeCloudKnownTypes(obj));
- }
- return dest;
- }
- // Neither a Map nor a List; no translation needed.
- return src;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/ShardingWritableByteChannel.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/ShardingWritableByteChannel.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/ShardingWritableByteChannel.java
deleted file mode 100644
index 54794ef..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/ShardingWritableByteChannel.java
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.nio.channels.WritableByteChannel;
-import java.util.ArrayList;
-
-/**
- * Implements a WritableByteChannel that may contain multiple output shards.
- *
- * <p>This provides {@link #writeToShard}, which takes a shard number for
- * writing to a particular shard.
- *
- * <p>The channel is considered open if all downstream channels are open, and
- * closes all downstream channels when closed.
- */
-public class ShardingWritableByteChannel implements WritableByteChannel {
-
- /**
- * Special shard number that causes a write to all shards.
- */
- public static final int ALL_SHARDS = -2;
-
-
- private final ArrayList<WritableByteChannel> writers = new ArrayList<>();
-
- /**
- * Returns the number of output shards.
- */
- public int getNumShards() {
- return writers.size();
- }
-
- /**
- * Adds another shard output channel.
- */
- public void addChannel(WritableByteChannel writer) {
- writers.add(writer);
- }
-
- /**
- * Returns the WritableByteChannel associated with the given shard number.
- */
- public WritableByteChannel getChannel(int shardNum) {
- return writers.get(shardNum);
- }
-
- /**
- * Writes the buffer to the given shard.
- *
- * <p>This does not change the current output shard.
- *
- * @return The total number of bytes written. If the shard number is
- * {@link #ALL_SHARDS}, then the total is the sum of each individual shard
- * write.
- */
- public int writeToShard(int shardNum, ByteBuffer src) throws IOException {
- if (shardNum >= 0) {
- return writers.get(shardNum).write(src);
- }
-
- switch (shardNum) {
- case ALL_SHARDS:
- int size = 0;
- for (WritableByteChannel writer : writers) {
- size += writer.write(src);
- }
- return size;
-
- default:
- throw new IllegalArgumentException("Illegal shard number: " + shardNum);
- }
- }
-
- /**
- * Writes a buffer to all shards.
- *
- * <p>Same as calling {@code writeToShard(ALL_SHARDS, buf)}.
- */
- @Override
- public int write(ByteBuffer src) throws IOException {
- return writeToShard(ALL_SHARDS, src);
- }
-
- @Override
- public boolean isOpen() {
- for (WritableByteChannel writer : writers) {
- if (!writer.isOpen()) {
- return false;
- }
- }
-
- return true;
- }
-
- @Override
- public void close() throws IOException {
- for (WritableByteChannel writer : writers) {
- writer.close();
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/SideInputReader.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/SideInputReader.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/SideInputReader.java
deleted file mode 100644
index 37873f3..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/SideInputReader.java
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
-import com.google.cloud.dataflow.sdk.values.PCollectionView;
-
-import javax.annotation.Nullable;
-
-/**
- * The interface to objects that provide side inputs. Particular implementations
- * may read a side input directly or use appropriate sorts of caching, etc.
- */
-public interface SideInputReader {
- /**
- * Returns the value of the given {@link PCollectionView} for the given {@link BoundedWindow}.
- *
- * <p>It is valid for a side input to be {@code null}. It is <i>not</i> valid for this to
- * return {@code null} for any other reason.
- */
- @Nullable
- <T> T get(PCollectionView<T> view, BoundedWindow window);
-
- /**
- * Returns true if the given {@link PCollectionView} is valid for this reader.
- */
- <T> boolean contains(PCollectionView<T> view);
-
- /**
- * Returns true if there are no side inputs in this reader.
- */
- boolean isEmpty();
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/SimpleDoFnRunner.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/SimpleDoFnRunner.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/SimpleDoFnRunner.java
deleted file mode 100644
index 15a5e51..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/SimpleDoFnRunner.java
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Copyright (C) 2016 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-import com.google.cloud.dataflow.sdk.transforms.DoFn;
-import com.google.cloud.dataflow.sdk.util.DoFnRunners.OutputManager;
-import com.google.cloud.dataflow.sdk.util.ExecutionContext.StepContext;
-import com.google.cloud.dataflow.sdk.util.common.CounterSet.AddCounterMutator;
-import com.google.cloud.dataflow.sdk.values.TupleTag;
-
-import java.util.List;
-
-/**
- * Runs a {@link DoFn} by constructing the appropriate contexts and passing them in.
- *
- * @param <InputT> the type of the DoFn's (main) input elements
- * @param <OutputT> the type of the DoFn's (main) output elements
- */
-public class SimpleDoFnRunner<InputT, OutputT> extends DoFnRunnerBase<InputT, OutputT>{
-
- protected SimpleDoFnRunner(PipelineOptions options, DoFn<InputT, OutputT> fn,
- SideInputReader sideInputReader,
- OutputManager outputManager,
- TupleTag<OutputT> mainOutputTag, List<TupleTag<?>> sideOutputTags, StepContext stepContext,
- AddCounterMutator addCounterMutator, WindowingStrategy<?, ?> windowingStrategy) {
- super(options, fn, sideInputReader, outputManager, mainOutputTag, sideOutputTags, stepContext,
- addCounterMutator, windowingStrategy);
- }
-
- @Override
- protected void invokeProcessElement(WindowedValue<InputT> elem) {
- final DoFn<InputT, OutputT>.ProcessContext processContext = createProcessContext(elem);
- // This can contain user code. Wrap it in case it throws an exception.
- try {
- fn.processElement(processContext);
- } catch (Exception ex) {
- throw wrapUserCodeException(ex);
- }
- }
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/Stager.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/Stager.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/Stager.java
deleted file mode 100644
index 04fd599..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/Stager.java
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.api.services.dataflow.model.DataflowPackage;
-
-import java.util.List;
-
-/**
- * Interface for staging files needed for running a Dataflow pipeline.
- */
-public interface Stager {
- /* Stage files and return a list of packages. */
- public List<DataflowPackage> stageFiles();
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/StreamUtils.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/StreamUtils.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/StreamUtils.java
deleted file mode 100644
index 268eb7f..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/StreamUtils.java
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.lang.ref.SoftReference;
-
-/**
- * Utility functions for stream operations.
- */
-public class StreamUtils {
-
- private StreamUtils() {
- }
-
- private static final int BUF_SIZE = 8192;
-
- private static ThreadLocal<SoftReference<byte[]>> threadLocalBuffer = new ThreadLocal<>();
-
- /**
- * Efficient converting stream to bytes.
- */
- public static byte[] getBytes(InputStream stream) throws IOException {
- if (stream instanceof ExposedByteArrayInputStream) {
- // Fast path for the exposed version.
- return ((ExposedByteArrayInputStream) stream).readAll();
- } else if (stream instanceof ByteArrayInputStream) {
- // Fast path for ByteArrayInputStream.
- byte[] ret = new byte[stream.available()];
- stream.read(ret);
- return ret;
- }
- // Falls back to normal stream copying.
- SoftReference<byte[]> refBuffer = threadLocalBuffer.get();
- byte[] buffer = refBuffer == null ? null : refBuffer.get();
- if (buffer == null) {
- buffer = new byte[BUF_SIZE];
- threadLocalBuffer.set(new SoftReference<byte[]>(buffer));
- }
- ByteArrayOutputStream outStream = new ByteArrayOutputStream();
- while (true) {
- int r = stream.read(buffer);
- if (r == -1) {
- break;
- }
- outStream.write(buffer, 0, r);
- }
- return outStream.toByteArray();
- }
-
-}
[67/67] incubator-beam git commit: Directory reorganization
Posted by dh...@apache.org.
Directory reorganization
Move Java-specific archetypes from "maven-archetypes/" into "sdks/java/maven-archetypes/".
Project: http://git-wip-us.apache.org/repos/asf/incubator-beam/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-beam/commit/257a7a6b
Tree: http://git-wip-us.apache.org/repos/asf/incubator-beam/tree/257a7a6b
Diff: http://git-wip-us.apache.org/repos/asf/incubator-beam/diff/257a7a6b
Branch: refs/heads/master
Commit: 257a7a6be0cce4d08be749af159ec8a6adb7ceb9
Parents: d4233aa
Author: Davor Bonaci <da...@google.com>
Authored: Wed Mar 23 17:47:11 2016 -0700
Committer: Davor Bonaci <da...@google.com>
Committed: Wed Mar 23 18:33:33 2016 -0700
----------------------------------------------------------------------
maven-archetypes/examples/pom.xml | 56 ---
.../META-INF/maven/archetype-metadata.xml | 29 --
.../main/resources/archetype-resources/pom.xml | 204 ----------
.../src/main/java/DebuggingWordCount.java | 182 ---------
.../src/main/java/MinimalWordCount.java | 115 ------
.../src/main/java/WindowedWordCount.java | 262 ------------
.../src/main/java/WordCount.java | 204 ----------
.../java/common/DataflowExampleOptions.java | 29 --
.../main/java/common/DataflowExampleUtils.java | 398 -------------------
.../common/ExampleBigQueryTableOptions.java | 53 ---
.../java/common/ExamplePubsubTopicOptions.java | 49 ---
.../main/java/common/PubsubFileInjector.java | 153 -------
.../src/test/java/DebuggingWordCountTest.java | 44 --
.../src/test/java/WordCountTest.java | 85 ----
.../projects/basic/archetype.properties | 5 -
.../src/test/resources/projects/basic/goal.txt | 1 -
maven-archetypes/pom.xml | 41 --
maven-archetypes/starter/pom.xml | 57 ---
.../META-INF/maven/archetype-metadata.xml | 21 -
.../main/resources/archetype-resources/pom.xml | 43 --
.../src/main/java/StarterPipeline.java | 67 ----
.../projects/basic/archetype.properties | 5 -
.../src/test/resources/projects/basic/goal.txt | 1 -
.../resources/projects/basic/reference/pom.xml | 43 --
.../src/main/java/it/pkg/StarterPipeline.java | 67 ----
pom.xml | 2 +-
sdks/java/maven-archetypes/examples/pom.xml | 56 +++
.../META-INF/maven/archetype-metadata.xml | 29 ++
.../main/resources/archetype-resources/pom.xml | 204 ++++++++++
.../src/main/java/DebuggingWordCount.java | 182 +++++++++
.../src/main/java/MinimalWordCount.java | 115 ++++++
.../src/main/java/WindowedWordCount.java | 262 ++++++++++++
.../src/main/java/WordCount.java | 204 ++++++++++
.../java/common/DataflowExampleOptions.java | 29 ++
.../main/java/common/DataflowExampleUtils.java | 398 +++++++++++++++++++
.../common/ExampleBigQueryTableOptions.java | 53 +++
.../java/common/ExamplePubsubTopicOptions.java | 49 +++
.../main/java/common/PubsubFileInjector.java | 153 +++++++
.../src/test/java/DebuggingWordCountTest.java | 44 ++
.../src/test/java/WordCountTest.java | 85 ++++
.../projects/basic/archetype.properties | 5 +
.../src/test/resources/projects/basic/goal.txt | 1 +
sdks/java/maven-archetypes/pom.xml | 41 ++
sdks/java/maven-archetypes/starter/pom.xml | 57 +++
.../META-INF/maven/archetype-metadata.xml | 21 +
.../main/resources/archetype-resources/pom.xml | 43 ++
.../src/main/java/StarterPipeline.java | 67 ++++
.../projects/basic/archetype.properties | 5 +
.../src/test/resources/projects/basic/goal.txt | 1 +
.../resources/projects/basic/reference/pom.xml | 43 ++
.../src/main/java/it/pkg/StarterPipeline.java | 67 ++++
51 files changed, 2215 insertions(+), 2215 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/257a7a6b/maven-archetypes/examples/pom.xml
----------------------------------------------------------------------
diff --git a/maven-archetypes/examples/pom.xml b/maven-archetypes/examples/pom.xml
deleted file mode 100644
index 7e74b9d..0000000
--- a/maven-archetypes/examples/pom.xml
+++ /dev/null
@@ -1,56 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project xmlns="http://maven.apache.org/POM/4.0.0"
- xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
- xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
-
- <modelVersion>4.0.0</modelVersion>
-
- <parent>
- <groupId>org.apache.beam</groupId>
- <artifactId>maven-archetypes-parent</artifactId>
- <version>0.1.0-incubating-SNAPSHOT</version>
- <relativePath>../pom.xml</relativePath>
- </parent>
-
- <artifactId>maven-archetypes-examples</artifactId>
- <name>Apache Beam :: Maven Archetypes :: Examples</name>
- <description>A Maven Archetype to create a project containing all the
- example pipelines from the Apache Beam Java SDK.</description>
-
- <packaging>maven-archetype</packaging>
-
- <build>
- <extensions>
- <extension>
- <groupId>org.apache.maven.archetype</groupId>
- <artifactId>archetype-packaging</artifactId>
- <version>2.4</version>
- </extension>
- </extensions>
-
- <pluginManagement>
- <plugins>
- <plugin>
- <artifactId>maven-archetype-plugin</artifactId>
- <version>2.4</version>
- </plugin>
- </plugins>
- </pluginManagement>
- </build>
-</project>
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/257a7a6b/maven-archetypes/examples/src/main/resources/META-INF/maven/archetype-metadata.xml
----------------------------------------------------------------------
diff --git a/maven-archetypes/examples/src/main/resources/META-INF/maven/archetype-metadata.xml b/maven-archetypes/examples/src/main/resources/META-INF/maven/archetype-metadata.xml
deleted file mode 100644
index 7742af4..0000000
--- a/maven-archetypes/examples/src/main/resources/META-INF/maven/archetype-metadata.xml
+++ /dev/null
@@ -1,29 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<archetype-descriptor
- xsi:schemaLocation="http://maven.apache.org/plugins/maven-archetype-plugin/archetype-descriptor/1.0.0 http://maven.apache.org/xsd/archetype-descriptor-1.0.0.xsd"
- name="Google Cloud Dataflow Example Pipelines Archetype"
- xmlns="http://maven.apache.org/plugins/maven-archetype-plugin/archetype-descriptor/1.0.0"
- xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
-
- <requiredProperties>
- <requiredProperty key="targetPlatform">
- <defaultValue>1.7</defaultValue>
- </requiredProperty>
- </requiredProperties>
-
- <fileSets>
- <fileSet filtered="true" packaged="true" encoding="UTF-8">
- <directory>src/main/java</directory>
- <includes>
- <include>**/*.java</include>
- </includes>
- </fileSet>
-
- <fileSet filtered="true" packaged="true" encoding="UTF-8">
- <directory>src/test/java</directory>
- <includes>
- <include>**/*.java</include>
- </includes>
- </fileSet>
- </fileSets>
-</archetype-descriptor>
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/257a7a6b/maven-archetypes/examples/src/main/resources/archetype-resources/pom.xml
----------------------------------------------------------------------
diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/pom.xml b/maven-archetypes/examples/src/main/resources/archetype-resources/pom.xml
deleted file mode 100644
index d19d0c6..0000000
--- a/maven-archetypes/examples/src/main/resources/archetype-resources/pom.xml
+++ /dev/null
@@ -1,204 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- ~ Copyright (C) 2015 Google Inc.
- ~
- ~ Licensed under the Apache License, Version 2.0 (the "License"); you may not
- ~ use this file except in compliance with the License. You may obtain a copy of
- ~ the License at
- ~
- ~ http://www.apache.org/licenses/LICENSE-2.0
- ~
- ~ Unless required by applicable law or agreed to in writing, software
- ~ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- ~ WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- ~ License for the specific language governing permissions and limitations under
- ~ the License.
- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~-->
-<project xmlns="http://maven.apache.org/POM/4.0.0"
- xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
- xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
- <modelVersion>4.0.0</modelVersion>
-
- <groupId>${groupId}</groupId>
- <artifactId>${artifactId}</artifactId>
- <version>${version}</version>
-
- <packaging>jar</packaging>
-
- <build>
- <plugins>
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-compiler-plugin</artifactId>
- <version>3.3</version>
- <configuration>
- <source>${targetPlatform}</source>
- <target>${targetPlatform}</target>
- </configuration>
- </plugin>
-
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-shade-plugin</artifactId>
- <version>2.3</version>
- <executions>
- <execution>
- <phase>package</phase>
- <goals>
- <goal>shade</goal>
- </goals>
- <configuration>
- <finalName>${project.artifactId}-bundled-${project.version}</finalName>
- <artifactSet>
- <includes>
- <include>*:*</include>
- </includes>
- </artifactSet>
- <filters>
- <filter>
- <artifact>*:*</artifact>
- <excludes>
- <exclude>META-INF/*.SF</exclude>
- <exclude>META-INF/*.DSA</exclude>
- <exclude>META-INF/*.RSA</exclude>
- </excludes>
- </filter>
- </filters>
- </configuration>
- </execution>
- </executions>
- </plugin>
-
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-surefire-plugin</artifactId>
- <version>2.18.1</version>
- <configuration>
- <parallel>all</parallel>
- <threadCount>4</threadCount>
- <redirectTestOutputToFile>true</redirectTestOutputToFile>
- </configuration>
- <dependencies>
- <dependency>
- <groupId>org.apache.maven.surefire</groupId>
- <artifactId>surefire-junit47</artifactId>
- <version>2.18.1</version>
- </dependency>
- </dependencies>
- </plugin>
- </plugins>
- </build>
-
- <dependencies>
- <!-- Adds a dependency on a specific version of the Dataflow SDK. -->
- <dependency>
- <groupId>org.apache.beam</groupId>
- <artifactId>java-sdk-all</artifactId>
- <version>[0-incubating, 2-incubating)</version>
- </dependency>
-
- <dependency>
- <groupId>com.google.api-client</groupId>
- <artifactId>google-api-client</artifactId>
- <version>1.21.0</version>
- <exclusions>
- <!-- Exclude an old version of guava that is being pulled
- in by a transitive dependency of google-api-client -->
- <exclusion>
- <groupId>com.google.guava</groupId>
- <artifactId>guava-jdk5</artifactId>
- </exclusion>
- </exclusions>
- </dependency>
-
- <!-- Dependencies below this line are specific dependencies needed by the examples code. -->
- <dependency>
- <groupId>com.google.apis</groupId>
- <artifactId>google-api-services-bigquery</artifactId>
- <version>v2-rev248-1.21.0</version>
- <exclusions>
- <!-- Exclude an old version of guava that is being pulled
- in by a transitive dependency of google-api-client -->
- <exclusion>
- <groupId>com.google.guava</groupId>
- <artifactId>guava-jdk5</artifactId>
- </exclusion>
- </exclusions>
- </dependency>
-
- <dependency>
- <groupId>com.google.http-client</groupId>
- <artifactId>google-http-client</artifactId>
- <version>1.21.0</version>
- <exclusions>
- <!-- Exclude an old version of guava that is being pulled
- in by a transitive dependency of google-api-client -->
- <exclusion>
- <groupId>com.google.guava</groupId>
- <artifactId>guava-jdk5</artifactId>
- </exclusion>
- </exclusions>
- </dependency>
-
- <dependency>
- <groupId>com.google.apis</groupId>
- <artifactId>google-api-services-pubsub</artifactId>
- <version>v1-rev7-1.21.0</version>
- <exclusions>
- <!-- Exclude an old version of guava that is being pulled
- in by a transitive dependency of google-api-client -->
- <exclusion>
- <groupId>com.google.guava</groupId>
- <artifactId>guava-jdk5</artifactId>
- </exclusion>
- </exclusions>
- </dependency>
-
- <dependency>
- <groupId>joda-time</groupId>
- <artifactId>joda-time</artifactId>
- <version>2.4</version>
- </dependency>
-
- <dependency>
- <groupId>com.google.guava</groupId>
- <artifactId>guava</artifactId>
- <version>18.0</version>
- </dependency>
-
- <dependency>
- <groupId>javax.servlet</groupId>
- <artifactId>javax.servlet-api</artifactId>
- <version>3.1.0</version>
- </dependency>
-
- <!-- Add slf4j API frontend binding with JUL backend -->
- <dependency>
- <groupId>org.slf4j</groupId>
- <artifactId>slf4j-api</artifactId>
- <version>1.7.7</version>
- </dependency>
-
- <dependency>
- <groupId>org.slf4j</groupId>
- <artifactId>slf4j-jdk14</artifactId>
- <version>1.7.7</version>
- <!-- When loaded at runtime this will wire up slf4j to the JUL backend -->
- <scope>runtime</scope>
- </dependency>
-
- <!-- Hamcrest and JUnit are required dependencies of DataflowAssert,
- which is used in the main code of DebuggingWordCount example. -->
- <dependency>
- <groupId>org.hamcrest</groupId>
- <artifactId>hamcrest-all</artifactId>
- <version>1.3</version>
- </dependency>
-
- <dependency>
- <groupId>junit</groupId>
- <artifactId>junit</artifactId>
- <version>4.11</version>
- </dependency>
- </dependencies>
-</project>
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/257a7a6b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/DebuggingWordCount.java
----------------------------------------------------------------------
diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/DebuggingWordCount.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/DebuggingWordCount.java
deleted file mode 100644
index 3cf2bc0..0000000
--- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/DebuggingWordCount.java
+++ /dev/null
@@ -1,182 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package ${package};
-
-import ${package}.WordCount.WordCountOptions;
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.io.TextIO;
-import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
-import com.google.cloud.dataflow.sdk.testing.DataflowAssert;
-import com.google.cloud.dataflow.sdk.transforms.Aggregator;
-import com.google.cloud.dataflow.sdk.transforms.DoFn;
-import com.google.cloud.dataflow.sdk.transforms.ParDo;
-import com.google.cloud.dataflow.sdk.transforms.Sum;
-import com.google.cloud.dataflow.sdk.values.KV;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.util.Arrays;
-import java.util.List;
-import java.util.regex.Pattern;
-
-
-/**
- * An example that verifies word counts in Shakespeare and includes Dataflow best practices.
- *
- * <p>This class, {@link DebuggingWordCount}, is the third in a series of four successively more
- * detailed 'word count' examples. You may first want to take a look at {@link MinimalWordCount}
- * and {@link WordCount}. After you've looked at this example, then see the
- * {@link WindowedWordCount} pipeline, for introduction of additional concepts.
- *
- * <p>Basic concepts, also in the MinimalWordCount and WordCount examples:
- * Reading text files; counting a PCollection; executing a Pipeline both locally
- * and using the Dataflow service; defining DoFns.
- *
- * <p>New Concepts:
- * <pre>
- * 1. Logging to Cloud Logging
- * 2. Controlling Dataflow worker log levels
- * 3. Creating a custom aggregator
- * 4. Testing your Pipeline via DataflowAssert
- * </pre>
- *
- * <p>To execute this pipeline locally, specify general pipeline configuration:
- * <pre>{@code
- * --project=YOUR_PROJECT_ID
- * }
- * </pre>
- *
- * <p>To execute this pipeline using the Dataflow service and the additional logging discussed
- * below, specify pipeline configuration:
- * <pre>{@code
- * --project=YOUR_PROJECT_ID
- * --stagingLocation=gs://YOUR_STAGING_DIRECTORY
- * --runner=BlockingDataflowPipelineRunner
- * --workerLogLevelOverrides={"com.google.cloud.dataflow.examples":"DEBUG"}
- * }
- * </pre>
- *
- * <p>Note that when you run via <code>mvn exec</code>, you may need to escape
- * the quotations as appropriate for your shell. For example, in <code>bash</code>:
- * <pre>
- * mvn compile exec:java ... \
- * -Dexec.args="... \
- * --workerLogLevelOverrides={\\\"com.google.cloud.dataflow.examples\\\":\\\"DEBUG\\\"}"
- * </pre>
- *
- * <p>Concept #2: Dataflow workers which execute user code are configured to log to Cloud
- * Logging by default at "INFO" log level and higher. One may override log levels for specific
- * logging namespaces by specifying:
- * <pre><code>
- * --workerLogLevelOverrides={"Name1":"Level1","Name2":"Level2",...}
- * </code></pre>
- * For example, by specifying:
- * <pre><code>
- * --workerLogLevelOverrides={"com.google.cloud.dataflow.examples":"DEBUG"}
- * </code></pre>
- * when executing this pipeline using the Dataflow service, Cloud Logging would contain only
- * "DEBUG" or higher level logs for the {@code com.google.cloud.dataflow.examples} package in
- * addition to the default "INFO" or higher level logs. In addition, the default Dataflow worker
- * logging configuration can be overridden by specifying
- * {@code --defaultWorkerLogLevel=<one of TRACE, DEBUG, INFO, WARN, ERROR>}. For example,
- * by specifying {@code --defaultWorkerLogLevel=DEBUG} when executing this pipeline with
- * the Dataflow service, Cloud Logging would contain all "DEBUG" or higher level logs. Note
- * that changing the default worker log level to TRACE or DEBUG will significantly increase
- * the amount of logs output.
- *
- * <p>The input file defaults to {@code gs://dataflow-samples/shakespeare/kinglear.txt} and can be
- * overridden with {@code --inputFile}.
- */
-public class DebuggingWordCount {
- /** A DoFn that filters for a specific key based upon a regular expression. */
- public static class FilterTextFn extends DoFn<KV<String, Long>, KV<String, Long>> {
- /**
- * Concept #1: The logger below uses the fully qualified class name of FilterTextFn
- * as the logger. All log statements emitted by this logger will be referenced by this name
- * and will be visible in the Cloud Logging UI. Learn more at https://cloud.google.com/logging
- * about the Cloud Logging UI.
- */
- private static final Logger LOG = LoggerFactory.getLogger(FilterTextFn.class);
-
- private final Pattern filter;
- public FilterTextFn(String pattern) {
- filter = Pattern.compile(pattern);
- }
-
- /**
- * Concept #3: A custom aggregator can track values in your pipeline as it runs. Those
- * values will be displayed in the Dataflow Monitoring UI when this pipeline is run using the
- * Dataflow service. These aggregators below track the number of matched and unmatched words.
- * Learn more at https://cloud.google.com/dataflow/pipelines/dataflow-monitoring-intf about
- * the Dataflow Monitoring UI.
- */
- private final Aggregator<Long, Long> matchedWords =
- createAggregator("matchedWords", new Sum.SumLongFn());
- private final Aggregator<Long, Long> unmatchedWords =
- createAggregator("umatchedWords", new Sum.SumLongFn());
-
- @Override
- public void processElement(ProcessContext c) {
- if (filter.matcher(c.element().getKey()).matches()) {
- // Log at the "DEBUG" level each element that we match. When executing this pipeline
- // using the Dataflow service, these log lines will appear in the Cloud Logging UI
- // only if the log level is set to "DEBUG" or lower.
- LOG.debug("Matched: " + c.element().getKey());
- matchedWords.addValue(1L);
- c.output(c.element());
- } else {
- // Log at the "TRACE" level each element that is not matched. Different log levels
- // can be used to control the verbosity of logging providing an effective mechanism
- // to filter less important information.
- LOG.trace("Did not match: " + c.element().getKey());
- unmatchedWords.addValue(1L);
- }
- }
- }
-
- public static void main(String[] args) {
- WordCountOptions options = PipelineOptionsFactory.fromArgs(args).withValidation()
- .as(WordCountOptions.class);
- Pipeline p = Pipeline.create(options);
-
- PCollection<KV<String, Long>> filteredWords =
- p.apply(TextIO.Read.named("ReadLines").from(options.getInputFile()))
- .apply(new WordCount.CountWords())
- .apply(ParDo.of(new FilterTextFn("Flourish|stomach")));
-
- /**
- * Concept #4: DataflowAssert is a set of convenient PTransforms in the style of
- * Hamcrest's collection matchers that can be used when writing Pipeline level tests
- * to validate the contents of PCollections. DataflowAssert is best used in unit tests
- * with small data sets but is demonstrated here as a teaching tool.
- *
- * <p>Below we verify that the set of filtered words matches our expected counts. Note
- * that DataflowAssert does not provide any output and that successful completion of the
- * Pipeline implies that the expectations were met. Learn more at
- * https://cloud.google.com/dataflow/pipelines/testing-your-pipeline on how to test
- * your Pipeline and see {@link DebuggingWordCountTest} for an example unit test.
- */
- List<KV<String, Long>> expectedResults = Arrays.asList(
- KV.of("Flourish", 3L),
- KV.of("stomach", 1L));
- DataflowAssert.that(filteredWords).containsInAnyOrder(expectedResults);
-
- p.run();
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/257a7a6b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/MinimalWordCount.java
----------------------------------------------------------------------
diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/MinimalWordCount.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/MinimalWordCount.java
deleted file mode 100644
index 035db01..0000000
--- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/MinimalWordCount.java
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package ${package};
-
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.io.TextIO;
-import com.google.cloud.dataflow.sdk.options.DataflowPipelineOptions;
-import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
-import com.google.cloud.dataflow.sdk.runners.BlockingDataflowPipelineRunner;
-import com.google.cloud.dataflow.sdk.transforms.Count;
-import com.google.cloud.dataflow.sdk.transforms.DoFn;
-import com.google.cloud.dataflow.sdk.transforms.ParDo;
-import com.google.cloud.dataflow.sdk.values.KV;
-
-
-/**
- * An example that counts words in Shakespeare.
- *
- * <p>This class, {@link MinimalWordCount}, is the first in a series of four successively more
- * detailed 'word count' examples. Here, for simplicity, we don't show any error-checking or
- * argument processing, and focus on construction of the pipeline, which chains together the
- * application of core transforms.
- *
- * <p>Next, see the {@link WordCount} pipeline, then the {@link DebuggingWordCount}, and finally
- * the {@link WindowedWordCount} pipeline, for more detailed examples that introduce additional
- * concepts.
- *
- * <p>Concepts:
- * <pre>
- * 1. Reading data from text files
- * 2. Specifying 'inline' transforms
- * 3. Counting a PCollection
- * 4. Writing data to Cloud Storage as text files
- * </pre>
- *
- * <p>To execute this pipeline, first edit the code to set your project ID, the staging
- * location, and the output location. The specified GCS bucket(s) must already exist.
- *
- * <p>Then, run the pipeline as described in the README. It will be deployed and run using the
- * Dataflow service. No args are required to run the pipeline. You can see the results in your
- * output bucket in the GCS browser.
- */
-public class MinimalWordCount {
-
- public static void main(String[] args) {
- // Create a DataflowPipelineOptions object. This object lets us set various execution
- // options for our pipeline, such as the associated Cloud Platform project and the location
- // in Google Cloud Storage to stage files.
- DataflowPipelineOptions options = PipelineOptionsFactory.create()
- .as(DataflowPipelineOptions.class);
- options.setRunner(BlockingDataflowPipelineRunner.class);
- // CHANGE 1/3: Your project ID is required in order to run your pipeline on the Google Cloud.
- options.setProject("SET_YOUR_PROJECT_ID_HERE");
- // CHANGE 2/3: Your Google Cloud Storage path is required for staging local files.
- options.setStagingLocation("gs://SET_YOUR_BUCKET_NAME_HERE/AND_STAGING_DIRECTORY");
-
- // Create the Pipeline object with the options we defined above.
- Pipeline p = Pipeline.create(options);
-
- // Apply the pipeline's transforms.
-
- // Concept #1: Apply a root transform to the pipeline; in this case, TextIO.Read to read a set
- // of input text files. TextIO.Read returns a PCollection where each element is one line from
- // the input text (a set of Shakespeare's texts).
- p.apply(TextIO.Read.from("gs://dataflow-samples/shakespeare/*"))
- // Concept #2: Apply a ParDo transform to our PCollection of text lines. This ParDo invokes a
- // DoFn (defined in-line) on each element that tokenizes the text line into individual words.
- // The ParDo returns a PCollection<String>, where each element is an individual word in
- // Shakespeare's collected texts.
- .apply(ParDo.named("ExtractWords").of(new DoFn<String, String>() {
- @Override
- public void processElement(ProcessContext c) {
- for (String word : c.element().split("[^a-zA-Z']+")) {
- if (!word.isEmpty()) {
- c.output(word);
- }
- }
- }
- }))
- // Concept #3: Apply the Count transform to our PCollection of individual words. The Count
- // transform returns a new PCollection of key/value pairs, where each key represents a unique
- // word in the text. The associated value is the occurrence count for that word.
- .apply(Count.<String>perElement())
- // Apply another ParDo transform that formats our PCollection of word counts into a printable
- // string, suitable for writing to an output file.
- .apply(ParDo.named("FormatResults").of(new DoFn<KV<String, Long>, String>() {
- @Override
- public void processElement(ProcessContext c) {
- c.output(c.element().getKey() + ": " + c.element().getValue());
- }
- }))
- // Concept #4: Apply a write transform, TextIO.Write, at the end of the pipeline.
- // TextIO.Write writes the contents of a PCollection (in this case, our PCollection of
- // formatted strings) to a series of text files in Google Cloud Storage.
- // CHANGE 3/3: The Google Cloud Storage path is required for outputting the results to.
- .apply(TextIO.Write.to("gs://YOUR_OUTPUT_BUCKET/AND_OUTPUT_PREFIX"));
-
- // Run the pipeline.
- p.run();
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/257a7a6b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/WindowedWordCount.java
----------------------------------------------------------------------
diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/WindowedWordCount.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/WindowedWordCount.java
deleted file mode 100644
index 29921e2..0000000
--- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/WindowedWordCount.java
+++ /dev/null
@@ -1,262 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package ${package};
-
-import com.google.api.services.bigquery.model.TableFieldSchema;
-import com.google.api.services.bigquery.model.TableReference;
-import com.google.api.services.bigquery.model.TableRow;
-import com.google.api.services.bigquery.model.TableSchema;
-import ${package}.common.DataflowExampleUtils;
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.PipelineResult;
-import com.google.cloud.dataflow.sdk.io.BigQueryIO;
-import com.google.cloud.dataflow.sdk.io.PubsubIO;
-import com.google.cloud.dataflow.sdk.io.TextIO;
-import com.google.cloud.dataflow.sdk.options.Default;
-import com.google.cloud.dataflow.sdk.options.Description;
-import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
-import com.google.cloud.dataflow.sdk.transforms.DoFn;
-import com.google.cloud.dataflow.sdk.transforms.ParDo;
-import com.google.cloud.dataflow.sdk.transforms.windowing.FixedWindows;
-import com.google.cloud.dataflow.sdk.transforms.windowing.Window;
-import com.google.cloud.dataflow.sdk.values.KV;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-
-import org.joda.time.Duration;
-import org.joda.time.Instant;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
-
-
-/**
- * An example that counts words in text, and can run over either unbounded or bounded input
- * collections.
- *
- * <p>This class, {@link WindowedWordCount}, is the last in a series of four successively more
- * detailed 'word count' examples. First take a look at {@link MinimalWordCount},
- * {@link WordCount}, and {@link DebuggingWordCount}.
- *
- * <p>Basic concepts, also in the MinimalWordCount, WordCount, and DebuggingWordCount examples:
- * Reading text files; counting a PCollection; writing to GCS; executing a Pipeline both locally
- * and using the Dataflow service; defining DoFns; creating a custom aggregator;
- * user-defined PTransforms; defining PipelineOptions.
- *
- * <p>New Concepts:
- * <pre>
- * 1. Unbounded and bounded pipeline input modes
- * 2. Adding timestamps to data
- * 3. PubSub topics as sources
- * 4. Windowing
- * 5. Re-using PTransforms over windowed PCollections
- * 6. Writing to BigQuery
- * </pre>
- *
- * <p>To execute this pipeline locally, specify general pipeline configuration:
- * <pre>{@code
- * --project=YOUR_PROJECT_ID
- * }
- * </pre>
- *
- * <p>To execute this pipeline using the Dataflow service, specify pipeline configuration:
- * <pre>{@code
- * --project=YOUR_PROJECT_ID
- * --stagingLocation=gs://YOUR_STAGING_DIRECTORY
- * --runner=BlockingDataflowPipelineRunner
- * }
- * </pre>
- *
- * <p>Optionally specify the input file path via:
- * {@code --inputFile=gs://INPUT_PATH},
- * which defaults to {@code gs://dataflow-samples/shakespeare/kinglear.txt}.
- *
- * <p>Specify an output BigQuery dataset and optionally, a table for the output. If you don't
- * specify the table, one will be created for you using the job name. If you don't specify the
- * dataset, a dataset called {@code dataflow-examples} must already exist in your project.
- * {@code --bigQueryDataset=YOUR-DATASET --bigQueryTable=YOUR-NEW-TABLE-NAME}.
- *
- * <p>Decide whether you want your pipeline to run with 'bounded' (such as files in GCS) or
- * 'unbounded' input (such as a PubSub topic). To run with unbounded input, set
- * {@code --unbounded=true}. Then, optionally specify the Google Cloud PubSub topic to read from
- * via {@code --pubsubTopic=projects/PROJECT_ID/topics/YOUR_TOPIC_NAME}. If the topic does not
- * exist, the pipeline will create one for you. It will delete this topic when it terminates.
- * The pipeline will automatically launch an auxiliary batch pipeline to populate the given PubSub
- * topic with the contents of the {@code --inputFile}, in order to make the example easy to run.
- * If you want to use an independently-populated PubSub topic, indicate this by setting
- * {@code --inputFile=""}. In that case, the auxiliary pipeline will not be started.
- *
- * <p>By default, the pipeline will do fixed windowing, on 1-minute windows. You can
- * change this interval by setting the {@code --windowSize} parameter, e.g. {@code --windowSize=10}
- * for 10-minute windows.
- */
-public class WindowedWordCount {
- private static final Logger LOG = LoggerFactory.getLogger(WindowedWordCount.class);
- static final int WINDOW_SIZE = 1; // Default window duration in minutes
-
- /**
- * Concept #2: A DoFn that sets the data element timestamp. This is a silly method, just for
- * this example, for the bounded data case.
- *
- * <p>Imagine that many ghosts of Shakespeare are all typing madly at the same time to recreate
- * his masterworks. Each line of the corpus will get a random associated timestamp somewhere in a
- * 2-hour period.
- */
- static class AddTimestampFn extends DoFn<String, String> {
- private static final long RAND_RANGE = 7200000; // 2 hours in ms
-
- @Override
- public void processElement(ProcessContext c) {
- // Generate a timestamp that falls somewhere in the past two hours.
- long randomTimestamp = System.currentTimeMillis()
- - (int) (Math.random() * RAND_RANGE);
- /**
- * Concept #2: Set the data element with that timestamp.
- */
- c.outputWithTimestamp(c.element(), new Instant(randomTimestamp));
- }
- }
-
- /** A DoFn that converts a Word and Count into a BigQuery table row. */
- static class FormatAsTableRowFn extends DoFn<KV<String, Long>, TableRow> {
- @Override
- public void processElement(ProcessContext c) {
- TableRow row = new TableRow()
- .set("word", c.element().getKey())
- .set("count", c.element().getValue())
- // include a field for the window timestamp
- .set("window_timestamp", c.timestamp().toString());
- c.output(row);
- }
- }
-
- /**
- * Helper method that defines the BigQuery schema used for the output.
- */
- private static TableSchema getSchema() {
- List<TableFieldSchema> fields = new ArrayList<>();
- fields.add(new TableFieldSchema().setName("word").setType("STRING"));
- fields.add(new TableFieldSchema().setName("count").setType("INTEGER"));
- fields.add(new TableFieldSchema().setName("window_timestamp").setType("TIMESTAMP"));
- TableSchema schema = new TableSchema().setFields(fields);
- return schema;
- }
-
- /**
- * Concept #6: We'll stream the results to a BigQuery table. The BigQuery output source is one
- * that supports both bounded and unbounded data. This is a helper method that creates a
- * TableReference from input options, to tell the pipeline where to write its BigQuery results.
- */
- private static TableReference getTableReference(Options options) {
- TableReference tableRef = new TableReference();
- tableRef.setProjectId(options.getProject());
- tableRef.setDatasetId(options.getBigQueryDataset());
- tableRef.setTableId(options.getBigQueryTable());
- return tableRef;
- }
-
- /**
- * Options supported by {@link WindowedWordCount}.
- *
- * <p>Inherits standard example configuration options, which allow specification of the BigQuery
- * table and the PubSub topic, as well as the {@link WordCount.WordCountOptions} support for
- * specification of the input file.
- */
- public static interface Options
- extends WordCount.WordCountOptions, DataflowExampleUtils.DataflowExampleUtilsOptions {
- @Description("Fixed window duration, in minutes")
- @Default.Integer(WINDOW_SIZE)
- Integer getWindowSize();
- void setWindowSize(Integer value);
-
- @Description("Whether to run the pipeline with unbounded input")
- boolean isUnbounded();
- void setUnbounded(boolean value);
- }
-
- public static void main(String[] args) throws IOException {
- Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
- options.setBigQuerySchema(getSchema());
- // DataflowExampleUtils creates the necessary input sources to simplify execution of this
- // Pipeline.
- DataflowExampleUtils exampleDataflowUtils = new DataflowExampleUtils(options,
- options.isUnbounded());
-
- Pipeline pipeline = Pipeline.create(options);
-
- /**
- * Concept #1: the Dataflow SDK lets us run the same pipeline with either a bounded or
- * unbounded input source.
- */
- PCollection<String> input;
- if (options.isUnbounded()) {
- LOG.info("Reading from PubSub.");
- /**
- * Concept #3: Read from the PubSub topic. A topic will be created if it wasn't
- * specified as an argument. The data elements' timestamps will come from the pubsub
- * injection.
- */
- input = pipeline
- .apply(PubsubIO.Read.topic(options.getPubsubTopic()));
- } else {
- /** Else, this is a bounded pipeline. Read from the GCS file. */
- input = pipeline
- .apply(TextIO.Read.from(options.getInputFile()))
- // Concept #2: Add an element timestamp, using an artificial time just to show windowing.
- // See AddTimestampFn for more detail on this.
- .apply(ParDo.of(new AddTimestampFn()));
- }
-
- /**
- * Concept #4: Window into fixed windows. The fixed window size for this example defaults to 1
- * minute (you can change this with a command-line option). See the documentation for more
- * information on how fixed windows work, and for information on the other types of windowing
- * available (e.g., sliding windows).
- */
- PCollection<String> windowedWords = input
- .apply(Window.<String>into(
- FixedWindows.of(Duration.standardMinutes(options.getWindowSize()))));
-
- /**
- * Concept #5: Re-use our existing CountWords transform that does not have knowledge of
- * windows over a PCollection containing windowed values.
- */
- PCollection<KV<String, Long>> wordCounts = windowedWords.apply(new WordCount.CountWords());
-
- /**
- * Concept #6: Format the results for a BigQuery table, then write to BigQuery.
- * The BigQuery output source supports both bounded and unbounded data.
- */
- wordCounts.apply(ParDo.of(new FormatAsTableRowFn()))
- .apply(BigQueryIO.Write.to(getTableReference(options)).withSchema(getSchema()));
-
- PipelineResult result = pipeline.run();
-
- /**
- * To mock unbounded input from PubSub, we'll now start an auxiliary 'injector' pipeline that
- * runs for a limited time, and publishes to the input PubSub topic.
- *
- * With an unbounded input source, you will need to explicitly shut down this pipeline when you
- * are done with it, so that you do not continue to be charged for the instances. You can do
- * this via a ctrl-C from the command line, or from the developer's console UI for Dataflow
- * pipelines. The PubSub topic will also be deleted at this time.
- */
- exampleDataflowUtils.mockUnboundedSource(options.getInputFile(), result);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/257a7a6b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/WordCount.java
----------------------------------------------------------------------
diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/WordCount.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/WordCount.java
deleted file mode 100644
index 150b60d..0000000
--- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/WordCount.java
+++ /dev/null
@@ -1,204 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package ${package};
-
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.io.TextIO;
-import com.google.cloud.dataflow.sdk.options.DataflowPipelineOptions;
-import com.google.cloud.dataflow.sdk.options.Default;
-import com.google.cloud.dataflow.sdk.options.DefaultValueFactory;
-import com.google.cloud.dataflow.sdk.options.Description;
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
-import com.google.cloud.dataflow.sdk.transforms.Aggregator;
-import com.google.cloud.dataflow.sdk.transforms.Count;
-import com.google.cloud.dataflow.sdk.transforms.DoFn;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-import com.google.cloud.dataflow.sdk.transforms.ParDo;
-import com.google.cloud.dataflow.sdk.transforms.Sum;
-import com.google.cloud.dataflow.sdk.util.gcsfs.GcsPath;
-import com.google.cloud.dataflow.sdk.values.KV;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-
-
-/**
- * An example that counts words in Shakespeare and includes Dataflow best practices.
- *
- * <p>This class, {@link WordCount}, is the second in a series of four successively more detailed
- * 'word count' examples. You may first want to take a look at {@link MinimalWordCount}.
- * After you've looked at this example, then see the {@link DebuggingWordCount}
- * pipeline, for introduction of additional concepts.
- *
- * <p>For a detailed walkthrough of this example, see
- * <a href="https://cloud.google.com/dataflow/java-sdk/wordcount-example">
- * https://cloud.google.com/dataflow/java-sdk/wordcount-example
- * </a>
- *
- * <p>Basic concepts, also in the MinimalWordCount example:
- * Reading text files; counting a PCollection; writing to GCS.
- *
- * <p>New Concepts:
- * <pre>
- * 1. Executing a Pipeline both locally and using the Dataflow service
- * 2. Using ParDo with static DoFns defined out-of-line
- * 3. Building a composite transform
- * 4. Defining your own pipeline options
- * </pre>
- *
- * <p>Concept #1: you can execute this pipeline either locally or using the Dataflow service.
- * These are now command-line options and not hard-coded as they were in the MinimalWordCount
- * example.
- * To execute this pipeline locally, specify general pipeline configuration:
- * <pre>{@code
- * --project=YOUR_PROJECT_ID
- * }
- * </pre>
- * and a local output file or output prefix on GCS:
- * <pre>{@code
- * --output=[YOUR_LOCAL_FILE | gs://YOUR_OUTPUT_PREFIX]
- * }</pre>
- *
- * <p>To execute this pipeline using the Dataflow service, specify pipeline configuration:
- * <pre>{@code
- * --project=YOUR_PROJECT_ID
- * --stagingLocation=gs://YOUR_STAGING_DIRECTORY
- * --runner=BlockingDataflowPipelineRunner
- * }
- * </pre>
- * and an output prefix on GCS:
- * <pre>{@code
- * --output=gs://YOUR_OUTPUT_PREFIX
- * }</pre>
- *
- * <p>The input file defaults to {@code gs://dataflow-samples/shakespeare/kinglear.txt} and can be
- * overridden with {@code --inputFile}.
- */
-public class WordCount {
-
- /**
- * Concept #2: You can make your pipeline code less verbose by defining your DoFns statically out-
- * of-line. This DoFn tokenizes lines of text into individual words; we pass it to a ParDo in the
- * pipeline.
- */
- static class ExtractWordsFn extends DoFn<String, String> {
- private final Aggregator<Long, Long> emptyLines =
- createAggregator("emptyLines", new Sum.SumLongFn());
-
- @Override
- public void processElement(ProcessContext c) {
- if (c.element().trim().isEmpty()) {
- emptyLines.addValue(1L);
- }
-
- // Split the line into words.
- String[] words = c.element().split("[^a-zA-Z']+");
-
- // Output each word encountered into the output PCollection.
- for (String word : words) {
- if (!word.isEmpty()) {
- c.output(word);
- }
- }
- }
- }
-
- /** A DoFn that converts a Word and Count into a printable string. */
- public static class FormatAsTextFn extends DoFn<KV<String, Long>, String> {
- @Override
- public void processElement(ProcessContext c) {
- c.output(c.element().getKey() + ": " + c.element().getValue());
- }
- }
-
- /**
- * A PTransform that converts a PCollection containing lines of text into a PCollection of
- * formatted word counts.
- *
- * <p>Concept #3: This is a custom composite transform that bundles two transforms (ParDo and
- * Count) as a reusable PTransform subclass. Using composite transforms allows for easy reuse,
- * modular testing, and an improved monitoring experience.
- */
- public static class CountWords extends PTransform<PCollection<String>,
- PCollection<KV<String, Long>>> {
- @Override
- public PCollection<KV<String, Long>> apply(PCollection<String> lines) {
-
- // Convert lines of text into individual words.
- PCollection<String> words = lines.apply(
- ParDo.of(new ExtractWordsFn()));
-
- // Count the number of times each word occurs.
- PCollection<KV<String, Long>> wordCounts =
- words.apply(Count.<String>perElement());
-
- return wordCounts;
- }
- }
-
- /**
- * Options supported by {@link WordCount}.
- *
- * <p>Concept #4: Defining your own configuration options. Here, you can add your own arguments
- * to be processed by the command-line parser, and specify default values for them. You can then
- * access the options values in your pipeline code.
- *
- * <p>Inherits standard configuration options.
- */
- public static interface WordCountOptions extends PipelineOptions {
- @Description("Path of the file to read from")
- @Default.String("gs://dataflow-samples/shakespeare/kinglear.txt")
- String getInputFile();
- void setInputFile(String value);
-
- @Description("Path of the file to write to")
- @Default.InstanceFactory(OutputFactory.class)
- String getOutput();
- void setOutput(String value);
-
- /**
- * Returns "gs://${YOUR_STAGING_DIRECTORY}/counts.txt" as the default destination.
- */
- public static class OutputFactory implements DefaultValueFactory<String> {
- @Override
- public String create(PipelineOptions options) {
- DataflowPipelineOptions dataflowOptions = options.as(DataflowPipelineOptions.class);
- if (dataflowOptions.getStagingLocation() != null) {
- return GcsPath.fromUri(dataflowOptions.getStagingLocation())
- .resolve("counts.txt").toString();
- } else {
- throw new IllegalArgumentException("Must specify --output or --stagingLocation");
- }
- }
- }
-
- }
-
- public static void main(String[] args) {
- WordCountOptions options = PipelineOptionsFactory.fromArgs(args).withValidation()
- .as(WordCountOptions.class);
- Pipeline p = Pipeline.create(options);
-
- // Concepts #2 and #3: Our pipeline applies the composite CountWords transform, and passes the
- // static FormatAsTextFn() to the ParDo transform.
- p.apply(TextIO.Read.named("ReadLines").from(options.getInputFile()))
- .apply(new CountWords())
- .apply(ParDo.of(new FormatAsTextFn()))
- .apply(TextIO.Write.named("WriteCounts").to(options.getOutput()));
-
- p.run();
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/257a7a6b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/DataflowExampleOptions.java
----------------------------------------------------------------------
diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/DataflowExampleOptions.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/DataflowExampleOptions.java
deleted file mode 100644
index e182f4c..0000000
--- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/DataflowExampleOptions.java
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
- * in compliance with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software distributed under the License
- * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
- * or implied. See the License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package ${package}.common;
-
-import com.google.cloud.dataflow.sdk.options.DataflowPipelineOptions;
-import com.google.cloud.dataflow.sdk.options.Default;
-import com.google.cloud.dataflow.sdk.options.Description;
-
-/**
- * Options that can be used to configure the Dataflow examples.
- */
-public interface DataflowExampleOptions extends DataflowPipelineOptions {
- @Description("Whether to keep jobs running on the Dataflow service after local process exit")
- @Default.Boolean(false)
- boolean getKeepJobsRunning();
- void setKeepJobsRunning(boolean keepJobsRunning);
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/257a7a6b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/DataflowExampleUtils.java
----------------------------------------------------------------------
diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/DataflowExampleUtils.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/DataflowExampleUtils.java
deleted file mode 100644
index 9861769..0000000
--- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/DataflowExampleUtils.java
+++ /dev/null
@@ -1,398 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
- * in compliance with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software distributed under the License
- * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
- * or implied. See the License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package ${package}.common;
-
-import com.google.api.client.googleapis.json.GoogleJsonResponseException;
-import com.google.api.client.googleapis.services.AbstractGoogleClientRequest;
-import com.google.api.services.bigquery.Bigquery;
-import com.google.api.services.bigquery.Bigquery.Datasets;
-import com.google.api.services.bigquery.Bigquery.Tables;
-import com.google.api.services.bigquery.model.Dataset;
-import com.google.api.services.bigquery.model.DatasetReference;
-import com.google.api.services.bigquery.model.Table;
-import com.google.api.services.bigquery.model.TableReference;
-import com.google.api.services.bigquery.model.TableSchema;
-import com.google.api.services.dataflow.Dataflow;
-import com.google.api.services.pubsub.Pubsub;
-import com.google.api.services.pubsub.model.Topic;
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.PipelineResult;
-import com.google.cloud.dataflow.sdk.io.TextIO;
-import com.google.cloud.dataflow.sdk.options.BigQueryOptions;
-import com.google.cloud.dataflow.sdk.options.DataflowPipelineOptions;
-import com.google.cloud.dataflow.sdk.runners.DataflowPipelineJob;
-import com.google.cloud.dataflow.sdk.runners.DataflowPipelineRunner;
-import com.google.cloud.dataflow.sdk.runners.DirectPipelineRunner;
-import com.google.cloud.dataflow.sdk.transforms.IntraBundleParallelization;
-import com.google.cloud.dataflow.sdk.util.MonitoringUtil;
-import com.google.cloud.dataflow.sdk.util.Transport;
-import com.google.common.collect.Lists;
-import com.google.common.collect.Sets;
-
-import java.io.IOException;
-import java.util.Collection;
-import java.util.List;
-import java.util.Set;
-import java.util.concurrent.TimeUnit;
-
-import javax.servlet.http.HttpServletResponse;
-
-/**
- * The utility class that sets up and tears down external resources, starts the Google Cloud Pub/Sub
- * injector, and cancels the streaming and the injector pipelines once the program terminates.
- *
- * <p>It is used to run Dataflow examples, such as TrafficMaxLaneFlow and TrafficRoutes.
- */
-public class DataflowExampleUtils {
-
- private final DataflowPipelineOptions options;
- private Bigquery bigQueryClient = null;
- private Pubsub pubsubClient = null;
- private Dataflow dataflowClient = null;
- private Set<DataflowPipelineJob> jobsToCancel = Sets.newHashSet();
- private List<String> pendingMessages = Lists.newArrayList();
-
- /**
- * Define an interface that supports the PubSub and BigQuery example options.
- */
- public static interface DataflowExampleUtilsOptions
- extends DataflowExampleOptions, ExamplePubsubTopicOptions, ExampleBigQueryTableOptions {
- }
-
- public DataflowExampleUtils(DataflowPipelineOptions options) {
- this.options = options;
- }
-
- /**
- * Do resources and runner options setup.
- */
- public DataflowExampleUtils(DataflowPipelineOptions options, boolean isUnbounded)
- throws IOException {
- this.options = options;
- setupResourcesAndRunner(isUnbounded);
- }
-
- /**
- * Sets up external resources that are required by the example,
- * such as Pub/Sub topics and BigQuery tables.
- *
- * @throws IOException if there is a problem setting up the resources
- */
- public void setup() throws IOException {
- setupPubsubTopic();
- setupBigQueryTable();
- }
-
- /**
- * Set up external resources, and configure the runner appropriately.
- */
- public void setupResourcesAndRunner(boolean isUnbounded) throws IOException {
- if (isUnbounded) {
- options.setStreaming(true);
- }
- setup();
- setupRunner();
- }
-
- /**
- * Sets up the Google Cloud Pub/Sub topic.
- *
- * <p>If the topic doesn't exist, a new topic with the given name will be created.
- *
- * @throws IOException if there is a problem setting up the Pub/Sub topic
- */
- public void setupPubsubTopic() throws IOException {
- ExamplePubsubTopicOptions pubsubTopicOptions = options.as(ExamplePubsubTopicOptions.class);
- if (!pubsubTopicOptions.getPubsubTopic().isEmpty()) {
- pendingMessages.add("*******************Set Up Pubsub Topic*********************");
- setupPubsubTopic(pubsubTopicOptions.getPubsubTopic());
- pendingMessages.add("The Pub/Sub topic has been set up for this example: "
- + pubsubTopicOptions.getPubsubTopic());
- }
- }
-
- /**
- * Sets up the BigQuery table with the given schema.
- *
- * <p>If the table already exists, the schema has to match the given one. Otherwise, the example
- * will throw a RuntimeException. If the table doesn't exist, a new table with the given schema
- * will be created.
- *
- * @throws IOException if there is a problem setting up the BigQuery table
- */
- public void setupBigQueryTable() throws IOException {
- ExampleBigQueryTableOptions bigQueryTableOptions =
- options.as(ExampleBigQueryTableOptions.class);
- if (bigQueryTableOptions.getBigQueryDataset() != null
- && bigQueryTableOptions.getBigQueryTable() != null
- && bigQueryTableOptions.getBigQuerySchema() != null) {
- pendingMessages.add("******************Set Up Big Query Table*******************");
- setupBigQueryTable(bigQueryTableOptions.getProject(),
- bigQueryTableOptions.getBigQueryDataset(),
- bigQueryTableOptions.getBigQueryTable(),
- bigQueryTableOptions.getBigQuerySchema());
- pendingMessages.add("The BigQuery table has been set up for this example: "
- + bigQueryTableOptions.getProject()
- + ":" + bigQueryTableOptions.getBigQueryDataset()
- + "." + bigQueryTableOptions.getBigQueryTable());
- }
- }
-
- /**
- * Tears down external resources that can be deleted upon the example's completion.
- */
- private void tearDown() {
- pendingMessages.add("*************************Tear Down*************************");
- ExamplePubsubTopicOptions pubsubTopicOptions = options.as(ExamplePubsubTopicOptions.class);
- if (!pubsubTopicOptions.getPubsubTopic().isEmpty()) {
- try {
- deletePubsubTopic(pubsubTopicOptions.getPubsubTopic());
- pendingMessages.add("The Pub/Sub topic has been deleted: "
- + pubsubTopicOptions.getPubsubTopic());
- } catch (IOException e) {
- pendingMessages.add("Failed to delete the Pub/Sub topic : "
- + pubsubTopicOptions.getPubsubTopic());
- }
- }
-
- ExampleBigQueryTableOptions bigQueryTableOptions =
- options.as(ExampleBigQueryTableOptions.class);
- if (bigQueryTableOptions.getBigQueryDataset() != null
- && bigQueryTableOptions.getBigQueryTable() != null
- && bigQueryTableOptions.getBigQuerySchema() != null) {
- pendingMessages.add("The BigQuery table might contain the example's output, "
- + "and it is not deleted automatically: "
- + bigQueryTableOptions.getProject()
- + ":" + bigQueryTableOptions.getBigQueryDataset()
- + "." + bigQueryTableOptions.getBigQueryTable());
- pendingMessages.add("Please go to the Developers Console to delete it manually."
- + " Otherwise, you may be charged for its usage.");
- }
- }
-
- private void setupBigQueryTable(String projectId, String datasetId, String tableId,
- TableSchema schema) throws IOException {
- if (bigQueryClient == null) {
- bigQueryClient = Transport.newBigQueryClient(options.as(BigQueryOptions.class)).build();
- }
-
- Datasets datasetService = bigQueryClient.datasets();
- if (executeNullIfNotFound(datasetService.get(projectId, datasetId)) == null) {
- Dataset newDataset = new Dataset().setDatasetReference(
- new DatasetReference().setProjectId(projectId).setDatasetId(datasetId));
- datasetService.insert(projectId, newDataset).execute();
- }
-
- Tables tableService = bigQueryClient.tables();
- Table table = executeNullIfNotFound(tableService.get(projectId, datasetId, tableId));
- if (table == null) {
- Table newTable = new Table().setSchema(schema).setTableReference(
- new TableReference().setProjectId(projectId).setDatasetId(datasetId).setTableId(tableId));
- tableService.insert(projectId, datasetId, newTable).execute();
- } else if (!table.getSchema().equals(schema)) {
- throw new RuntimeException(
- "Table exists and schemas do not match, expecting: " + schema.toPrettyString()
- + ", actual: " + table.getSchema().toPrettyString());
- }
- }
-
- private void setupPubsubTopic(String topic) throws IOException {
- if (pubsubClient == null) {
- pubsubClient = Transport.newPubsubClient(options).build();
- }
- if (executeNullIfNotFound(pubsubClient.projects().topics().get(topic)) == null) {
- pubsubClient.projects().topics().create(topic, new Topic().setName(topic)).execute();
- }
- }
-
- /**
- * Deletes the Google Cloud Pub/Sub topic.
- *
- * @throws IOException if there is a problem deleting the Pub/Sub topic
- */
- private void deletePubsubTopic(String topic) throws IOException {
- if (pubsubClient == null) {
- pubsubClient = Transport.newPubsubClient(options).build();
- }
- if (executeNullIfNotFound(pubsubClient.projects().topics().get(topic)) != null) {
- pubsubClient.projects().topics().delete(topic).execute();
- }
- }
-
- /**
- * If this is an unbounded (streaming) pipeline, and both inputFile and pubsub topic are defined,
- * start an 'injector' pipeline that publishes the contents of the file to the given topic, first
- * creating the topic if necessary.
- */
- public void startInjectorIfNeeded(String inputFile) {
- ExamplePubsubTopicOptions pubsubTopicOptions = options.as(ExamplePubsubTopicOptions.class);
- if (pubsubTopicOptions.isStreaming()
- && inputFile != null && !inputFile.isEmpty()
- && pubsubTopicOptions.getPubsubTopic() != null
- && !pubsubTopicOptions.getPubsubTopic().isEmpty()) {
- runInjectorPipeline(inputFile, pubsubTopicOptions.getPubsubTopic());
- }
- }
-
- /**
- * Do some runner setup: check that the DirectPipelineRunner is not used in conjunction with
- * streaming, and if streaming is specified, use the DataflowPipelineRunner. Return the streaming
- * flag value.
- */
- public void setupRunner() {
- if (options.isStreaming()) {
- if (options.getRunner() == DirectPipelineRunner.class) {
- throw new IllegalArgumentException(
- "Processing of unbounded input sources is not supported with the DirectPipelineRunner.");
- }
- // In order to cancel the pipelines automatically,
- // {@literal DataflowPipelineRunner} is forced to be used.
- options.setRunner(DataflowPipelineRunner.class);
- }
- }
-
- /**
- * Runs the batch injector for the streaming pipeline.
- *
- * <p>The injector pipeline will read from the given text file, and inject data
- * into the Google Cloud Pub/Sub topic.
- */
- public void runInjectorPipeline(String inputFile, String topic) {
- DataflowPipelineOptions copiedOptions = options.cloneAs(DataflowPipelineOptions.class);
- copiedOptions.setStreaming(false);
- copiedOptions.setNumWorkers(
- options.as(ExamplePubsubTopicOptions.class).getInjectorNumWorkers());
- copiedOptions.setJobName(options.getJobName() + "-injector");
- Pipeline injectorPipeline = Pipeline.create(copiedOptions);
- injectorPipeline.apply(TextIO.Read.from(inputFile))
- .apply(IntraBundleParallelization
- .of(PubsubFileInjector.publish(topic))
- .withMaxParallelism(20));
- DataflowPipelineJob injectorJob = (DataflowPipelineJob) injectorPipeline.run();
- jobsToCancel.add(injectorJob);
- }
-
- /**
- * Runs the provided injector pipeline for the streaming pipeline.
- */
- public void runInjectorPipeline(Pipeline injectorPipeline) {
- DataflowPipelineJob injectorJob = (DataflowPipelineJob) injectorPipeline.run();
- jobsToCancel.add(injectorJob);
- }
-
- /**
- * Start the auxiliary injector pipeline, then wait for this pipeline to finish.
- */
- public void mockUnboundedSource(String inputFile, PipelineResult result) {
- startInjectorIfNeeded(inputFile);
- waitToFinish(result);
- }
-
- /**
- * If {@literal DataflowPipelineRunner} or {@literal BlockingDataflowPipelineRunner} is used,
- * waits for the pipeline to finish and cancels it (and the injector) before the program exists.
- */
- public void waitToFinish(PipelineResult result) {
- if (result instanceof DataflowPipelineJob) {
- final DataflowPipelineJob job = (DataflowPipelineJob) result;
- jobsToCancel.add(job);
- if (!options.as(DataflowExampleOptions.class).getKeepJobsRunning()) {
- addShutdownHook(jobsToCancel);
- }
- try {
- job.waitToFinish(-1, TimeUnit.SECONDS, new MonitoringUtil.PrintHandler(System.out));
- } catch (Exception e) {
- throw new RuntimeException("Failed to wait for job to finish: " + job.getJobId());
- }
- } else {
- // Do nothing if the given PipelineResult doesn't support waitToFinish(),
- // such as EvaluationResults returned by DirectPipelineRunner.
- }
- }
-
- private void addShutdownHook(final Collection<DataflowPipelineJob> jobs) {
- if (dataflowClient == null) {
- dataflowClient = options.getDataflowClient();
- }
-
- Runtime.getRuntime().addShutdownHook(new Thread() {
- @Override
- public void run() {
- tearDown();
- printPendingMessages();
- for (DataflowPipelineJob job : jobs) {
- System.out.println("Canceling example pipeline: " + job.getJobId());
- try {
- job.cancel();
- } catch (IOException e) {
- System.out.println("Failed to cancel the job,"
- + " please go to the Developers Console to cancel it manually");
- System.out.println(
- MonitoringUtil.getJobMonitoringPageURL(job.getProjectId(), job.getJobId()));
- }
- }
-
- for (DataflowPipelineJob job : jobs) {
- boolean cancellationVerified = false;
- for (int retryAttempts = 6; retryAttempts > 0; retryAttempts--) {
- if (job.getState().isTerminal()) {
- cancellationVerified = true;
- System.out.println("Canceled example pipeline: " + job.getJobId());
- break;
- } else {
- System.out.println(
- "The example pipeline is still running. Verifying the cancellation.");
- }
- try {
- Thread.sleep(10000);
- } catch (InterruptedException e) {
- // Ignore
- }
- }
- if (!cancellationVerified) {
- System.out.println("Failed to verify the cancellation for job: " + job.getJobId());
- System.out.println("Please go to the Developers Console to verify manually:");
- System.out.println(
- MonitoringUtil.getJobMonitoringPageURL(job.getProjectId(), job.getJobId()));
- }
- }
- }
- });
- }
-
- private void printPendingMessages() {
- System.out.println();
- System.out.println("***********************************************************");
- System.out.println("***********************************************************");
- for (String message : pendingMessages) {
- System.out.println(message);
- }
- System.out.println("***********************************************************");
- System.out.println("***********************************************************");
- }
-
- private static <T> T executeNullIfNotFound(
- AbstractGoogleClientRequest<T> request) throws IOException {
- try {
- return request.execute();
- } catch (GoogleJsonResponseException e) {
- if (e.getStatusCode() == HttpServletResponse.SC_NOT_FOUND) {
- return null;
- } else {
- throw e;
- }
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/257a7a6b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/ExampleBigQueryTableOptions.java
----------------------------------------------------------------------
diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/ExampleBigQueryTableOptions.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/ExampleBigQueryTableOptions.java
deleted file mode 100644
index bef5bfd..0000000
--- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/ExampleBigQueryTableOptions.java
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
- * in compliance with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software distributed under the License
- * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
- * or implied. See the License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package ${package}.common;
-
-import com.google.api.services.bigquery.model.TableSchema;
-import com.google.cloud.dataflow.sdk.options.DataflowPipelineOptions;
-import com.google.cloud.dataflow.sdk.options.Default;
-import com.google.cloud.dataflow.sdk.options.DefaultValueFactory;
-import com.google.cloud.dataflow.sdk.options.Description;
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-
-/**
- * Options that can be used to configure BigQuery tables in Dataflow examples.
- * The project defaults to the project being used to run the example.
- */
-public interface ExampleBigQueryTableOptions extends DataflowPipelineOptions {
- @Description("BigQuery dataset name")
- @Default.String("dataflow_examples")
- String getBigQueryDataset();
- void setBigQueryDataset(String dataset);
-
- @Description("BigQuery table name")
- @Default.InstanceFactory(BigQueryTableFactory.class)
- String getBigQueryTable();
- void setBigQueryTable(String table);
-
- @Description("BigQuery table schema")
- TableSchema getBigQuerySchema();
- void setBigQuerySchema(TableSchema schema);
-
- /**
- * Returns the job name as the default BigQuery table name.
- */
- static class BigQueryTableFactory implements DefaultValueFactory<String> {
- @Override
- public String create(PipelineOptions options) {
- return options.as(DataflowPipelineOptions.class).getJobName()
- .replace('-', '_');
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/257a7a6b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/ExamplePubsubTopicOptions.java
----------------------------------------------------------------------
diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/ExamplePubsubTopicOptions.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/ExamplePubsubTopicOptions.java
deleted file mode 100644
index 525de69..0000000
--- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/ExamplePubsubTopicOptions.java
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
- * in compliance with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software distributed under the License
- * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
- * or implied. See the License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package ${package}.common;
-
-import com.google.cloud.dataflow.sdk.options.DataflowPipelineOptions;
-import com.google.cloud.dataflow.sdk.options.Default;
-import com.google.cloud.dataflow.sdk.options.DefaultValueFactory;
-import com.google.cloud.dataflow.sdk.options.Description;
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-
-/**
- * Options that can be used to configure Pub/Sub topic in Dataflow examples.
- */
-public interface ExamplePubsubTopicOptions extends DataflowPipelineOptions {
- @Description("Pub/Sub topic")
- @Default.InstanceFactory(PubsubTopicFactory.class)
- String getPubsubTopic();
- void setPubsubTopic(String topic);
-
- @Description("Number of workers to use when executing the injector pipeline")
- @Default.Integer(1)
- int getInjectorNumWorkers();
- void setInjectorNumWorkers(int numWorkers);
-
- /**
- * Returns a default Pub/Sub topic based on the project and the job names.
- */
- static class PubsubTopicFactory implements DefaultValueFactory<String> {
- @Override
- public String create(PipelineOptions options) {
- DataflowPipelineOptions dataflowPipelineOptions =
- options.as(DataflowPipelineOptions.class);
- return "projects/" + dataflowPipelineOptions.getProject()
- + "/topics/" + dataflowPipelineOptions.getJobName();
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/257a7a6b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/PubsubFileInjector.java
----------------------------------------------------------------------
diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/PubsubFileInjector.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/PubsubFileInjector.java
deleted file mode 100644
index f6f80ae..0000000
--- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/PubsubFileInjector.java
+++ /dev/null
@@ -1,153 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package ${package}.common;
-
-import com.google.api.services.pubsub.Pubsub;
-import com.google.api.services.pubsub.model.PublishRequest;
-import com.google.api.services.pubsub.model.PubsubMessage;
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.io.TextIO;
-import com.google.cloud.dataflow.sdk.options.DataflowPipelineOptions;
-import com.google.cloud.dataflow.sdk.options.Description;
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
-import com.google.cloud.dataflow.sdk.options.Validation;
-import com.google.cloud.dataflow.sdk.transforms.DoFn;
-import com.google.cloud.dataflow.sdk.transforms.IntraBundleParallelization;
-import com.google.cloud.dataflow.sdk.util.Transport;
-import com.google.common.collect.ImmutableMap;
-
-import java.io.IOException;
-import java.util.Arrays;
-
-/**
- * A batch Dataflow pipeline for injecting a set of GCS files into
- * a PubSub topic line by line. Empty lines are skipped.
- *
- * <p>This is useful for testing streaming
- * pipelines. Note that since batch pipelines might retry chunks, this
- * does _not_ guarantee exactly-once injection of file data. Some lines may
- * be published multiple times.
- * </p>
- */
-public class PubsubFileInjector {
-
- /**
- * An incomplete {@code PubsubFileInjector} transform with unbound output topic.
- */
- public static class Unbound {
- private final String timestampLabelKey;
-
- Unbound() {
- this.timestampLabelKey = null;
- }
-
- Unbound(String timestampLabelKey) {
- this.timestampLabelKey = timestampLabelKey;
- }
-
- Unbound withTimestampLabelKey(String timestampLabelKey) {
- return new Unbound(timestampLabelKey);
- }
-
- public Bound publish(String outputTopic) {
- return new Bound(outputTopic, timestampLabelKey);
- }
- }
-
- /** A DoFn that publishes non-empty lines to Google Cloud PubSub. */
- public static class Bound extends DoFn<String, Void> {
- private final String outputTopic;
- private final String timestampLabelKey;
- public transient Pubsub pubsub;
-
- public Bound(String outputTopic, String timestampLabelKey) {
- this.outputTopic = outputTopic;
- this.timestampLabelKey = timestampLabelKey;
- }
-
- @Override
- public void startBundle(Context context) {
- this.pubsub =
- Transport.newPubsubClient(context.getPipelineOptions().as(DataflowPipelineOptions.class))
- .build();
- }
-
- @Override
- public void processElement(ProcessContext c) throws IOException {
- if (c.element().isEmpty()) {
- return;
- }
- PubsubMessage pubsubMessage = new PubsubMessage();
- pubsubMessage.encodeData(c.element().getBytes());
- if (timestampLabelKey != null) {
- pubsubMessage.setAttributes(
- ImmutableMap.of(timestampLabelKey, Long.toString(c.timestamp().getMillis())));
- }
- PublishRequest publishRequest = new PublishRequest();
- publishRequest.setMessages(Arrays.asList(pubsubMessage));
- this.pubsub.projects().topics().publish(outputTopic, publishRequest).execute();
- }
- }
-
- /**
- * Creates a {@code PubsubFileInjector} transform with the given timestamp label key.
- */
- public static Unbound withTimestampLabelKey(String timestampLabelKey) {
- return new Unbound(timestampLabelKey);
- }
-
- /**
- * Creates a {@code PubsubFileInjector} transform that publishes to the given output topic.
- */
- public static Bound publish(String outputTopic) {
- return new Unbound().publish(outputTopic);
- }
-
- /**
- * Command line parameter options.
- */
- private interface PubsubFileInjectorOptions extends PipelineOptions {
- @Description("GCS location of files.")
- @Validation.Required
- String getInput();
- void setInput(String value);
-
- @Description("Topic to publish on.")
- @Validation.Required
- String getOutputTopic();
- void setOutputTopic(String value);
- }
-
- /**
- * Sets up and starts streaming pipeline.
- */
- public static void main(String[] args) {
- PubsubFileInjectorOptions options = PipelineOptionsFactory.fromArgs(args)
- .withValidation()
- .as(PubsubFileInjectorOptions.class);
-
- Pipeline pipeline = Pipeline.create(options);
-
- pipeline
- .apply(TextIO.Read.from(options.getInput()))
- .apply(IntraBundleParallelization.of(PubsubFileInjector.publish(options.getOutputTopic()))
- .withMaxParallelism(20));
-
- pipeline.run();
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/257a7a6b/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/DebuggingWordCountTest.java
----------------------------------------------------------------------
diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/DebuggingWordCountTest.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/DebuggingWordCountTest.java
deleted file mode 100644
index 7a9aa4c..0000000
--- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/DebuggingWordCountTest.java
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package ${package};
-
-import com.google.common.io.Files;
-
-import org.junit.Rule;
-import org.junit.Test;
-import org.junit.rules.TemporaryFolder;
-import org.junit.runner.RunWith;
-import org.junit.runners.JUnit4;
-
-import java.io.File;
-import java.nio.charset.StandardCharsets;
-
-/**
- * Tests for {@link DebuggingWordCount}.
- */
-@RunWith(JUnit4.class)
-public class DebuggingWordCountTest {
- @Rule public TemporaryFolder tmpFolder = new TemporaryFolder();
-
- @Test
- public void testDebuggingWordCount() throws Exception {
- File file = tmpFolder.newFile();
- Files.write("stomach secret Flourish message Flourish here Flourish", file,
- StandardCharsets.UTF_8);
- DebuggingWordCount.main(new String[]{"--inputFile=" + file.getAbsolutePath()});
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/257a7a6b/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/WordCountTest.java
----------------------------------------------------------------------
diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/WordCountTest.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/WordCountTest.java
deleted file mode 100644
index 45555ce..0000000
--- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/WordCountTest.java
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package ${package};
-
-import ${package}.WordCount.CountWords;
-import ${package}.WordCount.ExtractWordsFn;
-import ${package}.WordCount.FormatAsTextFn;
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.coders.StringUtf8Coder;
-import com.google.cloud.dataflow.sdk.testing.DataflowAssert;
-import com.google.cloud.dataflow.sdk.testing.RunnableOnService;
-import com.google.cloud.dataflow.sdk.testing.TestPipeline;
-import com.google.cloud.dataflow.sdk.transforms.Create;
-import com.google.cloud.dataflow.sdk.transforms.DoFnTester;
-import com.google.cloud.dataflow.sdk.transforms.ParDo;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-
-import org.hamcrest.CoreMatchers;
-import org.junit.Assert;
-import org.junit.Test;
-import org.junit.experimental.categories.Category;
-import org.junit.runner.RunWith;
-import org.junit.runners.JUnit4;
-
-import java.util.Arrays;
-import java.util.List;
-
-/**
- * Tests of WordCount.
- */
-@RunWith(JUnit4.class)
-public class WordCountTest {
-
- /** Example test that tests a specific DoFn. */
- @Test
- public void testExtractWordsFn() {
- DoFnTester<String, String> extractWordsFn =
- DoFnTester.of(new ExtractWordsFn());
-
- Assert.assertThat(extractWordsFn.processBatch(" some input words "),
- CoreMatchers.hasItems("some", "input", "words"));
- Assert.assertThat(extractWordsFn.processBatch(" "),
- CoreMatchers.<String>hasItems());
- Assert.assertThat(extractWordsFn.processBatch(" some ", " input", " words"),
- CoreMatchers.hasItems("some", "input", "words"));
- }
-
- static final String[] WORDS_ARRAY = new String[] {
- "hi there", "hi", "hi sue bob",
- "hi sue", "", "bob hi"};
-
- static final List<String> WORDS = Arrays.asList(WORDS_ARRAY);
-
- static final String[] COUNTS_ARRAY = new String[] {
- "hi: 5", "there: 1", "sue: 2", "bob: 2"};
-
- /** Example test that tests a PTransform by using an in-memory input and inspecting the output. */
- @Test
- @Category(RunnableOnService.class)
- public void testCountWords() throws Exception {
- Pipeline p = TestPipeline.create();
-
- PCollection<String> input = p.apply(Create.of(WORDS).withCoder(StringUtf8Coder.of()));
-
- PCollection<String> output = input.apply(new CountWords())
- .apply(ParDo.of(new FormatAsTextFn()));
-
- DataflowAssert.that(output).containsInAnyOrder(COUNTS_ARRAY);
- p.run();
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/257a7a6b/maven-archetypes/examples/src/test/resources/projects/basic/archetype.properties
----------------------------------------------------------------------
diff --git a/maven-archetypes/examples/src/test/resources/projects/basic/archetype.properties b/maven-archetypes/examples/src/test/resources/projects/basic/archetype.properties
deleted file mode 100644
index c59e77a..0000000
--- a/maven-archetypes/examples/src/test/resources/projects/basic/archetype.properties
+++ /dev/null
@@ -1,5 +0,0 @@
-package=it.pkg
-version=0.1-SNAPSHOT
-groupId=archetype.it
-artifactId=basic
-targetPlatform=1.7
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/257a7a6b/maven-archetypes/examples/src/test/resources/projects/basic/goal.txt
----------------------------------------------------------------------
diff --git a/maven-archetypes/examples/src/test/resources/projects/basic/goal.txt b/maven-archetypes/examples/src/test/resources/projects/basic/goal.txt
deleted file mode 100644
index 0b59873..0000000
--- a/maven-archetypes/examples/src/test/resources/projects/basic/goal.txt
+++ /dev/null
@@ -1 +0,0 @@
-verify
[45/67] [partial] incubator-beam git commit: Directory reorganization
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/BigQueryIO.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/BigQueryIO.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/BigQueryIO.java
deleted file mode 100644
index ab7df6f..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/BigQueryIO.java
+++ /dev/null
@@ -1,1499 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.io;
-
-import com.google.api.client.json.JsonFactory;
-import com.google.api.services.bigquery.Bigquery;
-import com.google.api.services.bigquery.model.QueryRequest;
-import com.google.api.services.bigquery.model.TableReference;
-import com.google.api.services.bigquery.model.TableRow;
-import com.google.api.services.bigquery.model.TableSchema;
-import com.google.cloud.dataflow.sdk.coders.AtomicCoder;
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.coders.CoderException;
-import com.google.cloud.dataflow.sdk.coders.KvCoder;
-import com.google.cloud.dataflow.sdk.coders.StandardCoder;
-import com.google.cloud.dataflow.sdk.coders.StringUtf8Coder;
-import com.google.cloud.dataflow.sdk.coders.TableRowJsonCoder;
-import com.google.cloud.dataflow.sdk.coders.VarIntCoder;
-import com.google.cloud.dataflow.sdk.coders.VoidCoder;
-import com.google.cloud.dataflow.sdk.io.BigQueryIO.Write.CreateDisposition;
-import com.google.cloud.dataflow.sdk.io.BigQueryIO.Write.WriteDisposition;
-import com.google.cloud.dataflow.sdk.options.BigQueryOptions;
-import com.google.cloud.dataflow.sdk.options.GcpOptions;
-import com.google.cloud.dataflow.sdk.runners.DirectPipelineRunner;
-import com.google.cloud.dataflow.sdk.runners.PipelineRunner;
-import com.google.cloud.dataflow.sdk.transforms.Aggregator;
-import com.google.cloud.dataflow.sdk.transforms.DoFn;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-import com.google.cloud.dataflow.sdk.transforms.ParDo;
-import com.google.cloud.dataflow.sdk.transforms.SerializableFunction;
-import com.google.cloud.dataflow.sdk.transforms.Sum;
-import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
-import com.google.cloud.dataflow.sdk.util.BigQueryTableInserter;
-import com.google.cloud.dataflow.sdk.util.BigQueryTableRowIterator;
-import com.google.cloud.dataflow.sdk.util.PropertyNames;
-import com.google.cloud.dataflow.sdk.util.Reshuffle;
-import com.google.cloud.dataflow.sdk.util.SystemDoFnInternal;
-import com.google.cloud.dataflow.sdk.util.Transport;
-import com.google.cloud.dataflow.sdk.util.WindowedValue;
-import com.google.cloud.dataflow.sdk.util.WindowingStrategy;
-import com.google.cloud.dataflow.sdk.values.KV;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.cloud.dataflow.sdk.values.PCollection.IsBounded;
-import com.google.cloud.dataflow.sdk.values.PDone;
-import com.google.cloud.dataflow.sdk.values.PInput;
-import com.google.cloud.hadoop.util.ApiErrorExtractor;
-import com.google.common.base.MoreObjects;
-import com.google.common.base.Preconditions;
-
-import com.fasterxml.jackson.annotation.JsonCreator;
-import com.fasterxml.jackson.annotation.JsonProperty;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-import java.util.UUID;
-import java.util.concurrent.ConcurrentHashMap;
-import java.util.concurrent.ThreadLocalRandom;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import javax.annotation.Nullable;
-
-/**
- * {@link PTransform}s for reading and writing
- * <a href="https://developers.google.com/bigquery/">BigQuery</a> tables.
- *
- * <h3>Table References</h3>
- * <p>A fully-qualified BigQuery table name consists of three components:
- * <ul>
- * <li>{@code projectId}: the Cloud project id (defaults to
- * {@link GcpOptions#getProject()}).
- * <li>{@code datasetId}: the BigQuery dataset id, unique within a project.
- * <li>{@code tableId}: a table id, unique within a dataset.
- * </ul>
- *
- * <p>BigQuery table references are stored as a {@link TableReference}, which comes
- * from the <a href="https://cloud.google.com/bigquery/client-libraries">
- * BigQuery Java Client API</a>.
- * Tables can be referred to as Strings, with or without the {@code projectId}.
- * A helper function is provided ({@link BigQueryIO#parseTableSpec(String)})
- * that parses the following string forms into a {@link TableReference}:
- *
- * <ul>
- * <li>[{@code project_id}]:[{@code dataset_id}].[{@code table_id}]
- * <li>[{@code dataset_id}].[{@code table_id}]
- * </ul>
- *
- * <h3>Reading</h3>
- * <p>To read from a BigQuery table, apply a {@link BigQueryIO.Read} transformation.
- * This produces a {@link PCollection} of {@link TableRow TableRows} as output:
- * <pre>{@code
- * PCollection<TableRow> shakespeare = pipeline.apply(
- * BigQueryIO.Read.named("Read")
- * .from("clouddataflow-readonly:samples.weather_stations"));
- * }</pre>
- *
- * <p>See {@link TableRow} for more information on the {@link TableRow} object.
- *
- * <p>Users may provide a query to read from rather than reading all of a BigQuery table. If
- * specified, the result obtained by executing the specified query will be used as the data of the
- * input transform.
- *
- * <pre>{@code
- * PCollection<TableRow> shakespeare = pipeline.apply(
- * BigQueryIO.Read.named("Read")
- * .fromQuery("SELECT year, mean_temp FROM samples.weather_stations"));
- * }</pre>
- *
- * <p>When creating a BigQuery input transform, users should provide either a query or a table.
- * Pipeline construction will fail with a validation error if neither or both are specified.
- *
- * <h3>Writing</h3>
- * <p>To write to a BigQuery table, apply a {@link BigQueryIO.Write} transformation.
- * This consumes a {@link PCollection} of {@link TableRow TableRows} as input.
- * <pre>{@code
- * PCollection<TableRow> quotes = ...
- *
- * List<TableFieldSchema> fields = new ArrayList<>();
- * fields.add(new TableFieldSchema().setName("source").setType("STRING"));
- * fields.add(new TableFieldSchema().setName("quote").setType("STRING"));
- * TableSchema schema = new TableSchema().setFields(fields);
- *
- * quotes.apply(BigQueryIO.Write
- * .named("Write")
- * .to("my-project:output.output_table")
- * .withSchema(schema)
- * .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_TRUNCATE));
- * }</pre>
- *
- * <p>See {@link BigQueryIO.Write} for details on how to specify if a write should
- * append to an existing table, replace the table, or verify that the table is
- * empty. Note that the dataset being written to must already exist. Write
- * dispositions are not supported in streaming mode.
- *
- * <h3>Sharding BigQuery output tables</h3>
- * <p>A common use case is to dynamically generate BigQuery table names based on
- * the current window. To support this,
- * {@link BigQueryIO.Write#to(SerializableFunction)}
- * accepts a function mapping the current window to a tablespec. For example,
- * here's code that outputs daily tables to BigQuery:
- * <pre>{@code
- * PCollection<TableRow> quotes = ...
- * quotes.apply(Window.<TableRow>into(CalendarWindows.days(1)))
- * .apply(BigQueryIO.Write
- * .named("Write")
- * .withSchema(schema)
- * .to(new SerializableFunction<BoundedWindow, String>() {
- * public String apply(BoundedWindow window) {
- * // The cast below is safe because CalendarWindows.days(1) produces IntervalWindows.
- * String dayString = DateTimeFormat.forPattern("yyyy_MM_dd")
- * .withZone(DateTimeZone.UTC)
- * .print(((IntervalWindow) window).start());
- * return "my-project:output.output_table_" + dayString;
- * }
- * }));
- * }</pre>
- *
- * <p>Per-window tables are not yet supported in batch mode.
- *
- * <h3>Permissions</h3>
- * <p>Permission requirements depend on the {@link PipelineRunner} that is used to execute the
- * Dataflow job. Please refer to the documentation of corresponding {@link PipelineRunner}s for
- * more details.
- *
- * <p>Please see <a href="https://cloud.google.com/bigquery/access-control">BigQuery Access Control
- * </a> for security and permission related information specific to BigQuery.
- */
-public class BigQueryIO {
- private static final Logger LOG = LoggerFactory.getLogger(BigQueryIO.class);
-
- /**
- * Singleton instance of the JSON factory used to read and write JSON
- * formatted rows.
- */
- private static final JsonFactory JSON_FACTORY = Transport.getJsonFactory();
-
- /**
- * Project IDs must contain 6-63 lowercase letters, digits, or dashes.
- * IDs must start with a letter and may not end with a dash.
- * This regex isn't exact - this allows for patterns that would be rejected by
- * the service, but this is sufficient for basic parsing of table references.
- */
- private static final String PROJECT_ID_REGEXP = "[a-z][-a-z0-9:.]{4,61}[a-z0-9]";
-
- /**
- * Regular expression that matches Dataset IDs.
- */
- private static final String DATASET_REGEXP = "[-\\w.]{1,1024}";
-
- /**
- * Regular expression that matches Table IDs.
- */
- private static final String TABLE_REGEXP = "[-\\w$@]{1,1024}";
-
- /**
- * Matches table specifications in the form {@code "[project_id]:[dataset_id].[table_id]"} or
- * {@code "[dataset_id].[table_id]"}.
- */
- private static final String DATASET_TABLE_REGEXP =
- String.format("((?<PROJECT>%s):)?(?<DATASET>%s)\\.(?<TABLE>%s)", PROJECT_ID_REGEXP,
- DATASET_REGEXP, TABLE_REGEXP);
-
- private static final Pattern TABLE_SPEC = Pattern.compile(DATASET_TABLE_REGEXP);
-
- // TODO: make this private and remove improper access from BigQueryIOTranslator.
- public static final String SET_PROJECT_FROM_OPTIONS_WARNING =
- "No project specified for BigQuery table \"%1$s.%2$s\". Assuming it is in \"%3$s\". If the"
- + " table is in a different project please specify it as a part of the BigQuery table"
- + " definition.";
-
- private static final String RESOURCE_NOT_FOUND_ERROR =
- "BigQuery %1$s not found for table \"%2$s\" . Please create the %1$s before pipeline"
- + " execution. If the %1$s is created by an earlier stage of the pipeline, this"
- + " validation can be disabled using #withoutValidation.";
-
- private static final String UNABLE_TO_CONFIRM_PRESENCE_OF_RESOURCE_ERROR =
- "Unable to confirm BigQuery %1$s presence for table \"%2$s\". If the %1$s is created by"
- + " an earlier stage of the pipeline, this validation can be disabled using"
- + " #withoutValidation.";
-
- /**
- * Parse a table specification in the form
- * {@code "[project_id]:[dataset_id].[table_id]"} or {@code "[dataset_id].[table_id]"}.
- *
- * <p>If the project id is omitted, the default project id is used.
- */
- public static TableReference parseTableSpec(String tableSpec) {
- Matcher match = TABLE_SPEC.matcher(tableSpec);
- if (!match.matches()) {
- throw new IllegalArgumentException(
- "Table reference is not in [project_id]:[dataset_id].[table_id] "
- + "format: " + tableSpec);
- }
-
- TableReference ref = new TableReference();
- ref.setProjectId(match.group("PROJECT"));
-
- return ref.setDatasetId(match.group("DATASET")).setTableId(match.group("TABLE"));
- }
-
- /**
- * Returns a canonical string representation of the {@link TableReference}.
- */
- public static String toTableSpec(TableReference ref) {
- StringBuilder sb = new StringBuilder();
- if (ref.getProjectId() != null) {
- sb.append(ref.getProjectId());
- sb.append(":");
- }
-
- sb.append(ref.getDatasetId()).append('.').append(ref.getTableId());
- return sb.toString();
- }
-
- /**
- * A {@link PTransform} that reads from a BigQuery table and returns a
- * {@link PCollection} of {@link TableRow TableRows} containing each of the rows of the table.
- *
- * <p>Each {@link TableRow} contains values indexed by column name. Here is a
- * sample processing function that processes a "line" column from rows:
- * <pre>{@code
- * static class ExtractWordsFn extends DoFn<TableRow, String> {
- * public void processElement(ProcessContext c) {
- * // Get the "line" field of the TableRow object, split it into words, and emit them.
- * TableRow row = c.element();
- * String[] words = row.get("line").toString().split("[^a-zA-Z']+");
- * for (String word : words) {
- * if (!word.isEmpty()) {
- * c.output(word);
- * }
- * }
- * }
- * }}</pre>
- */
- public static class Read {
- /**
- * Returns a {@link Read.Bound} with the given name. The BigQuery table or query to be read
- * from has not yet been configured.
- */
- public static Bound named(String name) {
- return new Bound().named(name);
- }
-
- /**
- * Reads a BigQuery table specified as {@code "[project_id]:[dataset_id].[table_id]"} or
- * {@code "[dataset_id].[table_id]"} for tables within the current project.
- */
- public static Bound from(String tableSpec) {
- return new Bound().from(tableSpec);
- }
-
- /**
- * Reads results received after executing the given query.
- */
- public static Bound fromQuery(String query) {
- return new Bound().fromQuery(query);
- }
-
- /**
- * Reads a BigQuery table specified as a {@link TableReference} object.
- */
- public static Bound from(TableReference table) {
- return new Bound().from(table);
- }
-
- /**
- * Disables BigQuery table validation, which is enabled by default.
- */
- public static Bound withoutValidation() {
- return new Bound().withoutValidation();
- }
-
- /**
- * A {@link PTransform} that reads from a BigQuery table and returns a bounded
- * {@link PCollection} of {@link TableRow TableRows}.
- */
- public static class Bound extends PTransform<PInput, PCollection<TableRow>> {
- TableReference table;
- final String query;
- final boolean validate;
- @Nullable
- Boolean flattenResults;
-
- private static final String QUERY_VALIDATION_FAILURE_ERROR =
- "Validation of query \"%1$s\" failed. If the query depends on an earlier stage of the"
- + " pipeline, This validation can be disabled using #withoutValidation.";
-
- private Bound() {
- this(null, null, null, true, null);
- }
-
- private Bound(String name, String query, TableReference reference, boolean validate,
- Boolean flattenResults) {
- super(name);
- this.table = reference;
- this.query = query;
- this.validate = validate;
- this.flattenResults = flattenResults;
- }
-
- /**
- * Returns a copy of this transform using the name associated with this transformation.
- *
- * <p>Does not modify this object.
- */
- public Bound named(String name) {
- return new Bound(name, query, table, validate, flattenResults);
- }
-
- /**
- * Returns a copy of this transform that reads from the specified table. Refer to
- * {@link #parseTableSpec(String)} for the specification format.
- *
- * <p>Does not modify this object.
- */
- public Bound from(String tableSpec) {
- return from(parseTableSpec(tableSpec));
- }
-
- /**
- * Returns a copy of this transform that reads from the specified table.
- *
- * <p>Does not modify this object.
- */
- public Bound from(TableReference table) {
- return new Bound(name, query, table, validate, flattenResults);
- }
-
- /**
- * Returns a copy of this transform that reads the results of the specified query.
- *
- * <p>Does not modify this object.
- *
- * <p>By default, the query results will be flattened -- see
- * "flattenResults" in the <a href="https://cloud.google.com/bigquery/docs/reference/v2/jobs">
- * Jobs documentation</a> for more information. To disable flattening, use
- * {@link BigQueryIO.Read.Bound#withoutResultFlattening}.
- */
- public Bound fromQuery(String query) {
- return new Bound(name, query, table, validate,
- MoreObjects.firstNonNull(flattenResults, Boolean.TRUE));
- }
-
- /**
- * Disable table validation.
- */
- public Bound withoutValidation() {
- return new Bound(name, query, table, false, flattenResults);
- }
-
- /**
- * Disable <a href="https://cloud.google.com/bigquery/docs/reference/v2/jobs">
- * flattening of query results</a>.
- *
- * <p>Only valid when a query is used ({@link #fromQuery}). Setting this option when reading
- * from a table will cause an error during validation.
- */
- public Bound withoutResultFlattening() {
- return new Bound(name, query, table, validate, false);
- }
-
- /**
- * Validates the current {@link PTransform}.
- */
- @Override
- public void validate(PInput input) {
- if (table == null && query == null) {
- throw new IllegalStateException(
- "Invalid BigQuery read operation, either table reference or query has to be set");
- } else if (table != null && query != null) {
- throw new IllegalStateException("Invalid BigQuery read operation. Specifies both a"
- + " query and a table, only one of these should be provided");
- } else if (table != null && flattenResults != null) {
- throw new IllegalStateException("Invalid BigQuery read operation. Specifies a"
- + " table with a result flattening preference, which is not configurable");
- } else if (query != null && flattenResults == null) {
- throw new IllegalStateException("Invalid BigQuery read operation. Specifies a"
- + " query without a result flattening preference");
- }
-
- BigQueryOptions bqOptions = input.getPipeline().getOptions().as(BigQueryOptions.class);
- if (table != null && table.getProjectId() == null) {
- // If user does not specify a project we assume the table to be located in the project
- // that owns the Dataflow job.
- LOG.warn(String.format(SET_PROJECT_FROM_OPTIONS_WARNING, table.getDatasetId(),
- table.getTableId(), bqOptions.getProject()));
- table.setProjectId(bqOptions.getProject());
- }
-
- if (validate) {
- // Check for source table/query presence for early failure notification.
- // Note that a presence check can fail if the table or dataset are created by earlier
- // stages of the pipeline or if a query depends on earlier stages of a pipeline. For these
- // cases the withoutValidation method can be used to disable the check.
- if (table != null) {
- verifyDatasetPresence(bqOptions, table);
- verifyTablePresence(bqOptions, table);
- }
- if (query != null) {
- dryRunQuery(bqOptions, query);
- }
- }
- }
-
- private static void dryRunQuery(BigQueryOptions options, String query) {
- Bigquery client = Transport.newBigQueryClient(options).build();
- QueryRequest request = new QueryRequest();
- request.setQuery(query);
- request.setDryRun(true);
-
- try {
- BigQueryTableRowIterator.executeWithBackOff(
- client.jobs().query(options.getProject(), request), QUERY_VALIDATION_FAILURE_ERROR,
- query);
- } catch (Exception e) {
- throw new IllegalArgumentException(
- String.format(QUERY_VALIDATION_FAILURE_ERROR, query), e);
- }
- }
-
- @Override
- public PCollection<TableRow> apply(PInput input) {
- return PCollection.<TableRow>createPrimitiveOutputInternal(
- input.getPipeline(),
- WindowingStrategy.globalDefault(),
- IsBounded.BOUNDED)
- // Force the output's Coder to be what the read is using, and
- // unchangeable later, to ensure that we read the input in the
- // format specified by the Read transform.
- .setCoder(TableRowJsonCoder.of());
- }
-
- @Override
- protected Coder<TableRow> getDefaultOutputCoder() {
- return TableRowJsonCoder.of();
- }
-
- static {
- DirectPipelineRunner.registerDefaultTransformEvaluator(
- Bound.class, new DirectPipelineRunner.TransformEvaluator<Bound>() {
- @Override
- public void evaluate(
- Bound transform, DirectPipelineRunner.EvaluationContext context) {
- evaluateReadHelper(transform, context);
- }
- });
- }
-
- /**
- * Returns the table to write, or {@code null} if reading from a query instead.
- */
- public TableReference getTable() {
- return table;
- }
-
- /**
- * Returns the query to be read, or {@code null} if reading from a table instead.
- */
- public String getQuery() {
- return query;
- }
-
- /**
- * Returns true if table validation is enabled.
- */
- public boolean getValidate() {
- return validate;
- }
-
- /**
- * Returns true/false if result flattening is enabled/disabled, or null if not applicable.
- */
- public Boolean getFlattenResults() {
- return flattenResults;
- }
- }
-
- /** Disallow construction of utility class. */
- private Read() {}
- }
-
- /////////////////////////////////////////////////////////////////////////////
-
- /**
- * A {@link PTransform} that writes a {@link PCollection} containing {@link TableRow TableRows}
- * to a BigQuery table.
- *
- * <p>In BigQuery, each table has an encosing dataset. The dataset being written must already
- * exist.
- *
- * <p>By default, tables will be created if they do not exist, which corresponds to a
- * {@link CreateDisposition#CREATE_IF_NEEDED} disposition that matches the default of BigQuery's
- * Jobs API. A schema must be provided (via {@link BigQueryIO.Write#withSchema(TableSchema)}),
- * or else the transform may fail at runtime with an {@link IllegalArgumentException}.
- *
- * <p>By default, writes require an empty table, which corresponds to
- * a {@link WriteDisposition#WRITE_EMPTY} disposition that matches the
- * default of BigQuery's Jobs API.
- *
- * <p>Here is a sample transform that produces TableRow values containing
- * "word" and "count" columns:
- * <pre>{@code
- * static class FormatCountsFn extends DoFn<KV<String, Long>, TableRow> {
- * public void processElement(ProcessContext c) {
- * TableRow row = new TableRow()
- * .set("word", c.element().getKey())
- * .set("count", c.element().getValue().intValue());
- * c.output(row);
- * }
- * }}</pre>
- */
- public static class Write {
- /**
- * An enumeration type for the BigQuery create disposition strings.
- *
- * @see <a href="https://cloud.google.com/bigquery/docs/reference/v2/jobs#configuration.query.createDisposition">
- * <code>configuration.query.createDisposition</code> in the BigQuery Jobs API</a>
- */
- public enum CreateDisposition {
- /**
- * Specifics that tables should not be created.
- *
- * <p>If the output table does not exist, the write fails.
- */
- CREATE_NEVER,
-
- /**
- * Specifies that tables should be created if needed. This is the default
- * behavior.
- *
- * <p>Requires that a table schema is provided via {@link BigQueryIO.Write#withSchema}.
- * This precondition is checked before starting a job. The schema is
- * not required to match an existing table's schema.
- *
- * <p>When this transformation is executed, if the output table does not
- * exist, the table is created from the provided schema. Note that even if
- * the table exists, it may be recreated if necessary when paired with a
- * {@link WriteDisposition#WRITE_TRUNCATE}.
- */
- CREATE_IF_NEEDED
- }
-
- /**
- * An enumeration type for the BigQuery write disposition strings.
- *
- * @see <a href="https://cloud.google.com/bigquery/docs/reference/v2/jobs#configuration.query.writeDisposition">
- * <code>configuration.query.writeDisposition</code> in the BigQuery Jobs API</a>
- */
- public enum WriteDisposition {
- /**
- * Specifies that write should replace a table.
- *
- * <p>The replacement may occur in multiple steps - for instance by first
- * removing the existing table, then creating a replacement, then filling
- * it in. This is not an atomic operation, and external programs may
- * see the table in any of these intermediate steps.
- */
- WRITE_TRUNCATE,
-
- /**
- * Specifies that rows may be appended to an existing table.
- */
- WRITE_APPEND,
-
- /**
- * Specifies that the output table must be empty. This is the default
- * behavior.
- *
- * <p>If the output table is not empty, the write fails at runtime.
- *
- * <p>This check may occur long before data is written, and does not
- * guarantee exclusive access to the table. If two programs are run
- * concurrently, each specifying the same output table and
- * a {@link WriteDisposition} of {@link WriteDisposition#WRITE_EMPTY}, it is possible
- * for both to succeed.
- */
- WRITE_EMPTY
- }
-
- /**
- * Creates a write transformation with the given transform name. The BigQuery table to be
- * written has not yet been configured.
- */
- public static Bound named(String name) {
- return new Bound().named(name);
- }
-
- /**
- * Creates a write transformation for the given table specification.
- *
- * <p>Refer to {@link #parseTableSpec(String)} for the specification format.
- */
- public static Bound to(String tableSpec) {
- return new Bound().to(tableSpec);
- }
-
- /** Creates a write transformation for the given table. */
- public static Bound to(TableReference table) {
- return new Bound().to(table);
- }
-
- /**
- * Creates a write transformation from a function that maps windows to table specifications.
- * Each time a new window is encountered, this function will be called and the resulting table
- * will be created. Records within that window will be written to the associated table.
- *
- * <p>See {@link #parseTableSpec(String)} for the format that {@code tableSpecFunction} should
- * return.
- *
- * <p>{@code tableSpecFunction} should be deterministic. When given the same window, it should
- * always return the same table specification.
- */
- public static Bound to(SerializableFunction<BoundedWindow, String> tableSpecFunction) {
- return new Bound().to(tableSpecFunction);
- }
-
- /**
- * Creates a write transformation from a function that maps windows to {@link TableReference}
- * objects.
- *
- * <p>{@code tableRefFunction} should be deterministic. When given the same window, it should
- * always return the same table reference.
- */
- public static Bound toTableReference(
- SerializableFunction<BoundedWindow, TableReference> tableRefFunction) {
- return new Bound().toTableReference(tableRefFunction);
- }
-
- /**
- * Creates a write transformation with the specified schema to use in table creation.
- *
- * <p>The schema is <i>required</i> only if writing to a table that does not already
- * exist, and {@link CreateDisposition} is set to
- * {@link CreateDisposition#CREATE_IF_NEEDED}.
- */
- public static Bound withSchema(TableSchema schema) {
- return new Bound().withSchema(schema);
- }
-
- /** Creates a write transformation with the specified options for creating the table. */
- public static Bound withCreateDisposition(CreateDisposition disposition) {
- return new Bound().withCreateDisposition(disposition);
- }
-
- /** Creates a write transformation with the specified options for writing to the table. */
- public static Bound withWriteDisposition(WriteDisposition disposition) {
- return new Bound().withWriteDisposition(disposition);
- }
-
- /**
- * Creates a write transformation with BigQuery table validation disabled.
- */
- public static Bound withoutValidation() {
- return new Bound().withoutValidation();
- }
-
- /**
- * A {@link PTransform} that can write either a bounded or unbounded
- * {@link PCollection} of {@link TableRow TableRows} to a BigQuery table.
- */
- public static class Bound extends PTransform<PCollection<TableRow>, PDone> {
- final TableReference table;
-
- final SerializableFunction<BoundedWindow, TableReference> tableRefFunction;
-
- // Table schema. The schema is required only if the table does not exist.
- final TableSchema schema;
-
- // Options for creating the table. Valid values are CREATE_IF_NEEDED and
- // CREATE_NEVER.
- final CreateDisposition createDisposition;
-
- // Options for writing to the table. Valid values are WRITE_TRUNCATE,
- // WRITE_APPEND and WRITE_EMPTY.
- final WriteDisposition writeDisposition;
-
- // An option to indicate if table validation is desired. Default is true.
- final boolean validate;
-
- private static class TranslateTableSpecFunction implements
- SerializableFunction<BoundedWindow, TableReference> {
- private SerializableFunction<BoundedWindow, String> tableSpecFunction;
-
- TranslateTableSpecFunction(SerializableFunction<BoundedWindow, String> tableSpecFunction) {
- this.tableSpecFunction = tableSpecFunction;
- }
-
- @Override
- public TableReference apply(BoundedWindow value) {
- return parseTableSpec(tableSpecFunction.apply(value));
- }
- }
-
- /**
- * @deprecated Should be private. Instead, use one of the factory methods in
- * {@link BigQueryIO.Write}, such as {@link BigQueryIO.Write#to(String)}, to create an
- * instance of this class.
- */
- @Deprecated
- public Bound() {
- this(null, null, null, null, CreateDisposition.CREATE_IF_NEEDED,
- WriteDisposition.WRITE_EMPTY, true);
- }
-
- private Bound(String name, TableReference ref,
- SerializableFunction<BoundedWindow, TableReference> tableRefFunction, TableSchema schema,
- CreateDisposition createDisposition, WriteDisposition writeDisposition,
- boolean validate) {
- super(name);
- this.table = ref;
- this.tableRefFunction = tableRefFunction;
- this.schema = schema;
- this.createDisposition = createDisposition;
- this.writeDisposition = writeDisposition;
- this.validate = validate;
- }
-
- /**
- * Returns a copy of this write transformation, but with the specified transform name.
- *
- * <p>Does not modify this object.
- */
- public Bound named(String name) {
- return new Bound(name, table, tableRefFunction, schema, createDisposition,
- writeDisposition, validate);
- }
-
- /**
- * Returns a copy of this write transformation, but writing to the specified table. Refer to
- * {@link #parseTableSpec(String)} for the specification format.
- *
- * <p>Does not modify this object.
- */
- public Bound to(String tableSpec) {
- return to(parseTableSpec(tableSpec));
- }
-
- /**
- * Returns a copy of this write transformation, but writing to the specified table.
- *
- * <p>Does not modify this object.
- */
- public Bound to(TableReference table) {
- return new Bound(name, table, tableRefFunction, schema, createDisposition,
- writeDisposition, validate);
- }
-
- /**
- * Returns a copy of this write transformation, but using the specified function to determine
- * which table to write to for each window.
- *
- * <p>Does not modify this object.
- *
- * <p>{@code tableSpecFunction} should be deterministic. When given the same window, it
- * should always return the same table specification.
- */
- public Bound to(
- SerializableFunction<BoundedWindow, String> tableSpecFunction) {
- return toTableReference(new TranslateTableSpecFunction(tableSpecFunction));
- }
-
- /**
- * Returns a copy of this write transformation, but using the specified function to determine
- * which table to write to for each window.
- *
- * <p>Does not modify this object.
- *
- * <p>{@code tableRefFunction} should be deterministic. When given the same window, it should
- * always return the same table reference.
- */
- public Bound toTableReference(
- SerializableFunction<BoundedWindow, TableReference> tableRefFunction) {
- return new Bound(name, table, tableRefFunction, schema, createDisposition,
- writeDisposition, validate);
- }
-
- /**
- * Returns a copy of this write transformation, but using the specified schema for rows
- * to be written.
- *
- * <p>Does not modify this object.
- */
- public Bound withSchema(TableSchema schema) {
- return new Bound(name, table, tableRefFunction, schema, createDisposition,
- writeDisposition, validate);
- }
-
- /**
- * Returns a copy of this write transformation, but using the specified create disposition.
- *
- * <p>Does not modify this object.
- */
- public Bound withCreateDisposition(CreateDisposition createDisposition) {
- return new Bound(name, table, tableRefFunction, schema, createDisposition,
- writeDisposition, validate);
- }
-
- /**
- * Returns a copy of this write transformation, but using the specified write disposition.
- *
- * <p>Does not modify this object.
- */
- public Bound withWriteDisposition(WriteDisposition writeDisposition) {
- return new Bound(name, table, tableRefFunction, schema, createDisposition,
- writeDisposition, validate);
- }
-
- /**
- * Returns a copy of this write transformation, but without BigQuery table validation.
- *
- * <p>Does not modify this object.
- */
- public Bound withoutValidation() {
- return new Bound(name, table, tableRefFunction, schema, createDisposition,
- writeDisposition, false);
- }
-
- private static void verifyTableEmpty(
- BigQueryOptions options,
- TableReference table) {
- try {
- Bigquery client = Transport.newBigQueryClient(options).build();
- BigQueryTableInserter inserter = new BigQueryTableInserter(client);
- if (!inserter.isEmpty(table)) {
- throw new IllegalArgumentException(
- "BigQuery table is not empty: " + BigQueryIO.toTableSpec(table));
- }
- } catch (IOException e) {
- ApiErrorExtractor errorExtractor = new ApiErrorExtractor();
- if (errorExtractor.itemNotFound(e)) {
- // Nothing to do. If the table does not exist, it is considered empty.
- } else {
- throw new RuntimeException(
- "unable to confirm BigQuery table emptiness for table "
- + BigQueryIO.toTableSpec(table), e);
- }
- }
- }
-
- @Override
- public PDone apply(PCollection<TableRow> input) {
- BigQueryOptions options = input.getPipeline().getOptions().as(BigQueryOptions.class);
-
- if (table == null && tableRefFunction == null) {
- throw new IllegalStateException(
- "must set the table reference of a BigQueryIO.Write transform");
- }
- if (table != null && tableRefFunction != null) {
- throw new IllegalStateException(
- "Cannot set both a table reference and a table function for a BigQueryIO.Write "
- + "transform");
- }
-
- if (createDisposition == CreateDisposition.CREATE_IF_NEEDED && schema == null) {
- throw new IllegalArgumentException("CreateDisposition is CREATE_IF_NEEDED, "
- + "however no schema was provided.");
- }
-
- if (table != null && table.getProjectId() == null) {
- // If user does not specify a project we assume the table to be located in the project
- // that owns the Dataflow job.
- String projectIdFromOptions = options.getProject();
- LOG.warn(String.format(BigQueryIO.SET_PROJECT_FROM_OPTIONS_WARNING, table.getDatasetId(),
- table.getTableId(), projectIdFromOptions));
- table.setProjectId(projectIdFromOptions);
- }
-
- // Check for destination table presence and emptiness for early failure notification.
- // Note that a presence check can fail if the table or dataset are created by earlier stages
- // of the pipeline. For these cases the withoutValidation method can be used to disable
- // the check.
- // Unfortunately we can't validate anything early in case tableRefFunction is specified.
- if (table != null && validate) {
- verifyDatasetPresence(options, table);
- if (getCreateDisposition() == BigQueryIO.Write.CreateDisposition.CREATE_NEVER) {
- verifyTablePresence(options, table);
- }
- if (getWriteDisposition() == BigQueryIO.Write.WriteDisposition.WRITE_EMPTY) {
- verifyTableEmpty(options, table);
- }
- }
-
- // In streaming, BigQuery write is taken care of by StreamWithDeDup transform.
- // We also currently do this if a tablespec function is specified.
- if (options.isStreaming() || tableRefFunction != null) {
- if (createDisposition == CreateDisposition.CREATE_NEVER) {
- throw new IllegalArgumentException("CreateDispostion.CREATE_NEVER is not "
- + "supported for unbounded PCollections or when using tablespec functions.");
- }
-
- if (writeDisposition == WriteDisposition.WRITE_TRUNCATE) {
- throw new IllegalArgumentException("WriteDisposition.WRITE_TRUNCATE is not "
- + "supported for unbounded PCollections or when using tablespec functions.");
- }
-
- return input.apply(new StreamWithDeDup(table, tableRefFunction, schema));
- }
-
- return PDone.in(input.getPipeline());
- }
-
- @Override
- protected Coder<Void> getDefaultOutputCoder() {
- return VoidCoder.of();
- }
-
- static {
- DirectPipelineRunner.registerDefaultTransformEvaluator(
- Bound.class, new DirectPipelineRunner.TransformEvaluator<Bound>() {
- @Override
- public void evaluate(
- Bound transform, DirectPipelineRunner.EvaluationContext context) {
- evaluateWriteHelper(transform, context);
- }
- });
- }
-
- /** Returns the create disposition. */
- public CreateDisposition getCreateDisposition() {
- return createDisposition;
- }
-
- /** Returns the write disposition. */
- public WriteDisposition getWriteDisposition() {
- return writeDisposition;
- }
-
- /** Returns the table schema. */
- public TableSchema getSchema() {
- return schema;
- }
-
- /** Returns the table reference, or {@code null} if a . */
- public TableReference getTable() {
- return table;
- }
-
- /** Returns {@code true} if table validation is enabled. */
- public boolean getValidate() {
- return validate;
- }
- }
-
- /** Disallow construction of utility class. */
- private Write() {}
- }
-
- private static void verifyDatasetPresence(BigQueryOptions options, TableReference table) {
- try {
- Bigquery client = Transport.newBigQueryClient(options).build();
- BigQueryTableRowIterator.executeWithBackOff(
- client.datasets().get(table.getProjectId(), table.getDatasetId()),
- RESOURCE_NOT_FOUND_ERROR, "dataset", BigQueryIO.toTableSpec(table));
- } catch (Exception e) {
- ApiErrorExtractor errorExtractor = new ApiErrorExtractor();
- if ((e instanceof IOException) && errorExtractor.itemNotFound((IOException) e)) {
- throw new IllegalArgumentException(
- String.format(RESOURCE_NOT_FOUND_ERROR, "dataset", BigQueryIO.toTableSpec(table)),
- e);
- } else {
- throw new RuntimeException(
- String.format(UNABLE_TO_CONFIRM_PRESENCE_OF_RESOURCE_ERROR, "dataset",
- BigQueryIO.toTableSpec(table)),
- e);
- }
- }
- }
-
- private static void verifyTablePresence(BigQueryOptions options, TableReference table) {
- try {
- Bigquery client = Transport.newBigQueryClient(options).build();
- BigQueryTableRowIterator.executeWithBackOff(
- client.tables().get(table.getProjectId(), table.getDatasetId(), table.getTableId()),
- RESOURCE_NOT_FOUND_ERROR, "table", BigQueryIO.toTableSpec(table));
- } catch (Exception e) {
- ApiErrorExtractor errorExtractor = new ApiErrorExtractor();
- if ((e instanceof IOException) && errorExtractor.itemNotFound((IOException) e)) {
- throw new IllegalArgumentException(
- String.format(RESOURCE_NOT_FOUND_ERROR, "table", BigQueryIO.toTableSpec(table)), e);
- } else {
- throw new RuntimeException(
- String.format(UNABLE_TO_CONFIRM_PRESENCE_OF_RESOURCE_ERROR, "table",
- BigQueryIO.toTableSpec(table)),
- e);
- }
- }
- }
-
- /////////////////////////////////////////////////////////////////////////////
-
- /**
- * Implementation of DoFn to perform streaming BigQuery write.
- */
- @SystemDoFnInternal
- private static class StreamingWriteFn
- extends DoFn<KV<ShardedKey<String>, TableRowInfo>, Void> {
- /** TableSchema in JSON. Use String to make the class Serializable. */
- private final String jsonTableSchema;
-
- /** JsonTableRows to accumulate BigQuery rows in order to batch writes. */
- private transient Map<String, List<TableRow>> tableRows;
-
- /** The list of unique ids for each BigQuery table row. */
- private transient Map<String, List<String>> uniqueIdsForTableRows;
-
- /** The list of tables created so far, so we don't try the creation
- each time. */
- private static Set<String> createdTables =
- Collections.newSetFromMap(new ConcurrentHashMap<String, Boolean>());
-
- /** Tracks bytes written, exposed as "ByteCount" Counter. */
- private Aggregator<Long, Long> byteCountAggregator =
- createAggregator("ByteCount", new Sum.SumLongFn());
-
- /** Constructor. */
- StreamingWriteFn(TableSchema schema) {
- try {
- jsonTableSchema = JSON_FACTORY.toString(schema);
- } catch (IOException e) {
- throw new RuntimeException("Cannot initialize BigQuery streaming writer.", e);
- }
- }
-
- /** Prepares a target BigQuery table. */
- @Override
- public void startBundle(Context context) {
- tableRows = new HashMap<>();
- uniqueIdsForTableRows = new HashMap<>();
- }
-
- /** Accumulates the input into JsonTableRows and uniqueIdsForTableRows. */
- @Override
- public void processElement(ProcessContext context) {
- String tableSpec = context.element().getKey().getKey();
- List<TableRow> rows = getOrCreateMapListValue(tableRows, tableSpec);
- List<String> uniqueIds = getOrCreateMapListValue(uniqueIdsForTableRows, tableSpec);
-
- rows.add(context.element().getValue().tableRow);
- uniqueIds.add(context.element().getValue().uniqueId);
- }
-
- /** Writes the accumulated rows into BigQuery with streaming API. */
- @Override
- public void finishBundle(Context context) throws Exception {
- BigQueryOptions options = context.getPipelineOptions().as(BigQueryOptions.class);
- Bigquery client = Transport.newBigQueryClient(options).build();
-
- for (String tableSpec : tableRows.keySet()) {
- TableReference tableReference = getOrCreateTable(options, tableSpec);
- flushRows(client, tableReference, tableRows.get(tableSpec),
- uniqueIdsForTableRows.get(tableSpec));
- }
- tableRows.clear();
- uniqueIdsForTableRows.clear();
- }
-
- public TableReference getOrCreateTable(BigQueryOptions options, String tableSpec)
- throws IOException {
- TableReference tableReference = parseTableSpec(tableSpec);
- if (!createdTables.contains(tableSpec)) {
- synchronized (createdTables) {
- // Another thread may have succeeded in creating the table in the meanwhile, so
- // check again. This check isn't needed for correctness, but we add it to prevent
- // every thread from attempting a create and overwhelming our BigQuery quota.
- if (!createdTables.contains(tableSpec)) {
- TableSchema tableSchema = JSON_FACTORY.fromString(jsonTableSchema, TableSchema.class);
- Bigquery client = Transport.newBigQueryClient(options).build();
- BigQueryTableInserter inserter = new BigQueryTableInserter(client);
- inserter.getOrCreateTable(tableReference, WriteDisposition.WRITE_APPEND,
- CreateDisposition.CREATE_IF_NEEDED, tableSchema);
- createdTables.add(tableSpec);
- }
- }
- }
- return tableReference;
- }
-
- /** Writes the accumulated rows into BigQuery with streaming API. */
- private void flushRows(Bigquery client, TableReference tableReference,
- List<TableRow> tableRows, List<String> uniqueIds) {
- if (!tableRows.isEmpty()) {
- try {
- BigQueryTableInserter inserter = new BigQueryTableInserter(client);
- inserter.insertAll(tableReference, tableRows, uniqueIds, byteCountAggregator);
- } catch (IOException e) {
- throw new RuntimeException(e);
- }
- }
- }
- }
-
- private static class ShardedKey<K> {
- private final K key;
- private final int shardNumber;
-
- public static <K> ShardedKey<K> of(K key, int shardNumber) {
- return new ShardedKey<K>(key, shardNumber);
- }
-
- private ShardedKey(K key, int shardNumber) {
- this.key = key;
- this.shardNumber = shardNumber;
- }
-
- public K getKey() {
- return key;
- }
-
- public int getShardNumber() {
- return shardNumber;
- }
- }
-
- /**
- * A {@link Coder} for {@link ShardedKey}, using a wrapped key {@link Coder}.
- */
- private static class ShardedKeyCoder<KeyT>
- extends StandardCoder<ShardedKey<KeyT>> {
- public static <KeyT> ShardedKeyCoder<KeyT> of(Coder<KeyT> keyCoder) {
- return new ShardedKeyCoder<>(keyCoder);
- }
-
- @JsonCreator
- public static <KeyT> ShardedKeyCoder<KeyT> of(
- @JsonProperty(PropertyNames.COMPONENT_ENCODINGS)
- List<Coder<KeyT>> components) {
- Preconditions.checkArgument(components.size() == 1,
- "Expecting 1 component, got " + components.size());
- return of(components.get(0));
- }
-
- protected ShardedKeyCoder(Coder<KeyT> keyCoder) {
- this.keyCoder = keyCoder;
- this.shardNumberCoder = VarIntCoder.of();
- }
-
- @Override
- public List<? extends Coder<?>> getCoderArguments() {
- return Arrays.asList(keyCoder);
- }
-
- @Override
- public void encode(ShardedKey<KeyT> key, OutputStream outStream, Context context)
- throws IOException {
- keyCoder.encode(key.getKey(), outStream, context.nested());
- shardNumberCoder.encode(key.getShardNumber(), outStream, context);
- }
-
- @Override
- public ShardedKey<KeyT> decode(InputStream inStream, Context context)
- throws IOException {
- return new ShardedKey<KeyT>(
- keyCoder.decode(inStream, context.nested()),
- shardNumberCoder.decode(inStream, context));
- }
-
- @Override
- public void verifyDeterministic() throws NonDeterministicException {
- keyCoder.verifyDeterministic();
- }
-
- Coder<KeyT> keyCoder;
- VarIntCoder shardNumberCoder;
- }
-
- private static class TableRowInfoCoder extends AtomicCoder<TableRowInfo> {
- private static final TableRowInfoCoder INSTANCE = new TableRowInfoCoder();
-
- @JsonCreator
- public static TableRowInfoCoder of() {
- return INSTANCE;
- }
-
- @Override
- public void encode(TableRowInfo value, OutputStream outStream, Context context)
- throws IOException {
- if (value == null) {
- throw new CoderException("cannot encode a null value");
- }
- tableRowCoder.encode(value.tableRow, outStream, context.nested());
- idCoder.encode(value.uniqueId, outStream, context.nested());
- }
-
- @Override
- public TableRowInfo decode(InputStream inStream, Context context)
- throws IOException {
- return new TableRowInfo(
- tableRowCoder.decode(inStream, context.nested()),
- idCoder.decode(inStream, context.nested()));
- }
-
- @Override
- public void verifyDeterministic() throws NonDeterministicException {
- throw new NonDeterministicException(this, "TableRows are not deterministic.");
- }
-
- TableRowJsonCoder tableRowCoder = TableRowJsonCoder.of();
- StringUtf8Coder idCoder = StringUtf8Coder.of();
- }
-
- private static class TableRowInfo {
- TableRowInfo(TableRow tableRow, String uniqueId) {
- this.tableRow = tableRow;
- this.uniqueId = uniqueId;
- }
-
- final TableRow tableRow;
- final String uniqueId;
- }
-
- /////////////////////////////////////////////////////////////////////////////
-
- /**
- * Fn that tags each table row with a unique id and destination table.
- * To avoid calling UUID.randomUUID() for each element, which can be costly,
- * a randomUUID is generated only once per bucket of data. The actual unique
- * id is created by concatenating this randomUUID with a sequential number.
- */
- private static class TagWithUniqueIdsAndTable
- extends DoFn<TableRow, KV<ShardedKey<String>, TableRowInfo>>
- implements DoFn.RequiresWindowAccess {
- /** TableSpec to write to. */
- private final String tableSpec;
-
- /** User function mapping windows to {@link TableReference} in JSON. */
- private final SerializableFunction<BoundedWindow, TableReference> tableRefFunction;
-
- private transient String randomUUID;
- private transient long sequenceNo = 0L;
-
- TagWithUniqueIdsAndTable(BigQueryOptions options, TableReference table,
- SerializableFunction<BoundedWindow, TableReference> tableRefFunction) {
- Preconditions.checkArgument(table == null ^ tableRefFunction == null,
- "Exactly one of table or tableRefFunction should be set");
- if (table != null) {
- if (table.getProjectId() == null) {
- table.setProjectId(options.as(BigQueryOptions.class).getProject());
- }
- this.tableSpec = toTableSpec(table);
- } else {
- tableSpec = null;
- }
- this.tableRefFunction = tableRefFunction;
- }
-
-
- @Override
- public void startBundle(Context context) {
- randomUUID = UUID.randomUUID().toString();
- }
-
- /** Tag the input with a unique id. */
- @Override
- public void processElement(ProcessContext context) throws IOException {
- String uniqueId = randomUUID + sequenceNo++;
- ThreadLocalRandom randomGenerator = ThreadLocalRandom.current();
- String tableSpec = tableSpecFromWindow(
- context.getPipelineOptions().as(BigQueryOptions.class), context.window());
- // We output on keys 0-50 to ensure that there's enough batching for
- // BigQuery.
- context.output(KV.of(ShardedKey.of(tableSpec, randomGenerator.nextInt(0, 50)),
- new TableRowInfo(context.element(), uniqueId)));
- }
-
- private String tableSpecFromWindow(BigQueryOptions options, BoundedWindow window) {
- if (tableSpec != null) {
- return tableSpec;
- } else {
- TableReference table = tableRefFunction.apply(window);
- if (table.getProjectId() == null) {
- table.setProjectId(options.getProject());
- }
- return toTableSpec(table);
- }
- }
- }
-
- /////////////////////////////////////////////////////////////////////////////
-
- /**
- * PTransform that performs streaming BigQuery write. To increase consistency,
- * it leverages BigQuery best effort de-dup mechanism.
- */
- private static class StreamWithDeDup extends PTransform<PCollection<TableRow>, PDone> {
- private final transient TableReference tableReference;
- private final SerializableFunction<BoundedWindow, TableReference> tableRefFunction;
- private final transient TableSchema tableSchema;
-
- /** Constructor. */
- StreamWithDeDup(TableReference tableReference,
- SerializableFunction<BoundedWindow, TableReference> tableRefFunction,
- TableSchema tableSchema) {
- this.tableReference = tableReference;
- this.tableRefFunction = tableRefFunction;
- this.tableSchema = tableSchema;
- }
-
- @Override
- protected Coder<Void> getDefaultOutputCoder() {
- return VoidCoder.of();
- }
-
- @Override
- public PDone apply(PCollection<TableRow> input) {
- // A naive implementation would be to simply stream data directly to BigQuery.
- // However, this could occasionally lead to duplicated data, e.g., when
- // a VM that runs this code is restarted and the code is re-run.
-
- // The above risk is mitigated in this implementation by relying on
- // BigQuery built-in best effort de-dup mechanism.
-
- // To use this mechanism, each input TableRow is tagged with a generated
- // unique id, which is then passed to BigQuery and used to ignore duplicates.
-
- PCollection<KV<ShardedKey<String>, TableRowInfo>> tagged = input.apply(ParDo.of(
- new TagWithUniqueIdsAndTable(input.getPipeline().getOptions().as(BigQueryOptions.class),
- tableReference, tableRefFunction)));
-
- // To prevent having the same TableRow processed more than once with regenerated
- // different unique ids, this implementation relies on "checkpointing", which is
- // achieved as a side effect of having StreamingWriteFn immediately follow a GBK,
- // performed by Reshuffle.
- tagged
- .setCoder(KvCoder.of(ShardedKeyCoder.of(StringUtf8Coder.of()), TableRowInfoCoder.of()))
- .apply(Reshuffle.<ShardedKey<String>, TableRowInfo>of())
- .apply(ParDo.of(new StreamingWriteFn(tableSchema)));
-
- // Note that the implementation to return PDone here breaks the
- // implicit assumption about the job execution order. If a user
- // implements a PTransform that takes PDone returned here as its
- // input, the transform may not necessarily be executed after
- // the BigQueryIO.Write.
-
- return PDone.in(input.getPipeline());
- }
- }
-
- /////////////////////////////////////////////////////////////////////////////
-
- /** Disallow construction of utility class. */
- private BigQueryIO() {}
-
- /**
- * Direct mode read evaluator.
- *
- * <p>This loads the entire table into an in-memory PCollection.
- */
- private static void evaluateReadHelper(
- Read.Bound transform, DirectPipelineRunner.EvaluationContext context) {
- BigQueryOptions options = context.getPipelineOptions();
- Bigquery client = Transport.newBigQueryClient(options).build();
- if (transform.table != null && transform.table.getProjectId() == null) {
- transform.table.setProjectId(options.getProject());
- }
-
- BigQueryTableRowIterator iterator;
- if (transform.query != null) {
- LOG.info("Reading from BigQuery query {}", transform.query);
- iterator =
- BigQueryTableRowIterator.fromQuery(
- transform.query, options.getProject(), client, transform.getFlattenResults());
- } else {
- LOG.info("Reading from BigQuery table {}", toTableSpec(transform.table));
- iterator = BigQueryTableRowIterator.fromTable(transform.table, client);
- }
-
- try (BigQueryTableRowIterator ignored = iterator) {
- List<TableRow> elems = new ArrayList<>();
- iterator.open();
- while (iterator.advance()) {
- elems.add(iterator.getCurrent());
- }
- LOG.info("Number of records read from BigQuery: {}", elems.size());
- context.setPCollection(context.getOutput(transform), elems);
- } catch (IOException | InterruptedException e) {
- throw new RuntimeException(e);
- }
- }
-
- private static <K, V> List<V> getOrCreateMapListValue(Map<K, List<V>> map, K key) {
- List<V> value = map.get(key);
- if (value == null) {
- value = new ArrayList<>();
- map.put(key, value);
- }
- return value;
- }
-
- /**
- * Direct mode write evaluator.
- *
- * <p>This writes the entire table in a single BigQuery request.
- * The table will be created if necessary.
- */
- private static void evaluateWriteHelper(
- Write.Bound transform, DirectPipelineRunner.EvaluationContext context) {
- BigQueryOptions options = context.getPipelineOptions();
- Bigquery client = Transport.newBigQueryClient(options).build();
- BigQueryTableInserter inserter = new BigQueryTableInserter(client);
-
- try {
- Map<TableReference, List<TableRow>> tableRows = new HashMap<>();
- for (WindowedValue<TableRow> windowedValue : context.getPCollectionWindowedValues(
- context.getInput(transform))) {
- for (BoundedWindow window : windowedValue.getWindows()) {
- TableReference ref;
- if (transform.tableRefFunction != null) {
- ref = transform.tableRefFunction.apply(window);
- } else {
- ref = transform.table;
- }
- if (ref.getProjectId() == null) {
- ref.setProjectId(options.getProject());
- }
-
- List<TableRow> rows = getOrCreateMapListValue(tableRows, ref);
- rows.add(windowedValue.getValue());
- }
- }
-
- for (TableReference ref : tableRows.keySet()) {
- LOG.info("Writing to BigQuery table {}", toTableSpec(ref));
- // {@link BigQueryTableInserter#getOrCreateTable} validates {@link CreateDisposition}
- // and {@link WriteDisposition}.
- // For each {@link TableReference}, it can only be called before rows are written.
- inserter.getOrCreateTable(
- ref, transform.writeDisposition, transform.createDisposition, transform.schema);
- inserter.insertAll(ref, tableRows.get(ref));
- }
- } catch (IOException e) {
- throw new RuntimeException(e);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/BlockBasedSource.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/BlockBasedSource.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/BlockBasedSource.java
deleted file mode 100644
index f4a9c7d..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/BlockBasedSource.java
+++ /dev/null
@@ -1,237 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.io;
-
-import com.google.cloud.dataflow.sdk.annotations.Experimental;
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-
-import java.io.IOException;
-import java.util.NoSuchElementException;
-
-import javax.annotation.Nullable;
-
-/**
- * A {@code BlockBasedSource} is a {@link FileBasedSource} where a file consists of blocks of
- * records.
- *
- * <p>{@code BlockBasedSource} should be derived from when a file format does not support efficient
- * seeking to a record in the file, but can support efficient seeking to a block. Alternatively,
- * records in the file cannot be offset-addressed, but blocks can (it is not possible to say
- * that record {code i} starts at offset {@code m}, but it is possible to say that block {@code j}
- * starts at offset {@code n}).
- *
- * <p>The records that will be read from a {@code BlockBasedSource} that corresponds to a subrange
- * of a file {@code [startOffset, endOffset)} are those records such that the record is contained in
- * a block that starts at offset {@code i}, where {@code i >= startOffset} and
- * {@code i < endOffset}. In other words, a record will be read from the source if its first byte is
- * contained in a block that begins within the range described by the source.
- *
- * <p>This entails that it is possible to determine the start offsets of all blocks in a file.
- *
- * <p>Progress reporting for reading from a {@code BlockBasedSource} is inaccurate. A {@link
- * BlockBasedReader} reports its current offset as {@code (offset of current block) + (current block
- * size) * (fraction of block consumed)}. However, only the offset of the current block is required
- * to be accurately reported by subclass implementations. As such, in the worst case, the current
- * offset is only updated at block boundaries.
- *
- * <p>{@code BlockBasedSource} supports dynamic splitting. However, because records in a {@code
- * BlockBasedSource} are not required to have offsets and progress reporting is inaccurate, {@code
- * BlockBasedReader} only supports splitting at block boundaries.
- * In other words, {@link BlockBasedReader#atSplitPoint} returns true iff the current record is the
- * first record in a block. See {@link FileBasedSource.FileBasedReader} for discussion about split
- * points.
- *
- * @param <T> The type of records to be read from the source.
- */
-@Experimental(Experimental.Kind.SOURCE_SINK)
-public abstract class BlockBasedSource<T> extends FileBasedSource<T> {
- /**
- * Creates a {@code BlockBasedSource} based on a file name or pattern. Subclasses must call this
- * constructor when creating a {@code BlockBasedSource} for a file pattern. See
- * {@link FileBasedSource} for more information.
- */
- public BlockBasedSource(String fileOrPatternSpec, long minBundleSize) {
- super(fileOrPatternSpec, minBundleSize);
- }
-
- /**
- * Creates a {@code BlockBasedSource} for a single file. Subclasses must call this constructor
- * when implementing {@link BlockBasedSource#createForSubrangeOfFile}. See documentation in
- * {@link FileBasedSource}.
- */
- public BlockBasedSource(String fileName, long minBundleSize, long startOffset, long endOffset) {
- super(fileName, minBundleSize, startOffset, endOffset);
- }
-
- /**
- * Creates a {@code BlockBasedSource} for the specified range in a single file.
- */
- @Override
- protected abstract BlockBasedSource<T> createForSubrangeOfFile(
- String fileName, long start, long end);
-
- /**
- * Creates a {@code BlockBasedReader}.
- */
- @Override
- protected abstract BlockBasedReader<T> createSingleFileReader(PipelineOptions options);
-
- /**
- * A {@code Block} represents a block of records that can be read.
- */
- @Experimental(Experimental.Kind.SOURCE_SINK)
- protected abstract static class Block<T> {
- /**
- * Returns the current record.
- */
- public abstract T getCurrentRecord();
-
- /**
- * Reads the next record from the block and returns true iff one exists.
- */
- public abstract boolean readNextRecord() throws IOException;
-
- /**
- * Returns the fraction of the block already consumed, if possible, as a value in
- * {@code [0, 1]}. It should not include the current record. Successive results from this method
- * must be monotonically increasing.
- *
- * <p>If it is not possible to compute the fraction of the block consumed this method may
- * return zero. For example, when the total number of records in the block is unknown.
- */
- public abstract double getFractionOfBlockConsumed();
- }
-
- /**
- * A {@code Reader} that reads records from a {@link BlockBasedSource}. If the source is a
- * subrange of a file, the blocks that will be read by this reader are those such that the first
- * byte of the block is within the range {@code [start, end)}.
- */
- @Experimental(Experimental.Kind.SOURCE_SINK)
- protected abstract static class BlockBasedReader<T> extends FileBasedReader<T> {
- private boolean atSplitPoint;
-
- protected BlockBasedReader(BlockBasedSource<T> source) {
- super(source);
- }
-
- /**
- * Read the next block from the input.
- */
- public abstract boolean readNextBlock() throws IOException;
-
- /**
- * Returns the current block (the block that was read by the last successful call to
- * {@link BlockBasedReader#readNextBlock}). May return null initially, or if no block has been
- * successfully read.
- */
- @Nullable
- public abstract Block<T> getCurrentBlock();
-
- /**
- * Returns the size of the current block in bytes as it is represented in the underlying file,
- * if possible. This method may return {@code 0} if the size of the current block is unknown.
- *
- * <p>The size returned by this method must be such that for two successive blocks A and B,
- * {@code offset(A) + size(A) <= offset(B)}. If this is not satisfied, the progress reported
- * by the {@code BlockBasedReader} will be non-monotonic and will interfere with the quality
- * (but not correctness) of dynamic work rebalancing.
- *
- * <p>This method and {@link Block#getFractionOfBlockConsumed} are used to provide an estimate
- * of progress within a block ({@code getCurrentBlock().getFractionOfBlockConsumed() *
- * getCurrentBlockSize()}). It is acceptable for the result of this computation to be {@code 0},
- * but progress estimation will be inaccurate.
- */
- public abstract long getCurrentBlockSize();
-
- /**
- * Returns the largest offset such that starting to read from that offset includes the current
- * block.
- */
- public abstract long getCurrentBlockOffset();
-
- @Override
- public final T getCurrent() throws NoSuchElementException {
- Block<T> currentBlock = getCurrentBlock();
- if (currentBlock == null) {
- throw new NoSuchElementException(
- "No block has been successfully read from " + getCurrentSource());
- }
- return currentBlock.getCurrentRecord();
- }
-
- /**
- * Returns true if the reader is at a split point. A {@code BlockBasedReader} is at a split
- * point if the current record is the first record in a block. In other words, split points
- * are block boundaries.
- */
- @Override
- protected boolean isAtSplitPoint() {
- return atSplitPoint;
- }
-
- /**
- * Reads the next record from the {@link #getCurrentBlock() current block} if
- * possible. Will call {@link #readNextBlock()} to advance to the next block if not.
- *
- * <p>The first record read from a block is treated as a split point.
- */
- @Override
- protected final boolean readNextRecord() throws IOException {
- atSplitPoint = false;
-
- while (getCurrentBlock() == null || !getCurrentBlock().readNextRecord()) {
- if (!readNextBlock()) {
- return false;
- }
- // The first record in a block is a split point.
- atSplitPoint = true;
- }
- return true;
- }
-
- @Override
- public Double getFractionConsumed() {
- if (getCurrentSource().getEndOffset() == Long.MAX_VALUE) {
- return null;
- }
- Block<T> currentBlock = getCurrentBlock();
- if (currentBlock == null) {
- // There is no current block (i.e., the read has not yet begun).
- return 0.0;
- }
- long currentBlockOffset = getCurrentBlockOffset();
- long startOffset = getCurrentSource().getStartOffset();
- long endOffset = getCurrentSource().getEndOffset();
- double fractionAtBlockStart =
- ((double) (currentBlockOffset - startOffset)) / (endOffset - startOffset);
- double fractionAtBlockEnd =
- ((double) (currentBlockOffset + getCurrentBlockSize() - startOffset)
- / (endOffset - startOffset));
- return Math.min(
- 1.0,
- fractionAtBlockStart
- + currentBlock.getFractionOfBlockConsumed()
- * (fractionAtBlockEnd - fractionAtBlockStart));
- }
-
- @Override
- protected long getCurrentOffset() {
- return getCurrentBlockOffset();
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/BoundedReadFromUnboundedSource.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/BoundedReadFromUnboundedSource.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/BoundedReadFromUnboundedSource.java
deleted file mode 100644
index 52c730c..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/BoundedReadFromUnboundedSource.java
+++ /dev/null
@@ -1,271 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.io;
-
-import static com.google.cloud.dataflow.sdk.util.StringUtils.approximateSimpleName;
-
-import com.google.api.client.util.BackOff;
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-import com.google.cloud.dataflow.sdk.transforms.RemoveDuplicates;
-import com.google.cloud.dataflow.sdk.transforms.SerializableFunction;
-import com.google.cloud.dataflow.sdk.util.IntervalBoundedExponentialBackOff;
-import com.google.cloud.dataflow.sdk.util.ValueWithRecordId;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.cloud.dataflow.sdk.values.PInput;
-
-import org.joda.time.Duration;
-import org.joda.time.Instant;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.NoSuchElementException;
-
-
-/**
- * {@link PTransform} that reads a bounded amount of data from an {@link UnboundedSource},
- * specified as one or both of a maximum number of elements or a maximum period of time to read.
- *
- * <p>Created by {@link Read}.
- */
-class BoundedReadFromUnboundedSource<T> extends PTransform<PInput, PCollection<T>> {
- private final UnboundedSource<T, ?> source;
- private final long maxNumRecords;
- private final Duration maxReadTime;
-
- /**
- * Returns a new {@link BoundedReadFromUnboundedSource} that reads a bounded amount
- * of data from the given {@link UnboundedSource}. The bound is specified as a number
- * of records to read.
- *
- * <p>This may take a long time to execute if the splits of this source are slow to read
- * records.
- */
- public BoundedReadFromUnboundedSource<T> withMaxNumRecords(long maxNumRecords) {
- return new BoundedReadFromUnboundedSource<T>(source, maxNumRecords, maxReadTime);
- }
-
- /**
- * Returns a new {@link BoundedReadFromUnboundedSource} that reads a bounded amount
- * of data from the given {@link UnboundedSource}. The bound is specified as an amount
- * of time to read for. Each split of the source will read for this much time.
- */
- public BoundedReadFromUnboundedSource<T> withMaxReadTime(Duration maxReadTime) {
- return new BoundedReadFromUnboundedSource<T>(source, maxNumRecords, maxReadTime);
- }
-
- BoundedReadFromUnboundedSource(
- UnboundedSource<T, ?> source, long maxNumRecords, Duration maxReadTime) {
- this.source = source;
- this.maxNumRecords = maxNumRecords;
- this.maxReadTime = maxReadTime;
- }
-
- @Override
- public PCollection<T> apply(PInput input) {
- PCollection<ValueWithRecordId<T>> read = Pipeline.applyTransform(input,
- Read.from(new UnboundedToBoundedSourceAdapter<>(source, maxNumRecords, maxReadTime)));
- if (source.requiresDeduping()) {
- read = read.apply(RemoveDuplicates.withRepresentativeValueFn(
- new SerializableFunction<ValueWithRecordId<T>, byte[]>() {
- @Override
- public byte[] apply(ValueWithRecordId<T> input) {
- return input.getId();
- }
- }));
- }
- return read.apply(ValueWithRecordId.<T>stripIds());
- }
-
- @Override
- protected Coder<T> getDefaultOutputCoder() {
- return source.getDefaultOutputCoder();
- }
-
- @Override
- public String getKindString() {
- return "Read(" + approximateSimpleName(source.getClass()) + ")";
- }
-
- private static class UnboundedToBoundedSourceAdapter<T>
- extends BoundedSource<ValueWithRecordId<T>> {
- private final UnboundedSource<T, ?> source;
- private final long maxNumRecords;
- private final Duration maxReadTime;
-
- private UnboundedToBoundedSourceAdapter(
- UnboundedSource<T, ?> source, long maxNumRecords, Duration maxReadTime) {
- this.source = source;
- this.maxNumRecords = maxNumRecords;
- this.maxReadTime = maxReadTime;
- }
-
- /**
- * Divide the given number of records into {@code numSplits} approximately
- * equal parts that sum to {@code numRecords}.
- */
- private static long[] splitNumRecords(long numRecords, int numSplits) {
- long[] splitNumRecords = new long[numSplits];
- for (int i = 0; i < numSplits; i++) {
- splitNumRecords[i] = numRecords / numSplits;
- }
- for (int i = 0; i < numRecords % numSplits; i++) {
- splitNumRecords[i] = splitNumRecords[i] + 1;
- }
- return splitNumRecords;
- }
-
- /**
- * Pick a number of initial splits based on the number of records expected to be processed.
- */
- private static int numInitialSplits(long numRecords) {
- final int maxSplits = 100;
- final long recordsPerSplit = 10000;
- return (int) Math.min(maxSplits, numRecords / recordsPerSplit + 1);
- }
-
- @Override
- public List<? extends BoundedSource<ValueWithRecordId<T>>> splitIntoBundles(
- long desiredBundleSizeBytes, PipelineOptions options) throws Exception {
- List<UnboundedToBoundedSourceAdapter<T>> result = new ArrayList<>();
- int numInitialSplits = numInitialSplits(maxNumRecords);
- List<? extends UnboundedSource<T, ?>> splits =
- source.generateInitialSplits(numInitialSplits, options);
- int numSplits = splits.size();
- long[] numRecords = splitNumRecords(maxNumRecords, numSplits);
- for (int i = 0; i < numSplits; i++) {
- result.add(
- new UnboundedToBoundedSourceAdapter<T>(splits.get(i), numRecords[i], maxReadTime));
- }
- return result;
- }
-
- @Override
- public long getEstimatedSizeBytes(PipelineOptions options) {
- // No way to estimate bytes, so returning 0.
- return 0L;
- }
-
- @Override
- public boolean producesSortedKeys(PipelineOptions options) {
- return false;
- }
-
- @Override
- public Coder<ValueWithRecordId<T>> getDefaultOutputCoder() {
- return ValueWithRecordId.ValueWithRecordIdCoder.of(source.getDefaultOutputCoder());
- }
-
- @Override
- public void validate() {
- source.validate();
- }
-
- @Override
- public BoundedReader<ValueWithRecordId<T>> createReader(PipelineOptions options) {
- return new Reader(source.createReader(options, null));
- }
-
- private class Reader extends BoundedReader<ValueWithRecordId<T>> {
- private long recordsRead = 0L;
- private Instant endTime = Instant.now().plus(maxReadTime);
- private UnboundedSource.UnboundedReader<T> reader;
-
- private Reader(UnboundedSource.UnboundedReader<T> reader) {
- this.recordsRead = 0L;
- if (maxReadTime != null) {
- this.endTime = Instant.now().plus(maxReadTime);
- } else {
- this.endTime = null;
- }
- this.reader = reader;
- }
-
- @Override
- public boolean start() throws IOException {
- if (maxNumRecords <= 0 || (maxReadTime != null && maxReadTime.getMillis() == 0)) {
- return false;
- }
-
- recordsRead++;
- if (reader.start()) {
- return true;
- } else {
- return advanceWithBackoff();
- }
- }
-
- @Override
- public boolean advance() throws IOException {
- if (recordsRead >= maxNumRecords) {
- finalizeCheckpoint();
- return false;
- }
- recordsRead++;
- return advanceWithBackoff();
- }
-
- private boolean advanceWithBackoff() throws IOException {
- // Try reading from the source with exponential backoff
- BackOff backoff = new IntervalBoundedExponentialBackOff(10000, 10);
- long nextSleep = backoff.nextBackOffMillis();
- while (nextSleep != BackOff.STOP) {
- if (endTime != null && Instant.now().isAfter(endTime)) {
- finalizeCheckpoint();
- return false;
- }
- if (reader.advance()) {
- return true;
- }
- try {
- Thread.sleep(nextSleep);
- } catch (InterruptedException e) {}
- nextSleep = backoff.nextBackOffMillis();
- }
- finalizeCheckpoint();
- return false;
- }
-
- private void finalizeCheckpoint() throws IOException {
- reader.getCheckpointMark().finalizeCheckpoint();
- }
-
- @Override
- public ValueWithRecordId<T> getCurrent() throws NoSuchElementException {
- return new ValueWithRecordId<>(reader.getCurrent(), reader.getCurrentRecordId());
- }
-
- @Override
- public Instant getCurrentTimestamp() throws NoSuchElementException {
- return reader.getCurrentTimestamp();
- }
-
- @Override
- public void close() throws IOException {
- reader.close();
- }
-
- @Override
- public BoundedSource<ValueWithRecordId<T>> getCurrentSource() {
- return UnboundedToBoundedSourceAdapter.this;
- }
- }
- }
-}
[36/67] [partial] incubator-beam git commit: Directory reorganization
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/ProxyInvocationHandler.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/ProxyInvocationHandler.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/ProxyInvocationHandler.java
deleted file mode 100644
index 527f712..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/ProxyInvocationHandler.java
+++ /dev/null
@@ -1,441 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.options;
-
-import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory.JsonIgnorePredicate;
-import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory.Registration;
-import com.google.cloud.dataflow.sdk.util.InstanceBuilder;
-import com.google.cloud.dataflow.sdk.util.common.ReflectHelpers;
-import com.google.common.base.Defaults;
-import com.google.common.base.Function;
-import com.google.common.base.Preconditions;
-import com.google.common.collect.ClassToInstanceMap;
-import com.google.common.collect.FluentIterable;
-import com.google.common.collect.ImmutableMap;
-import com.google.common.collect.Maps;
-import com.google.common.collect.MutableClassToInstanceMap;
-
-import com.fasterxml.jackson.annotation.JsonIgnore;
-import com.fasterxml.jackson.core.JsonGenerator;
-import com.fasterxml.jackson.core.JsonParser;
-import com.fasterxml.jackson.core.JsonProcessingException;
-import com.fasterxml.jackson.databind.DeserializationContext;
-import com.fasterxml.jackson.databind.JavaType;
-import com.fasterxml.jackson.databind.JsonDeserializer;
-import com.fasterxml.jackson.databind.JsonNode;
-import com.fasterxml.jackson.databind.JsonSerializer;
-import com.fasterxml.jackson.databind.ObjectMapper;
-import com.fasterxml.jackson.databind.SerializerProvider;
-import com.fasterxml.jackson.databind.node.ObjectNode;
-
-import java.beans.PropertyDescriptor;
-import java.io.IOException;
-import java.lang.annotation.Annotation;
-import java.lang.reflect.InvocationHandler;
-import java.lang.reflect.Method;
-import java.lang.reflect.Proxy;
-import java.lang.reflect.Type;
-import java.util.Arrays;
-import java.util.HashSet;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-import java.util.SortedMap;
-import java.util.TreeMap;
-
-import javax.annotation.concurrent.ThreadSafe;
-
-/**
- * Represents and {@link InvocationHandler} for a {@link Proxy}. The invocation handler uses bean
- * introspection of the proxy class to store and retrieve values based off of the property name.
- *
- * <p>Unset properties use the {@code @Default} metadata on the getter to return values. If there
- * is no {@code @Default} annotation on the getter, then a <a
- * href="https://docs.oracle.com/javase/tutorial/java/nutsandbolts/datatypes.html">default</a> as
- * per the Java Language Specification for the expected return type is returned.
- *
- * <p>In addition to the getter/setter pairs, this proxy invocation handler supports
- * {@link Object#equals(Object)}, {@link Object#hashCode()}, {@link Object#toString()} and
- * {@link PipelineOptions#as(Class)}.
- */
-@ThreadSafe
-class ProxyInvocationHandler implements InvocationHandler {
- private static final ObjectMapper MAPPER = new ObjectMapper();
- /**
- * No two instances of this class are considered equivalent hence we generate a random hash code
- * between 0 and {@link Integer#MAX_VALUE}.
- */
- private final int hashCode = (int) (Math.random() * Integer.MAX_VALUE);
- private final Set<Class<? extends PipelineOptions>> knownInterfaces;
- private final ClassToInstanceMap<PipelineOptions> interfaceToProxyCache;
- private final Map<String, Object> options;
- private final Map<String, JsonNode> jsonOptions;
- private final Map<String, String> gettersToPropertyNames;
- private final Map<String, String> settersToPropertyNames;
-
- ProxyInvocationHandler(Map<String, Object> options) {
- this(options, Maps.<String, JsonNode>newHashMap());
- }
-
- private ProxyInvocationHandler(Map<String, Object> options, Map<String, JsonNode> jsonOptions) {
- this.options = options;
- this.jsonOptions = jsonOptions;
- this.knownInterfaces = new HashSet<>(PipelineOptionsFactory.getRegisteredOptions());
- gettersToPropertyNames = Maps.newHashMap();
- settersToPropertyNames = Maps.newHashMap();
- interfaceToProxyCache = MutableClassToInstanceMap.create();
- }
-
- @Override
- public Object invoke(Object proxy, Method method, Object[] args) {
- if (args == null && "toString".equals(method.getName())) {
- return toString();
- } else if (args != null && args.length == 1 && "equals".equals(method.getName())) {
- return equals(args[0]);
- } else if (args == null && "hashCode".equals(method.getName())) {
- return hashCode();
- } else if (args != null && "as".equals(method.getName()) && args[0] instanceof Class) {
- @SuppressWarnings("unchecked")
- Class<? extends PipelineOptions> clazz = (Class<? extends PipelineOptions>) args[0];
- return as(clazz);
- } else if (args != null && "cloneAs".equals(method.getName()) && args[0] instanceof Class) {
- @SuppressWarnings("unchecked")
- Class<? extends PipelineOptions> clazz = (Class<? extends PipelineOptions>) args[0];
- return cloneAs(proxy, clazz);
- }
- String methodName = method.getName();
- synchronized (this) {
- if (gettersToPropertyNames.keySet().contains(methodName)) {
- String propertyName = gettersToPropertyNames.get(methodName);
- if (!options.containsKey(propertyName)) {
- // Lazy bind the default to the method.
- Object value = jsonOptions.containsKey(propertyName)
- ? getValueFromJson(propertyName, method)
- : getDefault((PipelineOptions) proxy, method);
- options.put(propertyName, value);
- }
- return options.get(propertyName);
- } else if (settersToPropertyNames.containsKey(methodName)) {
- options.put(settersToPropertyNames.get(methodName), args[0]);
- return Void.TYPE;
- }
- }
- throw new RuntimeException("Unknown method [" + method + "] invoked with args ["
- + Arrays.toString(args) + "].");
- }
-
- /**
- * Backing implementation for {@link PipelineOptions#as(Class)}.
- *
- * @param iface The interface that the returned object needs to implement.
- * @return An object that implements the interface <T>.
- */
- synchronized <T extends PipelineOptions> T as(Class<T> iface) {
- Preconditions.checkNotNull(iface);
- Preconditions.checkArgument(iface.isInterface());
- if (!interfaceToProxyCache.containsKey(iface)) {
- Registration<T> registration =
- PipelineOptionsFactory.validateWellFormed(iface, knownInterfaces);
- List<PropertyDescriptor> propertyDescriptors = registration.getPropertyDescriptors();
- Class<T> proxyClass = registration.getProxyClass();
- gettersToPropertyNames.putAll(generateGettersToPropertyNames(propertyDescriptors));
- settersToPropertyNames.putAll(generateSettersToPropertyNames(propertyDescriptors));
- knownInterfaces.add(iface);
- interfaceToProxyCache.putInstance(iface,
- InstanceBuilder.ofType(proxyClass)
- .fromClass(proxyClass)
- .withArg(InvocationHandler.class, this)
- .build());
- }
- return interfaceToProxyCache.getInstance(iface);
- }
-
- /**
- * Backing implementation for {@link PipelineOptions#cloneAs(Class)}.
- *
- * @return A copy of the PipelineOptions.
- */
- synchronized <T extends PipelineOptions> T cloneAs(Object proxy, Class<T> iface) {
- PipelineOptions clonedOptions;
- try {
- clonedOptions = MAPPER.readValue(MAPPER.writeValueAsBytes(proxy), PipelineOptions.class);
- } catch (IOException e) {
- throw new IllegalStateException("Failed to serialize the pipeline options to JSON.", e);
- }
- for (Class<? extends PipelineOptions> knownIface : knownInterfaces) {
- clonedOptions.as(knownIface);
- }
- return clonedOptions.as(iface);
- }
-
- /**
- * Returns true if the other object is a ProxyInvocationHandler or is a Proxy object and has the
- * same ProxyInvocationHandler as this.
- *
- * @param obj The object to compare against this.
- * @return true iff the other object is a ProxyInvocationHandler or is a Proxy object and has the
- * same ProxyInvocationHandler as this.
- */
- @Override
- public boolean equals(Object obj) {
- return obj != null && ((obj instanceof ProxyInvocationHandler && this == obj)
- || (Proxy.isProxyClass(obj.getClass()) && this == Proxy.getInvocationHandler(obj)));
- }
-
- /**
- * Each instance of this ProxyInvocationHandler is unique and has a random hash code.
- *
- * @return A hash code that was generated randomly.
- */
- @Override
- public int hashCode() {
- return hashCode;
- }
-
- /**
- * This will output all the currently set values. This is a relatively costly function
- * as it will call {@code toString()} on each object that has been set and format
- * the results in a readable format.
- *
- * @return A pretty printed string representation of this.
- */
- @Override
- public synchronized String toString() {
- SortedMap<String, Object> sortedOptions = new TreeMap<>();
- // Add the options that we received from deserialization
- sortedOptions.putAll(jsonOptions);
- // Override with any programmatically set options.
- sortedOptions.putAll(options);
-
- StringBuilder b = new StringBuilder();
- b.append("Current Settings:\n");
- for (Map.Entry<String, Object> entry : sortedOptions.entrySet()) {
- b.append(" " + entry.getKey() + ": " + entry.getValue() + "\n");
- }
- return b.toString();
- }
-
- /**
- * Uses a Jackson {@link ObjectMapper} to attempt type conversion.
- *
- * @param method The method whose return type you would like to return.
- * @param propertyName The name of the property that is being returned.
- * @return An object matching the return type of the method passed in.
- */
- private Object getValueFromJson(String propertyName, Method method) {
- try {
- JavaType type = MAPPER.getTypeFactory().constructType(method.getGenericReturnType());
- JsonNode jsonNode = jsonOptions.get(propertyName);
- return MAPPER.readValue(jsonNode.toString(), type);
- } catch (IOException e) {
- throw new RuntimeException("Unable to parse representation", e);
- }
- }
-
- /**
- * Returns a default value for the method based upon {@code @Default} metadata on the getter
- * to return values. If there is no {@code @Default} annotation on the getter, then a <a
- * href="https://docs.oracle.com/javase/tutorial/java/nutsandbolts/datatypes.html">default</a> as
- * per the Java Language Specification for the expected return type is returned.
- *
- * @param proxy The proxy object for which we are attempting to get the default.
- * @param method The getter method that was invoked.
- * @return The default value from an {@link Default} annotation if present, otherwise a default
- * value as per the Java Language Specification.
- */
- @SuppressWarnings({"unchecked", "rawtypes"})
- private Object getDefault(PipelineOptions proxy, Method method) {
- for (Annotation annotation : method.getAnnotations()) {
- if (annotation instanceof Default.Class) {
- return ((Default.Class) annotation).value();
- } else if (annotation instanceof Default.String) {
- return ((Default.String) annotation).value();
- } else if (annotation instanceof Default.Boolean) {
- return ((Default.Boolean) annotation).value();
- } else if (annotation instanceof Default.Character) {
- return ((Default.Character) annotation).value();
- } else if (annotation instanceof Default.Byte) {
- return ((Default.Byte) annotation).value();
- } else if (annotation instanceof Default.Short) {
- return ((Default.Short) annotation).value();
- } else if (annotation instanceof Default.Integer) {
- return ((Default.Integer) annotation).value();
- } else if (annotation instanceof Default.Long) {
- return ((Default.Long) annotation).value();
- } else if (annotation instanceof Default.Float) {
- return ((Default.Float) annotation).value();
- } else if (annotation instanceof Default.Double) {
- return ((Default.Double) annotation).value();
- } else if (annotation instanceof Default.Enum) {
- return Enum.valueOf((Class<Enum>) method.getReturnType(),
- ((Default.Enum) annotation).value());
- } else if (annotation instanceof Default.InstanceFactory) {
- return InstanceBuilder.ofType(((Default.InstanceFactory) annotation).value())
- .build()
- .create(proxy);
- }
- }
-
- /*
- * We need to make sure that we return something appropriate for the return type. Thus we return
- * a default value as defined by the JLS.
- */
- return Defaults.defaultValue(method.getReturnType());
- }
-
- /**
- * Returns a map from the getters method name to the name of the property based upon the passed in
- * {@link PropertyDescriptor}s property descriptors.
- *
- * @param propertyDescriptors A list of {@link PropertyDescriptor}s to use when generating the
- * map.
- * @return A map of getter method name to property name.
- */
- private static Map<String, String> generateGettersToPropertyNames(
- List<PropertyDescriptor> propertyDescriptors) {
- ImmutableMap.Builder<String, String> builder = ImmutableMap.builder();
- for (PropertyDescriptor descriptor : propertyDescriptors) {
- if (descriptor.getReadMethod() != null) {
- builder.put(descriptor.getReadMethod().getName(), descriptor.getName());
- }
- }
- return builder.build();
- }
-
- /**
- * Returns a map from the setters method name to its matching getters method name based upon the
- * passed in {@link PropertyDescriptor}s property descriptors.
- *
- * @param propertyDescriptors A list of {@link PropertyDescriptor}s to use when generating the
- * map.
- * @return A map of setter method name to getter method name.
- */
- private static Map<String, String> generateSettersToPropertyNames(
- List<PropertyDescriptor> propertyDescriptors) {
- ImmutableMap.Builder<String, String> builder = ImmutableMap.builder();
- for (PropertyDescriptor descriptor : propertyDescriptors) {
- if (descriptor.getWriteMethod() != null) {
- builder.put(descriptor.getWriteMethod().getName(), descriptor.getName());
- }
- }
- return builder.build();
- }
-
- static class Serializer extends JsonSerializer<PipelineOptions> {
- @Override
- public void serialize(PipelineOptions value, JsonGenerator jgen, SerializerProvider provider)
- throws IOException, JsonProcessingException {
- ProxyInvocationHandler handler = (ProxyInvocationHandler) Proxy.getInvocationHandler(value);
- synchronized (handler) {
- // We first filter out any properties that have been modified since
- // the last serialization of this PipelineOptions and then verify that
- // they are all serializable.
- Map<String, Object> filteredOptions = Maps.newHashMap(handler.options);
- removeIgnoredOptions(handler.knownInterfaces, filteredOptions);
- ensureSerializable(handler.knownInterfaces, filteredOptions);
-
- // Now we create the map of serializable options by taking the original
- // set of serialized options (if any) and updating them with any properties
- // instances that have been modified since the previous serialization.
- Map<String, Object> serializableOptions =
- Maps.<String, Object>newHashMap(handler.jsonOptions);
- serializableOptions.putAll(filteredOptions);
- jgen.writeStartObject();
- jgen.writeFieldName("options");
- jgen.writeObject(serializableOptions);
- jgen.writeEndObject();
- }
- }
-
- /**
- * We remove all properties within the passed in options where there getter is annotated with
- * {@link JsonIgnore @JsonIgnore} from the passed in options using the passed in interfaces.
- */
- private void removeIgnoredOptions(
- Set<Class<? extends PipelineOptions>> interfaces, Map<String, Object> options) {
- // Find all the method names that are annotated with JSON ignore.
- Set<String> jsonIgnoreMethodNames = FluentIterable.from(
- ReflectHelpers.getClosureOfMethodsOnInterfaces(interfaces))
- .filter(JsonIgnorePredicate.INSTANCE).transform(new Function<Method, String>() {
- @Override
- public String apply(Method input) {
- return input.getName();
- }
- }).toSet();
-
- // Remove all options that have the same method name as the descriptor.
- for (PropertyDescriptor descriptor
- : PipelineOptionsFactory.getPropertyDescriptors(interfaces)) {
- if (jsonIgnoreMethodNames.contains(descriptor.getReadMethod().getName())) {
- options.remove(descriptor.getName());
- }
- }
- }
-
- /**
- * We use an {@link ObjectMapper} to verify that the passed in options are serializable
- * and deserializable.
- */
- private void ensureSerializable(Set<Class<? extends PipelineOptions>> interfaces,
- Map<String, Object> options) throws IOException {
- // Construct a map from property name to the return type of the getter.
- Map<String, Type> propertyToReturnType = Maps.newHashMap();
- for (PropertyDescriptor descriptor
- : PipelineOptionsFactory.getPropertyDescriptors(interfaces)) {
- if (descriptor.getReadMethod() != null) {
- propertyToReturnType.put(descriptor.getName(),
- descriptor.getReadMethod().getGenericReturnType());
- }
- }
-
- // Attempt to serialize and deserialize each property.
- for (Map.Entry<String, Object> entry : options.entrySet()) {
- try {
- String serializedValue = MAPPER.writeValueAsString(entry.getValue());
- JavaType type = MAPPER.getTypeFactory()
- .constructType(propertyToReturnType.get(entry.getKey()));
- MAPPER.readValue(serializedValue, type);
- } catch (Exception e) {
- throw new IOException(String.format(
- "Failed to serialize and deserialize property '%s' with value '%s'",
- entry.getKey(), entry.getValue()), e);
- }
- }
- }
- }
-
- static class Deserializer extends JsonDeserializer<PipelineOptions> {
- @Override
- public PipelineOptions deserialize(JsonParser jp, DeserializationContext ctxt)
- throws IOException, JsonProcessingException {
- ObjectNode objectNode = (ObjectNode) jp.readValueAsTree();
- ObjectNode optionsNode = (ObjectNode) objectNode.get("options");
-
- Map<String, JsonNode> fields = Maps.newHashMap();
- for (Iterator<Map.Entry<String, JsonNode>> iterator = optionsNode.fields();
- iterator.hasNext(); ) {
- Map.Entry<String, JsonNode> field = iterator.next();
- fields.put(field.getKey(), field.getValue());
- }
- PipelineOptions options =
- new ProxyInvocationHandler(Maps.<String, Object>newHashMap(), fields)
- .as(PipelineOptions.class);
- return options;
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/StreamingOptions.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/StreamingOptions.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/StreamingOptions.java
deleted file mode 100644
index 9563c58..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/StreamingOptions.java
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.options;
-
-/**
- * Options used to configure streaming.
- */
-public interface StreamingOptions extends
- ApplicationNameOptions, GcpOptions, PipelineOptions {
- /**
- * Set to true if running a streaming pipeline.
- */
- @Description("Set to true if running a streaming pipeline.")
- boolean isStreaming();
- void setStreaming(boolean value);
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/Validation.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/Validation.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/Validation.java
deleted file mode 100644
index 20034f8..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/Validation.java
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.options;
-
-import java.lang.annotation.Documented;
-import java.lang.annotation.ElementType;
-import java.lang.annotation.Retention;
-import java.lang.annotation.RetentionPolicy;
-import java.lang.annotation.Target;
-
-/**
- * {@link Validation} represents a set of annotations that can be used to annotate getter
- * properties on {@link PipelineOptions} with information representing the validation criteria to
- * be used when validating with the {@link PipelineOptionsValidator}.
- */
-public @interface Validation {
- /**
- * This criteria specifies that the value must be not null. Note that this annotation
- * should only be applied to methods that return nullable objects.
- */
- @Target(value = ElementType.METHOD)
- @Retention(RetentionPolicy.RUNTIME)
- @Documented
- public @interface Required {
- /**
- * The groups that the annotated attribute is a member of. A member can be in 0 or more groups.
- * Members not in any groups are considered to be in a group consisting exclusively of
- * themselves. At least one member of a group must be non-null if the options are to be valid.
- */
- String[] groups() default {};
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/package-info.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/package-info.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/package-info.java
deleted file mode 100644
index cef995f..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/package-info.java
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-/**
- * Defines {@link com.google.cloud.dataflow.sdk.options.PipelineOptions} for
- * configuring pipeline execution.
- *
- * <p>{@link com.google.cloud.dataflow.sdk.options.PipelineOptions} encapsulates the various
- * parameters that describe how a pipeline should be run. {@code PipelineOptions} are created
- * using a {@link com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory}.
- */
-package com.google.cloud.dataflow.sdk.options;
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/package-info.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/package-info.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/package-info.java
deleted file mode 100644
index 5567f03..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/package-info.java
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-/**
- * Provides a simple, powerful model for building both batch and
- * streaming parallel data processing
- * {@link com.google.cloud.dataflow.sdk.Pipeline}s.
- *
- * <p>To use the Google Cloud Dataflow SDK, you build a
- * {@link com.google.cloud.dataflow.sdk.Pipeline}, which manages a graph of
- * {@link com.google.cloud.dataflow.sdk.transforms.PTransform}s
- * and the {@link com.google.cloud.dataflow.sdk.values.PCollection}s that
- * the PTransforms consume and produce.
- *
- * <p>Each Pipeline has a
- * {@link com.google.cloud.dataflow.sdk.runners.PipelineRunner} to specify
- * where and how it should run after pipeline construction is complete.
- *
- */
-package com.google.cloud.dataflow.sdk;
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/AggregatorPipelineExtractor.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/AggregatorPipelineExtractor.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/AggregatorPipelineExtractor.java
deleted file mode 100644
index ab87f2e..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/AggregatorPipelineExtractor.java
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.runners;
-
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.Pipeline.PipelineVisitor;
-import com.google.cloud.dataflow.sdk.transforms.Aggregator;
-import com.google.cloud.dataflow.sdk.transforms.AggregatorRetriever;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-import com.google.cloud.dataflow.sdk.transforms.ParDo;
-import com.google.cloud.dataflow.sdk.values.PValue;
-import com.google.common.collect.HashMultimap;
-import com.google.common.collect.SetMultimap;
-
-import java.util.Collection;
-import java.util.Collections;
-import java.util.Map;
-
-/**
- * Retrieves {@link Aggregator Aggregators} at each {@link ParDo} and returns a {@link Map} of
- * {@link Aggregator} to the {@link PTransform PTransforms} in which it is present.
- */
-public class AggregatorPipelineExtractor {
- private final Pipeline pipeline;
-
- /**
- * Creates an {@code AggregatorPipelineExtractor} for the given {@link Pipeline}.
- */
- public AggregatorPipelineExtractor(Pipeline pipeline) {
- this.pipeline = pipeline;
- }
-
- /**
- * Returns a {@link Map} between each {@link Aggregator} in the {@link Pipeline} to the {@link
- * PTransform PTransforms} in which it is used.
- */
- public Map<Aggregator<?, ?>, Collection<PTransform<?, ?>>> getAggregatorSteps() {
- HashMultimap<Aggregator<?, ?>, PTransform<?, ?>> aggregatorSteps = HashMultimap.create();
- pipeline.traverseTopologically(new AggregatorVisitor(aggregatorSteps));
- return aggregatorSteps.asMap();
- }
-
- private static class AggregatorVisitor implements PipelineVisitor {
- private final SetMultimap<Aggregator<?, ?>, PTransform<?, ?>> aggregatorSteps;
-
- public AggregatorVisitor(SetMultimap<Aggregator<?, ?>, PTransform<?, ?>> aggregatorSteps) {
- this.aggregatorSteps = aggregatorSteps;
- }
-
- @Override
- public void enterCompositeTransform(TransformTreeNode node) {}
-
- @Override
- public void leaveCompositeTransform(TransformTreeNode node) {}
-
- @Override
- public void visitTransform(TransformTreeNode node) {
- PTransform<?, ?> transform = node.getTransform();
- addStepToAggregators(transform, getAggregators(transform));
- }
-
- private Collection<Aggregator<?, ?>> getAggregators(PTransform<?, ?> transform) {
- if (transform != null) {
- if (transform instanceof ParDo.Bound) {
- return AggregatorRetriever.getAggregators(((ParDo.Bound<?, ?>) transform).getFn());
- } else if (transform instanceof ParDo.BoundMulti) {
- return AggregatorRetriever.getAggregators(((ParDo.BoundMulti<?, ?>) transform).getFn());
- }
- }
- return Collections.emptyList();
- }
-
- private void addStepToAggregators(
- PTransform<?, ?> transform, Collection<Aggregator<?, ?>> aggregators) {
- for (Aggregator<?, ?> aggregator : aggregators) {
- aggregatorSteps.put(aggregator, transform);
- }
- }
-
- @Override
- public void visitValue(PValue value, TransformTreeNode producer) {}
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/AggregatorRetrievalException.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/AggregatorRetrievalException.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/AggregatorRetrievalException.java
deleted file mode 100644
index 90162ad..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/AggregatorRetrievalException.java
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.runners;
-
-import com.google.cloud.dataflow.sdk.transforms.Aggregator;
-
-/**
- * Signals that an exception has occurred while retrieving {@link Aggregator}s.
- */
-public class AggregatorRetrievalException extends Exception {
- /**
- * Constructs a new {@code AggregatorRetrievalException} with the specified detail message and
- * cause.
- */
- public AggregatorRetrievalException(String message, Throwable cause) {
- super(message, cause);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/AggregatorValues.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/AggregatorValues.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/AggregatorValues.java
deleted file mode 100644
index 21f0282..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/AggregatorValues.java
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.runners;
-
-import com.google.cloud.dataflow.sdk.transforms.Aggregator;
-import com.google.cloud.dataflow.sdk.transforms.Combine.CombineFn;
-import com.google.cloud.dataflow.sdk.transforms.DoFn;
-
-import java.util.Collection;
-import java.util.Map;
-
-/**
- * A collection of values associated with an {@link Aggregator}. Aggregators declared in a
- * {@link DoFn} are emitted on a per-{@code DoFn}-application basis.
- *
- * @param <T> the output type of the aggregator
- */
-public abstract class AggregatorValues<T> {
- /**
- * Get the values of the {@link Aggregator} at all steps it was used.
- */
- public Collection<T> getValues() {
- return getValuesAtSteps().values();
- }
-
- /**
- * Get the values of the {@link Aggregator} by the user name at each step it was used.
- */
- public abstract Map<String, T> getValuesAtSteps();
-
- /**
- * Get the total value of this {@link Aggregator} by applying the specified {@link CombineFn}.
- */
- public T getTotalValue(CombineFn<T, ?, T> combineFn) {
- return combineFn.apply(getValues());
- }
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/BlockingDataflowPipelineRunner.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/BlockingDataflowPipelineRunner.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/BlockingDataflowPipelineRunner.java
deleted file mode 100644
index 95e3dfe..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/BlockingDataflowPipelineRunner.java
+++ /dev/null
@@ -1,181 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.runners;
-
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.PipelineResult.State;
-import com.google.cloud.dataflow.sdk.annotations.Experimental;
-import com.google.cloud.dataflow.sdk.options.BlockingDataflowPipelineOptions;
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-import com.google.cloud.dataflow.sdk.options.PipelineOptionsValidator;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-import com.google.cloud.dataflow.sdk.util.MonitoringUtil;
-import com.google.cloud.dataflow.sdk.values.PInput;
-import com.google.cloud.dataflow.sdk.values.POutput;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.IOException;
-import java.util.concurrent.TimeUnit;
-
-import javax.annotation.Nullable;
-
-/**
- * A {@link PipelineRunner} that's like {@link DataflowPipelineRunner}
- * but that waits for the launched job to finish.
- *
- * <p>Prints out job status updates and console messages while it waits.
- *
- * <p>Returns the final job state, or throws an exception if the job
- * fails or cannot be monitored.
- *
- * <p><h3>Permissions</h3>
- * When reading from a Dataflow source or writing to a Dataflow sink using
- * {@code BlockingDataflowPipelineRunner}, the Google cloud services account and the Google compute
- * engine service account of the GCP project running the Dataflow Job will need access to the
- * corresponding source/sink.
- *
- * <p>Please see <a href="https://cloud.google.com/dataflow/security-and-permissions">Google Cloud
- * Dataflow Security and Permissions</a> for more details.
- */
-public class BlockingDataflowPipelineRunner extends
- PipelineRunner<DataflowPipelineJob> {
- private static final Logger LOG = LoggerFactory.getLogger(BlockingDataflowPipelineRunner.class);
-
- // Defaults to an infinite wait period.
- // TODO: make this configurable after removal of option map.
- private static final long BUILTIN_JOB_TIMEOUT_SEC = -1L;
-
- private final DataflowPipelineRunner dataflowPipelineRunner;
- private final BlockingDataflowPipelineOptions options;
-
- protected BlockingDataflowPipelineRunner(
- DataflowPipelineRunner internalRunner,
- BlockingDataflowPipelineOptions options) {
- this.dataflowPipelineRunner = internalRunner;
- this.options = options;
- }
-
- /**
- * Constructs a runner from the provided options.
- */
- public static BlockingDataflowPipelineRunner fromOptions(
- PipelineOptions options) {
- BlockingDataflowPipelineOptions dataflowOptions =
- PipelineOptionsValidator.validate(BlockingDataflowPipelineOptions.class, options);
- DataflowPipelineRunner dataflowPipelineRunner =
- DataflowPipelineRunner.fromOptions(dataflowOptions);
-
- return new BlockingDataflowPipelineRunner(dataflowPipelineRunner, dataflowOptions);
- }
-
- /**
- * {@inheritDoc}
- *
- * @throws DataflowJobExecutionException if there is an exception during job execution.
- * @throws DataflowServiceException if there is an exception retrieving information about the job.
- */
- @Override
- public DataflowPipelineJob run(Pipeline p) {
- final DataflowPipelineJob job = dataflowPipelineRunner.run(p);
-
- // We ignore the potential race condition here (Ctrl-C after job submission but before the
- // shutdown hook is registered). Even if we tried to do something smarter (eg., SettableFuture)
- // the run method (which produces the job) could fail or be Ctrl-C'd before it had returned a
- // job. The display of the command to cancel the job is best-effort anyways -- RPC's could fail,
- // etc. If the user wants to verify the job was cancelled they should look at the job status.
- Thread shutdownHook = new Thread() {
- @Override
- public void run() {
- LOG.warn("Job is already running in Google Cloud Platform, Ctrl-C will not cancel it.\n"
- + "To cancel the job in the cloud, run:\n> {}",
- MonitoringUtil.getGcloudCancelCommand(options, job.getJobId()));
- }
- };
-
- try {
- Runtime.getRuntime().addShutdownHook(shutdownHook);
-
- @Nullable
- State result;
- try {
- result = job.waitToFinish(
- BUILTIN_JOB_TIMEOUT_SEC, TimeUnit.SECONDS,
- new MonitoringUtil.PrintHandler(options.getJobMessageOutput()));
- } catch (IOException | InterruptedException ex) {
- LOG.debug("Exception caught while retrieving status for job {}", job.getJobId(), ex);
- throw new DataflowServiceException(
- job, "Exception caught while retrieving status for job " + job.getJobId(), ex);
- }
-
- if (result == null) {
- throw new DataflowServiceException(
- job, "Timed out while retrieving status for job " + job.getJobId());
- }
-
- LOG.info("Job finished with status {}", result);
- if (!result.isTerminal()) {
- throw new IllegalStateException("Expected terminal state for job " + job.getJobId()
- + ", got " + result);
- }
-
- if (result == State.DONE) {
- return job;
- } else if (result == State.UPDATED) {
- DataflowPipelineJob newJob = job.getReplacedByJob();
- LOG.info("Job {} has been updated and is running as the new job with id {}."
- + "To access the updated job on the Dataflow monitoring console, please navigate to {}",
- job.getJobId(),
- newJob.getJobId(),
- MonitoringUtil.getJobMonitoringPageURL(newJob.getProjectId(), newJob.getJobId()));
- throw new DataflowJobUpdatedException(
- job,
- String.format("Job %s updated; new job is %s.", job.getJobId(), newJob.getJobId()),
- newJob);
- } else if (result == State.CANCELLED) {
- String message = String.format("Job %s cancelled by user", job.getJobId());
- LOG.info(message);
- throw new DataflowJobCancelledException(job, message);
- } else {
- throw new DataflowJobExecutionException(job, "Job " + job.getJobId()
- + " failed with status " + result);
- }
- } finally {
- Runtime.getRuntime().removeShutdownHook(shutdownHook);
- }
- }
-
- @Override
- public <OutputT extends POutput, InputT extends PInput> OutputT apply(
- PTransform<InputT, OutputT> transform, InputT input) {
- return dataflowPipelineRunner.apply(transform, input);
- }
-
- /**
- * Sets callbacks to invoke during execution. See {@link DataflowPipelineRunnerHooks}.
- */
- @Experimental
- public void setHooks(DataflowPipelineRunnerHooks hooks) {
- this.dataflowPipelineRunner.setHooks(hooks);
- }
-
- @Override
- public String toString() {
- return "BlockingDataflowPipelineRunner#" + options.getJobName();
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/DataflowJobAlreadyExistsException.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/DataflowJobAlreadyExistsException.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/DataflowJobAlreadyExistsException.java
deleted file mode 100644
index 1547f73..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/DataflowJobAlreadyExistsException.java
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.runners;
-
-/**
- * An exception that is thrown if the unique job name constraint of the Dataflow
- * service is broken because an existing job with the same job name is currently active.
- * The {@link DataflowPipelineJob} contained within this exception contains information
- * about the pre-existing job.
- */
-public class DataflowJobAlreadyExistsException extends DataflowJobException {
- /**
- * Create a new {@code DataflowJobAlreadyExistsException} with the specified {@link
- * DataflowPipelineJob} and message.
- */
- public DataflowJobAlreadyExistsException(
- DataflowPipelineJob job, String message) {
- super(job, message, null);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/DataflowJobAlreadyUpdatedException.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/DataflowJobAlreadyUpdatedException.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/DataflowJobAlreadyUpdatedException.java
deleted file mode 100644
index d4ae4f5..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/DataflowJobAlreadyUpdatedException.java
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.runners;
-
-/**
- * An exception that is thrown if the existing job has already been updated within the Dataflow
- * service and is no longer able to be updated. The {@link DataflowPipelineJob} contained within
- * this exception contains information about the pre-existing updated job.
- */
-public class DataflowJobAlreadyUpdatedException extends DataflowJobException {
- /**
- * Create a new {@code DataflowJobAlreadyUpdatedException} with the specified {@link
- * DataflowPipelineJob} and message.
- */
- public DataflowJobAlreadyUpdatedException(
- DataflowPipelineJob job, String message) {
- super(job, message, null);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/DataflowJobCancelledException.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/DataflowJobCancelledException.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/DataflowJobCancelledException.java
deleted file mode 100644
index 0d31726..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/DataflowJobCancelledException.java
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.runners;
-
-/**
- * Signals that a job run by a {@link BlockingDataflowPipelineRunner} was updated during execution.
- */
-public class DataflowJobCancelledException extends DataflowJobException {
- /**
- * Create a new {@code DataflowJobAlreadyUpdatedException} with the specified {@link
- * DataflowPipelineJob} and message.
- */
- public DataflowJobCancelledException(DataflowPipelineJob job, String message) {
- super(job, message, null);
- }
-
- /**
- * Create a new {@code DataflowJobAlreadyUpdatedException} with the specified {@link
- * DataflowPipelineJob}, message, and cause.
- */
- public DataflowJobCancelledException(DataflowPipelineJob job, String message, Throwable cause) {
- super(job, message, cause);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/DataflowJobException.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/DataflowJobException.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/DataflowJobException.java
deleted file mode 100644
index 9e305d5..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/DataflowJobException.java
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.runners;
-
-import java.util.Objects;
-
-import javax.annotation.Nullable;
-
-/**
- * A {@link RuntimeException} that contains information about a {@link DataflowPipelineJob}.
- */
-public abstract class DataflowJobException extends RuntimeException {
- private final DataflowPipelineJob job;
-
- DataflowJobException(DataflowPipelineJob job, String message, @Nullable Throwable cause) {
- super(message, cause);
- this.job = Objects.requireNonNull(job);
- }
-
- /**
- * Returns the failed job.
- */
- public DataflowPipelineJob getJob() {
- return job;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/DataflowJobExecutionException.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/DataflowJobExecutionException.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/DataflowJobExecutionException.java
deleted file mode 100644
index ae6df0f..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/DataflowJobExecutionException.java
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.runners;
-
-import javax.annotation.Nullable;
-
-/**
- * Signals that a job run by a {@link BlockingDataflowPipelineRunner} fails during execution, and
- * provides access to the failed job.
- */
-public class DataflowJobExecutionException extends DataflowJobException {
- DataflowJobExecutionException(DataflowPipelineJob job, String message) {
- this(job, message, null);
- }
-
- DataflowJobExecutionException(
- DataflowPipelineJob job, String message, @Nullable Throwable cause) {
- super(job, message, cause);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/DataflowJobUpdatedException.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/DataflowJobUpdatedException.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/DataflowJobUpdatedException.java
deleted file mode 100644
index 1becdd7..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/DataflowJobUpdatedException.java
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.runners;
-
-/**
- * Signals that a job run by a {@link BlockingDataflowPipelineRunner} was updated during execution.
- */
-public class DataflowJobUpdatedException extends DataflowJobException {
- private DataflowPipelineJob replacedByJob;
-
- /**
- * Create a new {@code DataflowJobUpdatedException} with the specified original {@link
- * DataflowPipelineJob}, message, and replacement {@link DataflowPipelineJob}.
- */
- public DataflowJobUpdatedException(
- DataflowPipelineJob job, String message, DataflowPipelineJob replacedByJob) {
- this(job, message, replacedByJob, null);
- }
-
- /**
- * Create a new {@code DataflowJobUpdatedException} with the specified original {@link
- * DataflowPipelineJob}, message, replacement {@link DataflowPipelineJob}, and cause.
- */
- public DataflowJobUpdatedException(
- DataflowPipelineJob job, String message, DataflowPipelineJob replacedByJob, Throwable cause) {
- super(job, message, cause);
- this.replacedByJob = replacedByJob;
- }
-
- /**
- * The new job that replaces the job terminated with this exception.
- */
- public DataflowPipelineJob getReplacedByJob() {
- return replacedByJob;
- }
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/DataflowPipeline.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/DataflowPipeline.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/DataflowPipeline.java
deleted file mode 100644
index 5a78624..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/DataflowPipeline.java
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.runners;
-
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.options.DataflowPipelineOptions;
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-
-/**
- * A {@link DataflowPipeline} is a {@link Pipeline} that returns a
- * {@link DataflowPipelineJob} when it is
- * {@link com.google.cloud.dataflow.sdk.Pipeline#run()}.
- *
- * <p>This is not intended for use by users of Cloud Dataflow.
- * Instead, use {@link Pipeline#create(PipelineOptions)} to initialize a
- * {@link Pipeline}.
- */
-public class DataflowPipeline extends Pipeline {
-
- /**
- * Creates and returns a new {@link DataflowPipeline} instance for tests.
- */
- public static DataflowPipeline create(DataflowPipelineOptions options) {
- return new DataflowPipeline(options);
- }
-
- private DataflowPipeline(DataflowPipelineOptions options) {
- super(DataflowPipelineRunner.fromOptions(options), options);
- }
-
- @Override
- public DataflowPipelineJob run() {
- return (DataflowPipelineJob) super.run();
- }
-
- @Override
- public DataflowPipelineRunner getRunner() {
- return (DataflowPipelineRunner) super.getRunner();
- }
-
- @Override
- public String toString() {
- return "DataflowPipeline#" + getOptions().as(DataflowPipelineOptions.class).getJobName();
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/DataflowPipelineJob.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/DataflowPipelineJob.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/DataflowPipelineJob.java
deleted file mode 100644
index e9f134c..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/DataflowPipelineJob.java
+++ /dev/null
@@ -1,389 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.runners;
-
-import static com.google.cloud.dataflow.sdk.util.TimeUtil.fromCloudTime;
-
-import com.google.api.client.googleapis.json.GoogleJsonResponseException;
-import com.google.api.client.util.BackOff;
-import com.google.api.client.util.BackOffUtils;
-import com.google.api.client.util.NanoClock;
-import com.google.api.client.util.Sleeper;
-import com.google.api.services.dataflow.Dataflow;
-import com.google.api.services.dataflow.model.Job;
-import com.google.api.services.dataflow.model.JobMessage;
-import com.google.api.services.dataflow.model.JobMetrics;
-import com.google.api.services.dataflow.model.MetricUpdate;
-import com.google.cloud.dataflow.sdk.PipelineResult;
-import com.google.cloud.dataflow.sdk.runners.dataflow.DataflowAggregatorTransforms;
-import com.google.cloud.dataflow.sdk.runners.dataflow.DataflowMetricUpdateExtractor;
-import com.google.cloud.dataflow.sdk.transforms.Aggregator;
-import com.google.cloud.dataflow.sdk.util.AttemptAndTimeBoundedExponentialBackOff;
-import com.google.cloud.dataflow.sdk.util.AttemptBoundedExponentialBackOff;
-import com.google.cloud.dataflow.sdk.util.MapAggregatorValues;
-import com.google.cloud.dataflow.sdk.util.MonitoringUtil;
-import com.google.common.annotations.VisibleForTesting;
-import com.google.common.base.Throwables;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.IOException;
-import java.net.SocketTimeoutException;
-import java.util.List;
-import java.util.Map;
-import java.util.concurrent.TimeUnit;
-
-import javax.annotation.Nullable;
-
-/**
- * A DataflowPipelineJob represents a job submitted to Dataflow using
- * {@link DataflowPipelineRunner}.
- */
-public class DataflowPipelineJob implements PipelineResult {
- private static final Logger LOG = LoggerFactory.getLogger(DataflowPipelineJob.class);
-
- /**
- * The id for the job.
- */
- private String jobId;
-
- /**
- * Google cloud project to associate this pipeline with.
- */
- private String projectId;
-
- /**
- * Client for the Dataflow service. This can be used to query the service
- * for information about the job.
- */
- private Dataflow dataflowClient;
-
- /**
- * The state the job terminated in or {@code null} if the job has not terminated.
- */
- @Nullable
- private State terminalState = null;
-
- /**
- * The job that replaced this one or {@code null} if the job has not been replaced.
- */
- @Nullable
- private DataflowPipelineJob replacedByJob = null;
-
- private DataflowAggregatorTransforms aggregatorTransforms;
-
- /**
- * The Metric Updates retrieved after the job was in a terminal state.
- */
- private List<MetricUpdate> terminalMetricUpdates;
-
- /**
- * The polling interval for job status and messages information.
- */
- static final long MESSAGES_POLLING_INTERVAL = TimeUnit.SECONDS.toMillis(2);
- static final long STATUS_POLLING_INTERVAL = TimeUnit.SECONDS.toMillis(2);
-
- /**
- * The amount of polling attempts for job status and messages information.
- */
- static final int MESSAGES_POLLING_ATTEMPTS = 10;
- static final int STATUS_POLLING_ATTEMPTS = 5;
-
- /**
- * Constructs the job.
- *
- * @param projectId the project id
- * @param jobId the job id
- * @param dataflowClient the client for the Dataflow Service
- */
- public DataflowPipelineJob(String projectId, String jobId, Dataflow dataflowClient,
- DataflowAggregatorTransforms aggregatorTransforms) {
- this.projectId = projectId;
- this.jobId = jobId;
- this.dataflowClient = dataflowClient;
- this.aggregatorTransforms = aggregatorTransforms;
- }
-
- /**
- * Get the id of this job.
- */
- public String getJobId() {
- return jobId;
- }
-
- /**
- * Get the project this job exists in.
- */
- public String getProjectId() {
- return projectId;
- }
-
- /**
- * Returns a new {@link DataflowPipelineJob} for the job that replaced this one, if applicable.
- *
- * @throws IllegalStateException if called before the job has terminated or if the job terminated
- * but was not updated
- */
- public DataflowPipelineJob getReplacedByJob() {
- if (terminalState == null) {
- throw new IllegalStateException("getReplacedByJob() called before job terminated");
- }
- if (replacedByJob == null) {
- throw new IllegalStateException("getReplacedByJob() called for job that was not replaced");
- }
- return replacedByJob;
- }
-
- /**
- * Get the Cloud Dataflow API Client used by this job.
- */
- public Dataflow getDataflowClient() {
- return dataflowClient;
- }
-
- /**
- * Waits for the job to finish and return the final status.
- *
- * @param timeToWait The time to wait in units timeUnit for the job to finish.
- * Provide a value less than 1 ms for an infinite wait.
- * @param timeUnit The unit of time for timeToWait.
- * @param messageHandler If non null this handler will be invoked for each
- * batch of messages received.
- * @return The final state of the job or null on timeout or if the
- * thread is interrupted.
- * @throws IOException If there is a persistent problem getting job
- * information.
- * @throws InterruptedException
- */
- @Nullable
- public State waitToFinish(
- long timeToWait,
- TimeUnit timeUnit,
- MonitoringUtil.JobMessagesHandler messageHandler)
- throws IOException, InterruptedException {
- return waitToFinish(timeToWait, timeUnit, messageHandler, Sleeper.DEFAULT, NanoClock.SYSTEM);
- }
-
- /**
- * Wait for the job to finish and return the final status.
- *
- * @param timeToWait The time to wait in units timeUnit for the job to finish.
- * Provide a value less than 1 ms for an infinite wait.
- * @param timeUnit The unit of time for timeToWait.
- * @param messageHandler If non null this handler will be invoked for each
- * batch of messages received.
- * @param sleeper A sleeper to use to sleep between attempts.
- * @param nanoClock A nanoClock used to time the total time taken.
- * @return The final state of the job or null on timeout or if the
- * thread is interrupted.
- * @throws IOException If there is a persistent problem getting job
- * information.
- * @throws InterruptedException
- */
- @Nullable
- @VisibleForTesting
- State waitToFinish(
- long timeToWait,
- TimeUnit timeUnit,
- MonitoringUtil.JobMessagesHandler messageHandler,
- Sleeper sleeper,
- NanoClock nanoClock)
- throws IOException, InterruptedException {
- MonitoringUtil monitor = new MonitoringUtil(projectId, dataflowClient);
-
- long lastTimestamp = 0;
- BackOff backoff =
- timeUnit.toMillis(timeToWait) > 0
- ? new AttemptAndTimeBoundedExponentialBackOff(
- MESSAGES_POLLING_ATTEMPTS,
- MESSAGES_POLLING_INTERVAL,
- timeUnit.toMillis(timeToWait),
- AttemptAndTimeBoundedExponentialBackOff.ResetPolicy.ATTEMPTS,
- nanoClock)
- : new AttemptBoundedExponentialBackOff(
- MESSAGES_POLLING_ATTEMPTS, MESSAGES_POLLING_INTERVAL);
- State state;
- do {
- // Get the state of the job before listing messages. This ensures we always fetch job
- // messages after the job finishes to ensure we have all them.
- state = getStateWithRetries(1, sleeper);
- boolean hasError = state == State.UNKNOWN;
-
- if (messageHandler != null && !hasError) {
- // Process all the job messages that have accumulated so far.
- try {
- List<JobMessage> allMessages = monitor.getJobMessages(
- jobId, lastTimestamp);
-
- if (!allMessages.isEmpty()) {
- lastTimestamp =
- fromCloudTime(allMessages.get(allMessages.size() - 1).getTime()).getMillis();
- messageHandler.process(allMessages);
- }
- } catch (GoogleJsonResponseException | SocketTimeoutException e) {
- hasError = true;
- LOG.warn("There were problems getting current job messages: {}.", e.getMessage());
- LOG.debug("Exception information:", e);
- }
- }
-
- if (!hasError) {
- backoff.reset();
- // Check if the job is done.
- if (state.isTerminal()) {
- return state;
- }
- }
- } while(BackOffUtils.next(sleeper, backoff));
- LOG.warn("No terminal state was returned. State value {}", state);
- return null; // Timed out.
- }
-
- /**
- * Cancels the job.
- * @throws IOException if there is a problem executing the cancel request.
- */
- public void cancel() throws IOException {
- Job content = new Job();
- content.setProjectId(projectId);
- content.setId(jobId);
- content.setRequestedState("JOB_STATE_CANCELLED");
- dataflowClient.projects().jobs()
- .update(projectId, jobId, content)
- .execute();
- }
-
- @Override
- public State getState() {
- if (terminalState != null) {
- return terminalState;
- }
-
- return getStateWithRetries(STATUS_POLLING_ATTEMPTS, Sleeper.DEFAULT);
- }
-
- /**
- * Attempts to get the state. Uses exponential backoff on failure up to the maximum number
- * of passed in attempts.
- *
- * @param attempts The amount of attempts to make.
- * @param sleeper Object used to do the sleeps between attempts.
- * @return The state of the job or State.UNKNOWN in case of failure.
- */
- @VisibleForTesting
- State getStateWithRetries(int attempts, Sleeper sleeper) {
- if (terminalState != null) {
- return terminalState;
- }
- try {
- Job job = getJobWithRetries(attempts, sleeper);
- return MonitoringUtil.toState(job.getCurrentState());
- } catch (IOException exn) {
- // The only IOException that getJobWithRetries is permitted to throw is the final IOException
- // that caused the failure of retry. Other exceptions are wrapped in an unchecked exceptions
- // and will propagate.
- return State.UNKNOWN;
- }
- }
-
- /**
- * Attempts to get the underlying {@link Job}. Uses exponential backoff on failure up to the
- * maximum number of passed in attempts.
- *
- * @param attempts The amount of attempts to make.
- * @param sleeper Object used to do the sleeps between attempts.
- * @return The underlying {@link Job} object.
- * @throws IOException When the maximum number of retries is exhausted, the last exception is
- * thrown.
- */
- @VisibleForTesting
- Job getJobWithRetries(int attempts, Sleeper sleeper) throws IOException {
- AttemptBoundedExponentialBackOff backoff =
- new AttemptBoundedExponentialBackOff(attempts, STATUS_POLLING_INTERVAL);
-
- // Retry loop ends in return or throw
- while (true) {
- try {
- Job job = dataflowClient
- .projects()
- .jobs()
- .get(projectId, jobId)
- .execute();
- State currentState = MonitoringUtil.toState(job.getCurrentState());
- if (currentState.isTerminal()) {
- terminalState = currentState;
- replacedByJob = new DataflowPipelineJob(
- getProjectId(), job.getReplacedByJobId(), dataflowClient, aggregatorTransforms);
- }
- return job;
- } catch (IOException exn) {
- LOG.warn("There were problems getting current job status: {}.", exn.getMessage());
- LOG.debug("Exception information:", exn);
-
- if (!nextBackOff(sleeper, backoff)) {
- throw exn;
- }
- }
- }
- }
-
- /**
- * Identical to {@link BackOffUtils#next} but without checked exceptions.
- */
- private boolean nextBackOff(Sleeper sleeper, BackOff backoff) {
- try {
- return BackOffUtils.next(sleeper, backoff);
- } catch (InterruptedException | IOException e) {
- throw Throwables.propagate(e);
- }
- }
-
- @Override
- public <OutputT> AggregatorValues<OutputT> getAggregatorValues(Aggregator<?, OutputT> aggregator)
- throws AggregatorRetrievalException {
- try {
- return new MapAggregatorValues<>(fromMetricUpdates(aggregator));
- } catch (IOException e) {
- throw new AggregatorRetrievalException(
- "IOException when retrieving Aggregator values for Aggregator " + aggregator, e);
- }
- }
-
- private <OutputT> Map<String, OutputT> fromMetricUpdates(Aggregator<?, OutputT> aggregator)
- throws IOException {
- if (aggregatorTransforms.contains(aggregator)) {
- List<MetricUpdate> metricUpdates;
- if (terminalMetricUpdates != null) {
- metricUpdates = terminalMetricUpdates;
- } else {
- boolean terminal = getState().isTerminal();
- JobMetrics jobMetrics =
- dataflowClient.projects().jobs().getMetrics(projectId, jobId).execute();
- metricUpdates = jobMetrics.getMetrics();
- if (terminal && jobMetrics.getMetrics() != null) {
- terminalMetricUpdates = metricUpdates;
- }
- }
-
- return DataflowMetricUpdateExtractor.fromMetricUpdates(
- aggregator, aggregatorTransforms, metricUpdates);
- } else {
- throw new IllegalArgumentException(
- "Aggregator " + aggregator + " is not used in this pipeline");
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/DataflowPipelineRegistrar.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/DataflowPipelineRegistrar.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/DataflowPipelineRegistrar.java
deleted file mode 100644
index 0e4d4e9..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/DataflowPipelineRegistrar.java
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.runners;
-
-import com.google.auto.service.AutoService;
-import com.google.cloud.dataflow.sdk.options.BlockingDataflowPipelineOptions;
-import com.google.cloud.dataflow.sdk.options.DataflowPipelineOptions;
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-import com.google.cloud.dataflow.sdk.options.PipelineOptionsRegistrar;
-import com.google.common.collect.ImmutableList;
-
-/**
- * Contains the {@link PipelineOptionsRegistrar} and {@link PipelineRunnerRegistrar} for
- * the {@link DataflowPipeline}.
- */
-public class DataflowPipelineRegistrar {
- private DataflowPipelineRegistrar() { }
-
- /**
- * Register the {@link DataflowPipelineOptions} and {@link BlockingDataflowPipelineOptions}.
- */
- @AutoService(PipelineOptionsRegistrar.class)
- public static class Options implements PipelineOptionsRegistrar {
- @Override
- public Iterable<Class<? extends PipelineOptions>> getPipelineOptions() {
- return ImmutableList.<Class<? extends PipelineOptions>>of(
- DataflowPipelineOptions.class,
- BlockingDataflowPipelineOptions.class);
- }
- }
-
- /**
- * Register the {@link DataflowPipelineRunner} and {@link BlockingDataflowPipelineRunner}.
- */
- @AutoService(PipelineRunnerRegistrar.class)
- public static class Runner implements PipelineRunnerRegistrar {
- @Override
- public Iterable<Class<? extends PipelineRunner<?>>> getPipelineRunners() {
- return ImmutableList.<Class<? extends PipelineRunner<?>>>of(
- DataflowPipelineRunner.class,
- BlockingDataflowPipelineRunner.class);
- }
- }
-}
[61/67] incubator-beam git commit: Directory reorganization
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/11bb9e0e/java8examples/src/main/java/com/google/cloud/dataflow/examples/complete/game/LeaderBoard.java
----------------------------------------------------------------------
diff --git a/java8examples/src/main/java/com/google/cloud/dataflow/examples/complete/game/LeaderBoard.java b/java8examples/src/main/java/com/google/cloud/dataflow/examples/complete/game/LeaderBoard.java
deleted file mode 100644
index 4185376..0000000
--- a/java8examples/src/main/java/com/google/cloud/dataflow/examples/complete/game/LeaderBoard.java
+++ /dev/null
@@ -1,237 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.examples.complete.game;
-
-import com.google.cloud.dataflow.examples.common.DataflowExampleOptions;
-import com.google.cloud.dataflow.examples.common.DataflowExampleUtils;
-import com.google.cloud.dataflow.examples.complete.game.utils.WriteToBigQuery;
-import com.google.cloud.dataflow.examples.complete.game.utils.WriteWindowedToBigQuery;
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.PipelineResult;
-import com.google.cloud.dataflow.sdk.io.PubsubIO;
-import com.google.cloud.dataflow.sdk.options.Default;
-import com.google.cloud.dataflow.sdk.options.Description;
-import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
-import com.google.cloud.dataflow.sdk.options.Validation;
-import com.google.cloud.dataflow.sdk.runners.DataflowPipelineRunner;
-import com.google.cloud.dataflow.sdk.transforms.ParDo;
-import com.google.cloud.dataflow.sdk.transforms.windowing.AfterProcessingTime;
-import com.google.cloud.dataflow.sdk.transforms.windowing.AfterWatermark;
-import com.google.cloud.dataflow.sdk.transforms.windowing.FixedWindows;
-import com.google.cloud.dataflow.sdk.transforms.windowing.GlobalWindows;
-import com.google.cloud.dataflow.sdk.transforms.windowing.IntervalWindow;
-import com.google.cloud.dataflow.sdk.transforms.windowing.Repeatedly;
-import com.google.cloud.dataflow.sdk.transforms.windowing.Window;
-import com.google.cloud.dataflow.sdk.values.KV;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-
-import org.joda.time.DateTimeZone;
-import org.joda.time.Duration;
-import org.joda.time.Instant;
-import org.joda.time.format.DateTimeFormat;
-import org.joda.time.format.DateTimeFormatter;
-
-import java.util.HashMap;
-import java.util.Map;
-import java.util.TimeZone;
-
-/**
- * This class is the third in a series of four pipelines that tell a story in a 'gaming' domain,
- * following {@link UserScore} and {@link HourlyTeamScore}. Concepts include: processing unbounded
- * data using fixed windows; use of custom timestamps and event-time processing; generation of
- * early/speculative results; using .accumulatingFiredPanes() to do cumulative processing of late-
- * arriving data.
- *
- * <p> This pipeline processes an unbounded stream of 'game events'. The calculation of the team
- * scores uses fixed windowing based on event time (the time of the game play event), not
- * processing time (the time that an event is processed by the pipeline). The pipeline calculates
- * the sum of scores per team, for each window. By default, the team scores are calculated using
- * one-hour windows.
- *
- * <p> In contrast-- to demo another windowing option-- the user scores are calculated using a
- * global window, which periodically (every ten minutes) emits cumulative user score sums.
- *
- * <p> In contrast to the previous pipelines in the series, which used static, finite input data,
- * here we're using an unbounded data source, which lets us provide speculative results, and allows
- * handling of late data, at much lower latency. We can use the early/speculative results to keep a
- * 'leaderboard' updated in near-realtime. Our handling of late data lets us generate correct
- * results, e.g. for 'team prizes'. We're now outputing window results as they're
- * calculated, giving us much lower latency than with the previous batch examples.
- *
- * <p> Run {@link injector.Injector} to generate pubsub data for this pipeline. The Injector
- * documentation provides more detail on how to do this.
- *
- * <p> To execute this pipeline using the Dataflow service, specify the pipeline configuration
- * like this:
- * <pre>{@code
- * --project=YOUR_PROJECT_ID
- * --stagingLocation=gs://YOUR_STAGING_DIRECTORY
- * --runner=BlockingDataflowPipelineRunner
- * --dataset=YOUR-DATASET
- * --topic=projects/YOUR-PROJECT/topics/YOUR-TOPIC
- * }
- * </pre>
- * where the BigQuery dataset you specify must already exist.
- * The PubSub topic you specify should be the same topic to which the Injector is publishing.
- */
-public class LeaderBoard extends HourlyTeamScore {
-
- private static final String TIMESTAMP_ATTRIBUTE = "timestamp_ms";
-
- private static DateTimeFormatter fmt =
- DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss.SSS")
- .withZone(DateTimeZone.forTimeZone(TimeZone.getTimeZone("PST")));
- static final Duration FIVE_MINUTES = Duration.standardMinutes(5);
- static final Duration TEN_MINUTES = Duration.standardMinutes(10);
-
-
- /**
- * Options supported by {@link LeaderBoard}.
- */
- static interface Options extends HourlyTeamScore.Options, DataflowExampleOptions {
-
- @Description("Pub/Sub topic to read from")
- @Validation.Required
- String getTopic();
- void setTopic(String value);
-
- @Description("Numeric value of fixed window duration for team analysis, in minutes")
- @Default.Integer(60)
- Integer getTeamWindowDuration();
- void setTeamWindowDuration(Integer value);
-
- @Description("Numeric value of allowed data lateness, in minutes")
- @Default.Integer(120)
- Integer getAllowedLateness();
- void setAllowedLateness(Integer value);
-
- @Description("Prefix used for the BigQuery table names")
- @Default.String("leaderboard")
- String getTableName();
- void setTableName(String value);
- }
-
- /**
- * Create a map of information that describes how to write pipeline output to BigQuery. This map
- * is used to write team score sums and includes event timing information.
- */
- protected static Map<String, WriteWindowedToBigQuery.FieldInfo<KV<String, Integer>>>
- configureWindowedTableWrite() {
-
- Map<String, WriteWindowedToBigQuery.FieldInfo<KV<String, Integer>>> tableConfigure =
- new HashMap<String, WriteWindowedToBigQuery.FieldInfo<KV<String, Integer>>>();
- tableConfigure.put("team",
- new WriteWindowedToBigQuery.FieldInfo<KV<String, Integer>>("STRING",
- c -> c.element().getKey()));
- tableConfigure.put("total_score",
- new WriteWindowedToBigQuery.FieldInfo<KV<String, Integer>>("INTEGER",
- c -> c.element().getValue()));
- tableConfigure.put("window_start",
- new WriteWindowedToBigQuery.FieldInfo<KV<String, Integer>>("STRING",
- c -> { IntervalWindow w = (IntervalWindow) c.window();
- return fmt.print(w.start()); }));
- tableConfigure.put("processing_time",
- new WriteWindowedToBigQuery.FieldInfo<KV<String, Integer>>(
- "STRING", c -> fmt.print(Instant.now())));
- tableConfigure.put("timing",
- new WriteWindowedToBigQuery.FieldInfo<KV<String, Integer>>(
- "STRING", c -> c.pane().getTiming().toString()));
- return tableConfigure;
- }
-
- /**
- * Create a map of information that describes how to write pipeline output to BigQuery. This map
- * is used to write user score sums.
- */
- protected static Map<String, WriteToBigQuery.FieldInfo<KV<String, Integer>>>
- configureGlobalWindowBigQueryWrite() {
-
- Map<String, WriteToBigQuery.FieldInfo<KV<String, Integer>>> tableConfigure =
- configureBigQueryWrite();
- tableConfigure.put("processing_time",
- new WriteToBigQuery.FieldInfo<KV<String, Integer>>(
- "STRING", c -> fmt.print(Instant.now())));
- return tableConfigure;
- }
-
-
- public static void main(String[] args) throws Exception {
-
- Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
- // Enforce that this pipeline is always run in streaming mode.
- options.setStreaming(true);
- // For example purposes, allow the pipeline to be easily cancelled instead of running
- // continuously.
- options.setRunner(DataflowPipelineRunner.class);
- DataflowExampleUtils dataflowUtils = new DataflowExampleUtils(options);
- Pipeline pipeline = Pipeline.create(options);
-
- // Read game events from Pub/Sub using custom timestamps, which are extracted from the pubsub
- // data elements, and parse the data.
- PCollection<GameActionInfo> gameEvents = pipeline
- .apply(PubsubIO.Read.timestampLabel(TIMESTAMP_ATTRIBUTE).topic(options.getTopic()))
- .apply(ParDo.named("ParseGameEvent").of(new ParseEventFn()));
-
- // [START DocInclude_WindowAndTrigger]
- // Extract team/score pairs from the event stream, using hour-long windows by default.
- gameEvents
- .apply(Window.named("LeaderboardTeamFixedWindows")
- .<GameActionInfo>into(FixedWindows.of(
- Duration.standardMinutes(options.getTeamWindowDuration())))
- // We will get early (speculative) results as well as cumulative
- // processing of late data.
- .triggering(
- AfterWatermark.pastEndOfWindow()
- .withEarlyFirings(AfterProcessingTime.pastFirstElementInPane()
- .plusDelayOf(FIVE_MINUTES))
- .withLateFirings(AfterProcessingTime.pastFirstElementInPane()
- .plusDelayOf(TEN_MINUTES)))
- .withAllowedLateness(Duration.standardMinutes(options.getAllowedLateness()))
- .accumulatingFiredPanes())
- // Extract and sum teamname/score pairs from the event data.
- .apply("ExtractTeamScore", new ExtractAndSumScore("team"))
- // Write the results to BigQuery.
- .apply("WriteTeamScoreSums",
- new WriteWindowedToBigQuery<KV<String, Integer>>(
- options.getTableName() + "_team", configureWindowedTableWrite()));
- // [END DocInclude_WindowAndTrigger]
-
- // [START DocInclude_ProcTimeTrigger]
- // Extract user/score pairs from the event stream using processing time, via global windowing.
- // Get periodic updates on all users' running scores.
- gameEvents
- .apply(Window.named("LeaderboardUserGlobalWindow")
- .<GameActionInfo>into(new GlobalWindows())
- // Get periodic results every ten minutes.
- .triggering(Repeatedly.forever(AfterProcessingTime.pastFirstElementInPane()
- .plusDelayOf(TEN_MINUTES)))
- .accumulatingFiredPanes()
- .withAllowedLateness(Duration.standardMinutes(options.getAllowedLateness())))
- // Extract and sum username/score pairs from the event data.
- .apply("ExtractUserScore", new ExtractAndSumScore("user"))
- // Write the results to BigQuery.
- .apply("WriteUserScoreSums",
- new WriteToBigQuery<KV<String, Integer>>(
- options.getTableName() + "_user", configureGlobalWindowBigQueryWrite()));
- // [END DocInclude_ProcTimeTrigger]
-
- // Run the pipeline and wait for the pipeline to finish; capture cancellation requests from the
- // command line.
- PipelineResult result = pipeline.run();
- dataflowUtils.waitToFinish(result);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/11bb9e0e/java8examples/src/main/java/com/google/cloud/dataflow/examples/complete/game/README.md
----------------------------------------------------------------------
diff --git a/java8examples/src/main/java/com/google/cloud/dataflow/examples/complete/game/README.md b/java8examples/src/main/java/com/google/cloud/dataflow/examples/complete/game/README.md
deleted file mode 100644
index 79b55ce..0000000
--- a/java8examples/src/main/java/com/google/cloud/dataflow/examples/complete/game/README.md
+++ /dev/null
@@ -1,113 +0,0 @@
-
-# 'Gaming' examples
-
-
-This directory holds a series of example Dataflow pipelines in a simple 'mobile
-gaming' domain. They all require Java 8. Each pipeline successively introduces
-new concepts, and gives some examples of using Java 8 syntax in constructing
-Dataflow pipelines. Other than usage of Java 8 lambda expressions, the concepts
-that are used apply equally well in Java 7.
-
-In the gaming scenario, many users play, as members of different teams, over
-the course of a day, and their actions are logged for processing. Some of the
-logged game events may be late-arriving, if users play on mobile devices and go
-transiently offline for a period.
-
-The scenario includes not only "regular" users, but "robot users", which have a
-higher click rate than the regular users, and may move from team to team.
-
-The first two pipelines in the series use pre-generated batch data samples. The
-second two pipelines read from a [PubSub](https://cloud.google.com/pubsub/)
-topic input. For these examples, you will also need to run the
-`injector.Injector` program, which generates and publishes the gaming data to
-PubSub. The javadocs for each pipeline have more detailed information on how to
-run that pipeline.
-
-All of these pipelines write their results to BigQuery table(s).
-
-
-## The pipelines in the 'gaming' series
-
-### UserScore
-
-The first pipeline in the series is `UserScore`. This pipeline does batch
-processing of data collected from gaming events. It calculates the sum of
-scores per user, over an entire batch of gaming data (collected, say, for each
-day). The batch processing will not include any late data that arrives after
-the day's cutoff point.
-
-### HourlyTeamScore
-
-The next pipeline in the series is `HourlyTeamScore`. This pipeline also
-processes data collected from gaming events in batch. It builds on `UserScore`,
-but uses [fixed windows](https://cloud.google.com/dataflow/model/windowing), by
-default an hour in duration. It calculates the sum of scores per team, for each
-window, optionally allowing specification of two timestamps before and after
-which data is filtered out. This allows a model where late data collected after
-the intended analysis window can be included in the analysis, and any late-
-arriving data prior to the beginning of the analysis window can be removed as
-well.
-
-By using windowing and adding element timestamps, we can do finer-grained
-analysis than with the `UserScore` pipeline — we're now tracking scores for
-each hour rather than over the course of a whole day. However, our batch
-processing is high-latency, in that we don't get results from plays at the
-beginning of the batch's time period until the complete batch is processed.
-
-### LeaderBoard
-
-The third pipeline in the series is `LeaderBoard`. This pipeline processes an
-unbounded stream of 'game events' from a PubSub topic. The calculation of the
-team scores uses fixed windowing based on event time (the time of the game play
-event), not processing time (the time that an event is processed by the
-pipeline). The pipeline calculates the sum of scores per team, for each window.
-By default, the team scores are calculated using one-hour windows.
-
-In contrast — to demo another windowing option — the user scores are calculated
-using a global window, which periodically (every ten minutes) emits cumulative
-user score sums.
-
-In contrast to the previous pipelines in the series, which used static, finite
-input data, here we're using an unbounded data source, which lets us provide
-_speculative_ results, and allows handling of late data, at much lower latency.
-E.g., we could use the early/speculative results to keep a 'leaderboard'
-updated in near-realtime. Our handling of late data lets us generate correct
-results, e.g. for 'team prizes'. We're now outputing window results as they're
-calculated, giving us much lower latency than with the previous batch examples.
-
-### GameStats
-
-The fourth pipeline in the series is `GameStats`. This pipeline builds
-on the `LeaderBoard` functionality — supporting output of speculative and late
-data — and adds some "business intelligence" analysis: identifying abuse
-detection. The pipeline derives the Mean user score sum for a window, and uses
-that information to identify likely spammers/robots. (The injector is designed
-so that the "robots" have a higher click rate than the "real" users). The robot
-users are then filtered out when calculating the team scores.
-
-Additionally, user sessions are tracked: that is, we find bursts of user
-activity using session windows. Then, the mean session duration information is
-recorded in the context of subsequent fixed windowing. (This could be used to
-tell us what games are giving us greater user retention).
-
-### Running the PubSub Injector
-
-The `LeaderBoard` and `GameStats` example pipelines read unbounded data
-from a PubSub topic.
-
-Use the `injector.Injector` program to generate this data and publish to a
-PubSub topic. See the `Injector`javadocs for more information on how to run the
-injector. Set up the injector before you start one of these pipelines. Then,
-when you start the pipeline, pass as an argument the name of that PubSub topic.
-See the pipeline javadocs for the details.
-
-## Viewing the results in BigQuery
-
-All of the pipelines write their results to BigQuery. `UserScore` and
-`HourlyTeamScore` each write one table, and `LeaderBoard` and
-`GameStats` each write two. The pipelines have default table names that
-you can override when you start up the pipeline if those tables already exist.
-
-Depending on the windowing intervals defined in a given pipeline, you may have
-to wait for a while (more than an hour) before you start to see results written
-to the BigQuery tables.
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/11bb9e0e/java8examples/src/main/java/com/google/cloud/dataflow/examples/complete/game/UserScore.java
----------------------------------------------------------------------
diff --git a/java8examples/src/main/java/com/google/cloud/dataflow/examples/complete/game/UserScore.java b/java8examples/src/main/java/com/google/cloud/dataflow/examples/complete/game/UserScore.java
deleted file mode 100644
index de06ce3..0000000
--- a/java8examples/src/main/java/com/google/cloud/dataflow/examples/complete/game/UserScore.java
+++ /dev/null
@@ -1,239 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.examples.complete.game;
-
-import com.google.cloud.dataflow.examples.complete.game.utils.WriteToBigQuery;
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.coders.AvroCoder;
-import com.google.cloud.dataflow.sdk.coders.DefaultCoder;
-import com.google.cloud.dataflow.sdk.io.TextIO;
-import com.google.cloud.dataflow.sdk.options.Default;
-import com.google.cloud.dataflow.sdk.options.Description;
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
-import com.google.cloud.dataflow.sdk.options.Validation;
-import com.google.cloud.dataflow.sdk.transforms.Aggregator;
-import com.google.cloud.dataflow.sdk.transforms.DoFn;
-import com.google.cloud.dataflow.sdk.transforms.MapElements;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-import com.google.cloud.dataflow.sdk.transforms.ParDo;
-import com.google.cloud.dataflow.sdk.transforms.Sum;
-import com.google.cloud.dataflow.sdk.values.KV;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.cloud.dataflow.sdk.values.TypeDescriptor;
-
-import org.apache.avro.reflect.Nullable;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.util.HashMap;
-import java.util.Map;
-
-/**
- * This class is the first in a series of four pipelines that tell a story in a 'gaming' domain.
- * Concepts: batch processing; reading input from Google Cloud Storage and writing output to
- * BigQuery; using standalone DoFns; use of the sum by key transform; examples of
- * Java 8 lambda syntax.
- *
- * <p> In this gaming scenario, many users play, as members of different teams, over the course of a
- * day, and their actions are logged for processing. Some of the logged game events may be late-
- * arriving, if users play on mobile devices and go transiently offline for a period.
- *
- * <p> This pipeline does batch processing of data collected from gaming events. It calculates the
- * sum of scores per user, over an entire batch of gaming data (collected, say, for each day). The
- * batch processing will not include any late data that arrives after the day's cutoff point.
- *
- * <p> To execute this pipeline using the Dataflow service and static example input data, specify
- * the pipeline configuration like this:
- * <pre>{@code
- * --project=YOUR_PROJECT_ID
- * --stagingLocation=gs://YOUR_STAGING_DIRECTORY
- * --runner=BlockingDataflowPipelineRunner
- * --dataset=YOUR-DATASET
- * }
- * </pre>
- * where the BigQuery dataset you specify must already exist.
- *
- * <p> Optionally include the --input argument to specify a batch input file.
- * See the --input default value for example batch data file, or use {@link injector.Injector} to
- * generate your own batch data.
- */
-public class UserScore {
-
- /**
- * Class to hold info about a game event.
- */
- @DefaultCoder(AvroCoder.class)
- static class GameActionInfo {
- @Nullable String user;
- @Nullable String team;
- @Nullable Integer score;
- @Nullable Long timestamp;
-
- public GameActionInfo() {}
-
- public GameActionInfo(String user, String team, Integer score, Long timestamp) {
- this.user = user;
- this.team = team;
- this.score = score;
- this.timestamp = timestamp;
- }
-
- public String getUser() {
- return this.user;
- }
- public String getTeam() {
- return this.team;
- }
- public Integer getScore() {
- return this.score;
- }
- public String getKey(String keyname) {
- if (keyname.equals("team")) {
- return this.team;
- } else { // return username as default
- return this.user;
- }
- }
- public Long getTimestamp() {
- return this.timestamp;
- }
- }
-
-
- /**
- * Parses the raw game event info into GameActionInfo objects. Each event line has the following
- * format: username,teamname,score,timestamp_in_ms,readable_time
- * e.g.:
- * user2_AsparagusPig,AsparagusPig,10,1445230923951,2015-11-02 09:09:28.224
- * The human-readable time string is not used here.
- */
- static class ParseEventFn extends DoFn<String, GameActionInfo> {
-
- // Log and count parse errors.
- private static final Logger LOG = LoggerFactory.getLogger(ParseEventFn.class);
- private final Aggregator<Long, Long> numParseErrors =
- createAggregator("ParseErrors", new Sum.SumLongFn());
-
- @Override
- public void processElement(ProcessContext c) {
- String[] components = c.element().split(",");
- try {
- String user = components[0].trim();
- String team = components[1].trim();
- Integer score = Integer.parseInt(components[2].trim());
- Long timestamp = Long.parseLong(components[3].trim());
- GameActionInfo gInfo = new GameActionInfo(user, team, score, timestamp);
- c.output(gInfo);
- } catch (ArrayIndexOutOfBoundsException | NumberFormatException e) {
- numParseErrors.addValue(1L);
- LOG.info("Parse error on " + c.element() + ", " + e.getMessage());
- }
- }
- }
-
- /**
- * A transform to extract key/score information from GameActionInfo, and sum the scores. The
- * constructor arg determines whether 'team' or 'user' info is extracted.
- */
- // [START DocInclude_USExtractXform]
- public static class ExtractAndSumScore
- extends PTransform<PCollection<GameActionInfo>, PCollection<KV<String, Integer>>> {
-
- private final String field;
-
- ExtractAndSumScore(String field) {
- this.field = field;
- }
-
- @Override
- public PCollection<KV<String, Integer>> apply(
- PCollection<GameActionInfo> gameInfo) {
-
- return gameInfo
- .apply(MapElements
- .via((GameActionInfo gInfo) -> KV.of(gInfo.getKey(field), gInfo.getScore()))
- .withOutputType(new TypeDescriptor<KV<String, Integer>>() {}))
- .apply(Sum.<String>integersPerKey());
- }
- }
- // [END DocInclude_USExtractXform]
-
-
- /**
- * Options supported by {@link UserScore}.
- */
- public static interface Options extends PipelineOptions {
-
- @Description("Path to the data file(s) containing game data.")
- // The default maps to two large Google Cloud Storage files (each ~12GB) holding two subsequent
- // day's worth (roughly) of data.
- @Default.String("gs://dataflow-samples/game/gaming_data*.csv")
- String getInput();
- void setInput(String value);
-
- @Description("BigQuery Dataset to write tables to. Must already exist.")
- @Validation.Required
- String getDataset();
- void setDataset(String value);
-
- @Description("The BigQuery table name. Should not already exist.")
- @Default.String("user_score")
- String getTableName();
- void setTableName(String value);
- }
-
- /**
- * Create a map of information that describes how to write pipeline output to BigQuery. This map
- * is passed to the {@link WriteToBigQuery} constructor to write user score sums.
- */
- protected static Map<String, WriteToBigQuery.FieldInfo<KV<String, Integer>>>
- configureBigQueryWrite() {
- Map<String, WriteToBigQuery.FieldInfo<KV<String, Integer>>> tableConfigure =
- new HashMap<String, WriteToBigQuery.FieldInfo<KV<String, Integer>>>();
- tableConfigure.put("user",
- new WriteToBigQuery.FieldInfo<KV<String, Integer>>("STRING", c -> c.element().getKey()));
- tableConfigure.put("total_score",
- new WriteToBigQuery.FieldInfo<KV<String, Integer>>("INTEGER", c -> c.element().getValue()));
- return tableConfigure;
- }
-
-
- /**
- * Run a batch pipeline.
- */
- // [START DocInclude_USMain]
- public static void main(String[] args) throws Exception {
- // Begin constructing a pipeline configured by commandline flags.
- Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
- Pipeline pipeline = Pipeline.create(options);
-
- // Read events from a text file and parse them.
- pipeline.apply(TextIO.Read.from(options.getInput()))
- .apply(ParDo.named("ParseGameEvent").of(new ParseEventFn()))
- // Extract and sum username/score pairs from the event data.
- .apply("ExtractUserScore", new ExtractAndSumScore("user"))
- .apply("WriteUserScoreSums",
- new WriteToBigQuery<KV<String, Integer>>(options.getTableName(),
- configureBigQueryWrite()));
-
- // Run the batch pipeline.
- pipeline.run();
- }
- // [END DocInclude_USMain]
-
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/11bb9e0e/java8examples/src/main/java/com/google/cloud/dataflow/examples/complete/game/injector/Injector.java
----------------------------------------------------------------------
diff --git a/java8examples/src/main/java/com/google/cloud/dataflow/examples/complete/game/injector/Injector.java b/java8examples/src/main/java/com/google/cloud/dataflow/examples/complete/game/injector/Injector.java
deleted file mode 100644
index 1691c54..0000000
--- a/java8examples/src/main/java/com/google/cloud/dataflow/examples/complete/game/injector/Injector.java
+++ /dev/null
@@ -1,415 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.examples.complete.game.injector;
-
-import com.google.api.services.pubsub.Pubsub;
-import com.google.api.services.pubsub.model.PublishRequest;
-import com.google.api.services.pubsub.model.PubsubMessage;
-import com.google.common.collect.ImmutableMap;
-
-import org.joda.time.DateTimeZone;
-import org.joda.time.format.DateTimeFormat;
-import org.joda.time.format.DateTimeFormatter;
-
-import java.io.BufferedOutputStream;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.OutputStreamWriter;
-import java.io.PrintWriter;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.List;
-import java.util.Random;
-import java.util.TimeZone;
-
-
-/**
- * This is a generator that simulates usage data from a mobile game, and either publishes the data
- * to a pubsub topic or writes it to a file.
- *
- * <p> The general model used by the generator is the following. There is a set of teams with team
- * members. Each member is scoring points for their team. After some period, a team will dissolve
- * and a new one will be created in its place. There is also a set of 'Robots', or spammer users.
- * They hop from team to team. The robots are set to have a higher 'click rate' (generate more
- * events) than the regular team members.
- *
- * <p> Each generated line of data has the following form:
- * username,teamname,score,timestamp_in_ms,readable_time
- * e.g.:
- * user2_AsparagusPig,AsparagusPig,10,1445230923951,2015-11-02 09:09:28.224
- *
- * <p> The Injector writes either to a PubSub topic, or a file. It will use the PubSub topic if
- * specified. It takes the following arguments:
- * {@code Injector project-name (topic-name|none) (filename|none)}.
- *
- * <p> To run the Injector in the mode where it publishes to PubSub, you will need to authenticate
- * locally using project-based service account credentials to avoid running over PubSub
- * quota.
- * See https://developers.google.com/identity/protocols/application-default-credentials
- * for more information on using service account credentials. Set the GOOGLE_APPLICATION_CREDENTIALS
- * environment variable to point to your downloaded service account credentials before starting the
- * program, e.g.:
- * {@code export GOOGLE_APPLICATION_CREDENTIALS=/path/to/your/credentials-key.json}.
- * If you do not do this, then your injector will only run for a few minutes on your
- * 'user account' credentials before you will start to see quota error messages like:
- * "Request throttled due to user QPS limit being reached", and see this exception:
- * ".com.google.api.client.googleapis.json.GoogleJsonResponseException: 429 Too Many Requests".
- * Once you've set up your credentials, run the Injector like this":
- * <pre>{@code
- * Injector <project-name> <topic-name> none
- * }
- * </pre>
- * The pubsub topic will be created if it does not exist.
- *
- * <p> To run the injector in write-to-file-mode, set the topic name to "none" and specify the
- * filename:
- * <pre>{@code
- * Injector <project-name> none <filename>
- * }
- * </pre>
- */
-class Injector {
- private static Pubsub pubsub;
- private static Random random = new Random();
- private static String topic;
- private static String project;
- private static final String TIMESTAMP_ATTRIBUTE = "timestamp_ms";
-
- // QPS ranges from 800 to 1000.
- private static final int MIN_QPS = 800;
- private static final int QPS_RANGE = 200;
- // How long to sleep, in ms, between creation of the threads that make API requests to PubSub.
- private static final int THREAD_SLEEP_MS = 500;
-
- // Lists used to generate random team names.
- private static final ArrayList<String> COLORS =
- new ArrayList<String>(Arrays.asList(
- "Magenta", "AliceBlue", "Almond", "Amaranth", "Amber",
- "Amethyst", "AndroidGreen", "AntiqueBrass", "Fuchsia", "Ruby", "AppleGreen",
- "Apricot", "Aqua", "ArmyGreen", "Asparagus", "Auburn", "Azure", "Banana",
- "Beige", "Bisque", "BarnRed", "BattleshipGrey"));
-
- private static final ArrayList<String> ANIMALS =
- new ArrayList<String>(Arrays.asList(
- "Echidna", "Koala", "Wombat", "Marmot", "Quokka", "Kangaroo", "Dingo", "Numbat", "Emu",
- "Wallaby", "CaneToad", "Bilby", "Possum", "Cassowary", "Kookaburra", "Platypus",
- "Bandicoot", "Cockatoo", "Antechinus"));
-
- // The list of live teams.
- private static ArrayList<TeamInfo> liveTeams = new ArrayList<TeamInfo>();
-
- private static DateTimeFormatter fmt =
- DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss.SSS")
- .withZone(DateTimeZone.forTimeZone(TimeZone.getTimeZone("PST")));
-
-
- // The total number of robots in the system.
- private static final int NUM_ROBOTS = 20;
- // Determines the chance that a team will have a robot team member.
- private static final int ROBOT_PROBABILITY = 3;
- private static final int NUM_LIVE_TEAMS = 15;
- private static final int BASE_MEMBERS_PER_TEAM = 5;
- private static final int MEMBERS_PER_TEAM = 15;
- private static final int MAX_SCORE = 20;
- private static final int LATE_DATA_RATE = 5 * 60 * 2; // Every 10 minutes
- private static final int BASE_DELAY_IN_MILLIS = 5 * 60 * 1000; // 5-10 minute delay
- private static final int FUZZY_DELAY_IN_MILLIS = 5 * 60 * 1000;
-
- // The minimum time a 'team' can live.
- private static final int BASE_TEAM_EXPIRATION_TIME_IN_MINS = 20;
- private static final int TEAM_EXPIRATION_TIME_IN_MINS = 20;
-
-
- /**
- * A class for holding team info: the name of the team, when it started,
- * and the current team members. Teams may but need not include one robot team member.
- */
- private static class TeamInfo {
- String teamName;
- long startTimeInMillis;
- int expirationPeriod;
- // The team might but need not include 1 robot. Will be non-null if so.
- String robot;
- int numMembers;
-
- private TeamInfo(String teamName, long startTimeInMillis, String robot) {
- this.teamName = teamName;
- this.startTimeInMillis = startTimeInMillis;
- // How long until this team is dissolved.
- this.expirationPeriod = random.nextInt(TEAM_EXPIRATION_TIME_IN_MINS) +
- BASE_TEAM_EXPIRATION_TIME_IN_MINS;
- this.robot = robot;
- // Determine the number of team members.
- numMembers = random.nextInt(MEMBERS_PER_TEAM) + BASE_MEMBERS_PER_TEAM;
- }
-
- String getTeamName() {
- return teamName;
- }
- String getRobot() {
- return robot;
- }
-
- long getStartTimeInMillis() {
- return startTimeInMillis;
- }
- long getEndTimeInMillis() {
- return startTimeInMillis + (expirationPeriod * 60 * 1000);
- }
- String getRandomUser() {
- int userNum = random.nextInt(numMembers);
- return "user" + userNum + "_" + teamName;
- }
-
- int numMembers() {
- return numMembers;
- }
-
- @Override
- public String toString() {
- return "(" + teamName + ", num members: " + numMembers() + ", starting at: "
- + startTimeInMillis + ", expires in: " + expirationPeriod + ", robot: " + robot + ")";
- }
- }
-
- /** Utility to grab a random element from an array of Strings. */
- private static String randomElement(ArrayList<String> list) {
- int index = random.nextInt(list.size());
- return list.get(index);
- }
-
- /**
- * Get and return a random team. If the selected team is too old w.r.t its expiration, remove
- * it, replacing it with a new team.
- */
- private static TeamInfo randomTeam(ArrayList<TeamInfo> list) {
- int index = random.nextInt(list.size());
- TeamInfo team = list.get(index);
- // If the selected team is expired, remove it and return a new team.
- long currTime = System.currentTimeMillis();
- if ((team.getEndTimeInMillis() < currTime) || team.numMembers() == 0) {
- System.out.println("\nteam " + team + " is too old; replacing.");
- System.out.println("start time: " + team.getStartTimeInMillis() +
- ", end time: " + team.getEndTimeInMillis() +
- ", current time:" + currTime);
- removeTeam(index);
- // Add a new team in its stead.
- return (addLiveTeam());
- } else {
- return team;
- }
- }
-
- /**
- * Create and add a team. Possibly add a robot to the team.
- */
- private static synchronized TeamInfo addLiveTeam() {
- String teamName = randomElement(COLORS) + randomElement(ANIMALS);
- String robot = null;
- // Decide if we want to add a robot to the team.
- if (random.nextInt(ROBOT_PROBABILITY) == 0) {
- robot = "Robot-" + random.nextInt(NUM_ROBOTS);
- }
- // Create the new team.
- TeamInfo newTeam = new TeamInfo(teamName, System.currentTimeMillis(), robot);
- liveTeams.add(newTeam);
- System.out.println("[+" + newTeam + "]");
- return newTeam;
- }
-
- /**
- * Remove a specific team.
- */
- private static synchronized void removeTeam(int teamIndex) {
- TeamInfo removedTeam = liveTeams.remove(teamIndex);
- System.out.println("[-" + removedTeam + "]");
- }
-
- /** Generate a user gaming event. */
- private static String generateEvent(Long currTime, int delayInMillis) {
- TeamInfo team = randomTeam(liveTeams);
- String teamName = team.getTeamName();
- String user;
- final int parseErrorRate = 900000;
-
- String robot = team.getRobot();
- // If the team has an associated robot team member...
- if (robot != null) {
- // Then use that robot for the message with some probability.
- // Set this probability to higher than that used to select any of the 'regular' team
- // members, so that if there is a robot on the team, it has a higher click rate.
- if (random.nextInt(team.numMembers() / 2) == 0) {
- user = robot;
- } else {
- user = team.getRandomUser();
- }
- } else { // No robot.
- user = team.getRandomUser();
- }
- String event = user + "," + teamName + "," + random.nextInt(MAX_SCORE);
- // Randomly introduce occasional parse errors. You can see a custom counter tracking the number
- // of such errors in the Dataflow Monitoring UI, as the example pipeline runs.
- if (random.nextInt(parseErrorRate) == 0) {
- System.out.println("Introducing a parse error.");
- event = "THIS LINE REPRESENTS CORRUPT DATA AND WILL CAUSE A PARSE ERROR";
- }
- return addTimeInfoToEvent(event, currTime, delayInMillis);
- }
-
- /**
- * Add time info to a generated gaming event.
- */
- private static String addTimeInfoToEvent(String message, Long currTime, int delayInMillis) {
- String eventTimeString =
- Long.toString((currTime - delayInMillis) / 1000 * 1000);
- // Add a (redundant) 'human-readable' date string to make the data semantics more clear.
- String dateString = fmt.print(currTime);
- message = message + "," + eventTimeString + "," + dateString;
- return message;
- }
-
- /**
- * Publish 'numMessages' arbitrary events from live users with the provided delay, to a
- * PubSub topic.
- */
- public static void publishData(int numMessages, int delayInMillis)
- throws IOException {
- List<PubsubMessage> pubsubMessages = new ArrayList<>();
-
- for (int i = 0; i < Math.max(1, numMessages); i++) {
- Long currTime = System.currentTimeMillis();
- String message = generateEvent(currTime, delayInMillis);
- PubsubMessage pubsubMessage = new PubsubMessage()
- .encodeData(message.getBytes("UTF-8"));
- pubsubMessage.setAttributes(
- ImmutableMap.of(TIMESTAMP_ATTRIBUTE,
- Long.toString((currTime - delayInMillis) / 1000 * 1000)));
- if (delayInMillis != 0) {
- System.out.println(pubsubMessage.getAttributes());
- System.out.println("late data for: " + message);
- }
- pubsubMessages.add(pubsubMessage);
- }
-
- PublishRequest publishRequest = new PublishRequest();
- publishRequest.setMessages(pubsubMessages);
- pubsub.projects().topics().publish(topic, publishRequest).execute();
- }
-
- /**
- * Publish generated events to a file.
- */
- public static void publishDataToFile(String fileName, int numMessages, int delayInMillis)
- throws IOException {
- PrintWriter out = new PrintWriter(new OutputStreamWriter(
- new BufferedOutputStream(new FileOutputStream(fileName, true)), "UTF-8"));
-
- try {
- for (int i = 0; i < Math.max(1, numMessages); i++) {
- Long currTime = System.currentTimeMillis();
- String message = generateEvent(currTime, delayInMillis);
- out.println(message);
- }
- } catch (Exception e) {
- e.printStackTrace();
- } finally {
- if (out != null) {
- out.flush();
- out.close();
- }
- }
- }
-
-
- public static void main(String[] args) throws IOException, InterruptedException {
- if (args.length < 3) {
- System.out.println("Usage: Injector project-name (topic-name|none) (filename|none)");
- System.exit(1);
- }
- boolean writeToFile = false;
- boolean writeToPubsub = true;
- project = args[0];
- String topicName = args[1];
- String fileName = args[2];
- // The Injector writes either to a PubSub topic, or a file. It will use the PubSub topic if
- // specified; otherwise, it will try to write to a file.
- if (topicName.equalsIgnoreCase("none")) {
- writeToFile = true;
- writeToPubsub = false;
- }
- if (writeToPubsub) {
- // Create the PubSub client.
- pubsub = InjectorUtils.getClient();
- // Create the PubSub topic as necessary.
- topic = InjectorUtils.getFullyQualifiedTopicName(project, topicName);
- InjectorUtils.createTopic(pubsub, topic);
- System.out.println("Injecting to topic: " + topic);
- } else {
- if (fileName.equalsIgnoreCase("none")) {
- System.out.println("Filename not specified.");
- System.exit(1);
- }
- System.out.println("Writing to file: " + fileName);
- }
- System.out.println("Starting Injector");
-
- // Start off with some random live teams.
- while (liveTeams.size() < NUM_LIVE_TEAMS) {
- addLiveTeam();
- }
-
- // Publish messages at a rate determined by the QPS and Thread sleep settings.
- for (int i = 0; true; i++) {
- if (Thread.activeCount() > 10) {
- System.err.println("I'm falling behind!");
- }
-
- // Decide if this should be a batch of late data.
- final int numMessages;
- final int delayInMillis;
- if (i % LATE_DATA_RATE == 0) {
- // Insert delayed data for one user (one message only)
- delayInMillis = BASE_DELAY_IN_MILLIS + random.nextInt(FUZZY_DELAY_IN_MILLIS);
- numMessages = 1;
- System.out.println("DELAY(" + delayInMillis + ", " + numMessages + ")");
- } else {
- System.out.print(".");
- delayInMillis = 0;
- numMessages = MIN_QPS + random.nextInt(QPS_RANGE);
- }
-
- if (writeToFile) { // Won't use threading for the file write.
- publishDataToFile(fileName, numMessages, delayInMillis);
- } else { // Write to PubSub.
- // Start a thread to inject some data.
- new Thread(){
- @Override
- public void run() {
- try {
- publishData(numMessages, delayInMillis);
- } catch (IOException e) {
- System.err.println(e);
- }
- }
- }.start();
- }
-
- // Wait before creating another injector thread.
- Thread.sleep(THREAD_SLEEP_MS);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/11bb9e0e/java8examples/src/main/java/com/google/cloud/dataflow/examples/complete/game/injector/InjectorUtils.java
----------------------------------------------------------------------
diff --git a/java8examples/src/main/java/com/google/cloud/dataflow/examples/complete/game/injector/InjectorUtils.java b/java8examples/src/main/java/com/google/cloud/dataflow/examples/complete/game/injector/InjectorUtils.java
deleted file mode 100644
index 55982df..0000000
--- a/java8examples/src/main/java/com/google/cloud/dataflow/examples/complete/game/injector/InjectorUtils.java
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.examples.complete.game.injector;
-
-
-import com.google.api.client.googleapis.auth.oauth2.GoogleCredential;
-import com.google.api.client.googleapis.json.GoogleJsonResponseException;
-import com.google.api.client.googleapis.util.Utils;
-import com.google.api.client.http.HttpRequestInitializer;
-import com.google.api.client.http.HttpStatusCodes;
-import com.google.api.client.http.HttpTransport;
-import com.google.api.client.json.JsonFactory;
-import com.google.api.services.pubsub.Pubsub;
-import com.google.api.services.pubsub.PubsubScopes;
-import com.google.api.services.pubsub.model.Topic;
-
-import com.google.common.base.Preconditions;
-
-import java.io.IOException;
-
-class InjectorUtils {
-
- private static final String APP_NAME = "injector";
-
- /**
- * Builds a new Pubsub client and returns it.
- */
- public static Pubsub getClient(final HttpTransport httpTransport,
- final JsonFactory jsonFactory)
- throws IOException {
- Preconditions.checkNotNull(httpTransport);
- Preconditions.checkNotNull(jsonFactory);
- GoogleCredential credential =
- GoogleCredential.getApplicationDefault(httpTransport, jsonFactory);
- if (credential.createScopedRequired()) {
- credential = credential.createScoped(PubsubScopes.all());
- }
- if (credential.getClientAuthentication() != null) {
- System.out.println("\n***Warning! You are not using service account credentials to "
- + "authenticate.\nYou need to use service account credentials for this example,"
- + "\nsince user-level credentials do not have enough pubsub quota,\nand so you will run "
- + "out of PubSub quota very quickly.\nSee "
- + "https://developers.google.com/identity/protocols/application-default-credentials.");
- System.exit(1);
- }
- HttpRequestInitializer initializer =
- new RetryHttpInitializerWrapper(credential);
- return new Pubsub.Builder(httpTransport, jsonFactory, initializer)
- .setApplicationName(APP_NAME)
- .build();
- }
-
- /**
- * Builds a new Pubsub client with default HttpTransport and
- * JsonFactory and returns it.
- */
- public static Pubsub getClient() throws IOException {
- return getClient(Utils.getDefaultTransport(),
- Utils.getDefaultJsonFactory());
- }
-
-
- /**
- * Returns the fully qualified topic name for Pub/Sub.
- */
- public static String getFullyQualifiedTopicName(
- final String project, final String topic) {
- return String.format("projects/%s/topics/%s", project, topic);
- }
-
- /**
- * Create a topic if it doesn't exist.
- */
- public static void createTopic(Pubsub client, String fullTopicName)
- throws IOException {
- try {
- client.projects().topics().get(fullTopicName).execute();
- } catch (GoogleJsonResponseException e) {
- if (e.getStatusCode() == HttpStatusCodes.STATUS_CODE_NOT_FOUND) {
- Topic topic = client.projects().topics()
- .create(fullTopicName, new Topic())
- .execute();
- System.out.printf("Topic %s was created.\n", topic.getName());
- }
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/11bb9e0e/java8examples/src/main/java/com/google/cloud/dataflow/examples/complete/game/injector/RetryHttpInitializerWrapper.java
----------------------------------------------------------------------
diff --git a/java8examples/src/main/java/com/google/cloud/dataflow/examples/complete/game/injector/RetryHttpInitializerWrapper.java b/java8examples/src/main/java/com/google/cloud/dataflow/examples/complete/game/injector/RetryHttpInitializerWrapper.java
deleted file mode 100644
index 1437534..0000000
--- a/java8examples/src/main/java/com/google/cloud/dataflow/examples/complete/game/injector/RetryHttpInitializerWrapper.java
+++ /dev/null
@@ -1,126 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
- * in compliance with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software distributed under the License
- * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
- * or implied. See the License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.examples.complete.game.injector;
-
-import com.google.api.client.auth.oauth2.Credential;
-import com.google.api.client.http.HttpBackOffIOExceptionHandler;
-import com.google.api.client.http.HttpBackOffUnsuccessfulResponseHandler;
-import com.google.api.client.http.HttpRequest;
-import com.google.api.client.http.HttpRequestInitializer;
-import com.google.api.client.http.HttpResponse;
-import com.google.api.client.http.HttpUnsuccessfulResponseHandler;
-import com.google.api.client.util.ExponentialBackOff;
-import com.google.api.client.util.Sleeper;
-import com.google.common.base.Preconditions;
-
-import java.io.IOException;
-import java.util.logging.Logger;
-
-/**
- * RetryHttpInitializerWrapper will automatically retry upon RPC
- * failures, preserving the auto-refresh behavior of the Google
- * Credentials.
- */
-public class RetryHttpInitializerWrapper implements HttpRequestInitializer {
-
- /**
- * A private logger.
- */
- private static final Logger LOG =
- Logger.getLogger(RetryHttpInitializerWrapper.class.getName());
-
- /**
- * One minutes in miliseconds.
- */
- private static final int ONEMINITUES = 60000;
-
- /**
- * Intercepts the request for filling in the "Authorization"
- * header field, as well as recovering from certain unsuccessful
- * error codes wherein the Credential must refresh its token for a
- * retry.
- */
- private final Credential wrappedCredential;
-
- /**
- * A sleeper; you can replace it with a mock in your test.
- */
- private final Sleeper sleeper;
-
- /**
- * A constructor.
- *
- * @param wrappedCredential Credential which will be wrapped and
- * used for providing auth header.
- */
- public RetryHttpInitializerWrapper(final Credential wrappedCredential) {
- this(wrappedCredential, Sleeper.DEFAULT);
- }
-
- /**
- * A protected constructor only for testing.
- *
- * @param wrappedCredential Credential which will be wrapped and
- * used for providing auth header.
- * @param sleeper Sleeper for easy testing.
- */
- RetryHttpInitializerWrapper(
- final Credential wrappedCredential, final Sleeper sleeper) {
- this.wrappedCredential = Preconditions.checkNotNull(wrappedCredential);
- this.sleeper = sleeper;
- }
-
- /**
- * Initializes the given request.
- */
- @Override
- public final void initialize(final HttpRequest request) {
- request.setReadTimeout(2 * ONEMINITUES); // 2 minutes read timeout
- final HttpUnsuccessfulResponseHandler backoffHandler =
- new HttpBackOffUnsuccessfulResponseHandler(
- new ExponentialBackOff())
- .setSleeper(sleeper);
- request.setInterceptor(wrappedCredential);
- request.setUnsuccessfulResponseHandler(
- new HttpUnsuccessfulResponseHandler() {
- @Override
- public boolean handleResponse(
- final HttpRequest request,
- final HttpResponse response,
- final boolean supportsRetry) throws IOException {
- if (wrappedCredential.handleResponse(
- request, response, supportsRetry)) {
- // If credential decides it can handle it,
- // the return code or message indicated
- // something specific to authentication,
- // and no backoff is desired.
- return true;
- } else if (backoffHandler.handleResponse(
- request, response, supportsRetry)) {
- // Otherwise, we defer to the judgement of
- // our internal backoff handler.
- LOG.info("Retrying "
- + request.getUrl().toString());
- return true;
- } else {
- return false;
- }
- }
- });
- request.setIOExceptionHandler(
- new HttpBackOffIOExceptionHandler(new ExponentialBackOff())
- .setSleeper(sleeper));
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/11bb9e0e/java8examples/src/main/java/com/google/cloud/dataflow/examples/complete/game/utils/WriteToBigQuery.java
----------------------------------------------------------------------
diff --git a/java8examples/src/main/java/com/google/cloud/dataflow/examples/complete/game/utils/WriteToBigQuery.java b/java8examples/src/main/java/com/google/cloud/dataflow/examples/complete/game/utils/WriteToBigQuery.java
deleted file mode 100644
index 2cf719a..0000000
--- a/java8examples/src/main/java/com/google/cloud/dataflow/examples/complete/game/utils/WriteToBigQuery.java
+++ /dev/null
@@ -1,134 +0,0 @@
- /*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.examples.complete.game.utils;
-
-import com.google.api.services.bigquery.model.TableFieldSchema;
-import com.google.api.services.bigquery.model.TableReference;
-import com.google.api.services.bigquery.model.TableRow;
-import com.google.api.services.bigquery.model.TableSchema;
-import com.google.cloud.dataflow.examples.complete.game.UserScore;
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.io.BigQueryIO;
-import com.google.cloud.dataflow.sdk.io.BigQueryIO.Write.CreateDisposition;
-import com.google.cloud.dataflow.sdk.io.BigQueryIO.Write.WriteDisposition;
-import com.google.cloud.dataflow.sdk.options.GcpOptions;
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-import com.google.cloud.dataflow.sdk.transforms.DoFn;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-import com.google.cloud.dataflow.sdk.transforms.ParDo;
-import com.google.cloud.dataflow.sdk.transforms.SerializableFunction;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.cloud.dataflow.sdk.values.PDone;
-
-import java.io.Serializable;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Map;
-
-/**
- * Generate, format, and write BigQuery table row information. Use provided information about
- * the field names and types, as well as lambda functions that describe how to generate their
- * values.
- */
-public class WriteToBigQuery<T>
- extends PTransform<PCollection<T>, PDone> {
-
- protected String tableName;
- protected Map<String, FieldInfo<T>> fieldInfo;
-
- public WriteToBigQuery() {
- }
-
- public WriteToBigQuery(String tableName,
- Map<String, FieldInfo<T>> fieldInfo) {
- this.tableName = tableName;
- this.fieldInfo = fieldInfo;
- }
-
- /** Define a class to hold information about output table field definitions. */
- public static class FieldInfo<T> implements Serializable {
- // The BigQuery 'type' of the field
- private String fieldType;
- // A lambda function to generate the field value
- private SerializableFunction<DoFn<T, TableRow>.ProcessContext, Object> fieldFn;
-
- public FieldInfo(String fieldType,
- SerializableFunction<DoFn<T, TableRow>.ProcessContext, Object> fieldFn) {
- this.fieldType = fieldType;
- this.fieldFn = fieldFn;
- }
-
- String getFieldType() {
- return this.fieldType;
- }
-
- SerializableFunction<DoFn<T, TableRow>.ProcessContext, Object> getFieldFn() {
- return this.fieldFn;
- }
- }
- /** Convert each key/score pair into a BigQuery TableRow as specified by fieldFn. */
- protected class BuildRowFn extends DoFn<T, TableRow> {
-
- @Override
- public void processElement(ProcessContext c) {
-
- TableRow row = new TableRow();
- for (Map.Entry<String, FieldInfo<T>> entry : fieldInfo.entrySet()) {
- String key = entry.getKey();
- FieldInfo<T> fcnInfo = entry.getValue();
- SerializableFunction<DoFn<T, TableRow>.ProcessContext, Object> fcn =
- fcnInfo.getFieldFn();
- row.set(key, fcn.apply(c));
- }
- c.output(row);
- }
- }
-
- /** Build the output table schema. */
- protected TableSchema getSchema() {
- List<TableFieldSchema> fields = new ArrayList<>();
- for (Map.Entry<String, FieldInfo<T>> entry : fieldInfo.entrySet()) {
- String key = entry.getKey();
- FieldInfo<T> fcnInfo = entry.getValue();
- String bqType = fcnInfo.getFieldType();
- fields.add(new TableFieldSchema().setName(key).setType(bqType));
- }
- return new TableSchema().setFields(fields);
- }
-
- @Override
- public PDone apply(PCollection<T> teamAndScore) {
- return teamAndScore
- .apply(ParDo.named("ConvertToRow").of(new BuildRowFn()))
- .apply(BigQueryIO.Write
- .to(getTable(teamAndScore.getPipeline(),
- tableName))
- .withSchema(getSchema())
- .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
- .withWriteDisposition(WriteDisposition.WRITE_APPEND));
- }
-
- /** Utility to construct an output table reference. */
- static TableReference getTable(Pipeline pipeline, String tableName) {
- PipelineOptions options = pipeline.getOptions();
- TableReference table = new TableReference();
- table.setDatasetId(options.as(UserScore.Options.class).getDataset());
- table.setProjectId(options.as(GcpOptions.class).getProject());
- table.setTableId(tableName);
- return table;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/11bb9e0e/java8examples/src/main/java/com/google/cloud/dataflow/examples/complete/game/utils/WriteWindowedToBigQuery.java
----------------------------------------------------------------------
diff --git a/java8examples/src/main/java/com/google/cloud/dataflow/examples/complete/game/utils/WriteWindowedToBigQuery.java b/java8examples/src/main/java/com/google/cloud/dataflow/examples/complete/game/utils/WriteWindowedToBigQuery.java
deleted file mode 100644
index 8433021..0000000
--- a/java8examples/src/main/java/com/google/cloud/dataflow/examples/complete/game/utils/WriteWindowedToBigQuery.java
+++ /dev/null
@@ -1,76 +0,0 @@
- /*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.examples.complete.game.utils;
-
-import com.google.api.services.bigquery.model.TableRow;
-import com.google.cloud.dataflow.sdk.io.BigQueryIO;
-import com.google.cloud.dataflow.sdk.io.BigQueryIO.Write.CreateDisposition;
-import com.google.cloud.dataflow.sdk.io.BigQueryIO.Write.WriteDisposition;
-import com.google.cloud.dataflow.sdk.transforms.DoFn;
-import com.google.cloud.dataflow.sdk.transforms.DoFn.RequiresWindowAccess;
-import com.google.cloud.dataflow.sdk.transforms.ParDo;
-import com.google.cloud.dataflow.sdk.transforms.SerializableFunction;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.cloud.dataflow.sdk.values.PDone;
-
-import java.util.Map;
-
-/**
- * Generate, format, and write BigQuery table row information. Subclasses {@link WriteToBigQuery}
- * to require windowing; so this subclass may be used for writes that require access to the
- * context's window information.
- */
-public class WriteWindowedToBigQuery<T>
- extends WriteToBigQuery<T> {
-
- public WriteWindowedToBigQuery(String tableName,
- Map<String, FieldInfo<T>> fieldInfo) {
- super(tableName, fieldInfo);
- }
-
- /** Convert each key/score pair into a BigQuery TableRow. */
- protected class BuildRowFn extends DoFn<T, TableRow>
- implements RequiresWindowAccess {
-
- @Override
- public void processElement(ProcessContext c) {
-
- TableRow row = new TableRow();
- for (Map.Entry<String, FieldInfo<T>> entry : fieldInfo.entrySet()) {
- String key = entry.getKey();
- FieldInfo<T> fcnInfo = entry.getValue();
- SerializableFunction<DoFn<T, TableRow>.ProcessContext, Object> fcn =
- fcnInfo.getFieldFn();
- row.set(key, fcn.apply(c));
- }
- c.output(row);
- }
- }
-
- @Override
- public PDone apply(PCollection<T> teamAndScore) {
- return teamAndScore
- .apply(ParDo.named("ConvertToRow").of(new BuildRowFn()))
- .apply(BigQueryIO.Write
- .to(getTable(teamAndScore.getPipeline(),
- tableName))
- .withSchema(getSchema())
- .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
- .withWriteDisposition(WriteDisposition.WRITE_APPEND));
- }
-
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/11bb9e0e/java8examples/src/test/java/com/google/cloud/dataflow/examples/MinimalWordCountJava8Test.java
----------------------------------------------------------------------
diff --git a/java8examples/src/test/java/com/google/cloud/dataflow/examples/MinimalWordCountJava8Test.java b/java8examples/src/test/java/com/google/cloud/dataflow/examples/MinimalWordCountJava8Test.java
deleted file mode 100644
index fcae41c..0000000
--- a/java8examples/src/test/java/com/google/cloud/dataflow/examples/MinimalWordCountJava8Test.java
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.examples;
-
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.io.TextIO;
-import com.google.cloud.dataflow.sdk.options.GcsOptions;
-import com.google.cloud.dataflow.sdk.testing.TestPipeline;
-import com.google.cloud.dataflow.sdk.transforms.Count;
-import com.google.cloud.dataflow.sdk.transforms.Filter;
-import com.google.cloud.dataflow.sdk.transforms.FlatMapElements;
-import com.google.cloud.dataflow.sdk.transforms.MapElements;
-import com.google.cloud.dataflow.sdk.util.GcsUtil;
-import com.google.cloud.dataflow.sdk.util.gcsfs.GcsPath;
-import com.google.cloud.dataflow.sdk.values.KV;
-import com.google.cloud.dataflow.sdk.values.TypeDescriptor;
-import com.google.common.collect.ImmutableList;
-
-import org.junit.Test;
-import org.junit.runner.RunWith;
-import org.junit.runners.JUnit4;
-import org.mockito.Mockito;
-import org.mockito.invocation.InvocationOnMock;
-import org.mockito.stubbing.Answer;
-
-import java.io.IOException;
-import java.io.Serializable;
-import java.nio.channels.FileChannel;
-import java.nio.channels.SeekableByteChannel;
-import java.nio.file.Files;
-import java.nio.file.StandardOpenOption;
-import java.util.Arrays;
-import java.util.List;
-
-/**
- * To keep {@link MinimalWordCountJava8} simple, it is not factored or testable. This test
- * file should be maintained with a copy of its code for a basic smoke test.
- */
-@RunWith(JUnit4.class)
-public class MinimalWordCountJava8Test implements Serializable {
-
- /**
- * A basic smoke test that ensures there is no crash at pipeline construction time.
- */
- @Test
- public void testMinimalWordCountJava8() throws Exception {
- Pipeline p = TestPipeline.create();
- p.getOptions().as(GcsOptions.class).setGcsUtil(buildMockGcsUtil());
-
- p.apply(TextIO.Read.from("gs://dataflow-samples/shakespeare/*"))
- .apply(FlatMapElements.via((String word) -> Arrays.asList(word.split("[^a-zA-Z']+")))
- .withOutputType(new TypeDescriptor<String>() {}))
- .apply(Filter.byPredicate((String word) -> !word.isEmpty()))
- .apply(Count.<String>perElement())
- .apply(MapElements
- .via((KV<String, Long> wordCount) -> wordCount.getKey() + ": " + wordCount.getValue())
- .withOutputType(new TypeDescriptor<String>() {}))
- .apply(TextIO.Write.to("gs://YOUR_OUTPUT_BUCKET/AND_OUTPUT_PREFIX"));
- }
-
- private GcsUtil buildMockGcsUtil() throws IOException {
- GcsUtil mockGcsUtil = Mockito.mock(GcsUtil.class);
-
- // Any request to open gets a new bogus channel
- Mockito
- .when(mockGcsUtil.open(Mockito.any(GcsPath.class)))
- .then(new Answer<SeekableByteChannel>() {
- @Override
- public SeekableByteChannel answer(InvocationOnMock invocation) throws Throwable {
- return FileChannel.open(
- Files.createTempFile("channel-", ".tmp"),
- StandardOpenOption.CREATE, StandardOpenOption.DELETE_ON_CLOSE);
- }
- });
-
- // Any request for expansion returns a list containing the original GcsPath
- // This is required to pass validation that occurs in TextIO during apply()
- Mockito
- .when(mockGcsUtil.expand(Mockito.any(GcsPath.class)))
- .then(new Answer<List<GcsPath>>() {
- @Override
- public List<GcsPath> answer(InvocationOnMock invocation) throws Throwable {
- return ImmutableList.of((GcsPath) invocation.getArguments()[0]);
- }
- });
-
- return mockGcsUtil;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/11bb9e0e/java8examples/src/test/java/com/google/cloud/dataflow/examples/complete/game/GameStatsTest.java
----------------------------------------------------------------------
diff --git a/java8examples/src/test/java/com/google/cloud/dataflow/examples/complete/game/GameStatsTest.java b/java8examples/src/test/java/com/google/cloud/dataflow/examples/complete/game/GameStatsTest.java
deleted file mode 100644
index f77d146..0000000
--- a/java8examples/src/test/java/com/google/cloud/dataflow/examples/complete/game/GameStatsTest.java
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.examples.complete.game;
-
-import com.google.cloud.dataflow.examples.complete.game.GameStats.CalculateSpammyUsers;
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.testing.DataflowAssert;
-import com.google.cloud.dataflow.sdk.testing.RunnableOnService;
-import com.google.cloud.dataflow.sdk.testing.TestPipeline;
-import com.google.cloud.dataflow.sdk.transforms.Create;
-import com.google.cloud.dataflow.sdk.values.KV;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-
-import org.junit.Test;
-import org.junit.experimental.categories.Category;
-import org.junit.runner.RunWith;
-import org.junit.runners.JUnit4;
-
-import java.io.Serializable;
-import java.util.Arrays;
-import java.util.List;
-
-/**
- * Tests of GameStats.
- * Because the pipeline was designed for easy readability and explanations, it lacks good
- * modularity for testing. See our testing documentation for better ideas:
- * https://cloud.google.com/dataflow/pipelines/testing-your-pipeline.
- */
-@RunWith(JUnit4.class)
-public class GameStatsTest implements Serializable {
-
- // User scores
- static final List<KV<String, Integer>> USER_SCORES = Arrays.asList(
- KV.of("Robot-2", 66), KV.of("Robot-1", 116), KV.of("user7_AndroidGreenKookaburra", 23),
- KV.of("user7_AndroidGreenKookaburra", 1),
- KV.of("user19_BisqueBilby", 14), KV.of("user13_ApricotQuokka", 15),
- KV.of("user18_BananaEmu", 25), KV.of("user6_AmberEchidna", 8),
- KV.of("user2_AmberQuokka", 6), KV.of("user0_MagentaKangaroo", 4),
- KV.of("user0_MagentaKangaroo", 3), KV.of("user2_AmberCockatoo", 13),
- KV.of("user7_AlmondWallaby", 15), KV.of("user6_AmberNumbat", 11),
- KV.of("user6_AmberQuokka", 4));
-
- // The expected list of 'spammers'.
- static final List<KV<String, Integer>> SPAMMERS = Arrays.asList(
- KV.of("Robot-2", 66), KV.of("Robot-1", 116));
-
- /** Test the calculation of 'spammy users'. */
- @Test
- @Category(RunnableOnService.class)
- public void testCalculateSpammyUsers() throws Exception {
- Pipeline p = TestPipeline.create();
-
- PCollection<KV<String, Integer>> input = p.apply(Create.of(USER_SCORES));
- PCollection<KV<String, Integer>> output = input.apply(new CalculateSpammyUsers());
-
- // Check the set of spammers.
- DataflowAssert.that(output).containsInAnyOrder(SPAMMERS);
-
- p.run();
- }
-
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/11bb9e0e/java8examples/src/test/java/com/google/cloud/dataflow/examples/complete/game/HourlyTeamScoreTest.java
----------------------------------------------------------------------
diff --git a/java8examples/src/test/java/com/google/cloud/dataflow/examples/complete/game/HourlyTeamScoreTest.java b/java8examples/src/test/java/com/google/cloud/dataflow/examples/complete/game/HourlyTeamScoreTest.java
deleted file mode 100644
index f77a5d4..0000000
--- a/java8examples/src/test/java/com/google/cloud/dataflow/examples/complete/game/HourlyTeamScoreTest.java
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.examples.complete.game;
-
-import com.google.cloud.dataflow.examples.complete.game.UserScore.GameActionInfo;
-import com.google.cloud.dataflow.examples.complete.game.UserScore.ParseEventFn;
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.coders.StringUtf8Coder;
-import com.google.cloud.dataflow.sdk.testing.DataflowAssert;
-import com.google.cloud.dataflow.sdk.testing.RunnableOnService;
-import com.google.cloud.dataflow.sdk.testing.TestPipeline;
-import com.google.cloud.dataflow.sdk.transforms.Create;
-import com.google.cloud.dataflow.sdk.transforms.Filter;
-import com.google.cloud.dataflow.sdk.transforms.MapElements;
-import com.google.cloud.dataflow.sdk.transforms.ParDo;
-import com.google.cloud.dataflow.sdk.values.KV;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.cloud.dataflow.sdk.values.TypeDescriptor;
-
-import org.joda.time.Instant;
-import org.junit.Test;
-import org.junit.experimental.categories.Category;
-import org.junit.runner.RunWith;
-import org.junit.runners.JUnit4;
-
-import java.io.Serializable;
-import java.util.Arrays;
-import java.util.List;
-
-/**
- * Tests of HourlyTeamScore.
- * Because the pipeline was designed for easy readability and explanations, it lacks good
- * modularity for testing. See our testing documentation for better ideas:
- * https://cloud.google.com/dataflow/pipelines/testing-your-pipeline.
- */
-@RunWith(JUnit4.class)
-public class HourlyTeamScoreTest implements Serializable {
-
- static final String[] GAME_EVENTS_ARRAY = new String[] {
- "user0_MagentaKangaroo,MagentaKangaroo,3,1447955630000,2015-11-19 09:53:53.444",
- "user13_ApricotQuokka,ApricotQuokka,15,1447955630000,2015-11-19 09:53:53.444",
- "user6_AmberNumbat,AmberNumbat,11,1447955630000,2015-11-19 09:53:53.444",
- "user7_AlmondWallaby,AlmondWallaby,15,1447955630000,2015-11-19 09:53:53.444",
- "user7_AndroidGreenKookaburra,AndroidGreenKookaburra,12,1447955630000,2015-11-19 09:53:53.444",
- "user7_AndroidGreenKookaburra,AndroidGreenKookaburra,11,1447955630000,2015-11-19 09:53:53.444",
- "user19_BisqueBilby,BisqueBilby,6,1447955630000,2015-11-19 09:53:53.444",
- "user19_BisqueBilby,BisqueBilby,8,1447955630000,2015-11-19 09:53:53.444",
- // time gap...
- "user0_AndroidGreenEchidna,AndroidGreenEchidna,0,1447965690000,2015-11-19 12:41:31.053",
- "user0_MagentaKangaroo,MagentaKangaroo,4,1447965690000,2015-11-19 12:41:31.053",
- "user2_AmberCockatoo,AmberCockatoo,13,1447965690000,2015-11-19 12:41:31.053",
- "user18_BananaEmu,BananaEmu,7,1447965690000,2015-11-19 12:41:31.053",
- "user3_BananaEmu,BananaEmu,17,1447965690000,2015-11-19 12:41:31.053",
- "user18_BananaEmu,BananaEmu,1,1447965690000,2015-11-19 12:41:31.053",
- "user18_ApricotCaneToad,ApricotCaneToad,14,1447965690000,2015-11-19 12:41:31.053"
- };
-
-
- static final List<String> GAME_EVENTS = Arrays.asList(GAME_EVENTS_ARRAY);
-
-
- // Used to check the filtering.
- static final KV[] FILTERED_EVENTS = new KV[] {
- KV.of("user0_AndroidGreenEchidna", 0), KV.of("user0_MagentaKangaroo", 4),
- KV.of("user2_AmberCockatoo", 13),
- KV.of("user18_BananaEmu", 7), KV.of("user3_BananaEmu", 17),
- KV.of("user18_BananaEmu", 1), KV.of("user18_ApricotCaneToad", 14)
- };
-
-
- /** Test the filtering. */
- @Test
- @Category(RunnableOnService.class)
- public void testUserScoresFilter() throws Exception {
- Pipeline p = TestPipeline.create();
-
- final Instant startMinTimestamp = new Instant(1447965680000L);
-
- PCollection<String> input = p.apply(Create.of(GAME_EVENTS).withCoder(StringUtf8Coder.of()));
-
- PCollection<KV<String, Integer>> output = input
- .apply(ParDo.named("ParseGameEvent").of(new ParseEventFn()))
-
- .apply("FilterStartTime", Filter.byPredicate(
- (GameActionInfo gInfo)
- -> gInfo.getTimestamp() > startMinTimestamp.getMillis()))
- // run a map to access the fields in the result.
- .apply(MapElements
- .via((GameActionInfo gInfo) -> KV.of(gInfo.getUser(), gInfo.getScore()))
- .withOutputType(new TypeDescriptor<KV<String, Integer>>() {}));
-
- DataflowAssert.that(output).containsInAnyOrder(FILTERED_EVENTS);
-
- p.run();
- }
-
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/11bb9e0e/java8examples/src/test/java/com/google/cloud/dataflow/examples/complete/game/UserScoreTest.java
----------------------------------------------------------------------
diff --git a/java8examples/src/test/java/com/google/cloud/dataflow/examples/complete/game/UserScoreTest.java b/java8examples/src/test/java/com/google/cloud/dataflow/examples/complete/game/UserScoreTest.java
deleted file mode 100644
index 641e2c3..0000000
--- a/java8examples/src/test/java/com/google/cloud/dataflow/examples/complete/game/UserScoreTest.java
+++ /dev/null
@@ -1,154 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.examples.complete.game;
-
-import com.google.cloud.dataflow.examples.complete.game.UserScore.ExtractAndSumScore;
-import com.google.cloud.dataflow.examples.complete.game.UserScore.GameActionInfo;
-import com.google.cloud.dataflow.examples.complete.game.UserScore.ParseEventFn;
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.coders.StringUtf8Coder;
-import com.google.cloud.dataflow.sdk.testing.DataflowAssert;
-import com.google.cloud.dataflow.sdk.testing.RunnableOnService;
-import com.google.cloud.dataflow.sdk.testing.TestPipeline;
-import com.google.cloud.dataflow.sdk.transforms.Create;
-import com.google.cloud.dataflow.sdk.transforms.DoFnTester;
-import com.google.cloud.dataflow.sdk.transforms.MapElements;
-import com.google.cloud.dataflow.sdk.transforms.ParDo;
-import com.google.cloud.dataflow.sdk.values.KV;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.cloud.dataflow.sdk.values.TypeDescriptor;
-
-import org.junit.Assert;
-import org.junit.Test;
-import org.junit.experimental.categories.Category;
-import org.junit.runner.RunWith;
-import org.junit.runners.JUnit4;
-
-import java.io.Serializable;
-import java.util.Arrays;
-import java.util.List;
-
-/**
- * Tests of UserScore.
- */
-@RunWith(JUnit4.class)
-public class UserScoreTest implements Serializable {
-
- static final String[] GAME_EVENTS_ARRAY = new String[] {
- "user0_MagentaKangaroo,MagentaKangaroo,3,1447955630000,2015-11-19 09:53:53.444",
- "user13_ApricotQuokka,ApricotQuokka,15,1447955630000,2015-11-19 09:53:53.444",
- "user6_AmberNumbat,AmberNumbat,11,1447955630000,2015-11-19 09:53:53.444",
- "user7_AlmondWallaby,AlmondWallaby,15,1447955630000,2015-11-19 09:53:53.444",
- "user7_AndroidGreenKookaburra,AndroidGreenKookaburra,12,1447955630000,2015-11-19 09:53:53.444",
- "user6_AliceBlueDingo,AliceBlueDingo,4,xxxxxxx,2015-11-19 09:53:53.444",
- "user7_AndroidGreenKookaburra,AndroidGreenKookaburra,11,1447955630000,2015-11-19 09:53:53.444",
- "THIS IS A PARSE ERROR,2015-11-19 09:53:53.444",
- "user19_BisqueBilby,BisqueBilby,6,1447955630000,2015-11-19 09:53:53.444",
- "user19_BisqueBilby,BisqueBilby,8,1447955630000,2015-11-19 09:53:53.444"
- };
-
- static final String[] GAME_EVENTS_ARRAY2 = new String[] {
- "user6_AliceBlueDingo,AliceBlueDingo,4,xxxxxxx,2015-11-19 09:53:53.444",
- "THIS IS A PARSE ERROR,2015-11-19 09:53:53.444",
- "user13_BisqueBilby,BisqueBilby,xxx,1447955630000,2015-11-19 09:53:53.444"
- };
-
- static final List<String> GAME_EVENTS = Arrays.asList(GAME_EVENTS_ARRAY);
- static final List<String> GAME_EVENTS2 = Arrays.asList(GAME_EVENTS_ARRAY2);
-
- static final List<KV<String, Integer>> USER_SUMS = Arrays.asList(
- KV.of("user0_MagentaKangaroo", 3), KV.of("user13_ApricotQuokka", 15),
- KV.of("user6_AmberNumbat", 11), KV.of("user7_AlmondWallaby", 15),
- KV.of("user7_AndroidGreenKookaburra", 23),
- KV.of("user19_BisqueBilby", 14));
-
- static final List<KV<String, Integer>> TEAM_SUMS = Arrays.asList(
- KV.of("MagentaKangaroo", 3), KV.of("ApricotQuokka", 15),
- KV.of("AmberNumbat", 11), KV.of("AlmondWallaby", 15),
- KV.of("AndroidGreenKookaburra", 23),
- KV.of("BisqueBilby", 14));
-
- /** Test the ParseEventFn DoFn. */
- @Test
- public void testParseEventFn() {
- DoFnTester<String, GameActionInfo> parseEventFn =
- DoFnTester.of(new ParseEventFn());
-
- List<GameActionInfo> results = parseEventFn.processBatch(GAME_EVENTS_ARRAY);
- Assert.assertEquals(results.size(), 8);
- Assert.assertEquals(results.get(0).getUser(), "user0_MagentaKangaroo");
- Assert.assertEquals(results.get(0).getTeam(), "MagentaKangaroo");
- Assert.assertEquals(results.get(0).getScore(), new Integer(3));
- }
-
- /** Tests ExtractAndSumScore("user"). */
- @Test
- @Category(RunnableOnService.class)
- public void testUserScoreSums() throws Exception {
- Pipeline p = TestPipeline.create();
-
- PCollection<String> input = p.apply(Create.of(GAME_EVENTS).withCoder(StringUtf8Coder.of()));
-
- PCollection<KV<String, Integer>> output = input
- .apply(ParDo.of(new ParseEventFn()))
- // Extract and sum username/score pairs from the event data.
- .apply("ExtractUserScore", new ExtractAndSumScore("user"));
-
- // Check the user score sums.
- DataflowAssert.that(output).containsInAnyOrder(USER_SUMS);
-
- p.run();
- }
-
- /** Tests ExtractAndSumScore("team"). */
- @Test
- @Category(RunnableOnService.class)
- public void testTeamScoreSums() throws Exception {
- Pipeline p = TestPipeline.create();
-
- PCollection<String> input = p.apply(Create.of(GAME_EVENTS).withCoder(StringUtf8Coder.of()));
-
- PCollection<KV<String, Integer>> output = input
- .apply(ParDo.of(new ParseEventFn()))
- // Extract and sum teamname/score pairs from the event data.
- .apply("ExtractTeamScore", new ExtractAndSumScore("team"));
-
- // Check the team score sums.
- DataflowAssert.that(output).containsInAnyOrder(TEAM_SUMS);
-
- p.run();
- }
-
- /** Test that bad input data is dropped appropriately. */
- @Test
- @Category(RunnableOnService.class)
- public void testUserScoresBadInput() throws Exception {
- Pipeline p = TestPipeline.create();
-
- PCollection<String> input = p.apply(Create.of(GAME_EVENTS2).withCoder(StringUtf8Coder.of()));
-
- PCollection<KV<String, Integer>> extract = input
- .apply(ParDo.of(new ParseEventFn()))
- .apply(
- MapElements.via((GameActionInfo gInfo) -> KV.of(gInfo.getUser(), gInfo.getScore()))
- .withOutputType(new TypeDescriptor<KV<String, Integer>>() {}));
-
- DataflowAssert.that(extract).empty();
-
- p.run();
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/11bb9e0e/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index 2d0a3e1..3803007 100644
--- a/pom.xml
+++ b/pom.xml
@@ -147,7 +147,7 @@
<jdk>[1.8,)</jdk>
</activation>
<modules>
- <module>java8examples</module>
+ <module>examples/java8</module>
</modules>
</profile>
<profile>
[14/67] [partial] incubator-beam git commit: Directory reorganization
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/BatchTimerInternals.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/BatchTimerInternals.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/BatchTimerInternals.java
deleted file mode 100644
index b6a1493..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/BatchTimerInternals.java
+++ /dev/null
@@ -1,138 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
-import com.google.common.base.MoreObjects;
-import com.google.common.base.Preconditions;
-
-import org.joda.time.Instant;
-
-import java.util.HashSet;
-import java.util.PriorityQueue;
-import java.util.Set;
-
-import javax.annotation.Nullable;
-
-/**
- * TimerInternals that uses priority queues to manage the timers that are ready to fire.
- */
-public class BatchTimerInternals implements TimerInternals {
- /** Set of timers that are scheduled used for deduplicating timers. */
- private Set<TimerData> existingTimers = new HashSet<>();
-
- // Keep these queues separate so we can advance over them separately.
- private PriorityQueue<TimerData> watermarkTimers = new PriorityQueue<>(11);
- private PriorityQueue<TimerData> processingTimers = new PriorityQueue<>(11);
-
- private Instant inputWatermarkTime;
- private Instant processingTime;
-
- private PriorityQueue<TimerData> queue(TimeDomain domain) {
- return TimeDomain.EVENT_TIME.equals(domain) ? watermarkTimers : processingTimers;
- }
-
- public BatchTimerInternals(Instant processingTime) {
- this.processingTime = processingTime;
- this.inputWatermarkTime = BoundedWindow.TIMESTAMP_MIN_VALUE;
- }
-
- @Override
- public void setTimer(TimerData timer) {
- if (existingTimers.add(timer)) {
- queue(timer.getDomain()).add(timer);
- }
- }
-
- @Override
- public void deleteTimer(TimerData timer) {
- existingTimers.remove(timer);
- queue(timer.getDomain()).remove(timer);
- }
-
- @Override
- public Instant currentProcessingTime() {
- return processingTime;
- }
-
- /**
- * {@inheritDoc}
- *
- * @return {@link BoundedWindow#TIMESTAMP_MAX_VALUE}: in batch mode, upstream processing
- * is already complete.
- */
- @Override
- @Nullable
- public Instant currentSynchronizedProcessingTime() {
- return BoundedWindow.TIMESTAMP_MAX_VALUE;
- }
-
- @Override
- public Instant currentInputWatermarkTime() {
- return inputWatermarkTime;
- }
-
- @Override
- @Nullable
- public Instant currentOutputWatermarkTime() {
- // The output watermark is always undefined in batch mode.
- return null;
- }
-
- @Override
- public String toString() {
- return MoreObjects.toStringHelper(getClass())
- .add("watermarkTimers", watermarkTimers)
- .add("processingTimers", processingTimers)
- .toString();
- }
-
- public void advanceInputWatermark(ReduceFnRunner<?, ?, ?, ?> runner, Instant newInputWatermark)
- throws Exception {
- Preconditions.checkState(!newInputWatermark.isBefore(inputWatermarkTime),
- "Cannot move input watermark time backwards from %s to %s", inputWatermarkTime,
- newInputWatermark);
- inputWatermarkTime = newInputWatermark;
- advance(runner, newInputWatermark, TimeDomain.EVENT_TIME);
- }
-
- public void advanceProcessingTime(ReduceFnRunner<?, ?, ?, ?> runner, Instant newProcessingTime)
- throws Exception {
- Preconditions.checkState(!newProcessingTime.isBefore(processingTime),
- "Cannot move processing time backwards from %s to %s", processingTime, newProcessingTime);
- processingTime = newProcessingTime;
- advance(runner, newProcessingTime, TimeDomain.PROCESSING_TIME);
- }
-
- private void advance(ReduceFnRunner<?, ?, ?, ?> runner, Instant newTime, TimeDomain domain)
- throws Exception {
- PriorityQueue<TimerData> timers = queue(domain);
- boolean shouldFire = false;
-
- do {
- TimerData timer = timers.peek();
- // Timers fire if the new time is ahead of the timer
- shouldFire = timer != null && newTime.isAfter(timer.getTimestamp());
- if (shouldFire) {
- // Remove before firing, so that if the trigger adds another identical
- // timer we don't remove it.
- timers.remove();
- runner.onTimer(timer);
- }
- } while (shouldFire);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/BigQueryTableInserter.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/BigQueryTableInserter.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/BigQueryTableInserter.java
deleted file mode 100644
index cd51062..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/BigQueryTableInserter.java
+++ /dev/null
@@ -1,434 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.api.client.util.BackOff;
-import com.google.api.client.util.BackOffUtils;
-import com.google.api.client.util.ExponentialBackOff;
-import com.google.api.client.util.Sleeper;
-import com.google.api.services.bigquery.Bigquery;
-import com.google.api.services.bigquery.model.Table;
-import com.google.api.services.bigquery.model.TableDataInsertAllRequest;
-import com.google.api.services.bigquery.model.TableDataInsertAllResponse;
-import com.google.api.services.bigquery.model.TableDataList;
-import com.google.api.services.bigquery.model.TableReference;
-import com.google.api.services.bigquery.model.TableRow;
-import com.google.api.services.bigquery.model.TableSchema;
-import com.google.cloud.dataflow.sdk.io.BigQueryIO;
-import com.google.cloud.dataflow.sdk.io.BigQueryIO.Write.CreateDisposition;
-import com.google.cloud.dataflow.sdk.io.BigQueryIO.Write.WriteDisposition;
-import com.google.cloud.dataflow.sdk.transforms.Aggregator;
-import com.google.cloud.hadoop.util.ApiErrorExtractor;
-import com.google.common.annotations.VisibleForTesting;
-import com.google.common.base.Preconditions;
-import com.google.common.base.Throwables;
-import com.google.common.util.concurrent.MoreExecutors;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.LinkedList;
-import java.util.List;
-import java.util.concurrent.Callable;
-import java.util.concurrent.ExecutionException;
-import java.util.concurrent.ExecutorService;
-import java.util.concurrent.Executors;
-import java.util.concurrent.Future;
-import java.util.concurrent.ThreadPoolExecutor;
-import java.util.concurrent.TimeUnit;
-
-import javax.annotation.Nullable;
-
-/**
- * Inserts rows into BigQuery.
- */
-public class BigQueryTableInserter {
- private static final Logger LOG = LoggerFactory.getLogger(BigQueryTableInserter.class);
-
- // Approximate amount of table data to upload per InsertAll request.
- private static final long UPLOAD_BATCH_SIZE_BYTES = 64 * 1024;
-
- // The maximum number of rows to upload per InsertAll request.
- private static final long MAX_ROWS_PER_BATCH = 500;
-
- // The maximum number of times to retry inserting rows into BigQuery.
- private static final int MAX_INSERT_ATTEMPTS = 5;
-
- // The initial backoff after a failure inserting rows into BigQuery.
- private static final long INITIAL_INSERT_BACKOFF_INTERVAL_MS = 200L;
-
- private final Bigquery client;
- private final TableReference defaultRef;
- private final long maxRowsPerBatch;
-
- private static final ExecutorService executor = MoreExecutors.getExitingExecutorService(
- (ThreadPoolExecutor) Executors.newFixedThreadPool(100), 10, TimeUnit.SECONDS);
-
- /**
- * Constructs a new row inserter.
- *
- * @param client a BigQuery client
- */
- public BigQueryTableInserter(Bigquery client) {
- this.client = client;
- this.defaultRef = null;
- this.maxRowsPerBatch = MAX_ROWS_PER_BATCH;
- }
-
- /**
- * Constructs a new row inserter.
- *
- * @param client a BigQuery client
- * @param defaultRef identifies the table to insert into
- * @deprecated replaced by {@link #BigQueryTableInserter(Bigquery)}
- */
- @Deprecated
- public BigQueryTableInserter(Bigquery client, TableReference defaultRef) {
- this.client = client;
- this.defaultRef = defaultRef;
- this.maxRowsPerBatch = MAX_ROWS_PER_BATCH;
- }
-
- /**
- * Constructs a new row inserter.
- *
- * @param client a BigQuery client
- */
- public BigQueryTableInserter(Bigquery client, int maxRowsPerBatch) {
- this.client = client;
- this.defaultRef = null;
- this.maxRowsPerBatch = maxRowsPerBatch;
- }
-
- /**
- * Constructs a new row inserter.
- *
- * @param client a BigQuery client
- * @param defaultRef identifies the default table to insert into
- * @deprecated replaced by {@link #BigQueryTableInserter(Bigquery, int)}
- */
- @Deprecated
- public BigQueryTableInserter(Bigquery client, TableReference defaultRef, int maxRowsPerBatch) {
- this.client = client;
- this.defaultRef = defaultRef;
- this.maxRowsPerBatch = maxRowsPerBatch;
- }
-
- /**
- * Insert all rows from the given list.
- *
- * @deprecated replaced by {@link #insertAll(TableReference, List)}
- */
- @Deprecated
- public void insertAll(List<TableRow> rowList) throws IOException {
- insertAll(defaultRef, rowList, null, null);
- }
-
- /**
- * Insert all rows from the given list using specified insertIds if not null.
- *
- * @deprecated replaced by {@link #insertAll(TableReference, List, List)}
- */
- @Deprecated
- public void insertAll(List<TableRow> rowList,
- @Nullable List<String> insertIdList) throws IOException {
- insertAll(defaultRef, rowList, insertIdList, null);
- }
-
- /**
- * Insert all rows from the given list.
- */
- public void insertAll(TableReference ref, List<TableRow> rowList) throws IOException {
- insertAll(ref, rowList, null, null);
- }
-
- /**
- * Insert all rows from the given list using specified insertIds if not null. Track count of
- * bytes written with the Aggregator.
- */
- public void insertAll(TableReference ref, List<TableRow> rowList,
- @Nullable List<String> insertIdList, Aggregator<Long, Long> byteCountAggregator)
- throws IOException {
- Preconditions.checkNotNull(ref, "ref");
- if (insertIdList != null && rowList.size() != insertIdList.size()) {
- throw new AssertionError("If insertIdList is not null it needs to have at least "
- + "as many elements as rowList");
- }
-
- AttemptBoundedExponentialBackOff backoff = new AttemptBoundedExponentialBackOff(
- MAX_INSERT_ATTEMPTS,
- INITIAL_INSERT_BACKOFF_INTERVAL_MS);
-
- List<TableDataInsertAllResponse.InsertErrors> allErrors = new ArrayList<>();
- // These lists contain the rows to publish. Initially the contain the entire list. If there are
- // failures, they will contain only the failed rows to be retried.
- List<TableRow> rowsToPublish = rowList;
- List<String> idsToPublish = insertIdList;
- while (true) {
- List<TableRow> retryRows = new ArrayList<>();
- List<String> retryIds = (idsToPublish != null) ? new ArrayList<String>() : null;
-
- int strideIndex = 0;
- // Upload in batches.
- List<TableDataInsertAllRequest.Rows> rows = new LinkedList<>();
- int dataSize = 0;
-
- List<Future<List<TableDataInsertAllResponse.InsertErrors>>> futures = new ArrayList<>();
- List<Integer> strideIndices = new ArrayList<>();
-
- for (int i = 0; i < rowsToPublish.size(); ++i) {
- TableRow row = rowsToPublish.get(i);
- TableDataInsertAllRequest.Rows out = new TableDataInsertAllRequest.Rows();
- if (idsToPublish != null) {
- out.setInsertId(idsToPublish.get(i));
- }
- out.setJson(row.getUnknownKeys());
- rows.add(out);
-
- dataSize += row.toString().length();
- if (dataSize >= UPLOAD_BATCH_SIZE_BYTES || rows.size() >= maxRowsPerBatch ||
- i == rowsToPublish.size() - 1) {
- TableDataInsertAllRequest content = new TableDataInsertAllRequest();
- content.setRows(rows);
-
- final Bigquery.Tabledata.InsertAll insert = client.tabledata()
- .insertAll(ref.getProjectId(), ref.getDatasetId(), ref.getTableId(),
- content);
-
- futures.add(
- executor.submit(new Callable<List<TableDataInsertAllResponse.InsertErrors>>() {
- @Override
- public List<TableDataInsertAllResponse.InsertErrors> call() throws IOException {
- return insert.execute().getInsertErrors();
- }
- }));
- strideIndices.add(strideIndex);
-
- if (byteCountAggregator != null) {
- byteCountAggregator.addValue(Long.valueOf(dataSize));
- }
- dataSize = 0;
- strideIndex = i + 1;
- rows = new LinkedList<>();
- }
- }
-
- try {
- for (int i = 0; i < futures.size(); i++) {
- List<TableDataInsertAllResponse.InsertErrors> errors = futures.get(i).get();
- if (errors != null) {
- for (TableDataInsertAllResponse.InsertErrors error : errors) {
- allErrors.add(error);
- if (error.getIndex() == null) {
- throw new IOException("Insert failed: " + allErrors);
- }
-
- int errorIndex = error.getIndex().intValue() + strideIndices.get(i);
- retryRows.add(rowsToPublish.get(errorIndex));
- if (retryIds != null) {
- retryIds.add(idsToPublish.get(errorIndex));
- }
- }
- }
- }
- } catch (InterruptedException e) {
- throw new IOException("Interrupted while inserting " + rowsToPublish);
- } catch (ExecutionException e) {
- Throwables.propagate(e.getCause());
- }
-
- if (!allErrors.isEmpty() && !backoff.atMaxAttempts()) {
- try {
- Thread.sleep(backoff.nextBackOffMillis());
- } catch (InterruptedException e) {
- throw new IOException("Interrupted while waiting before retrying insert of " + retryRows);
- }
- LOG.info("Retrying failed inserts to BigQuery");
- rowsToPublish = retryRows;
- idsToPublish = retryIds;
- allErrors.clear();
- } else {
- break;
- }
- }
- if (!allErrors.isEmpty()) {
- throw new IOException("Insert failed: " + allErrors);
- }
- }
-
- /**
- * Retrieves or creates the table.
- *
- * <p>The table is checked to conform to insertion requirements as specified
- * by WriteDisposition and CreateDisposition.
- *
- * <p>If table truncation is requested (WriteDisposition.WRITE_TRUNCATE), then
- * this will re-create the table if necessary to ensure it is empty.
- *
- * <p>If an empty table is required (WriteDisposition.WRITE_EMPTY), then this
- * will fail if the table exists and is not empty.
- *
- * <p>When constructing a table, a {@code TableSchema} must be available. If a
- * schema is provided, then it will be used. If no schema is provided, but
- * an existing table is being cleared (WRITE_TRUNCATE option above), then
- * the existing schema will be re-used. If no schema is available, then an
- * {@code IOException} is thrown.
- */
- public Table getOrCreateTable(
- TableReference ref,
- WriteDisposition writeDisposition,
- CreateDisposition createDisposition,
- @Nullable TableSchema schema) throws IOException {
- // Check if table already exists.
- Bigquery.Tables.Get get = client.tables()
- .get(ref.getProjectId(), ref.getDatasetId(), ref.getTableId());
- Table table = null;
- try {
- table = get.execute();
- } catch (IOException e) {
- ApiErrorExtractor errorExtractor = new ApiErrorExtractor();
- if (!errorExtractor.itemNotFound(e) ||
- createDisposition != CreateDisposition.CREATE_IF_NEEDED) {
- // Rethrow.
- throw e;
- }
- }
-
- // If we want an empty table, and it isn't, then delete it first.
- if (table != null) {
- if (writeDisposition == WriteDisposition.WRITE_APPEND) {
- return table;
- }
-
- boolean empty = isEmpty(ref);
- if (empty) {
- if (writeDisposition == WriteDisposition.WRITE_TRUNCATE) {
- LOG.info("Empty table found, not removing {}", BigQueryIO.toTableSpec(ref));
- }
- return table;
-
- } else if (writeDisposition == WriteDisposition.WRITE_EMPTY) {
- throw new IOException("WriteDisposition is WRITE_EMPTY, "
- + "but table is not empty");
- }
-
- // Reuse the existing schema if none was provided.
- if (schema == null) {
- schema = table.getSchema();
- }
-
- // Delete table and fall through to re-creating it below.
- LOG.info("Deleting table {}", BigQueryIO.toTableSpec(ref));
- Bigquery.Tables.Delete delete = client.tables()
- .delete(ref.getProjectId(), ref.getDatasetId(), ref.getTableId());
- delete.execute();
- }
-
- if (schema == null) {
- throw new IllegalArgumentException(
- "Table schema required for new table.");
- }
-
- // Create the table.
- return tryCreateTable(ref, schema);
- }
-
- /**
- * Checks if a table is empty.
- */
- public boolean isEmpty(TableReference ref) throws IOException {
- Bigquery.Tabledata.List list = client.tabledata()
- .list(ref.getProjectId(), ref.getDatasetId(), ref.getTableId());
- list.setMaxResults(1L);
- TableDataList dataList = list.execute();
-
- return dataList.getRows() == null || dataList.getRows().isEmpty();
- }
-
- /**
- * Retry table creation up to 5 minutes (with exponential backoff) when this user is near the
- * quota for table creation. This relatively innocuous behavior can happen when BigQueryIO is
- * configured with a table spec function to use different tables for each window.
- */
- private static final int RETRY_CREATE_TABLE_DURATION_MILLIS = (int) TimeUnit.MINUTES.toMillis(5);
-
- /**
- * Tries to create the BigQuery table.
- * If a table with the same name already exists in the dataset, the table
- * creation fails, and the function returns null. In such a case,
- * the existing table doesn't necessarily have the same schema as specified
- * by the parameter.
- *
- * @param schema Schema of the new BigQuery table.
- * @return The newly created BigQuery table information, or null if the table
- * with the same name already exists.
- * @throws IOException if other error than already existing table occurs.
- */
- @Nullable
- public Table tryCreateTable(TableReference ref, TableSchema schema) throws IOException {
- LOG.info("Trying to create BigQuery table: {}", BigQueryIO.toTableSpec(ref));
- BackOff backoff =
- new ExponentialBackOff.Builder()
- .setMaxElapsedTimeMillis(RETRY_CREATE_TABLE_DURATION_MILLIS)
- .build();
-
- Table table = new Table().setTableReference(ref).setSchema(schema);
- return tryCreateTable(table, ref.getProjectId(), ref.getDatasetId(), backoff, Sleeper.DEFAULT);
- }
-
- @VisibleForTesting
- @Nullable
- Table tryCreateTable(
- Table table, String projectId, String datasetId, BackOff backoff, Sleeper sleeper)
- throws IOException {
- boolean retry = false;
- while (true) {
- try {
- return client.tables().insert(projectId, datasetId, table).execute();
- } catch (IOException e) {
- ApiErrorExtractor extractor = new ApiErrorExtractor();
- if (extractor.itemAlreadyExists(e)) {
- // The table already exists, nothing to return.
- return null;
- } else if (extractor.rateLimited(e)) {
- // The request failed because we hit a temporary quota. Back off and try again.
- try {
- if (BackOffUtils.next(sleeper, backoff)) {
- if (!retry) {
- LOG.info(
- "Quota limit reached when creating table {}:{}.{}, retrying up to {} minutes",
- projectId,
- datasetId,
- table.getTableReference().getTableId(),
- TimeUnit.MILLISECONDS.toSeconds(RETRY_CREATE_TABLE_DURATION_MILLIS) / 60.0);
- retry = true;
- }
- continue;
- }
- } catch (InterruptedException e1) {
- // Restore interrupted state and throw the last failure.
- Thread.currentThread().interrupt();
- throw e;
- }
- }
- throw e;
- }
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/BigQueryTableRowIterator.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/BigQueryTableRowIterator.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/BigQueryTableRowIterator.java
deleted file mode 100644
index c2c80f7..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/BigQueryTableRowIterator.java
+++ /dev/null
@@ -1,469 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-import static com.google.common.base.Preconditions.checkArgument;
-import static com.google.common.base.Preconditions.checkNotNull;
-import static com.google.common.base.Preconditions.checkState;
-
-import com.google.api.client.googleapis.services.AbstractGoogleClientRequest;
-import com.google.api.client.util.BackOff;
-import com.google.api.client.util.BackOffUtils;
-import com.google.api.client.util.ClassInfo;
-import com.google.api.client.util.Data;
-import com.google.api.client.util.Sleeper;
-import com.google.api.services.bigquery.Bigquery;
-import com.google.api.services.bigquery.Bigquery.Jobs.Insert;
-import com.google.api.services.bigquery.model.Dataset;
-import com.google.api.services.bigquery.model.DatasetReference;
-import com.google.api.services.bigquery.model.ErrorProto;
-import com.google.api.services.bigquery.model.Job;
-import com.google.api.services.bigquery.model.JobConfiguration;
-import com.google.api.services.bigquery.model.JobConfigurationQuery;
-import com.google.api.services.bigquery.model.JobReference;
-import com.google.api.services.bigquery.model.JobStatus;
-import com.google.api.services.bigquery.model.Table;
-import com.google.api.services.bigquery.model.TableCell;
-import com.google.api.services.bigquery.model.TableDataList;
-import com.google.api.services.bigquery.model.TableFieldSchema;
-import com.google.api.services.bigquery.model.TableReference;
-import com.google.api.services.bigquery.model.TableRow;
-import com.google.api.services.bigquery.model.TableSchema;
-import com.google.common.base.MoreObjects;
-import com.google.common.collect.ImmutableList;
-
-import org.joda.time.Duration;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.IOException;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-import java.util.NoSuchElementException;
-import java.util.Objects;
-import java.util.Random;
-
-import javax.annotation.Nullable;
-
-/**
- * Iterates over all rows in a table.
- */
-public class BigQueryTableRowIterator implements AutoCloseable {
- private static final Logger LOG = LoggerFactory.getLogger(BigQueryTableRowIterator.class);
-
- @Nullable private TableReference ref;
- @Nullable private final String projectId;
- @Nullable private TableSchema schema;
- private final Bigquery client;
- private String pageToken;
- private Iterator<TableRow> iteratorOverCurrentBatch;
- private TableRow current;
- // Set true when the final page is seen from the service.
- private boolean lastPage = false;
-
- // The maximum number of times a BigQuery request will be retried
- private static final int MAX_RETRIES = 3;
- // Initial wait time for the backoff implementation
- private static final Duration INITIAL_BACKOFF_TIME = Duration.standardSeconds(1);
-
- // After sending a query to BQ service we will be polling the BQ service to check the status with
- // following interval to check the status of query execution job
- private static final Duration QUERY_COMPLETION_POLL_TIME = Duration.standardSeconds(1);
-
- private final String query;
- // Whether to flatten query results.
- private final boolean flattenResults;
- // Temporary dataset used to store query results.
- private String temporaryDatasetId = null;
- // Temporary table used to store query results.
- private String temporaryTableId = null;
-
- private BigQueryTableRowIterator(
- @Nullable TableReference ref, @Nullable String query, @Nullable String projectId,
- Bigquery client, boolean flattenResults) {
- this.ref = ref;
- this.query = query;
- this.projectId = projectId;
- this.client = checkNotNull(client, "client");
- this.flattenResults = flattenResults;
- }
-
- /**
- * Constructs a {@code BigQueryTableRowIterator} that reads from the specified table.
- */
- public static BigQueryTableRowIterator fromTable(TableReference ref, Bigquery client) {
- checkNotNull(ref, "ref");
- checkNotNull(client, "client");
- return new BigQueryTableRowIterator(ref, null, ref.getProjectId(), client, true);
- }
-
- /**
- * Constructs a {@code BigQueryTableRowIterator} that reads from the results of executing the
- * specified query in the specified project.
- */
- public static BigQueryTableRowIterator fromQuery(
- String query, String projectId, Bigquery client, @Nullable Boolean flattenResults) {
- checkNotNull(query, "query");
- checkNotNull(projectId, "projectId");
- checkNotNull(client, "client");
- return new BigQueryTableRowIterator(null, query, projectId, client,
- MoreObjects.firstNonNull(flattenResults, Boolean.TRUE));
- }
-
- /**
- * Opens the table for read.
- * @throws IOException on failure
- */
- public void open() throws IOException, InterruptedException {
- if (query != null) {
- ref = executeQueryAndWaitForCompletion();
- }
- // Get table schema.
- Bigquery.Tables.Get get =
- client.tables().get(ref.getProjectId(), ref.getDatasetId(), ref.getTableId());
-
- Table table =
- executeWithBackOff(
- get,
- "Error opening BigQuery table %s of dataset %s : {}",
- ref.getTableId(),
- ref.getDatasetId());
- schema = table.getSchema();
- }
-
- public boolean advance() throws IOException, InterruptedException {
- while (true) {
- if (iteratorOverCurrentBatch != null && iteratorOverCurrentBatch.hasNext()) {
- // Embed schema information into the raw row, so that values have an
- // associated key. This matches how rows are read when using the
- // DataflowPipelineRunner.
- current = getTypedTableRow(schema.getFields(), iteratorOverCurrentBatch.next());
- return true;
- }
- if (lastPage) {
- return false;
- }
-
- Bigquery.Tabledata.List list =
- client.tabledata().list(ref.getProjectId(), ref.getDatasetId(), ref.getTableId());
- if (pageToken != null) {
- list.setPageToken(pageToken);
- }
-
- TableDataList result =
- executeWithBackOff(
- list,
- "Error reading from BigQuery table %s of dataset %s : {}",
- ref.getTableId(),
- ref.getDatasetId());
-
- pageToken = result.getPageToken();
- iteratorOverCurrentBatch =
- result.getRows() != null
- ? result.getRows().iterator()
- : Collections.<TableRow>emptyIterator();
-
- // The server may return a page token indefinitely on a zero-length table.
- if (pageToken == null || result.getTotalRows() != null && result.getTotalRows() == 0) {
- lastPage = true;
- }
- }
- }
-
- public TableRow getCurrent() {
- if (current == null) {
- throw new NoSuchElementException();
- }
- return current;
- }
-
- /**
- * Adjusts a field returned from the BigQuery API to match what we will receive when running
- * BigQuery's export-to-GCS and parallel read, which is the efficient parallel implementation
- * used for batch jobs executed on the Cloud Dataflow service.
- *
- * <p>The following is the relationship between BigQuery schema and Java types:
- *
- * <ul>
- * <li>Nulls are {@code null}.
- * <li>Repeated fields are {@code List} of objects.
- * <li>Record columns are {@link TableRow} objects.
- * <li>{@code BOOLEAN} columns are JSON booleans, hence Java {@code Boolean} objects.
- * <li>{@code FLOAT} columns are JSON floats, hence Java {@code Double} objects.
- * <li>{@code TIMESTAMP} columns are {@code String} objects that are of the format
- * {@code yyyy-MM-dd HH:mm:ss[.SSSSSS] UTC}, where the {@code .SSSSSS} has no trailing
- * zeros and can be 1 to 6 digits long.
- * <li>Every other atomic type is a {@code String}.
- * </ul>
- *
- * <p>Note that integers are encoded as strings to match BigQuery's exported JSON format.
- *
- * <p>Finally, values are stored in the {@link TableRow} as {"field name": value} pairs
- * and are not accessible through the {@link TableRow#getF} function.
- */
- @Nullable private Object getTypedCellValue(TableFieldSchema fieldSchema, Object v) {
- if (Data.isNull(v)) {
- return null;
- }
-
- if (Objects.equals(fieldSchema.getMode(), "REPEATED")) {
- TableFieldSchema elementSchema = fieldSchema.clone().setMode("REQUIRED");
- @SuppressWarnings("unchecked")
- List<Map<String, Object>> rawCells = (List<Map<String, Object>>) v;
- ImmutableList.Builder<Object> values = ImmutableList.builder();
- for (Map<String, Object> element : rawCells) {
- values.add(getTypedCellValue(elementSchema, element.get("v")));
- }
- return values.build();
- }
-
- if (fieldSchema.getType().equals("RECORD")) {
- @SuppressWarnings("unchecked")
- Map<String, Object> typedV = (Map<String, Object>) v;
- return getTypedTableRow(fieldSchema.getFields(), typedV);
- }
-
- if (fieldSchema.getType().equals("FLOAT")) {
- return Double.parseDouble((String) v);
- }
-
- if (fieldSchema.getType().equals("BOOLEAN")) {
- return Boolean.parseBoolean((String) v);
- }
-
- if (fieldSchema.getType().equals("TIMESTAMP")) {
- return AvroUtils.formatTimestamp((String) v);
- }
-
- return v;
- }
-
- /**
- * A list of the field names that cannot be used in BigQuery tables processed by Dataflow,
- * because they are reserved keywords in {@link TableRow}.
- */
- // TODO: This limitation is unfortunate. We need to give users a way to use BigQueryIO that does
- // not indirect through our broken use of {@link TableRow}.
- // See discussion: https://github.com/GoogleCloudPlatform/DataflowJavaSDK/pull/41
- private static final Collection<String> RESERVED_FIELD_NAMES =
- ClassInfo.of(TableRow.class).getNames();
-
- /**
- * Converts a row returned from the BigQuery JSON API as a {@code Map<String, Object>} into a
- * Java {@link TableRow} with nested {@link TableCell TableCells}. The {@code Object} values in
- * the cells are converted to Java types according to the provided field schemas.
- *
- * <p>See {@link #getTypedCellValue(TableFieldSchema, Object)} for details on how BigQuery
- * types are mapped to Java types.
- */
- private TableRow getTypedTableRow(List<TableFieldSchema> fields, Map<String, Object> rawRow) {
- // If rawRow is a TableRow, use it. If not, create a new one.
- TableRow row;
- List<? extends Map<String, Object>> cells;
- if (rawRow instanceof TableRow) {
- // Since rawRow is a TableRow it already has TableCell objects in setF. We do not need to do
- // any type conversion, but extract the cells for cell-wise processing below.
- row = (TableRow) rawRow;
- cells = row.getF();
- // Clear the cells from the row, so that row.getF() will return null. This matches the
- // behavior of rows produced by the BigQuery export API used on the service.
- row.setF(null);
- } else {
- row = new TableRow();
-
- // Since rawRow is a Map<String, Object> we use Map.get("f") instead of TableRow.getF() to
- // get its cells. Similarly, when rawCell is a Map<String, Object> instead of a TableCell,
- // we will use Map.get("v") instead of TableCell.getV() get its value.
- @SuppressWarnings("unchecked")
- List<? extends Map<String, Object>> rawCells =
- (List<? extends Map<String, Object>>) rawRow.get("f");
- cells = rawCells;
- }
-
- checkState(cells.size() == fields.size(),
- "Expected that the row has the same number of cells %s as fields in the schema %s",
- cells.size(), fields.size());
-
- // Loop through all the fields in the row, normalizing their types with the TableFieldSchema
- // and storing the normalized values by field name in the Map<String, Object> that
- // underlies the TableRow.
- Iterator<? extends Map<String, Object>> cellIt = cells.iterator();
- Iterator<TableFieldSchema> fieldIt = fields.iterator();
- while (cellIt.hasNext()) {
- Map<String, Object> cell = cellIt.next();
- TableFieldSchema fieldSchema = fieldIt.next();
-
- // Convert the object in this cell to the Java type corresponding to its type in the schema.
- Object convertedValue = getTypedCellValue(fieldSchema, cell.get("v"));
-
- String fieldName = fieldSchema.getName();
- checkArgument(!RESERVED_FIELD_NAMES.contains(fieldName),
- "BigQueryIO does not support records with columns named %s", fieldName);
-
- if (convertedValue == null) {
- // BigQuery does not include null values when the export operation (to JSON) is used.
- // To match that behavior, BigQueryTableRowiterator, and the DirectPipelineRunner,
- // intentionally omits columns with null values.
- continue;
- }
-
- row.set(fieldName, convertedValue);
- }
- return row;
- }
-
- // Create a new BigQuery dataset
- private void createDataset(String datasetId) throws IOException, InterruptedException {
- Dataset dataset = new Dataset();
- DatasetReference reference = new DatasetReference();
- reference.setProjectId(projectId);
- reference.setDatasetId(datasetId);
- dataset.setDatasetReference(reference);
-
- String createDatasetError =
- "Error when trying to create the temporary dataset " + datasetId + " in project "
- + projectId;
- executeWithBackOff(
- client.datasets().insert(projectId, dataset), createDatasetError + " :{}");
- }
-
- // Delete the given table that is available in the given dataset.
- private void deleteTable(String datasetId, String tableId)
- throws IOException, InterruptedException {
- executeWithBackOff(
- client.tables().delete(projectId, datasetId, tableId),
- "Error when trying to delete the temporary table " + datasetId + " in dataset " + datasetId
- + " of project " + projectId + ". Manual deletion may be required. Error message : {}");
- }
-
- // Delete the given dataset. This will fail if the given dataset has any tables.
- private void deleteDataset(String datasetId) throws IOException, InterruptedException {
- executeWithBackOff(
- client.datasets().delete(projectId, datasetId),
- "Error when trying to delete the temporary dataset " + datasetId + " in project "
- + projectId + ". Manual deletion may be required. Error message : {}");
- }
-
- /**
- * Executes the specified query and returns a reference to the temporary BigQuery table created
- * to hold the results.
- *
- * @throws IOException if the query fails.
- */
- private TableReference executeQueryAndWaitForCompletion()
- throws IOException, InterruptedException {
- // Create a temporary dataset to store results.
- // Starting dataset name with an "_" so that it is hidden.
- Random rnd = new Random(System.currentTimeMillis());
- temporaryDatasetId = "_dataflow_temporary_dataset_" + rnd.nextInt(1000000);
- temporaryTableId = "dataflow_temporary_table_" + rnd.nextInt(1000000);
-
- createDataset(temporaryDatasetId);
- Job job = new Job();
- JobConfiguration config = new JobConfiguration();
- JobConfigurationQuery queryConfig = new JobConfigurationQuery();
- config.setQuery(queryConfig);
- job.setConfiguration(config);
- queryConfig.setQuery(query);
- queryConfig.setAllowLargeResults(true);
- queryConfig.setFlattenResults(flattenResults);
-
- TableReference destinationTable = new TableReference();
- destinationTable.setProjectId(projectId);
- destinationTable.setDatasetId(temporaryDatasetId);
- destinationTable.setTableId(temporaryTableId);
- queryConfig.setDestinationTable(destinationTable);
-
- Insert insert = client.jobs().insert(projectId, job);
- Job queryJob = executeWithBackOff(
- insert, "Error when trying to execute the job for query " + query + " :{}");
- JobReference jobId = queryJob.getJobReference();
-
- while (true) {
- Job pollJob = executeWithBackOff(
- client.jobs().get(projectId, jobId.getJobId()),
- "Error when trying to get status of the job for query " + query + " :{}");
- JobStatus status = pollJob.getStatus();
- if (status.getState().equals("DONE")) {
- // Job is DONE, but did not necessarily succeed.
- ErrorProto error = status.getErrorResult();
- if (error == null) {
- return pollJob.getConfiguration().getQuery().getDestinationTable();
- } else {
- // There will be no temporary table to delete, so null out the reference.
- temporaryTableId = null;
- throw new IOException("Executing query " + query + " failed: " + error.getMessage());
- }
- }
- try {
- Thread.sleep(QUERY_COMPLETION_POLL_TIME.getMillis());
- } catch (InterruptedException e) {
- e.printStackTrace();
- }
- }
- }
-
- // Execute a BQ request with exponential backoff and return the result.
- // client - BQ request to be executed
- // error - Formatted message to log if when a request fails. Takes exception message as a
- // formatter parameter.
- public static <T> T executeWithBackOff(AbstractGoogleClientRequest<T> client, String error,
- Object... errorArgs) throws IOException, InterruptedException {
- Sleeper sleeper = Sleeper.DEFAULT;
- BackOff backOff =
- new AttemptBoundedExponentialBackOff(MAX_RETRIES, INITIAL_BACKOFF_TIME.getMillis());
-
- T result = null;
- while (true) {
- try {
- result = client.execute();
- break;
- } catch (IOException e) {
- LOG.error(String.format(error, errorArgs), e.getMessage());
- if (!BackOffUtils.next(sleeper, backOff)) {
- LOG.error(
- String.format(error, errorArgs), "Failing after retrying " + MAX_RETRIES + " times.");
- throw e;
- }
- }
- }
-
- return result;
- }
-
- @Override
- public void close() {
- // Prevent any further requests.
- lastPage = true;
-
- try {
- // Deleting temporary table and dataset that gets generated when executing a query.
- if (temporaryDatasetId != null) {
- if (temporaryTableId != null) {
- deleteTable(temporaryDatasetId, temporaryTableId);
- }
- deleteDataset(temporaryDatasetId);
- }
- } catch (IOException | InterruptedException e) {
- throw new RuntimeException(e);
- }
-
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/BitSetCoder.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/BitSetCoder.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/BitSetCoder.java
deleted file mode 100644
index f3a039a..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/BitSetCoder.java
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.cloud.dataflow.sdk.coders.AtomicCoder;
-import com.google.cloud.dataflow.sdk.coders.ByteArrayCoder;
-import com.google.cloud.dataflow.sdk.coders.CoderException;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.util.BitSet;
-
-/**
- * Coder for the BitSet used to track child-trigger finished states.
- */
-class BitSetCoder extends AtomicCoder<BitSet> {
-
- private static final BitSetCoder INSTANCE = new BitSetCoder();
- private transient ByteArrayCoder byteArrayCoder = ByteArrayCoder.of();
-
- private BitSetCoder() {}
-
- public static BitSetCoder of() {
- return INSTANCE;
- }
-
- @Override
- public void encode(BitSet value, OutputStream outStream, Context context)
- throws CoderException, IOException {
- byteArrayCoder.encodeAndOwn(value.toByteArray(), outStream, context);
- }
-
- @Override
- public BitSet decode(InputStream inStream, Context context)
- throws CoderException, IOException {
- return BitSet.valueOf(byteArrayCoder.decode(inStream, context));
- }
-
- @Override
- public void verifyDeterministic() throws NonDeterministicException {
- verifyDeterministic(
- "BitSetCoder requires its byteArrayCoder to be deterministic.",
- byteArrayCoder);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/BufferedElementCountingOutputStream.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/BufferedElementCountingOutputStream.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/BufferedElementCountingOutputStream.java
deleted file mode 100644
index e8e693a..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/BufferedElementCountingOutputStream.java
+++ /dev/null
@@ -1,184 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
- * in compliance with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software distributed under the License
- * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
- * or implied. See the License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.cloud.dataflow.sdk.coders.Coder.Context;
-
-import java.io.IOException;
-import java.io.OutputStream;
-import java.nio.ByteBuffer;
-
-import javax.annotation.concurrent.NotThreadSafe;
-
-/**
- * Provides an efficient encoding for {@link Iterable}s containing small values by
- * buffering up to {@code bufferSize} bytes of data before prefixing the count.
- * Note that each element needs to be encoded in a nested context. See
- * {@link Context Coder.Context} for more details.
- *
- * <p>To use this stream:
- * <pre><code>
- * BufferedElementCountingOutputStream os = ...
- * for (Element E : elements) {
- * os.markElementStart();
- * // write an element to os
- * }
- * os.finish();
- * </code></pre>
- *
- * <p>The resulting output stream is:
- * <pre>
- * countA element(0) element(1) ... element(countA - 1)
- * countB element(0) element(1) ... element(countB - 1)
- * ...
- * countX element(0) element(1) ... element(countX - 1)
- * countY
- * </pre>
- *
- * <p>To read this stream:
- * <pre><code>
- * InputStream is = ...
- * long count;
- * do {
- * count = VarInt.decodeLong(is);
- * for (int i = 0; i < count; ++i) {
- * // read an element from is
- * }
- * } while(count > 0);
- * </code></pre>
- *
- * <p>The counts are encoded as variable length longs. See {@link VarInt#encode(long, OutputStream)}
- * for more details. The end of the iterable is detected by reading a count of 0.
- */
-@NotThreadSafe
-public class BufferedElementCountingOutputStream extends OutputStream {
- public static final int DEFAULT_BUFFER_SIZE = 64 * 1024;
- private final ByteBuffer buffer;
- private final OutputStream os;
- private boolean finished;
- private long count;
-
- /**
- * Creates an output stream which encodes the number of elements output to it in a streaming
- * manner.
- */
- public BufferedElementCountingOutputStream(OutputStream os) {
- this(os, DEFAULT_BUFFER_SIZE);
- }
-
- /**
- * Creates an output stream which encodes the number of elements output to it in a streaming
- * manner with the given {@code bufferSize}.
- */
- BufferedElementCountingOutputStream(OutputStream os, int bufferSize) {
- this.buffer = ByteBuffer.allocate(bufferSize);
- this.os = os;
- this.finished = false;
- this.count = 0;
- }
-
- /**
- * Finishes the encoding by flushing any buffered data,
- * and outputting a final count of 0.
- */
- public void finish() throws IOException {
- if (finished) {
- return;
- }
- flush();
- // Finish the stream by stating that there are 0 elements that follow.
- VarInt.encode(0, os);
- finished = true;
- }
-
- /**
- * Marks that a new element is being output. This allows this output stream
- * to use the buffer if it had previously overflowed marking the start of a new
- * block of elements.
- */
- public void markElementStart() throws IOException {
- if (finished) {
- throw new IOException("Stream has been finished. Can not add any more elements.");
- }
- count++;
- }
-
- @Override
- public void write(int b) throws IOException {
- if (finished) {
- throw new IOException("Stream has been finished. Can not write any more data.");
- }
- if (count == 0) {
- os.write(b);
- return;
- }
-
- if (buffer.hasRemaining()) {
- buffer.put((byte) b);
- } else {
- outputBuffer();
- os.write(b);
- }
- }
-
- @Override
- public void write(byte[] b, int off, int len) throws IOException {
- if (finished) {
- throw new IOException("Stream has been finished. Can not write any more data.");
- }
- if (count == 0) {
- os.write(b, off, len);
- return;
- }
-
- if (buffer.remaining() >= len) {
- buffer.put(b, off, len);
- } else {
- outputBuffer();
- os.write(b, off, len);
- }
- }
-
- @Override
- public void flush() throws IOException {
- if (finished) {
- return;
- }
- outputBuffer();
- os.flush();
- }
-
- @Override
- public void close() throws IOException {
- finish();
- os.close();
- }
-
- // Output the buffer if it contains any data.
- private void outputBuffer() throws IOException {
- if (count > 0) {
- VarInt.encode(count, os);
- // We are using a heap based buffer and not a direct buffer so it is safe to access
- // the underlying array.
- os.write(buffer.array(), buffer.arrayOffset(), buffer.position());
- buffer.clear();
- // The buffer has been flushed so we must write to the underlying stream until
- // we learn of the next element. We reset the count to zero marking that we should
- // not use the buffer.
- count = 0;
- }
- }
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/CloudKnownType.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/CloudKnownType.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/CloudKnownType.java
deleted file mode 100644
index 8b41eb8..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/CloudKnownType.java
+++ /dev/null
@@ -1,138 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.Map;
-
-import javax.annotation.Nullable;
-
-/** A utility for manipulating well-known cloud types. */
-enum CloudKnownType {
- TEXT("http://schema.org/Text", String.class) {
- @Override
- public <T> T parse(Object value, Class<T> clazz) {
- return clazz.cast(value);
- }
- },
- BOOLEAN("http://schema.org/Boolean", Boolean.class) {
- @Override
- public <T> T parse(Object value, Class<T> clazz) {
- return clazz.cast(value);
- }
- },
- INTEGER("http://schema.org/Integer", Long.class, Integer.class) {
- @Override
- public <T> T parse(Object value, Class<T> clazz) {
- Object result = null;
- if (value.getClass() == clazz) {
- result = value;
- } else if (clazz == Long.class) {
- if (value instanceof Integer) {
- result = ((Integer) value).longValue();
- } else if (value instanceof String) {
- result = Long.valueOf((String) value);
- }
- } else if (clazz == Integer.class) {
- if (value instanceof Long) {
- result = ((Long) value).intValue();
- } else if (value instanceof String) {
- result = Integer.valueOf((String) value);
- }
- }
- return clazz.cast(result);
- }
- },
- FLOAT("http://schema.org/Float", Double.class, Float.class) {
- @Override
- public <T> T parse(Object value, Class<T> clazz) {
- Object result = null;
- if (value.getClass() == clazz) {
- result = value;
- } else if (clazz == Double.class) {
- if (value instanceof Float) {
- result = ((Float) value).doubleValue();
- } else if (value instanceof String) {
- result = Double.valueOf((String) value);
- }
- } else if (clazz == Float.class) {
- if (value instanceof Double) {
- result = ((Double) value).floatValue();
- } else if (value instanceof String) {
- result = Float.valueOf((String) value);
- }
- }
- return clazz.cast(result);
- }
- };
-
- private final String uri;
- private final Class<?>[] classes;
-
- private CloudKnownType(String uri, Class<?>... classes) {
- this.uri = uri;
- this.classes = classes;
- }
-
- public String getUri() {
- return uri;
- }
-
- public abstract <T> T parse(Object value, Class<T> clazz);
-
- public Class<?> defaultClass() {
- return classes[0];
- }
-
- private static final Map<String, CloudKnownType> typesByUri =
- Collections.unmodifiableMap(buildTypesByUri());
-
- private static Map<String, CloudKnownType> buildTypesByUri() {
- Map<String, CloudKnownType> result = new HashMap<>();
- for (CloudKnownType ty : CloudKnownType.values()) {
- result.put(ty.getUri(), ty);
- }
- return result;
- }
-
- @Nullable
- public static CloudKnownType forUri(@Nullable String uri) {
- if (uri == null) {
- return null;
- }
- return typesByUri.get(uri);
- }
-
- private static final Map<Class<?>, CloudKnownType> typesByClass =
- Collections.unmodifiableMap(buildTypesByClass());
-
- private static Map<Class<?>, CloudKnownType> buildTypesByClass() {
- Map<Class<?>, CloudKnownType> result = new HashMap<>();
- for (CloudKnownType ty : CloudKnownType.values()) {
- for (Class<?> clazz : ty.classes) {
- result.put(clazz, ty);
- }
- }
- return result;
- }
-
- @Nullable
- public static CloudKnownType forClass(Class<?> clazz) {
- return typesByClass.get(clazz);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/CloudObject.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/CloudObject.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/CloudObject.java
deleted file mode 100644
index 8c704bf..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/CloudObject.java
+++ /dev/null
@@ -1,184 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-import static com.google.api.client.util.Preconditions.checkNotNull;
-
-import com.google.api.client.json.GenericJson;
-import com.google.api.client.util.Key;
-
-import java.util.Map;
-
-import javax.annotation.Nullable;
-
-/**
- * A representation of an arbitrary Java object to be instantiated by Dataflow
- * workers.
- *
- * <p>Typically, an object to be written by the SDK to the Dataflow service will
- * implement a method (typically called {@code asCloudObject()}) that returns a
- * {@code CloudObject} to represent the object in the protocol. Once the
- * {@code CloudObject} is constructed, the method should explicitly add
- * additional properties to be presented during deserialization, representing
- * child objects by building additional {@code CloudObject}s.
- */
-public final class CloudObject extends GenericJson {
- /**
- * Constructs a {@code CloudObject} by copying the supplied serialized object
- * spec, which must represent an SDK object serialized for transport via the
- * Dataflow API.
- *
- * <p>The most common use of this method is during deserialization on the worker,
- * where it's used as a binding type during instance construction.
- *
- * @param spec supplies the serialized form of the object as a nested map
- * @throws RuntimeException if the supplied map does not represent an SDK object
- */
- public static CloudObject fromSpec(Map<String, Object> spec) {
- CloudObject result = new CloudObject();
- result.putAll(spec);
- if (result.className == null) {
- throw new RuntimeException("Unable to create an SDK object from " + spec
- + ": Object class not specified (missing \""
- + PropertyNames.OBJECT_TYPE_NAME + "\" field)");
- }
- return result;
- }
-
- /**
- * Constructs a {@code CloudObject} to be used for serializing an instance of
- * the supplied class for transport via the Dataflow API. The instance
- * parameters to be serialized must be supplied explicitly after the
- * {@code CloudObject} is created, by using {@link CloudObject#put}.
- *
- * @param cls the class to use when deserializing the object on the worker
- */
- public static CloudObject forClass(Class<?> cls) {
- CloudObject result = new CloudObject();
- result.className = checkNotNull(cls).getName();
- return result;
- }
-
- /**
- * Constructs a {@code CloudObject} to be used for serializing data to be
- * deserialized using the supplied class name the supplied class name for
- * transport via the Dataflow API. The instance parameters to be serialized
- * must be supplied explicitly after the {@code CloudObject} is created, by
- * using {@link CloudObject#put}.
- *
- * @param className the class to use when deserializing the object on the worker
- */
- public static CloudObject forClassName(String className) {
- CloudObject result = new CloudObject();
- result.className = checkNotNull(className);
- return result;
- }
-
- /**
- * Constructs a {@code CloudObject} representing the given value.
- * @param value the scalar value to represent.
- */
- public static CloudObject forString(String value) {
- CloudObject result = forClassName(CloudKnownType.TEXT.getUri());
- result.put(PropertyNames.SCALAR_FIELD_NAME, value);
- return result;
- }
-
- /**
- * Constructs a {@code CloudObject} representing the given value.
- * @param value the scalar value to represent.
- */
- public static CloudObject forBoolean(Boolean value) {
- CloudObject result = forClassName(CloudKnownType.BOOLEAN.getUri());
- result.put(PropertyNames.SCALAR_FIELD_NAME, value);
- return result;
- }
-
- /**
- * Constructs a {@code CloudObject} representing the given value.
- * @param value the scalar value to represent.
- */
- public static CloudObject forInteger(Long value) {
- CloudObject result = forClassName(CloudKnownType.INTEGER.getUri());
- result.put(PropertyNames.SCALAR_FIELD_NAME, value);
- return result;
- }
-
- /**
- * Constructs a {@code CloudObject} representing the given value.
- * @param value the scalar value to represent.
- */
- public static CloudObject forInteger(Integer value) {
- CloudObject result = forClassName(CloudKnownType.INTEGER.getUri());
- result.put(PropertyNames.SCALAR_FIELD_NAME, value);
- return result;
- }
-
- /**
- * Constructs a {@code CloudObject} representing the given value.
- * @param value the scalar value to represent.
- */
- public static CloudObject forFloat(Float value) {
- CloudObject result = forClassName(CloudKnownType.FLOAT.getUri());
- result.put(PropertyNames.SCALAR_FIELD_NAME, value);
- return result;
- }
-
- /**
- * Constructs a {@code CloudObject} representing the given value.
- * @param value the scalar value to represent.
- */
- public static CloudObject forFloat(Double value) {
- CloudObject result = forClassName(CloudKnownType.FLOAT.getUri());
- result.put(PropertyNames.SCALAR_FIELD_NAME, value);
- return result;
- }
-
- /**
- * Constructs a {@code CloudObject} representing the given value of a
- * well-known cloud object type.
- * @param value the scalar value to represent.
- * @throws RuntimeException if the value does not have a
- * {@link CloudKnownType} mapping
- */
- public static CloudObject forKnownType(Object value) {
- @Nullable CloudKnownType ty = CloudKnownType.forClass(value.getClass());
- if (ty == null) {
- throw new RuntimeException("Unable to represent value via the Dataflow API: " + value);
- }
- CloudObject result = forClassName(ty.getUri());
- result.put(PropertyNames.SCALAR_FIELD_NAME, value);
- return result;
- }
-
- @Key(PropertyNames.OBJECT_TYPE_NAME)
- private String className;
-
- private CloudObject() {}
-
- /**
- * Gets the name of the Java class that this CloudObject represents.
- */
- public String getClassName() {
- return className;
- }
-
- @Override
- public CloudObject clone() {
- return (CloudObject) super.clone();
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/CoderUtils.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/CoderUtils.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/CoderUtils.java
deleted file mode 100644
index ddab933..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/CoderUtils.java
+++ /dev/null
@@ -1,327 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-import static com.google.cloud.dataflow.sdk.util.Structs.addList;
-
-import com.google.api.client.util.Base64;
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.coders.CoderException;
-import com.google.cloud.dataflow.sdk.coders.IterableCoder;
-import com.google.cloud.dataflow.sdk.coders.KvCoder;
-import com.google.cloud.dataflow.sdk.coders.KvCoderBase;
-import com.google.cloud.dataflow.sdk.coders.MapCoder;
-import com.google.cloud.dataflow.sdk.coders.MapCoderBase;
-import com.google.cloud.dataflow.sdk.values.TypeDescriptor;
-import com.google.common.base.Throwables;
-
-import com.fasterxml.jackson.annotation.JsonTypeInfo;
-import com.fasterxml.jackson.annotation.JsonTypeInfo.As;
-import com.fasterxml.jackson.annotation.JsonTypeInfo.Id;
-import com.fasterxml.jackson.databind.DatabindContext;
-import com.fasterxml.jackson.databind.JavaType;
-import com.fasterxml.jackson.databind.ObjectMapper;
-import com.fasterxml.jackson.databind.annotation.JsonTypeIdResolver;
-import com.fasterxml.jackson.databind.jsontype.impl.TypeIdResolverBase;
-import com.fasterxml.jackson.databind.module.SimpleModule;
-import com.fasterxml.jackson.databind.type.TypeFactory;
-
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.lang.ref.SoftReference;
-import java.lang.reflect.ParameterizedType;
-import java.lang.reflect.TypeVariable;
-
-/**
- * Utilities for working with Coders.
- */
-public final class CoderUtils {
- private CoderUtils() {} // Non-instantiable
-
- /**
- * Coder class-name alias for a key-value type.
- */
- public static final String KIND_PAIR = "kind:pair";
-
- /**
- * Coder class-name alias for a stream type.
- */
- public static final String KIND_STREAM = "kind:stream";
-
- private static ThreadLocal<SoftReference<ExposedByteArrayOutputStream>> threadLocalOutputStream
- = new ThreadLocal<>();
-
- /**
- * If true, a call to {@code encodeToByteArray} is already on the call stack.
- */
- private static ThreadLocal<Boolean> threadLocalOutputStreamInUse = new ThreadLocal<Boolean>() {
- @Override
- protected Boolean initialValue() {
- return false;
- }
- };
-
- /**
- * Encodes the given value using the specified Coder, and returns
- * the encoded bytes.
- *
- * <p>This function is not reentrant; it should not be called from methods of the provided
- * {@link Coder}.
- */
- public static <T> byte[] encodeToByteArray(Coder<T> coder, T value) throws CoderException {
- return encodeToByteArray(coder, value, Coder.Context.OUTER);
- }
-
- public static <T> byte[] encodeToByteArray(Coder<T> coder, T value, Coder.Context context)
- throws CoderException {
- if (threadLocalOutputStreamInUse.get()) {
- // encodeToByteArray() is called recursively and the thread local stream is in use,
- // allocating a new one.
- ByteArrayOutputStream stream = new ExposedByteArrayOutputStream();
- encodeToSafeStream(coder, value, stream, context);
- return stream.toByteArray();
- } else {
- threadLocalOutputStreamInUse.set(true);
- try {
- ByteArrayOutputStream stream = getThreadLocalOutputStream();
- encodeToSafeStream(coder, value, stream, context);
- return stream.toByteArray();
- } finally {
- threadLocalOutputStreamInUse.set(false);
- }
- }
- }
-
- /**
- * Encodes {@code value} to the given {@code stream}, which should be a stream that never throws
- * {@code IOException}, such as {@code ByteArrayOutputStream} or
- * {@link ExposedByteArrayOutputStream}.
- */
- private static <T> void encodeToSafeStream(
- Coder<T> coder, T value, OutputStream stream, Coder.Context context) throws CoderException {
- try {
- coder.encode(value, new UnownedOutputStream(stream), context);
- } catch (IOException exn) {
- Throwables.propagateIfPossible(exn, CoderException.class);
- throw new IllegalArgumentException(
- "Forbidden IOException when writing to OutputStream", exn);
- }
- }
-
- /**
- * Decodes the given bytes using the specified Coder, and returns
- * the resulting decoded value.
- */
- public static <T> T decodeFromByteArray(Coder<T> coder, byte[] encodedValue)
- throws CoderException {
- return decodeFromByteArray(coder, encodedValue, Coder.Context.OUTER);
- }
-
- public static <T> T decodeFromByteArray(
- Coder<T> coder, byte[] encodedValue, Coder.Context context) throws CoderException {
- try (ExposedByteArrayInputStream stream = new ExposedByteArrayInputStream(encodedValue)) {
- T result = decodeFromSafeStream(coder, stream, context);
- if (stream.available() != 0) {
- throw new CoderException(
- stream.available() + " unexpected extra bytes after decoding " + result);
- }
- return result;
- }
- }
-
- /**
- * Decodes a value from the given {@code stream}, which should be a stream that never throws
- * {@code IOException}, such as {@code ByteArrayInputStream} or
- * {@link ExposedByteArrayInputStream}.
- */
- private static <T> T decodeFromSafeStream(
- Coder<T> coder, InputStream stream, Coder.Context context) throws CoderException {
- try {
- return coder.decode(new UnownedInputStream(stream), context);
- } catch (IOException exn) {
- Throwables.propagateIfPossible(exn, CoderException.class);
- throw new IllegalArgumentException(
- "Forbidden IOException when reading from InputStream", exn);
- }
- }
-
- private static ByteArrayOutputStream getThreadLocalOutputStream() {
- SoftReference<ExposedByteArrayOutputStream> refStream = threadLocalOutputStream.get();
- ExposedByteArrayOutputStream stream = refStream == null ? null : refStream.get();
- if (stream == null) {
- stream = new ExposedByteArrayOutputStream();
- threadLocalOutputStream.set(new SoftReference<>(stream));
- }
- stream.reset();
- return stream;
- }
-
- /**
- * Clones the given value by encoding and then decoding it with the specified Coder.
- *
- * <p>This function is not reentrant; it should not be called from methods of the provided
- * {@link Coder}.
- */
- public static <T> T clone(Coder<T> coder, T value) throws CoderException {
- return decodeFromByteArray(coder, encodeToByteArray(coder, value, Coder.Context.OUTER));
- }
-
- /**
- * Encodes the given value using the specified Coder, and returns the Base64 encoding of the
- * encoded bytes.
- *
- * @throws CoderException if there are errors during encoding.
- */
- public static <T> String encodeToBase64(Coder<T> coder, T value)
- throws CoderException {
- byte[] rawValue = encodeToByteArray(coder, value);
- return Base64.encodeBase64URLSafeString(rawValue);
- }
-
- /**
- * Parses a value from a base64-encoded String using the given coder.
- */
- public static <T> T decodeFromBase64(Coder<T> coder, String encodedValue) throws CoderException {
- return decodeFromSafeStream(
- coder, new ByteArrayInputStream(Base64.decodeBase64(encodedValue)), Coder.Context.OUTER);
- }
-
- /**
- * If {@code coderType} is a subclass of {@code Coder<T>} for a specific
- * type {@code T}, returns {@code T.class}.
- */
- @SuppressWarnings({"rawtypes", "unchecked"})
- public static TypeDescriptor getCodedType(TypeDescriptor coderDescriptor) {
- ParameterizedType coderType =
- (ParameterizedType) coderDescriptor.getSupertype(Coder.class).getType();
- TypeDescriptor codedType = TypeDescriptor.of(coderType.getActualTypeArguments()[0]);
- return codedType;
- }
-
- public static CloudObject makeCloudEncoding(
- String type,
- CloudObject... componentSpecs) {
- CloudObject encoding = CloudObject.forClassName(type);
- if (componentSpecs.length > 0) {
- addList(encoding, PropertyNames.COMPONENT_ENCODINGS, componentSpecs);
- }
- return encoding;
- }
-
- /**
- * A {@link com.fasterxml.jackson.databind.Module} that adds the type
- * resolver needed for Coder definitions created by the Dataflow service.
- */
- static final class Jackson2Module extends SimpleModule {
- /**
- * The Coder custom type resolver.
- *
- * <p>This resolver resolves coders. If the Coder ID is a particular
- * well-known identifier supplied by the Dataflow service, it's replaced
- * with the corresponding class. All other Coder instances are resolved
- * by class name, using the package com.google.cloud.dataflow.sdk.coders
- * if there are no "."s in the ID.
- */
- private static final class Resolver extends TypeIdResolverBase {
- @SuppressWarnings("unused") // Used via @JsonTypeIdResolver annotation on Mixin
- public Resolver() {
- super(TypeFactory.defaultInstance().constructType(Coder.class),
- TypeFactory.defaultInstance());
- }
-
- @Deprecated
- @Override
- public JavaType typeFromId(String id) {
- return typeFromId(null, id);
- }
-
- @Override
- public JavaType typeFromId(DatabindContext context, String id) {
- Class<?> clazz = getClassForId(id);
- if (clazz == KvCoder.class) {
- clazz = KvCoderBase.class;
- }
- if (clazz == MapCoder.class) {
- clazz = MapCoderBase.class;
- }
- @SuppressWarnings("rawtypes")
- TypeVariable[] tvs = clazz.getTypeParameters();
- JavaType[] types = new JavaType[tvs.length];
- for (int lupe = 0; lupe < tvs.length; lupe++) {
- types[lupe] = TypeFactory.unknownType();
- }
- return _typeFactory.constructSimpleType(clazz, types);
- }
-
- private Class<?> getClassForId(String id) {
- try {
- if (id.contains(".")) {
- return Class.forName(id);
- }
-
- if (id.equals(KIND_STREAM)) {
- return IterableCoder.class;
- } else if (id.equals(KIND_PAIR)) {
- return KvCoder.class;
- }
-
- // Otherwise, see if the ID is the name of a class in
- // com.google.cloud.dataflow.sdk.coders. We do this via creating
- // the class object so that class loaders have a chance to get
- // involved -- and since we need the class object anyway.
- return Class.forName(Coder.class.getPackage().getName() + "." + id);
- } catch (ClassNotFoundException e) {
- throw new RuntimeException("Unable to convert coder ID " + id + " to class", e);
- }
- }
-
- @Override
- public String idFromValueAndType(Object o, Class<?> clazz) {
- return clazz.getName();
- }
-
- @Override
- public String idFromValue(Object o) {
- return o.getClass().getName();
- }
-
- @Override
- public JsonTypeInfo.Id getMechanism() {
- return JsonTypeInfo.Id.CUSTOM;
- }
- }
-
- /**
- * The mixin class defining how Coders are handled by the deserialization
- * {@link ObjectMapper}.
- *
- * <p>This is done via a mixin so that this resolver is <i>only</i> used
- * during deserialization requested by the Dataflow SDK.
- */
- @JsonTypeIdResolver(Resolver.class)
- @JsonTypeInfo(use = Id.CUSTOM, include = As.PROPERTY, property = PropertyNames.OBJECT_TYPE_NAME)
- private static final class Mixin {}
-
- public Jackson2Module() {
- super("DataflowCoders");
- setMixInAnnotation(Coder.class, Mixin.class);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/CombineContextFactory.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/CombineContextFactory.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/CombineContextFactory.java
deleted file mode 100644
index 6f2b89b..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/CombineContextFactory.java
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-import com.google.cloud.dataflow.sdk.transforms.CombineWithContext.Context;
-import com.google.cloud.dataflow.sdk.transforms.DoFn;
-import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
-import com.google.cloud.dataflow.sdk.util.state.StateContext;
-import com.google.cloud.dataflow.sdk.values.PCollectionView;
-
-/**
- * Factory that produces {@code Combine.Context} based on different inputs.
- */
-public class CombineContextFactory {
-
- private static final Context NULL_CONTEXT = new Context() {
- @Override
- public PipelineOptions getPipelineOptions() {
- throw new IllegalArgumentException("cannot call getPipelineOptions() in a null context");
- }
-
- @Override
- public <T> T sideInput(PCollectionView<T> view) {
- throw new IllegalArgumentException("cannot call sideInput() in a null context");
- }
- };
-
- /**
- * Returns a fake {@code Combine.Context} for tests.
- */
- public static Context nullContext() {
- return NULL_CONTEXT;
- }
-
- /**
- * Returns a {@code Combine.Context} that wraps a {@code DoFn.ProcessContext}.
- */
- public static Context createFromProcessContext(final DoFn<?, ?>.ProcessContext c) {
- return new Context() {
- @Override
- public PipelineOptions getPipelineOptions() {
- return c.getPipelineOptions();
- }
-
- @Override
- public <T> T sideInput(PCollectionView<T> view) {
- return c.sideInput(view);
- }
- };
- }
-
- /**
- * Returns a {@code Combine.Context} that wraps a {@link StateContext}.
- */
- public static Context createFromStateContext(final StateContext<?> c) {
- return new Context() {
- @Override
- public PipelineOptions getPipelineOptions() {
- return c.getPipelineOptions();
- }
-
- @Override
- public <T> T sideInput(PCollectionView<T> view) {
- return c.sideInput(view);
- }
- };
- }
-
- /**
- * Returns a {@code Combine.Context} from {@code PipelineOptions}, {@code SideInputReader},
- * and the main input window.
- */
- public static Context createFromComponents(final PipelineOptions options,
- final SideInputReader sideInputReader, final BoundedWindow mainInputWindow) {
- return new Context() {
- @Override
- public PipelineOptions getPipelineOptions() {
- return options;
- }
-
- @Override
- public <T> T sideInput(PCollectionView<T> view) {
- if (!sideInputReader.contains(view)) {
- throw new IllegalArgumentException("calling sideInput() with unknown view");
- }
-
- BoundedWindow sideInputWindow =
- view.getWindowingStrategyInternal().getWindowFn().getSideInputWindow(mainInputWindow);
- return sideInputReader.get(view, sideInputWindow);
- }
- };
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/CombineFnUtil.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/CombineFnUtil.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/CombineFnUtil.java
deleted file mode 100644
index d974480..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/CombineFnUtil.java
+++ /dev/null
@@ -1,154 +0,0 @@
-
-/*
- * Copyright (C) 2016 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.cloud.dataflow.sdk.coders.CannotProvideCoderException;
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.coders.CoderRegistry;
-import com.google.cloud.dataflow.sdk.transforms.Combine.CombineFn;
-import com.google.cloud.dataflow.sdk.transforms.Combine.KeyedCombineFn;
-import com.google.cloud.dataflow.sdk.transforms.CombineFnBase.GlobalCombineFn;
-import com.google.cloud.dataflow.sdk.transforms.CombineWithContext.CombineFnWithContext;
-import com.google.cloud.dataflow.sdk.transforms.CombineWithContext.Context;
-import com.google.cloud.dataflow.sdk.transforms.CombineWithContext.KeyedCombineFnWithContext;
-import com.google.cloud.dataflow.sdk.util.state.StateContext;
-
-import java.io.IOException;
-import java.io.NotSerializableException;
-import java.io.ObjectOutputStream;
-
-/**
- * Static utility methods that create combine function instances.
- */
-public class CombineFnUtil {
- /**
- * Returns the partial application of the {@link KeyedCombineFnWithContext} to a specific
- * context to produce a {@link KeyedCombineFn}.
- *
- * <p>The returned {@link KeyedCombineFn} cannot be serialized.
- */
- public static <K, InputT, AccumT, OutputT> KeyedCombineFn<K, InputT, AccumT, OutputT>
- bindContext(
- KeyedCombineFnWithContext<K, InputT, AccumT, OutputT> combineFn,
- StateContext<?> stateContext) {
- Context context = CombineContextFactory.createFromStateContext(stateContext);
- return new NonSerializableBoundedKeyedCombineFn<>(combineFn, context);
- }
-
- /**
- * Return a {@link CombineFnWithContext} from the given {@link GlobalCombineFn}.
- */
- public static <InputT, AccumT, OutputT>
- CombineFnWithContext<InputT, AccumT, OutputT> toFnWithContext(
- GlobalCombineFn<InputT, AccumT, OutputT> globalCombineFn) {
- if (globalCombineFn instanceof CombineFnWithContext) {
- @SuppressWarnings("unchecked")
- CombineFnWithContext<InputT, AccumT, OutputT> combineFnWithContext =
- (CombineFnWithContext<InputT, AccumT, OutputT>) globalCombineFn;
- return combineFnWithContext;
- } else {
- @SuppressWarnings("unchecked")
- final CombineFn<InputT, AccumT, OutputT> combineFn =
- (CombineFn<InputT, AccumT, OutputT>) globalCombineFn;
- return new CombineFnWithContext<InputT, AccumT, OutputT>() {
- @Override
- public AccumT createAccumulator(Context c) {
- return combineFn.createAccumulator();
- }
- @Override
- public AccumT addInput(AccumT accumulator, InputT input, Context c) {
- return combineFn.addInput(accumulator, input);
- }
- @Override
- public AccumT mergeAccumulators(Iterable<AccumT> accumulators, Context c) {
- return combineFn.mergeAccumulators(accumulators);
- }
- @Override
- public OutputT extractOutput(AccumT accumulator, Context c) {
- return combineFn.extractOutput(accumulator);
- }
- @Override
- public AccumT compact(AccumT accumulator, Context c) {
- return combineFn.compact(accumulator);
- }
- @Override
- public OutputT defaultValue() {
- return combineFn.defaultValue();
- }
- @Override
- public Coder<AccumT> getAccumulatorCoder(CoderRegistry registry, Coder<InputT> inputCoder)
- throws CannotProvideCoderException {
- return combineFn.getAccumulatorCoder(registry, inputCoder);
- }
- @Override
- public Coder<OutputT> getDefaultOutputCoder(
- CoderRegistry registry, Coder<InputT> inputCoder) throws CannotProvideCoderException {
- return combineFn.getDefaultOutputCoder(registry, inputCoder);
- }
- };
- }
- }
-
- private static class NonSerializableBoundedKeyedCombineFn<K, InputT, AccumT, OutputT>
- extends KeyedCombineFn<K, InputT, AccumT, OutputT> {
- private final KeyedCombineFnWithContext<K, InputT, AccumT, OutputT> combineFn;
- private final Context context;
-
- private NonSerializableBoundedKeyedCombineFn(
- KeyedCombineFnWithContext<K, InputT, AccumT, OutputT> combineFn,
- Context context) {
- this.combineFn = combineFn;
- this.context = context;
- }
- @Override
- public AccumT createAccumulator(K key) {
- return combineFn.createAccumulator(key, context);
- }
- @Override
- public AccumT addInput(K key, AccumT accumulator, InputT value) {
- return combineFn.addInput(key, accumulator, value, context);
- }
- @Override
- public AccumT mergeAccumulators(K key, Iterable<AccumT> accumulators) {
- return combineFn.mergeAccumulators(key, accumulators, context);
- }
- @Override
- public OutputT extractOutput(K key, AccumT accumulator) {
- return combineFn.extractOutput(key, accumulator, context);
- }
- @Override
- public AccumT compact(K key, AccumT accumulator) {
- return combineFn.compact(key, accumulator, context);
- }
- @Override
- public Coder<AccumT> getAccumulatorCoder(CoderRegistry registry, Coder<K> keyCoder,
- Coder<InputT> inputCoder) throws CannotProvideCoderException {
- return combineFn.getAccumulatorCoder(registry, keyCoder, inputCoder);
- }
- @Override
- public Coder<OutputT> getDefaultOutputCoder(CoderRegistry registry, Coder<K> keyCoder,
- Coder<InputT> inputCoder) throws CannotProvideCoderException {
- return combineFn.getDefaultOutputCoder(registry, keyCoder, inputCoder);
- }
-
- private void writeObject(@SuppressWarnings("unused") ObjectOutputStream out)
- throws IOException {
- throw new NotSerializableException(
- "Cannot serialize the CombineFn resulting from CombineFnUtil.bindContext.");
- }
- }
-}
[53/67] incubator-beam git commit: Directory reorganization
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/src/main/java/com/google/cloud/dataflow/examples/cookbook/TriggerExample.java
----------------------------------------------------------------------
diff --git a/examples/src/main/java/com/google/cloud/dataflow/examples/cookbook/TriggerExample.java b/examples/src/main/java/com/google/cloud/dataflow/examples/cookbook/TriggerExample.java
deleted file mode 100644
index ce5e08e..0000000
--- a/examples/src/main/java/com/google/cloud/dataflow/examples/cookbook/TriggerExample.java
+++ /dev/null
@@ -1,564 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.examples.cookbook;
-
-import com.google.api.services.bigquery.model.TableFieldSchema;
-import com.google.api.services.bigquery.model.TableReference;
-import com.google.api.services.bigquery.model.TableRow;
-import com.google.api.services.bigquery.model.TableSchema;
-import com.google.cloud.dataflow.examples.common.DataflowExampleOptions;
-import com.google.cloud.dataflow.examples.common.DataflowExampleUtils;
-import com.google.cloud.dataflow.examples.common.ExampleBigQueryTableOptions;
-import com.google.cloud.dataflow.examples.common.ExamplePubsubTopicOptions;
-import com.google.cloud.dataflow.examples.common.PubsubFileInjector;
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.PipelineResult;
-import com.google.cloud.dataflow.sdk.io.BigQueryIO;
-import com.google.cloud.dataflow.sdk.io.PubsubIO;
-import com.google.cloud.dataflow.sdk.io.TextIO;
-import com.google.cloud.dataflow.sdk.options.DataflowPipelineOptions;
-import com.google.cloud.dataflow.sdk.options.Default;
-import com.google.cloud.dataflow.sdk.options.Description;
-import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
-import com.google.cloud.dataflow.sdk.runners.DataflowPipelineRunner;
-import com.google.cloud.dataflow.sdk.transforms.DoFn;
-import com.google.cloud.dataflow.sdk.transforms.DoFn.RequiresWindowAccess;
-import com.google.cloud.dataflow.sdk.transforms.GroupByKey;
-import com.google.cloud.dataflow.sdk.transforms.IntraBundleParallelization;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-import com.google.cloud.dataflow.sdk.transforms.ParDo;
-import com.google.cloud.dataflow.sdk.transforms.windowing.AfterEach;
-import com.google.cloud.dataflow.sdk.transforms.windowing.AfterProcessingTime;
-import com.google.cloud.dataflow.sdk.transforms.windowing.AfterWatermark;
-import com.google.cloud.dataflow.sdk.transforms.windowing.FixedWindows;
-import com.google.cloud.dataflow.sdk.transforms.windowing.Repeatedly;
-import com.google.cloud.dataflow.sdk.transforms.windowing.Window;
-import com.google.cloud.dataflow.sdk.values.KV;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.cloud.dataflow.sdk.values.PCollectionList;
-
-import org.joda.time.Duration;
-import org.joda.time.Instant;
-
-import java.util.ArrayList;
-import java.util.List;
-import java.util.concurrent.TimeUnit;
-
-/**
- * This example illustrates the basic concepts behind triggering. It shows how to use different
- * trigger definitions to produce partial (speculative) results before all the data is processed and
- * to control when updated results are produced for late data. The example performs a streaming
- * analysis of the data coming in from PubSub and writes the results to BigQuery. It divides the
- * data into {@link Window windows} to be processed, and demonstrates using various kinds of {@link
- * Trigger triggers} to control when the results for each window are emitted.
- *
- * <p> This example uses a portion of real traffic data from San Diego freeways. It contains
- * readings from sensor stations set up along each freeway. Each sensor reading includes a
- * calculation of the 'total flow' across all lanes in that freeway direction.
- *
- * <p> Concepts:
- * <pre>
- * 1. The default triggering behavior
- * 2. Late data with the default trigger
- * 3. How to get speculative estimates
- * 4. Combining late data and speculative estimates
- * </pre>
- *
- * <p> Before running this example, it will be useful to familiarize yourself with Dataflow triggers
- * and understand the concept of 'late data',
- * See: <a href="https://cloud.google.com/dataflow/model/triggers">
- * https://cloud.google.com/dataflow/model/triggers </a> and
- * <a href="https://cloud.google.com/dataflow/model/windowing#Advanced">
- * https://cloud.google.com/dataflow/model/windowing#Advanced </a>
- *
- * <p> The example pipeline reads data from a Pub/Sub topic. By default, running the example will
- * also run an auxiliary pipeline to inject data from the default {@code --input} file to the
- * {@code --pubsubTopic}. The auxiliary pipeline puts a timestamp on the injected data so that the
- * example pipeline can operate on <i>event time</i> (rather than arrival time). The auxiliary
- * pipeline also randomly simulates late data, by setting the timestamps of some of the data
- * elements to be in the past. You may override the default {@code --input} with the file of your
- * choosing or set {@code --input=""} which will disable the automatic Pub/Sub injection, and allow
- * you to use a separate tool to publish to the given topic.
- *
- * <p> The example is configured to use the default Pub/Sub topic and the default BigQuery table
- * from the example common package (there are no defaults for a general Dataflow pipeline).
- * You can override them by using the {@code --pubsubTopic}, {@code --bigQueryDataset}, and
- * {@code --bigQueryTable} options. If the Pub/Sub topic or the BigQuery table do not exist,
- * the example will try to create them.
- *
- * <p> The pipeline outputs its results to a BigQuery table.
- * Here are some queries you can use to see interesting results:
- * Replace {@code <enter_table_name>} in the query below with the name of the BigQuery table.
- * Replace {@code <enter_window_interval>} in the query below with the window interval.
- *
- * <p> To see the results of the default trigger,
- * Note: When you start up your pipeline, you'll initially see results from 'late' data. Wait after
- * the window duration, until the first pane of non-late data has been emitted, to see more
- * interesting results.
- * {@code SELECT * FROM enter_table_name WHERE trigger_type = "default" ORDER BY window DESC}
- *
- * <p> To see the late data i.e. dropped by the default trigger,
- * {@code SELECT * FROM <enter_table_name> WHERE trigger_type = "withAllowedLateness" and
- * (timing = "LATE" or timing = "ON_TIME") and freeway = "5" ORDER BY window DESC, processing_time}
- *
- * <p>To see the the difference between accumulation mode and discarding mode,
- * {@code SELECT * FROM <enter_table_name> WHERE (timing = "LATE" or timing = "ON_TIME") AND
- * (trigger_type = "withAllowedLateness" or trigger_type = "sequential") and freeway = "5" ORDER BY
- * window DESC, processing_time}
- *
- * <p> To see speculative results every minute,
- * {@code SELECT * FROM <enter_table_name> WHERE trigger_type = "speculative" and freeway = "5"
- * ORDER BY window DESC, processing_time}
- *
- * <p> To see speculative results every five minutes after the end of the window
- * {@code SELECT * FROM <enter_table_name> WHERE trigger_type = "sequential" and timing != "EARLY"
- * and freeway = "5" ORDER BY window DESC, processing_time}
- *
- * <p> To see the first and the last pane for a freeway in a window for all the trigger types,
- * {@code SELECT * FROM <enter_table_name> WHERE (isFirst = true or isLast = true) ORDER BY window}
- *
- * <p> To reduce the number of results for each query we can add additional where clauses.
- * For examples, To see the results of the default trigger,
- * {@code SELECT * FROM <enter_table_name> WHERE trigger_type = "default" AND freeway = "5" AND
- * window = "<enter_window_interval>"}
- *
- * <p> The example will try to cancel the pipelines on the signal to terminate the process (CTRL-C)
- * and then exits.
- */
-
-public class TriggerExample {
- //Numeric value of fixed window duration, in minutes
- public static final int WINDOW_DURATION = 30;
- // Constants used in triggers.
- // Speeding up ONE_MINUTE or FIVE_MINUTES helps you get an early approximation of results.
- // ONE_MINUTE is used only with processing time before the end of the window
- public static final Duration ONE_MINUTE = Duration.standardMinutes(1);
- // FIVE_MINUTES is used only with processing time after the end of the window
- public static final Duration FIVE_MINUTES = Duration.standardMinutes(5);
- // ONE_DAY is used to specify the amount of lateness allowed for the data elements.
- public static final Duration ONE_DAY = Duration.standardDays(1);
-
- /**
- * This transform demonstrates using triggers to control when data is produced for each window
- * Consider an example to understand the results generated by each type of trigger.
- * The example uses "freeway" as the key. Event time is the timestamp associated with the data
- * element and processing time is the time when the data element gets processed in the pipeline.
- * For freeway 5, suppose there are 10 elements in the [10:00:00, 10:30:00) window.
- * Key (freeway) | Value (total_flow) | event time | processing time
- * 5 | 50 | 10:00:03 | 10:00:47
- * 5 | 30 | 10:01:00 | 10:01:03
- * 5 | 30 | 10:02:00 | 11:07:00
- * 5 | 20 | 10:04:10 | 10:05:15
- * 5 | 60 | 10:05:00 | 11:03:00
- * 5 | 20 | 10:05:01 | 11.07:30
- * 5 | 60 | 10:15:00 | 10:27:15
- * 5 | 40 | 10:26:40 | 10:26:43
- * 5 | 60 | 10:27:20 | 10:27:25
- * 5 | 60 | 10:29:00 | 11:11:00
- *
- * <p> Dataflow tracks a watermark which records up to what point in event time the data is
- * complete. For the purposes of the example, we'll assume the watermark is approximately 15m
- * behind the current processing time. In practice, the actual value would vary over time based
- * on the systems knowledge of the current PubSub delay and contents of the backlog (data
- * that has not yet been processed).
- *
- * <p> If the watermark is 15m behind, then the window [10:00:00, 10:30:00) (in event time) would
- * close at 10:44:59, when the watermark passes 10:30:00.
- */
- static class CalculateTotalFlow
- extends PTransform <PCollection<KV<String, Integer>>, PCollectionList<TableRow>> {
- private int windowDuration;
-
- CalculateTotalFlow(int windowDuration) {
- this.windowDuration = windowDuration;
- }
-
- @Override
- public PCollectionList<TableRow> apply(PCollection<KV<String, Integer>> flowInfo) {
-
- // Concept #1: The default triggering behavior
- // By default Dataflow uses a trigger which fires when the watermark has passed the end of the
- // window. This would be written {@code Repeatedly.forever(AfterWatermark.pastEndOfWindow())}.
-
- // The system also defaults to dropping late data -- data which arrives after the watermark
- // has passed the event timestamp of the arriving element. This means that the default trigger
- // will only fire once.
-
- // Each pane produced by the default trigger with no allowed lateness will be the first and
- // last pane in the window, and will be ON_TIME.
-
- // The results for the example above with the default trigger and zero allowed lateness
- // would be:
- // Key (freeway) | Value (total_flow) | number_of_records | isFirst | isLast | timing
- // 5 | 260 | 6 | true | true | ON_TIME
-
- // At 11:03:00 (processing time) the system watermark may have advanced to 10:54:00. As a
- // result, when the data record with event time 10:05:00 arrives at 11:03:00, it is considered
- // late, and dropped.
-
- PCollection<TableRow> defaultTriggerResults = flowInfo
- .apply("Default", Window
- // The default window duration values work well if you're running the default input
- // file. You may want to adjust the window duration otherwise.
- .<KV<String, Integer>>into(FixedWindows.of(Duration.standardMinutes(windowDuration)))
- // The default trigger first emits output when the system's watermark passes the end
- // of the window.
- .triggering(Repeatedly.forever(AfterWatermark.pastEndOfWindow()))
- // Late data is dropped
- .withAllowedLateness(Duration.ZERO)
- // Discard elements after emitting each pane.
- // With no allowed lateness and the specified trigger there will only be a single
- // pane, so this doesn't have a noticeable effect. See concept 2 for more details.
- .discardingFiredPanes())
- .apply(new TotalFlow("default"));
-
- // Concept #2: Late data with the default trigger
- // This uses the same trigger as concept #1, but allows data that is up to ONE_DAY late. This
- // leads to each window staying open for ONE_DAY after the watermark has passed the end of the
- // window. Any late data will result in an additional pane being fired for that same window.
-
- // The first pane produced will be ON_TIME and the remaining panes will be LATE.
- // To definitely get the last pane when the window closes, use
- // .withAllowedLateness(ONE_DAY, ClosingBehavior.FIRE_ALWAYS).
-
- // The results for the example above with the default trigger and ONE_DAY allowed lateness
- // would be:
- // Key (freeway) | Value (total_flow) | number_of_records | isFirst | isLast | timing
- // 5 | 260 | 6 | true | false | ON_TIME
- // 5 | 60 | 1 | false | false | LATE
- // 5 | 30 | 1 | false | false | LATE
- // 5 | 20 | 1 | false | false | LATE
- // 5 | 60 | 1 | false | false | LATE
- PCollection<TableRow> withAllowedLatenessResults = flowInfo
- .apply("WithLateData", Window
- .<KV<String, Integer>>into(FixedWindows.of(Duration.standardMinutes(windowDuration)))
- // Late data is emitted as it arrives
- .triggering(Repeatedly.forever(AfterWatermark.pastEndOfWindow()))
- // Once the output is produced, the pane is dropped and we start preparing the next
- // pane for the window
- .discardingFiredPanes()
- // Late data is handled up to one day
- .withAllowedLateness(ONE_DAY))
- .apply(new TotalFlow("withAllowedLateness"));
-
- // Concept #3: How to get speculative estimates
- // We can specify a trigger that fires independent of the watermark, for instance after
- // ONE_MINUTE of processing time. This allows us to produce speculative estimates before
- // all the data is available. Since we don't have any triggers that depend on the watermark
- // we don't get an ON_TIME firing. Instead, all panes are either EARLY or LATE.
-
- // We also use accumulatingFiredPanes to build up the results across each pane firing.
-
- // The results for the example above for this trigger would be:
- // Key (freeway) | Value (total_flow) | number_of_records | isFirst | isLast | timing
- // 5 | 80 | 2 | true | false | EARLY
- // 5 | 100 | 3 | false | false | EARLY
- // 5 | 260 | 6 | false | false | EARLY
- // 5 | 320 | 7 | false | false | LATE
- // 5 | 370 | 9 | false | false | LATE
- // 5 | 430 | 10 | false | false | LATE
- PCollection<TableRow> speculativeResults = flowInfo
- .apply("Speculative" , Window
- .<KV<String, Integer>>into(FixedWindows.of(Duration.standardMinutes(windowDuration)))
- // Trigger fires every minute.
- .triggering(Repeatedly.forever(AfterProcessingTime.pastFirstElementInPane()
- // Speculative every ONE_MINUTE
- .plusDelayOf(ONE_MINUTE)))
- // After emitting each pane, it will continue accumulating the elements so that each
- // approximation includes all of the previous data in addition to the newly arrived
- // data.
- .accumulatingFiredPanes()
- .withAllowedLateness(ONE_DAY))
- .apply(new TotalFlow("speculative"));
-
- // Concept #4: Combining late data and speculative estimates
- // We can put the previous concepts together to get EARLY estimates, an ON_TIME result,
- // and LATE updates based on late data.
-
- // Each time a triggering condition is satisfied it advances to the next trigger.
- // If there are new elements this trigger emits a window under following condition:
- // > Early approximations every minute till the end of the window.
- // > An on-time firing when the watermark has passed the end of the window
- // > Every five minutes of late data.
-
- // Every pane produced will either be EARLY, ON_TIME or LATE.
-
- // The results for the example above for this trigger would be:
- // Key (freeway) | Value (total_flow) | number_of_records | isFirst | isLast | timing
- // 5 | 80 | 2 | true | false | EARLY
- // 5 | 100 | 3 | false | false | EARLY
- // 5 | 260 | 6 | false | false | EARLY
- // [First pane fired after the end of the window]
- // 5 | 320 | 7 | false | false | ON_TIME
- // 5 | 430 | 10 | false | false | LATE
-
- // For more possibilities of how to build advanced triggers, see {@link Trigger}.
- PCollection<TableRow> sequentialResults = flowInfo
- .apply("Sequential", Window
- .<KV<String, Integer>>into(FixedWindows.of(Duration.standardMinutes(windowDuration)))
- .triggering(AfterEach.inOrder(
- Repeatedly.forever(AfterProcessingTime.pastFirstElementInPane()
- // Speculative every ONE_MINUTE
- .plusDelayOf(ONE_MINUTE)).orFinally(AfterWatermark.pastEndOfWindow()),
- Repeatedly.forever(AfterProcessingTime.pastFirstElementInPane()
- // Late data every FIVE_MINUTES
- .plusDelayOf(FIVE_MINUTES))))
- .accumulatingFiredPanes()
- // For up to ONE_DAY
- .withAllowedLateness(ONE_DAY))
- .apply(new TotalFlow("sequential"));
-
- // Adds the results generated by each trigger type to a PCollectionList.
- PCollectionList<TableRow> resultsList = PCollectionList.of(defaultTriggerResults)
- .and(withAllowedLatenessResults)
- .and(speculativeResults)
- .and(sequentialResults);
-
- return resultsList;
- }
- }
-
- //////////////////////////////////////////////////////////////////////////////////////////////////
- // The remaining parts of the pipeline are needed to produce the output for each
- // concept above. Not directly relevant to understanding the trigger examples.
-
- /**
- * Calculate total flow and number of records for each freeway and format the results to TableRow
- * objects, to save to BigQuery.
- */
- static class TotalFlow extends
- PTransform <PCollection<KV<String, Integer>>, PCollection<TableRow>> {
- private String triggerType;
-
- public TotalFlow(String triggerType) {
- this.triggerType = triggerType;
- }
-
- @Override
- public PCollection<TableRow> apply(PCollection<KV<String, Integer>> flowInfo) {
- PCollection<KV<String, Iterable<Integer>>> flowPerFreeway = flowInfo
- .apply(GroupByKey.<String, Integer>create());
-
- PCollection<KV<String, String>> results = flowPerFreeway.apply(ParDo.of(
- new DoFn <KV<String, Iterable<Integer>>, KV<String, String>>() {
-
- @Override
- public void processElement(ProcessContext c) throws Exception {
- Iterable<Integer> flows = c.element().getValue();
- Integer sum = 0;
- Long numberOfRecords = 0L;
- for (Integer value : flows) {
- sum += value;
- numberOfRecords++;
- }
- c.output(KV.of(c.element().getKey(), sum + "," + numberOfRecords));
- }
- }));
- PCollection<TableRow> output = results.apply(ParDo.of(new FormatTotalFlow(triggerType)));
- return output;
- }
- }
-
- /**
- * Format the results of the Total flow calculation to a TableRow, to save to BigQuery.
- * Adds the triggerType, pane information, processing time and the window timestamp.
- * */
- static class FormatTotalFlow extends DoFn<KV<String, String>, TableRow>
- implements RequiresWindowAccess {
- private String triggerType;
-
- public FormatTotalFlow(String triggerType) {
- this.triggerType = triggerType;
- }
- @Override
- public void processElement(ProcessContext c) throws Exception {
- String[] values = c.element().getValue().split(",");
- TableRow row = new TableRow()
- .set("trigger_type", triggerType)
- .set("freeway", c.element().getKey())
- .set("total_flow", Integer.parseInt(values[0]))
- .set("number_of_records", Long.parseLong(values[1]))
- .set("window", c.window().toString())
- .set("isFirst", c.pane().isFirst())
- .set("isLast", c.pane().isLast())
- .set("timing", c.pane().getTiming().toString())
- .set("event_time", c.timestamp().toString())
- .set("processing_time", Instant.now().toString());
- c.output(row);
- }
- }
-
- /**
- * Extract the freeway and total flow in a reading.
- * Freeway is used as key since we are calculating the total flow for each freeway.
- */
- static class ExtractFlowInfo extends DoFn<String, KV<String, Integer>> {
- @Override
- public void processElement(ProcessContext c) throws Exception {
- String[] laneInfo = c.element().split(",");
- if (laneInfo[0].equals("timestamp")) {
- // Header row
- return;
- }
- if (laneInfo.length < 48) {
- //Skip the invalid input.
- return;
- }
- String freeway = laneInfo[2];
- Integer totalFlow = tryIntegerParse(laneInfo[7]);
- // Ignore the records with total flow 0 to easily understand the working of triggers.
- // Skip the records with total flow -1 since they are invalid input.
- if (totalFlow == null || totalFlow <= 0) {
- return;
- }
- c.output(KV.of(freeway, totalFlow));
- }
- }
-
- /**
- * Inherits standard configuration options.
- */
- public interface TrafficFlowOptions
- extends ExamplePubsubTopicOptions, ExampleBigQueryTableOptions, DataflowExampleOptions {
-
- @Description("Input file to inject to Pub/Sub topic")
- @Default.String("gs://dataflow-samples/traffic_sensor/"
- + "Freeways-5Minaa2010-01-01_to_2010-02-15.csv")
- String getInput();
- void setInput(String value);
-
- @Description("Numeric value of window duration for fixed windows, in minutes")
- @Default.Integer(WINDOW_DURATION)
- Integer getWindowDuration();
- void setWindowDuration(Integer value);
- }
-
- private static final String PUBSUB_TIMESTAMP_LABEL_KEY = "timestamp_ms";
-
- public static void main(String[] args) throws Exception {
- TrafficFlowOptions options = PipelineOptionsFactory.fromArgs(args)
- .withValidation()
- .as(TrafficFlowOptions.class);
- options.setStreaming(true);
-
- // In order to cancel the pipelines automatically,
- // {@code DataflowPipelineRunner} is forced to be used.
- options.setRunner(DataflowPipelineRunner.class);
- options.setBigQuerySchema(getSchema());
-
- DataflowExampleUtils dataflowUtils = new DataflowExampleUtils(options);
- dataflowUtils.setup();
-
- Pipeline pipeline = Pipeline.create(options);
-
- TableReference tableRef = getTableReference(options.getProject(),
- options.getBigQueryDataset(), options.getBigQueryTable());
-
- PCollectionList<TableRow> resultList = pipeline.apply(PubsubIO.Read.named("ReadPubsubInput")
- .timestampLabel(PUBSUB_TIMESTAMP_LABEL_KEY)
- .topic(options.getPubsubTopic()))
- .apply(ParDo.of(new ExtractFlowInfo()))
- .apply(new CalculateTotalFlow(options.getWindowDuration()));
-
- for (int i = 0; i < resultList.size(); i++){
- resultList.get(i).apply(BigQueryIO.Write.to(tableRef).withSchema(getSchema()));
- }
-
- PipelineResult result = pipeline.run();
- if (!options.getInput().isEmpty()){
- //Inject the data into the pubsub topic
- dataflowUtils.runInjectorPipeline(runInjector(options));
- }
- // dataflowUtils will try to cancel the pipeline and the injector before the program exits.
- dataflowUtils.waitToFinish(result);
- }
-
- private static Pipeline runInjector(TrafficFlowOptions options){
- DataflowPipelineOptions copiedOptions = options.cloneAs(DataflowPipelineOptions.class);
- copiedOptions.setStreaming(false);
- copiedOptions.setNumWorkers(options.as(DataflowExampleOptions.class).getInjectorNumWorkers());
- copiedOptions.setJobName(options.getJobName() + "-injector");
- Pipeline injectorPipeline = Pipeline.create(copiedOptions);
- injectorPipeline
- .apply(TextIO.Read.named("ReadMyFile").from(options.getInput()))
- .apply(ParDo.named("InsertRandomDelays").of(new InsertDelays()))
- .apply(IntraBundleParallelization.of(PubsubFileInjector
- .withTimestampLabelKey(PUBSUB_TIMESTAMP_LABEL_KEY)
- .publish(options.getPubsubTopic()))
- .withMaxParallelism(20));
-
- return injectorPipeline;
- }
-
- /**
- * Add current time to each record.
- * Also insert a delay at random to demo the triggers.
- */
- public static class InsertDelays extends DoFn<String, String> {
- private static final double THRESHOLD = 0.001;
- // MIN_DELAY and MAX_DELAY in minutes.
- private static final int MIN_DELAY = 1;
- private static final int MAX_DELAY = 100;
-
- @Override
- public void processElement(ProcessContext c) throws Exception {
- Instant timestamp = Instant.now();
- if (Math.random() < THRESHOLD){
- int range = MAX_DELAY - MIN_DELAY;
- int delayInMinutes = (int) (Math.random() * range) + MIN_DELAY;
- long delayInMillis = TimeUnit.MINUTES.toMillis(delayInMinutes);
- timestamp = new Instant(timestamp.getMillis() - delayInMillis);
- }
- c.outputWithTimestamp(c.element(), timestamp);
- }
- }
-
-
- /**Sets the table reference. **/
- private static TableReference getTableReference(String project, String dataset, String table){
- TableReference tableRef = new TableReference();
- tableRef.setProjectId(project);
- tableRef.setDatasetId(dataset);
- tableRef.setTableId(table);
- return tableRef;
- }
-
- /** Defines the BigQuery schema used for the output. */
- private static TableSchema getSchema() {
- List<TableFieldSchema> fields = new ArrayList<>();
- fields.add(new TableFieldSchema().setName("trigger_type").setType("STRING"));
- fields.add(new TableFieldSchema().setName("freeway").setType("STRING"));
- fields.add(new TableFieldSchema().setName("total_flow").setType("INTEGER"));
- fields.add(new TableFieldSchema().setName("number_of_records").setType("INTEGER"));
- fields.add(new TableFieldSchema().setName("window").setType("STRING"));
- fields.add(new TableFieldSchema().setName("isFirst").setType("BOOLEAN"));
- fields.add(new TableFieldSchema().setName("isLast").setType("BOOLEAN"));
- fields.add(new TableFieldSchema().setName("timing").setType("STRING"));
- fields.add(new TableFieldSchema().setName("event_time").setType("TIMESTAMP"));
- fields.add(new TableFieldSchema().setName("processing_time").setType("TIMESTAMP"));
- TableSchema schema = new TableSchema().setFields(fields);
- return schema;
- }
-
- private static Integer tryIntegerParse(String number) {
- try {
- return Integer.parseInt(number);
- } catch (NumberFormatException e) {
- return null;
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/src/test/java/com/google/cloud/dataflow/examples/DebuggingWordCountTest.java
----------------------------------------------------------------------
diff --git a/examples/src/test/java/com/google/cloud/dataflow/examples/DebuggingWordCountTest.java b/examples/src/test/java/com/google/cloud/dataflow/examples/DebuggingWordCountTest.java
deleted file mode 100644
index 77d7bc8..0000000
--- a/examples/src/test/java/com/google/cloud/dataflow/examples/DebuggingWordCountTest.java
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.examples;
-
-import com.google.common.io.Files;
-
-import org.junit.Rule;
-import org.junit.Test;
-import org.junit.rules.TemporaryFolder;
-import org.junit.runner.RunWith;
-import org.junit.runners.JUnit4;
-
-import java.io.File;
-import java.nio.charset.StandardCharsets;
-
-/**
- * Tests for {@link DebuggingWordCount}.
- */
-@RunWith(JUnit4.class)
-public class DebuggingWordCountTest {
- @Rule public TemporaryFolder tmpFolder = new TemporaryFolder();
-
- @Test
- public void testDebuggingWordCount() throws Exception {
- File file = tmpFolder.newFile();
- Files.write("stomach secret Flourish message Flourish here Flourish", file,
- StandardCharsets.UTF_8);
- DebuggingWordCount.main(new String[]{"--inputFile=" + file.getAbsolutePath()});
- }
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/src/test/java/com/google/cloud/dataflow/examples/WordCountTest.java
----------------------------------------------------------------------
diff --git a/examples/src/test/java/com/google/cloud/dataflow/examples/WordCountTest.java b/examples/src/test/java/com/google/cloud/dataflow/examples/WordCountTest.java
deleted file mode 100644
index 4542c48..0000000
--- a/examples/src/test/java/com/google/cloud/dataflow/examples/WordCountTest.java
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.examples;
-
-import com.google.cloud.dataflow.examples.WordCount.CountWords;
-import com.google.cloud.dataflow.examples.WordCount.ExtractWordsFn;
-import com.google.cloud.dataflow.examples.WordCount.FormatAsTextFn;
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.coders.StringUtf8Coder;
-import com.google.cloud.dataflow.sdk.testing.DataflowAssert;
-import com.google.cloud.dataflow.sdk.testing.RunnableOnService;
-import com.google.cloud.dataflow.sdk.testing.TestPipeline;
-import com.google.cloud.dataflow.sdk.transforms.Create;
-import com.google.cloud.dataflow.sdk.transforms.DoFnTester;
-import com.google.cloud.dataflow.sdk.transforms.MapElements;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-
-import org.hamcrest.CoreMatchers;
-import org.junit.Assert;
-import org.junit.Test;
-import org.junit.experimental.categories.Category;
-import org.junit.runner.RunWith;
-import org.junit.runners.JUnit4;
-
-import java.util.Arrays;
-import java.util.List;
-
-/**
- * Tests of WordCount.
- */
-@RunWith(JUnit4.class)
-public class WordCountTest {
-
- /** Example test that tests a specific DoFn. */
- @Test
- public void testExtractWordsFn() {
- DoFnTester<String, String> extractWordsFn =
- DoFnTester.of(new ExtractWordsFn());
-
- Assert.assertThat(extractWordsFn.processBatch(" some input words "),
- CoreMatchers.hasItems("some", "input", "words"));
- Assert.assertThat(extractWordsFn.processBatch(" "),
- CoreMatchers.<String>hasItems());
- Assert.assertThat(extractWordsFn.processBatch(" some ", " input", " words"),
- CoreMatchers.hasItems("some", "input", "words"));
- }
-
- static final String[] WORDS_ARRAY = new String[] {
- "hi there", "hi", "hi sue bob",
- "hi sue", "", "bob hi"};
-
- static final List<String> WORDS = Arrays.asList(WORDS_ARRAY);
-
- static final String[] COUNTS_ARRAY = new String[] {
- "hi: 5", "there: 1", "sue: 2", "bob: 2"};
-
- /** Example test that tests a PTransform by using an in-memory input and inspecting the output. */
- @Test
- @Category(RunnableOnService.class)
- public void testCountWords() throws Exception {
- Pipeline p = TestPipeline.create();
-
- PCollection<String> input = p.apply(Create.of(WORDS).withCoder(StringUtf8Coder.of()));
-
- PCollection<String> output = input.apply(new CountWords())
- .apply(MapElements.via(new FormatAsTextFn()));
-
- DataflowAssert.that(output).containsInAnyOrder(COUNTS_ARRAY);
- p.run();
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/src/test/java/com/google/cloud/dataflow/examples/complete/AutoCompleteTest.java
----------------------------------------------------------------------
diff --git a/examples/src/test/java/com/google/cloud/dataflow/examples/complete/AutoCompleteTest.java b/examples/src/test/java/com/google/cloud/dataflow/examples/complete/AutoCompleteTest.java
deleted file mode 100644
index aec1557..0000000
--- a/examples/src/test/java/com/google/cloud/dataflow/examples/complete/AutoCompleteTest.java
+++ /dev/null
@@ -1,181 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.examples.complete;
-
-import com.google.cloud.dataflow.examples.complete.AutoComplete.CompletionCandidate;
-import com.google.cloud.dataflow.examples.complete.AutoComplete.ComputeTopCompletions;
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.testing.DataflowAssert;
-import com.google.cloud.dataflow.sdk.testing.TestPipeline;
-import com.google.cloud.dataflow.sdk.transforms.Create;
-import com.google.cloud.dataflow.sdk.transforms.DoFn;
-import com.google.cloud.dataflow.sdk.transforms.Filter;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-import com.google.cloud.dataflow.sdk.transforms.ParDo;
-import com.google.cloud.dataflow.sdk.transforms.SerializableFunction;
-import com.google.cloud.dataflow.sdk.transforms.windowing.SlidingWindows;
-import com.google.cloud.dataflow.sdk.transforms.windowing.Window;
-import com.google.cloud.dataflow.sdk.values.KV;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.cloud.dataflow.sdk.values.TimestampedValue;
-
-import org.joda.time.Duration;
-import org.joda.time.Instant;
-import org.junit.Test;
-import org.junit.runner.RunWith;
-import org.junit.runners.Parameterized;
-
-import java.io.Serializable;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.List;
-
-/**
- * Tests of AutoComplete.
- */
-@RunWith(Parameterized.class)
-public class AutoCompleteTest implements Serializable {
- private boolean recursive;
-
- public AutoCompleteTest(Boolean recursive) {
- this.recursive = recursive;
- }
-
- @Parameterized.Parameters
- public static Collection<Object[]> testRecursive() {
- return Arrays.asList(new Object[][] {
- { true },
- { false }
- });
- }
-
- @Test
- public void testAutoComplete() {
- List<String> words = Arrays.asList(
- "apple",
- "apple",
- "apricot",
- "banana",
- "blackberry",
- "blackberry",
- "blackberry",
- "blueberry",
- "blueberry",
- "cherry");
-
- Pipeline p = TestPipeline.create();
-
- PCollection<String> input = p.apply(Create.of(words));
-
- PCollection<KV<String, List<CompletionCandidate>>> output =
- input.apply(new ComputeTopCompletions(2, recursive))
- .apply(Filter.byPredicate(
- new SerializableFunction<KV<String, List<CompletionCandidate>>, Boolean>() {
- @Override
- public Boolean apply(KV<String, List<CompletionCandidate>> element) {
- return element.getKey().length() <= 2;
- }
- }));
-
- DataflowAssert.that(output).containsInAnyOrder(
- KV.of("a", parseList("apple:2", "apricot:1")),
- KV.of("ap", parseList("apple:2", "apricot:1")),
- KV.of("b", parseList("blackberry:3", "blueberry:2")),
- KV.of("ba", parseList("banana:1")),
- KV.of("bl", parseList("blackberry:3", "blueberry:2")),
- KV.of("c", parseList("cherry:1")),
- KV.of("ch", parseList("cherry:1")));
- p.run();
- }
-
- @Test
- public void testTinyAutoComplete() {
- List<String> words = Arrays.asList("x", "x", "x", "xy", "xy", "xyz");
-
- Pipeline p = TestPipeline.create();
-
- PCollection<String> input = p.apply(Create.of(words));
-
- PCollection<KV<String, List<CompletionCandidate>>> output =
- input.apply(new ComputeTopCompletions(2, recursive));
-
- DataflowAssert.that(output).containsInAnyOrder(
- KV.of("x", parseList("x:3", "xy:2")),
- KV.of("xy", parseList("xy:2", "xyz:1")),
- KV.of("xyz", parseList("xyz:1")));
- p.run();
- }
-
- @Test
- public void testWindowedAutoComplete() {
- List<TimestampedValue<String>> words = Arrays.asList(
- TimestampedValue.of("xA", new Instant(1)),
- TimestampedValue.of("xA", new Instant(1)),
- TimestampedValue.of("xB", new Instant(1)),
- TimestampedValue.of("xB", new Instant(2)),
- TimestampedValue.of("xB", new Instant(2)));
-
- Pipeline p = TestPipeline.create();
-
- PCollection<String> input = p
- .apply(Create.of(words))
- .apply(new ReifyTimestamps<String>());
-
- PCollection<KV<String, List<CompletionCandidate>>> output =
- input.apply(Window.<String>into(SlidingWindows.of(new Duration(2))))
- .apply(new ComputeTopCompletions(2, recursive));
-
- DataflowAssert.that(output).containsInAnyOrder(
- // Window [0, 2)
- KV.of("x", parseList("xA:2", "xB:1")),
- KV.of("xA", parseList("xA:2")),
- KV.of("xB", parseList("xB:1")),
-
- // Window [1, 3)
- KV.of("x", parseList("xB:3", "xA:2")),
- KV.of("xA", parseList("xA:2")),
- KV.of("xB", parseList("xB:3")),
-
- // Window [2, 3)
- KV.of("x", parseList("xB:2")),
- KV.of("xB", parseList("xB:2")));
- p.run();
- }
-
- private static List<CompletionCandidate> parseList(String... entries) {
- List<CompletionCandidate> all = new ArrayList<>();
- for (String s : entries) {
- String[] countValue = s.split(":");
- all.add(new CompletionCandidate(countValue[0], Integer.valueOf(countValue[1])));
- }
- return all;
- }
-
- private static class ReifyTimestamps<T>
- extends PTransform<PCollection<TimestampedValue<T>>, PCollection<T>> {
- @Override
- public PCollection<T> apply(PCollection<TimestampedValue<T>> input) {
- return input.apply(ParDo.of(new DoFn<TimestampedValue<T>, T>() {
- @Override
- public void processElement(ProcessContext c) {
- c.outputWithTimestamp(c.element().getValue(), c.element().getTimestamp());
- }
- }));
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/src/test/java/com/google/cloud/dataflow/examples/complete/TfIdfTest.java
----------------------------------------------------------------------
diff --git a/examples/src/test/java/com/google/cloud/dataflow/examples/complete/TfIdfTest.java b/examples/src/test/java/com/google/cloud/dataflow/examples/complete/TfIdfTest.java
deleted file mode 100644
index 5ee136c..0000000
--- a/examples/src/test/java/com/google/cloud/dataflow/examples/complete/TfIdfTest.java
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.examples.complete;
-
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.coders.StringDelegateCoder;
-import com.google.cloud.dataflow.sdk.testing.DataflowAssert;
-import com.google.cloud.dataflow.sdk.testing.RunnableOnService;
-import com.google.cloud.dataflow.sdk.testing.TestPipeline;
-import com.google.cloud.dataflow.sdk.transforms.Create;
-import com.google.cloud.dataflow.sdk.transforms.Keys;
-import com.google.cloud.dataflow.sdk.transforms.RemoveDuplicates;
-import com.google.cloud.dataflow.sdk.values.KV;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-
-import org.junit.Test;
-import org.junit.experimental.categories.Category;
-import org.junit.runner.RunWith;
-import org.junit.runners.JUnit4;
-
-import java.net.URI;
-import java.util.Arrays;
-
-/**
- * Tests of {@link TfIdf}.
- */
-@RunWith(JUnit4.class)
-public class TfIdfTest {
-
- /** Test that the example runs. */
- @Test
- @Category(RunnableOnService.class)
- public void testTfIdf() throws Exception {
- Pipeline pipeline = TestPipeline.create();
-
- pipeline.getCoderRegistry().registerCoder(URI.class, StringDelegateCoder.of(URI.class));
-
- PCollection<KV<String, KV<URI, Double>>> wordToUriAndTfIdf = pipeline
- .apply(Create.of(
- KV.of(new URI("x"), "a b c d"),
- KV.of(new URI("y"), "a b c"),
- KV.of(new URI("z"), "a m n")))
- .apply(new TfIdf.ComputeTfIdf());
-
- PCollection<String> words = wordToUriAndTfIdf
- .apply(Keys.<String>create())
- .apply(RemoveDuplicates.<String>create());
-
- DataflowAssert.that(words).containsInAnyOrder(Arrays.asList("a", "m", "n", "b", "c", "d"));
-
- pipeline.run();
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/src/test/java/com/google/cloud/dataflow/examples/complete/TopWikipediaSessionsTest.java
----------------------------------------------------------------------
diff --git a/examples/src/test/java/com/google/cloud/dataflow/examples/complete/TopWikipediaSessionsTest.java b/examples/src/test/java/com/google/cloud/dataflow/examples/complete/TopWikipediaSessionsTest.java
deleted file mode 100644
index ce9de51..0000000
--- a/examples/src/test/java/com/google/cloud/dataflow/examples/complete/TopWikipediaSessionsTest.java
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.examples.complete;
-
-import com.google.api.services.bigquery.model.TableRow;
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.testing.DataflowAssert;
-import com.google.cloud.dataflow.sdk.testing.RunnableOnService;
-import com.google.cloud.dataflow.sdk.testing.TestPipeline;
-import com.google.cloud.dataflow.sdk.transforms.Create;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-
-import org.junit.Test;
-import org.junit.experimental.categories.Category;
-import org.junit.runner.RunWith;
-import org.junit.runners.JUnit4;
-
-import java.util.Arrays;
-
-/** Unit tests for {@link TopWikipediaSessions}. */
-@RunWith(JUnit4.class)
-public class TopWikipediaSessionsTest {
- @Test
- @Category(RunnableOnService.class)
- public void testComputeTopUsers() {
- Pipeline p = TestPipeline.create();
-
- PCollection<String> output =
- p.apply(Create.of(Arrays.asList(
- new TableRow().set("timestamp", 0).set("contributor_username", "user1"),
- new TableRow().set("timestamp", 1).set("contributor_username", "user1"),
- new TableRow().set("timestamp", 2).set("contributor_username", "user1"),
- new TableRow().set("timestamp", 0).set("contributor_username", "user2"),
- new TableRow().set("timestamp", 1).set("contributor_username", "user2"),
- new TableRow().set("timestamp", 3601).set("contributor_username", "user2"),
- new TableRow().set("timestamp", 3602).set("contributor_username", "user2"),
- new TableRow().set("timestamp", 35 * 24 * 3600).set("contributor_username", "user3"))))
- .apply(new TopWikipediaSessions.ComputeTopSessions(1.0));
-
- DataflowAssert.that(output).containsInAnyOrder(Arrays.asList(
- "user1 : [1970-01-01T00:00:00.000Z..1970-01-01T01:00:02.000Z)"
- + " : 3 : 1970-01-01T00:00:00.000Z",
- "user3 : [1970-02-05T00:00:00.000Z..1970-02-05T01:00:00.000Z)"
- + " : 1 : 1970-02-01T00:00:00.000Z"));
-
- p.run();
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/src/test/java/com/google/cloud/dataflow/examples/cookbook/BigQueryTornadoesTest.java
----------------------------------------------------------------------
diff --git a/examples/src/test/java/com/google/cloud/dataflow/examples/cookbook/BigQueryTornadoesTest.java b/examples/src/test/java/com/google/cloud/dataflow/examples/cookbook/BigQueryTornadoesTest.java
deleted file mode 100644
index 6dce4ed..0000000
--- a/examples/src/test/java/com/google/cloud/dataflow/examples/cookbook/BigQueryTornadoesTest.java
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.examples.cookbook;
-
-import com.google.api.services.bigquery.model.TableRow;
-import com.google.cloud.dataflow.examples.cookbook.BigQueryTornadoes.ExtractTornadoesFn;
-import com.google.cloud.dataflow.examples.cookbook.BigQueryTornadoes.FormatCountsFn;
-import com.google.cloud.dataflow.sdk.transforms.DoFnTester;
-import com.google.cloud.dataflow.sdk.values.KV;
-
-import org.hamcrest.CoreMatchers;
-import org.junit.Assert;
-import org.junit.Test;
-import org.junit.runner.RunWith;
-import org.junit.runners.JUnit4;
-
-import java.util.List;
-
-/**
- * Test case for {@link BigQueryTornadoes}.
- */
-@RunWith(JUnit4.class)
-public class BigQueryTornadoesTest {
-
- @Test
- public void testExtractTornadoes() throws Exception {
- TableRow row = new TableRow()
- .set("month", "6")
- .set("tornado", true);
- DoFnTester<TableRow, Integer> extractWordsFn =
- DoFnTester.of(new ExtractTornadoesFn());
- Assert.assertThat(extractWordsFn.processBatch(row),
- CoreMatchers.hasItems(6));
- }
-
- @Test
- public void testNoTornadoes() throws Exception {
- TableRow row = new TableRow()
- .set("month", 6)
- .set("tornado", false);
- DoFnTester<TableRow, Integer> extractWordsFn =
- DoFnTester.of(new ExtractTornadoesFn());
- Assert.assertTrue(extractWordsFn.processBatch(row).isEmpty());
- }
-
- @Test
- @SuppressWarnings({"rawtypes", "unchecked"})
- public void testFormatCounts() throws Exception {
- DoFnTester<KV<Integer, Long>, TableRow> formatCountsFn =
- DoFnTester.of(new FormatCountsFn());
- KV empty[] = {};
- List<TableRow> results = formatCountsFn.processBatch(empty);
- Assert.assertTrue(results.size() == 0);
- KV input[] = { KV.of(3, 0L),
- KV.of(4, Long.MAX_VALUE),
- KV.of(5, Long.MIN_VALUE) };
- results = formatCountsFn.processBatch(input);
- Assert.assertEquals(results.size(), 3);
- Assert.assertEquals(results.get(0).get("month"), 3);
- Assert.assertEquals(results.get(0).get("tornado_count"), 0L);
- Assert.assertEquals(results.get(1).get("month"), 4);
- Assert.assertEquals(results.get(1).get("tornado_count"), Long.MAX_VALUE);
- Assert.assertEquals(results.get(2).get("month"), 5);
- Assert.assertEquals(results.get(2).get("tornado_count"), Long.MIN_VALUE);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/src/test/java/com/google/cloud/dataflow/examples/cookbook/CombinePerKeyExamplesTest.java
----------------------------------------------------------------------
diff --git a/examples/src/test/java/com/google/cloud/dataflow/examples/cookbook/CombinePerKeyExamplesTest.java b/examples/src/test/java/com/google/cloud/dataflow/examples/cookbook/CombinePerKeyExamplesTest.java
deleted file mode 100644
index fe4823d..0000000
--- a/examples/src/test/java/com/google/cloud/dataflow/examples/cookbook/CombinePerKeyExamplesTest.java
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.examples.cookbook;
-
-import com.google.api.services.bigquery.model.TableRow;
-import com.google.cloud.dataflow.examples.cookbook.CombinePerKeyExamples.ExtractLargeWordsFn;
-import com.google.cloud.dataflow.examples.cookbook.CombinePerKeyExamples.FormatShakespeareOutputFn;
-import com.google.cloud.dataflow.sdk.transforms.DoFnTester;
-import com.google.cloud.dataflow.sdk.values.KV;
-
-import org.hamcrest.CoreMatchers;
-import org.junit.Assert;
-import org.junit.Test;
-import org.junit.runner.RunWith;
-import org.junit.runners.JUnit4;
-
-import java.util.List;
-
-/** Unit tests for {@link CombinePerKeyExamples}. */
-@RunWith(JUnit4.class)
-public class CombinePerKeyExamplesTest {
-
- private static final TableRow row1 = new TableRow()
- .set("corpus", "king_lear").set("word", "snuffleupaguses");
- private static final TableRow row2 = new TableRow()
- .set("corpus", "macbeth").set("word", "antidisestablishmentarianism");
- private static final TableRow row3 = new TableRow()
- .set("corpus", "king_lear").set("word", "antidisestablishmentarianism");
- private static final TableRow row4 = new TableRow()
- .set("corpus", "macbeth").set("word", "bob");
- private static final TableRow row5 = new TableRow()
- .set("corpus", "king_lear").set("word", "hi");
-
- static final TableRow[] ROWS_ARRAY = new TableRow[] {
- row1, row2, row3, row4, row5
- };
-
- private static final KV<String, String> tuple1 = KV.of("snuffleupaguses", "king_lear");
- private static final KV<String, String> tuple2 = KV.of("antidisestablishmentarianism", "macbeth");
- private static final KV<String, String> tuple3 = KV.of("antidisestablishmentarianism",
- "king_lear");
-
- private static final KV<String, String> combinedTuple1 = KV.of("antidisestablishmentarianism",
- "king_lear,macbeth");
- private static final KV<String, String> combinedTuple2 = KV.of("snuffleupaguses", "king_lear");
-
- @SuppressWarnings({"unchecked", "rawtypes"})
- static final KV<String, String>[] COMBINED_TUPLES_ARRAY = new KV[] {
- combinedTuple1, combinedTuple2
- };
-
- private static final TableRow resultRow1 = new TableRow()
- .set("word", "snuffleupaguses").set("all_plays", "king_lear");
- private static final TableRow resultRow2 = new TableRow()
- .set("word", "antidisestablishmentarianism")
- .set("all_plays", "king_lear,macbeth");
-
- @Test
- public void testExtractLargeWordsFn() {
- DoFnTester<TableRow, KV<String, String>> extractLargeWordsFn =
- DoFnTester.of(new ExtractLargeWordsFn());
- List<KV<String, String>> results = extractLargeWordsFn.processBatch(ROWS_ARRAY);
- Assert.assertThat(results, CoreMatchers.hasItem(tuple1));
- Assert.assertThat(results, CoreMatchers.hasItem(tuple2));
- Assert.assertThat(results, CoreMatchers.hasItem(tuple3));
- }
-
- @Test
- public void testFormatShakespeareOutputFn() {
- DoFnTester<KV<String, String>, TableRow> formatShakespeareOutputFn =
- DoFnTester.of(new FormatShakespeareOutputFn());
- List<TableRow> results = formatShakespeareOutputFn.processBatch(COMBINED_TUPLES_ARRAY);
- Assert.assertThat(results, CoreMatchers.hasItem(resultRow1));
- Assert.assertThat(results, CoreMatchers.hasItem(resultRow2));
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/src/test/java/com/google/cloud/dataflow/examples/cookbook/DeDupExampleTest.java
----------------------------------------------------------------------
diff --git a/examples/src/test/java/com/google/cloud/dataflow/examples/cookbook/DeDupExampleTest.java b/examples/src/test/java/com/google/cloud/dataflow/examples/cookbook/DeDupExampleTest.java
deleted file mode 100644
index bce6b11..0000000
--- a/examples/src/test/java/com/google/cloud/dataflow/examples/cookbook/DeDupExampleTest.java
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.examples.cookbook;
-
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.coders.StringUtf8Coder;
-import com.google.cloud.dataflow.sdk.testing.DataflowAssert;
-import com.google.cloud.dataflow.sdk.testing.RunnableOnService;
-import com.google.cloud.dataflow.sdk.testing.TestPipeline;
-import com.google.cloud.dataflow.sdk.transforms.Create;
-import com.google.cloud.dataflow.sdk.transforms.RemoveDuplicates;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-
-import org.junit.Test;
-import org.junit.experimental.categories.Category;
-import org.junit.runner.RunWith;
-import org.junit.runners.JUnit4;
-
-import java.util.Arrays;
-import java.util.List;
-
-/** Unit tests for {@link DeDupExample}. */
-@RunWith(JUnit4.class)
-public class DeDupExampleTest {
-
- @Test
- @Category(RunnableOnService.class)
- public void testRemoveDuplicates() {
- List<String> strings = Arrays.asList(
- "k1",
- "k5",
- "k5",
- "k2",
- "k1",
- "k2",
- "k3");
-
- Pipeline p = TestPipeline.create();
-
- PCollection<String> input =
- p.apply(Create.of(strings)
- .withCoder(StringUtf8Coder.of()));
-
- PCollection<String> output =
- input.apply(RemoveDuplicates.<String>create());
-
- DataflowAssert.that(output)
- .containsInAnyOrder("k1", "k5", "k2", "k3");
- p.run();
- }
-
- @Test
- @Category(RunnableOnService.class)
- public void testRemoveDuplicatesEmpty() {
- List<String> strings = Arrays.asList();
-
- Pipeline p = TestPipeline.create();
-
- PCollection<String> input =
- p.apply(Create.of(strings)
- .withCoder(StringUtf8Coder.of()));
-
- PCollection<String> output =
- input.apply(RemoveDuplicates.<String>create());
-
- DataflowAssert.that(output).empty();
- p.run();
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/src/test/java/com/google/cloud/dataflow/examples/cookbook/FilterExamplesTest.java
----------------------------------------------------------------------
diff --git a/examples/src/test/java/com/google/cloud/dataflow/examples/cookbook/FilterExamplesTest.java b/examples/src/test/java/com/google/cloud/dataflow/examples/cookbook/FilterExamplesTest.java
deleted file mode 100644
index 6d822f9..0000000
--- a/examples/src/test/java/com/google/cloud/dataflow/examples/cookbook/FilterExamplesTest.java
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.examples.cookbook;
-
-import com.google.api.services.bigquery.model.TableRow;
-import com.google.cloud.dataflow.examples.cookbook.FilterExamples.FilterSingleMonthDataFn;
-import com.google.cloud.dataflow.examples.cookbook.FilterExamples.ProjectionFn;
-import com.google.cloud.dataflow.sdk.transforms.DoFnTester;
-
-import org.hamcrest.CoreMatchers;
-import org.junit.Assert;
-import org.junit.Test;
-import org.junit.runner.RunWith;
-import org.junit.runners.JUnit4;
-
-import java.util.Arrays;
-import java.util.List;
-
-/** Unit tests for {@link FilterExamples}. */
-@RunWith(JUnit4.class)
-public class FilterExamplesTest {
-
- private static final TableRow row1 = new TableRow()
- .set("month", "6").set("day", "21")
- .set("year", "2014").set("mean_temp", "85.3")
- .set("tornado", true);
- private static final TableRow row2 = new TableRow()
- .set("month", "7").set("day", "20")
- .set("year", "2014").set("mean_temp", "75.4")
- .set("tornado", false);
- private static final TableRow row3 = new TableRow()
- .set("month", "6").set("day", "18")
- .set("year", "2014").set("mean_temp", "45.3")
- .set("tornado", true);
- static final TableRow[] ROWS_ARRAY = new TableRow[] {
- row1, row2, row3
- };
- static final List<TableRow> ROWS = Arrays.asList(ROWS_ARRAY);
-
- private static final TableRow outRow1 = new TableRow()
- .set("year", 2014).set("month", 6)
- .set("day", 21).set("mean_temp", 85.3);
- private static final TableRow outRow2 = new TableRow()
- .set("year", 2014).set("month", 7)
- .set("day", 20).set("mean_temp", 75.4);
- private static final TableRow outRow3 = new TableRow()
- .set("year", 2014).set("month", 6)
- .set("day", 18).set("mean_temp", 45.3);
- private static final TableRow[] PROJROWS_ARRAY = new TableRow[] {
- outRow1, outRow2, outRow3
- };
-
-
- @Test
- public void testProjectionFn() {
- DoFnTester<TableRow, TableRow> projectionFn =
- DoFnTester.of(new ProjectionFn());
- List<TableRow> results = projectionFn.processBatch(ROWS_ARRAY);
- Assert.assertThat(results, CoreMatchers.hasItem(outRow1));
- Assert.assertThat(results, CoreMatchers.hasItem(outRow2));
- Assert.assertThat(results, CoreMatchers.hasItem(outRow3));
- }
-
- @Test
- public void testFilterSingleMonthDataFn() {
- DoFnTester<TableRow, TableRow> filterSingleMonthDataFn =
- DoFnTester.of(new FilterSingleMonthDataFn(7));
- List<TableRow> results = filterSingleMonthDataFn.processBatch(PROJROWS_ARRAY);
- Assert.assertThat(results, CoreMatchers.hasItem(outRow2));
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/src/test/java/com/google/cloud/dataflow/examples/cookbook/JoinExamplesTest.java
----------------------------------------------------------------------
diff --git a/examples/src/test/java/com/google/cloud/dataflow/examples/cookbook/JoinExamplesTest.java b/examples/src/test/java/com/google/cloud/dataflow/examples/cookbook/JoinExamplesTest.java
deleted file mode 100644
index db3ae34..0000000
--- a/examples/src/test/java/com/google/cloud/dataflow/examples/cookbook/JoinExamplesTest.java
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.examples.cookbook;
-
-import com.google.api.services.bigquery.model.TableRow;
-import com.google.cloud.dataflow.examples.cookbook.JoinExamples.ExtractCountryInfoFn;
-import com.google.cloud.dataflow.examples.cookbook.JoinExamples.ExtractEventDataFn;
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.testing.DataflowAssert;
-import com.google.cloud.dataflow.sdk.testing.RunnableOnService;
-import com.google.cloud.dataflow.sdk.testing.TestPipeline;
-import com.google.cloud.dataflow.sdk.transforms.Create;
-import com.google.cloud.dataflow.sdk.transforms.DoFnTester;
-import com.google.cloud.dataflow.sdk.values.KV;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-
-import org.hamcrest.CoreMatchers;
-import org.junit.Assert;
-import org.junit.Test;
-import org.junit.experimental.categories.Category;
-import org.junit.runner.RunWith;
-import org.junit.runners.JUnit4;
-
-import java.util.Arrays;
-import java.util.List;
-
-/** Unit tests for {@link JoinExamples}. */
-@RunWith(JUnit4.class)
-public class JoinExamplesTest {
-
- private static final TableRow row1 = new TableRow()
- .set("ActionGeo_CountryCode", "VM").set("SQLDATE", "20141212")
- .set("Actor1Name", "BANGKOK").set("SOURCEURL", "http://cnn.com");
- private static final TableRow row2 = new TableRow()
- .set("ActionGeo_CountryCode", "VM").set("SQLDATE", "20141212")
- .set("Actor1Name", "LAOS").set("SOURCEURL", "http://www.chicagotribune.com");
- private static final TableRow row3 = new TableRow()
- .set("ActionGeo_CountryCode", "BE").set("SQLDATE", "20141213")
- .set("Actor1Name", "AFGHANISTAN").set("SOURCEURL", "http://cnn.com");
- static final TableRow[] EVENTS = new TableRow[] {
- row1, row2, row3
- };
- static final List<TableRow> EVENT_ARRAY = Arrays.asList(EVENTS);
-
- private static final KV<String, String> kv1 = KV.of("VM",
- "Date: 20141212, Actor1: LAOS, url: http://www.chicagotribune.com");
- private static final KV<String, String> kv2 = KV.of("BE",
- "Date: 20141213, Actor1: AFGHANISTAN, url: http://cnn.com");
- private static final KV<String, String> kv3 = KV.of("BE", "Belgium");
- private static final KV<String, String> kv4 = KV.of("VM", "Vietnam");
-
- private static final TableRow cc1 = new TableRow()
- .set("FIPSCC", "VM").set("HumanName", "Vietnam");
- private static final TableRow cc2 = new TableRow()
- .set("FIPSCC", "BE").set("HumanName", "Belgium");
- static final TableRow[] CCS = new TableRow[] {
- cc1, cc2
- };
- static final List<TableRow> CC_ARRAY = Arrays.asList(CCS);
-
- static final String[] JOINED_EVENTS = new String[] {
- "Country code: VM, Country name: Vietnam, Event info: Date: 20141212, Actor1: LAOS, "
- + "url: http://www.chicagotribune.com",
- "Country code: VM, Country name: Vietnam, Event info: Date: 20141212, Actor1: BANGKOK, "
- + "url: http://cnn.com",
- "Country code: BE, Country name: Belgium, Event info: Date: 20141213, Actor1: AFGHANISTAN, "
- + "url: http://cnn.com"
- };
-
- @Test
- public void testExtractEventDataFn() {
- DoFnTester<TableRow, KV<String, String>> extractEventDataFn =
- DoFnTester.of(new ExtractEventDataFn());
- List<KV<String, String>> results = extractEventDataFn.processBatch(EVENTS);
- Assert.assertThat(results, CoreMatchers.hasItem(kv1));
- Assert.assertThat(results, CoreMatchers.hasItem(kv2));
- }
-
- @Test
- public void testExtractCountryInfoFn() {
- DoFnTester<TableRow, KV<String, String>> extractCountryInfoFn =
- DoFnTester.of(new ExtractCountryInfoFn());
- List<KV<String, String>> results = extractCountryInfoFn.processBatch(CCS);
- Assert.assertThat(results, CoreMatchers.hasItem(kv3));
- Assert.assertThat(results, CoreMatchers.hasItem(kv4));
- }
-
-
- @Test
- @Category(RunnableOnService.class)
- public void testJoin() throws java.lang.Exception {
- Pipeline p = TestPipeline.create();
- PCollection<TableRow> input1 = p.apply("CreateEvent", Create.of(EVENT_ARRAY));
- PCollection<TableRow> input2 = p.apply("CreateCC", Create.of(CC_ARRAY));
-
- PCollection<String> output = JoinExamples.joinEvents(input1, input2);
- DataflowAssert.that(output).containsInAnyOrder(JOINED_EVENTS);
- p.run();
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/src/test/java/com/google/cloud/dataflow/examples/cookbook/MaxPerKeyExamplesTest.java
----------------------------------------------------------------------
diff --git a/examples/src/test/java/com/google/cloud/dataflow/examples/cookbook/MaxPerKeyExamplesTest.java b/examples/src/test/java/com/google/cloud/dataflow/examples/cookbook/MaxPerKeyExamplesTest.java
deleted file mode 100644
index 3deff2a..0000000
--- a/examples/src/test/java/com/google/cloud/dataflow/examples/cookbook/MaxPerKeyExamplesTest.java
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.examples.cookbook;
-
-import com.google.api.services.bigquery.model.TableRow;
-import com.google.cloud.dataflow.examples.cookbook.MaxPerKeyExamples.ExtractTempFn;
-import com.google.cloud.dataflow.examples.cookbook.MaxPerKeyExamples.FormatMaxesFn;
-import com.google.cloud.dataflow.sdk.transforms.DoFnTester;
-import com.google.cloud.dataflow.sdk.values.KV;
-import com.google.common.collect.ImmutableList;
-
-import org.hamcrest.CoreMatchers;
-import org.junit.Assert;
-import org.junit.Test;
-import org.junit.runner.RunWith;
-import org.junit.runners.JUnit4;
-
-import java.util.List;
-
-/** Unit tests for {@link MaxPerKeyExamples}. */
-@RunWith(JUnit4.class)
-public class MaxPerKeyExamplesTest {
-
- private static final TableRow row1 = new TableRow()
- .set("month", "6").set("day", "21")
- .set("year", "2014").set("mean_temp", "85.3")
- .set("tornado", true);
- private static final TableRow row2 = new TableRow()
- .set("month", "7").set("day", "20")
- .set("year", "2014").set("mean_temp", "75.4")
- .set("tornado", false);
- private static final TableRow row3 = new TableRow()
- .set("month", "6").set("day", "18")
- .set("year", "2014").set("mean_temp", "45.3")
- .set("tornado", true);
- private static final List<TableRow> TEST_ROWS = ImmutableList.of(row1, row2, row3);
-
- private static final KV<Integer, Double> kv1 = KV.of(6, 85.3);
- private static final KV<Integer, Double> kv2 = KV.of(6, 45.3);
- private static final KV<Integer, Double> kv3 = KV.of(7, 75.4);
-
- private static final List<KV<Integer, Double>> TEST_KVS = ImmutableList.of(kv1, kv2, kv3);
-
- private static final TableRow resultRow1 = new TableRow()
- .set("month", 6)
- .set("max_mean_temp", 85.3);
- private static final TableRow resultRow2 = new TableRow()
- .set("month", 7)
- .set("max_mean_temp", 75.4);
-
-
- @Test
- public void testExtractTempFn() {
- DoFnTester<TableRow, KV<Integer, Double>> extractTempFn =
- DoFnTester.of(new ExtractTempFn());
- List<KV<Integer, Double>> results = extractTempFn.processBatch(TEST_ROWS);
- Assert.assertThat(results, CoreMatchers.hasItem(kv1));
- Assert.assertThat(results, CoreMatchers.hasItem(kv2));
- Assert.assertThat(results, CoreMatchers.hasItem(kv3));
- }
-
- @Test
- public void testFormatMaxesFn() {
- DoFnTester<KV<Integer, Double>, TableRow> formatMaxesFnFn =
- DoFnTester.of(new FormatMaxesFn());
- List<TableRow> results = formatMaxesFnFn.processBatch(TEST_KVS);
- Assert.assertThat(results, CoreMatchers.hasItem(resultRow1));
- Assert.assertThat(results, CoreMatchers.hasItem(resultRow2));
- }
-
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/src/test/java/com/google/cloud/dataflow/examples/cookbook/TriggerExampleTest.java
----------------------------------------------------------------------
diff --git a/examples/src/test/java/com/google/cloud/dataflow/examples/cookbook/TriggerExampleTest.java b/examples/src/test/java/com/google/cloud/dataflow/examples/cookbook/TriggerExampleTest.java
deleted file mode 100644
index 209ea52..0000000
--- a/examples/src/test/java/com/google/cloud/dataflow/examples/cookbook/TriggerExampleTest.java
+++ /dev/null
@@ -1,139 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.examples.cookbook;
-
-import com.google.api.services.bigquery.model.TableRow;
-import com.google.cloud.dataflow.examples.cookbook.TriggerExample.ExtractFlowInfo;
-import com.google.cloud.dataflow.examples.cookbook.TriggerExample.TotalFlow;
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.testing.DataflowAssert;
-import com.google.cloud.dataflow.sdk.testing.RunnableOnService;
-import com.google.cloud.dataflow.sdk.testing.TestPipeline;
-import com.google.cloud.dataflow.sdk.transforms.Create;
-import com.google.cloud.dataflow.sdk.transforms.DoFn;
-import com.google.cloud.dataflow.sdk.transforms.DoFnTester;
-import com.google.cloud.dataflow.sdk.transforms.ParDo;
-import com.google.cloud.dataflow.sdk.transforms.windowing.FixedWindows;
-import com.google.cloud.dataflow.sdk.transforms.windowing.Window;
-import com.google.cloud.dataflow.sdk.values.KV;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.cloud.dataflow.sdk.values.TimestampedValue;
-
-import org.joda.time.Duration;
-import org.joda.time.Instant;
-import org.junit.Assert;
-import org.junit.Test;
-import org.junit.experimental.categories.Category;
-import org.junit.runner.RunWith;
-import org.junit.runners.JUnit4;
-
-import java.util.Arrays;
-import java.util.List;
-
-/**
- * Unit Tests for {@link TriggerExample}.
- * The results generated by triggers are by definition non-deterministic and hence hard to test.
- * The unit test does not test all aspects of the example.
- */
-@RunWith(JUnit4.class)
-public class TriggerExampleTest {
-
- private static final String[] INPUT =
- {"01/01/2010 00:00:00,1108302,94,E,ML,36,100,29,0.0065,66,9,1,0.001,74.8,1,9,3,0.0028,71,1,9,"
- + "12,0.0099,67.4,1,9,13,0.0121,99.0,1,,,,,0,,,,,0,,,,,0,,,,,0", "01/01/2010 00:00:00,"
- + "1100333,5,N,FR,9,0,39,,,9,,,,0,,,,,0,,,,,0,,,,,0,,,,,0,,,,,0,,,,,0,,,,"};
-
- private static final List<TimestampedValue<String>> TIME_STAMPED_INPUT = Arrays.asList(
- TimestampedValue.of("01/01/2010 00:00:00,1108302,5,W,ML,36,100,30,0.0065,66,9,1,0.001,"
- + "74.8,1,9,3,0.0028,71,1,9,12,0.0099,87.4,1,9,13,0.0121,99.0,1,,,,,0,,,,,0,,,,,0,,,"
- + ",,0", new Instant(60000)),
- TimestampedValue.of("01/01/2010 00:00:00,1108302,110,E,ML,36,100,40,0.0065,66,9,1,0.001,"
- + "74.8,1,9,3,0.0028,71,1,9,12,0.0099,67.4,1,9,13,0.0121,99.0,1,,,,,0,,,,,0,,,,,0,,,"
- + ",,0", new Instant(1)),
- TimestampedValue.of("01/01/2010 00:00:00,1108302,110,E,ML,36,100,50,0.0065,66,9,1,"
- + "0.001,74.8,1,9,3,0.0028,71,1,9,12,0.0099,97.4,1,9,13,0.0121,50.0,1,,,,,0,,,,,0"
- + ",,,,,0,,,,,0", new Instant(1)));
-
- private static final TableRow OUT_ROW_1 = new TableRow()
- .set("trigger_type", "default")
- .set("freeway", "5").set("total_flow", 30)
- .set("number_of_records", 1)
- .set("isFirst", true).set("isLast", true)
- .set("timing", "ON_TIME")
- .set("window", "[1970-01-01T00:01:00.000Z..1970-01-01T00:02:00.000Z)");
-
- private static final TableRow OUT_ROW_2 = new TableRow()
- .set("trigger_type", "default")
- .set("freeway", "110").set("total_flow", 90)
- .set("number_of_records", 2)
- .set("isFirst", true).set("isLast", true)
- .set("timing", "ON_TIME")
- .set("window", "[1970-01-01T00:00:00.000Z..1970-01-01T00:01:00.000Z)");
-
- @Test
- public void testExtractTotalFlow() {
- DoFnTester<String, KV<String, Integer>> extractFlowInfow = DoFnTester
- .of(new ExtractFlowInfo());
-
- List<KV<String, Integer>> results = extractFlowInfow.processBatch(INPUT);
- Assert.assertEquals(results.size(), 1);
- Assert.assertEquals(results.get(0).getKey(), "94");
- Assert.assertEquals(results.get(0).getValue(), new Integer(29));
-
- List<KV<String, Integer>> output = extractFlowInfow.processBatch("");
- Assert.assertEquals(output.size(), 0);
- }
-
- @Test
- @Category(RunnableOnService.class)
- public void testTotalFlow () {
- Pipeline pipeline = TestPipeline.create();
- PCollection<KV<String, Integer>> flow = pipeline
- .apply(Create.timestamped(TIME_STAMPED_INPUT))
- .apply(ParDo.of(new ExtractFlowInfo()));
-
- PCollection<TableRow> totalFlow = flow
- .apply(Window.<KV<String, Integer>>into(FixedWindows.of(Duration.standardMinutes(1))))
- .apply(new TotalFlow("default"));
-
- PCollection<TableRow> results = totalFlow.apply(ParDo.of(new FormatResults()));
-
-
- DataflowAssert.that(results).containsInAnyOrder(OUT_ROW_1, OUT_ROW_2);
- pipeline.run();
-
- }
-
- static class FormatResults extends DoFn<TableRow, TableRow> {
- @Override
- public void processElement(ProcessContext c) throws Exception {
- TableRow element = c.element();
- TableRow row = new TableRow()
- .set("trigger_type", element.get("trigger_type"))
- .set("freeway", element.get("freeway"))
- .set("total_flow", element.get("total_flow"))
- .set("number_of_records", element.get("number_of_records"))
- .set("isFirst", element.get("isFirst"))
- .set("isLast", element.get("isLast"))
- .set("timing", element.get("timing"))
- .set("window", element.get("window"));
- c.output(row);
- }
- }
-}
-
-
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index 3145c40..2d0a3e1 100644
--- a/pom.xml
+++ b/pom.xml
@@ -127,7 +127,7 @@
<modules>
<module>sdks/java/core</module>
<module>runners</module>
- <module>examples</module>
+ <module>examples/java</module>
<module>maven-archetypes</module>
</modules>
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/travis/test_wordcount.sh
----------------------------------------------------------------------
diff --git a/travis/test_wordcount.sh b/travis/test_wordcount.sh
index fdb9d10..fdd878d 100755
--- a/travis/test_wordcount.sh
+++ b/travis/test_wordcount.sh
@@ -19,7 +19,7 @@ set -o pipefail
PASS=1
VERSION=$(mvn org.apache.maven.plugins:maven-help-plugin:2.1.1:evaluate -Dexpression=project.version | grep -v '\[')
-JAR_FILE=examples/target/google-cloud-dataflow-java-examples-all-bundled-${VERSION}.jar
+JAR_FILE=examples/java/target/java-examples-all-bundled-${VERSION}.jar
function check_result_hash {
local name=$1
@@ -52,7 +52,7 @@ function run_via_mvn {
local expected_hash=$3
local outfile_prefix="$(get_outfile_prefix "$name")" || exit 2
- local cmd='mvn exec:java -f pom.xml -pl examples \
+ local cmd='mvn exec:java -f pom.xml -pl examples/java \
-Dexec.mainClass=com.google.cloud.dataflow.examples.WordCount \
-Dexec.args="--runner=DirectPipelineRunner --inputFile='"$input"' --output='"$outfile_prefix"'"'
echo "$name: Running $cmd" >&2
[29/67] [partial] incubator-beam git commit: Directory reorganization
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/TransformExecutorServices.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/TransformExecutorServices.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/TransformExecutorServices.java
deleted file mode 100644
index 34efdf6..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/TransformExecutorServices.java
+++ /dev/null
@@ -1,153 +0,0 @@
-/*
- * Copyright (C) 2016 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.runners.inprocess;
-
-import com.google.common.base.MoreObjects;
-
-import java.util.Map;
-import java.util.Queue;
-import java.util.concurrent.ConcurrentLinkedQueue;
-import java.util.concurrent.ExecutorService;
-import java.util.concurrent.atomic.AtomicReference;
-
-/**
- * Static factory methods for constructing instances of {@link TransformExecutorService}.
- */
-final class TransformExecutorServices {
- private TransformExecutorServices() {
- // Do not instantiate
- }
-
- /**
- * Returns an EvaluationState that evaluates {@link TransformExecutor TransformExecutors} in
- * parallel.
- */
- public static TransformExecutorService parallel(
- ExecutorService executor, Map<TransformExecutor<?>, Boolean> scheduled) {
- return new ParallelEvaluationState(executor, scheduled);
- }
-
- /**
- * Returns an EvaluationState that evaluates {@link TransformExecutor TransformExecutors} in
- * serial.
- */
- public static TransformExecutorService serial(
- ExecutorService executor, Map<TransformExecutor<?>, Boolean> scheduled) {
- return new SerialEvaluationState(executor, scheduled);
- }
-
- /**
- * A {@link TransformExecutorService} with unlimited parallelism. Any {@link TransformExecutor}
- * scheduled will be immediately submitted to the {@link ExecutorService}.
- *
- * <p>A principal use of this is for the evaluation of an unkeyed Step. Unkeyed computations are
- * processed in parallel.
- */
- private static class ParallelEvaluationState implements TransformExecutorService {
- private final ExecutorService executor;
- private final Map<TransformExecutor<?>, Boolean> scheduled;
-
- private ParallelEvaluationState(
- ExecutorService executor, Map<TransformExecutor<?>, Boolean> scheduled) {
- this.executor = executor;
- this.scheduled = scheduled;
- }
-
- @Override
- public void schedule(TransformExecutor<?> work) {
- executor.submit(work);
- scheduled.put(work, true);
- }
-
- @Override
- public void complete(TransformExecutor<?> completed) {
- scheduled.remove(completed);
- }
- }
-
- /**
- * A {@link TransformExecutorService} with a single work queue. Any {@link TransformExecutor}
- * scheduled will be placed on the work queue. Only one item of work will be submitted to the
- * {@link ExecutorService} at any time.
- *
- * <p>A principal use of this is for the serial evaluation of a (Step, Key) pair.
- * Keyed computations are processed serially per step.
- */
- private static class SerialEvaluationState implements TransformExecutorService {
- private final ExecutorService executor;
- private final Map<TransformExecutor<?>, Boolean> scheduled;
-
- private AtomicReference<TransformExecutor<?>> currentlyEvaluating;
- private final Queue<TransformExecutor<?>> workQueue;
-
- private SerialEvaluationState(
- ExecutorService executor, Map<TransformExecutor<?>, Boolean> scheduled) {
- this.scheduled = scheduled;
- this.executor = executor;
- this.currentlyEvaluating = new AtomicReference<>();
- this.workQueue = new ConcurrentLinkedQueue<>();
- }
-
- /**
- * Schedules the work, adding it to the work queue if there is a bundle currently being
- * evaluated and scheduling it immediately otherwise.
- */
- @Override
- public void schedule(TransformExecutor<?> work) {
- workQueue.offer(work);
- updateCurrentlyEvaluating();
- }
-
- @Override
- public void complete(TransformExecutor<?> completed) {
- if (!currentlyEvaluating.compareAndSet(completed, null)) {
- throw new IllegalStateException(
- "Finished work "
- + completed
- + " but could not complete due to unexpected currently executing "
- + currentlyEvaluating.get());
- }
- scheduled.remove(completed);
- updateCurrentlyEvaluating();
- }
-
- private void updateCurrentlyEvaluating() {
- if (currentlyEvaluating.get() == null) {
- // Only synchronize if we need to update what's currently evaluating
- synchronized (this) {
- TransformExecutor<?> newWork = workQueue.poll();
- if (newWork != null) {
- if (currentlyEvaluating.compareAndSet(null, newWork)) {
- scheduled.put(newWork, true);
- executor.submit(newWork);
- } else {
- workQueue.offer(newWork);
- }
- }
- }
- }
- }
-
- @Override
- public String toString() {
- return MoreObjects.toStringHelper(SerialEvaluationState.class)
- .add("currentlyEvaluating", currentlyEvaluating)
- .add("workQueue", workQueue)
- .toString();
- }
- }
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/UnboundedReadEvaluatorFactory.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/UnboundedReadEvaluatorFactory.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/UnboundedReadEvaluatorFactory.java
deleted file mode 100644
index 549afab..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/UnboundedReadEvaluatorFactory.java
+++ /dev/null
@@ -1,168 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.runners.inprocess;
-
-import com.google.cloud.dataflow.sdk.io.Read.Unbounded;
-import com.google.cloud.dataflow.sdk.io.UnboundedSource;
-import com.google.cloud.dataflow.sdk.io.UnboundedSource.CheckpointMark;
-import com.google.cloud.dataflow.sdk.io.UnboundedSource.UnboundedReader;
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-import com.google.cloud.dataflow.sdk.runners.inprocess.InProcessPipelineRunner.CommittedBundle;
-import com.google.cloud.dataflow.sdk.runners.inprocess.InProcessPipelineRunner.UncommittedBundle;
-import com.google.cloud.dataflow.sdk.transforms.AppliedPTransform;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-import com.google.cloud.dataflow.sdk.util.WindowedValue;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-
-import java.io.IOException;
-import java.util.Queue;
-import java.util.concurrent.ConcurrentHashMap;
-import java.util.concurrent.ConcurrentLinkedQueue;
-import java.util.concurrent.ConcurrentMap;
-
-import javax.annotation.Nullable;
-
-/**
- * A {@link TransformEvaluatorFactory} that produces {@link TransformEvaluator TransformEvaluators}
- * for the {@link Unbounded Read.Unbounded} primitive {@link PTransform}.
- */
-class UnboundedReadEvaluatorFactory implements TransformEvaluatorFactory {
- /*
- * An evaluator for a Source is stateful, to ensure the CheckpointMark is properly persisted.
- * Evaluators are cached here to ensure that the checkpoint mark is appropriately reused
- * and any splits are honored.
- */
- private final ConcurrentMap<EvaluatorKey, Queue<? extends UnboundedReadEvaluator<?>>>
- sourceEvaluators = new ConcurrentHashMap<>();
-
- @SuppressWarnings({"unchecked", "rawtypes"})
- @Override
- public <InputT> TransformEvaluator<InputT> forApplication(AppliedPTransform<?, ?, ?> application,
- @Nullable CommittedBundle<?> inputBundle, InProcessEvaluationContext evaluationContext) {
- return getTransformEvaluator((AppliedPTransform) application, evaluationContext);
- }
-
- private <OutputT> TransformEvaluator<?> getTransformEvaluator(
- final AppliedPTransform<?, PCollection<OutputT>, Unbounded<OutputT>> transform,
- final InProcessEvaluationContext evaluationContext) {
- UnboundedReadEvaluator<?> currentEvaluator =
- getTransformEvaluatorQueue(transform, evaluationContext).poll();
- if (currentEvaluator == null) {
- return EmptyTransformEvaluator.create(transform);
- }
- return currentEvaluator;
- }
-
- /**
- * Get the queue of {@link TransformEvaluator TransformEvaluators} that produce elements for the
- * provided application of {@link Unbounded Read.Unbounded}, initializing it if required.
- *
- * <p>This method is thread-safe, and will only produce new evaluators if no other invocation has
- * already done so.
- */
- @SuppressWarnings("unchecked")
- private <OutputT> Queue<UnboundedReadEvaluator<OutputT>> getTransformEvaluatorQueue(
- final AppliedPTransform<?, PCollection<OutputT>, Unbounded<OutputT>> transform,
- final InProcessEvaluationContext evaluationContext) {
- // Key by the application and the context the evaluation is occurring in (which call to
- // Pipeline#run).
- EvaluatorKey key = new EvaluatorKey(transform, evaluationContext);
- @SuppressWarnings("unchecked")
- Queue<UnboundedReadEvaluator<OutputT>> evaluatorQueue =
- (Queue<UnboundedReadEvaluator<OutputT>>) sourceEvaluators.get(key);
- if (evaluatorQueue == null) {
- evaluatorQueue = new ConcurrentLinkedQueue<>();
- if (sourceEvaluators.putIfAbsent(key, evaluatorQueue) == null) {
- // If no queue existed in the evaluators, add an evaluator to initialize the evaluator
- // factory for this transform
- UnboundedReadEvaluator<OutputT> evaluator =
- new UnboundedReadEvaluator<OutputT>(transform, evaluationContext, evaluatorQueue);
- evaluatorQueue.offer(evaluator);
- } else {
- // otherwise return the existing Queue that arrived before us
- evaluatorQueue = (Queue<UnboundedReadEvaluator<OutputT>>) sourceEvaluators.get(key);
- }
- }
- return evaluatorQueue;
- }
-
- /**
- * A {@link UnboundedReadEvaluator} produces elements from an underlying {@link UnboundedSource},
- * discarding all input elements. Within the call to {@link #finishBundle()}, the evaluator
- * creates the {@link UnboundedReader} and consumes some currently available input.
- *
- * <p>Calls to {@link UnboundedReadEvaluator} are not internally thread-safe, and should only be
- * used by a single thread at a time. Each {@link UnboundedReadEvaluator} maintains its own
- * checkpoint, and constructs its reader from the current checkpoint in each call to
- * {@link #finishBundle()}.
- */
- private static class UnboundedReadEvaluator<OutputT> implements TransformEvaluator<Object> {
- private static final int ARBITRARY_MAX_ELEMENTS = 10;
- private final AppliedPTransform<?, PCollection<OutputT>, Unbounded<OutputT>> transform;
- private final InProcessEvaluationContext evaluationContext;
- private final Queue<UnboundedReadEvaluator<OutputT>> evaluatorQueue;
- private CheckpointMark checkpointMark;
-
- public UnboundedReadEvaluator(
- AppliedPTransform<?, PCollection<OutputT>, Unbounded<OutputT>> transform,
- InProcessEvaluationContext evaluationContext,
- Queue<UnboundedReadEvaluator<OutputT>> evaluatorQueue) {
- this.transform = transform;
- this.evaluationContext = evaluationContext;
- this.evaluatorQueue = evaluatorQueue;
- this.checkpointMark = null;
- }
-
- @Override
- public void processElement(WindowedValue<Object> element) {}
-
- @Override
- public InProcessTransformResult finishBundle() throws IOException {
- UncommittedBundle<OutputT> output = evaluationContext.createRootBundle(transform.getOutput());
- try (UnboundedReader<OutputT> reader =
- createReader(
- transform.getTransform().getSource(), evaluationContext.getPipelineOptions());) {
- int numElements = 0;
- if (reader.start()) {
- do {
- output.add(
- WindowedValue.timestampedValueInGlobalWindow(
- reader.getCurrent(), reader.getCurrentTimestamp()));
- numElements++;
- } while (numElements < ARBITRARY_MAX_ELEMENTS && reader.advance());
- }
- checkpointMark = reader.getCheckpointMark();
- checkpointMark.finalizeCheckpoint();
- // TODO: When exercising create initial splits, make this the minimum watermark across all
- // existing readers
- StepTransformResult result =
- StepTransformResult.withHold(transform, reader.getWatermark())
- .addOutput(output)
- .build();
- evaluatorQueue.offer(this);
- return result;
- }
- }
-
- private <CheckpointMarkT extends CheckpointMark> UnboundedReader<OutputT> createReader(
- UnboundedSource<OutputT, CheckpointMarkT> source, PipelineOptions options) {
- @SuppressWarnings("unchecked")
- CheckpointMarkT mark = (CheckpointMarkT) checkpointMark;
- return source.createReader(options, mark);
- }
- }
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/ViewEvaluatorFactory.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/ViewEvaluatorFactory.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/ViewEvaluatorFactory.java
deleted file mode 100644
index dd2bfb1..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/ViewEvaluatorFactory.java
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.runners.inprocess;
-
-import com.google.cloud.dataflow.sdk.coders.KvCoder;
-import com.google.cloud.dataflow.sdk.coders.VoidCoder;
-import com.google.cloud.dataflow.sdk.runners.inprocess.InProcessPipelineRunner.PCollectionViewWriter;
-import com.google.cloud.dataflow.sdk.transforms.AppliedPTransform;
-import com.google.cloud.dataflow.sdk.transforms.GroupByKey;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-import com.google.cloud.dataflow.sdk.transforms.Values;
-import com.google.cloud.dataflow.sdk.transforms.View.CreatePCollectionView;
-import com.google.cloud.dataflow.sdk.transforms.WithKeys;
-import com.google.cloud.dataflow.sdk.util.WindowedValue;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.cloud.dataflow.sdk.values.PCollectionView;
-
-import java.util.ArrayList;
-import java.util.List;
-
-/**
- * The {@link InProcessPipelineRunner} {@link TransformEvaluatorFactory} for the
- * {@link CreatePCollectionView} primitive {@link PTransform}.
- *
- * <p>The {@link ViewEvaluatorFactory} produces {@link TransformEvaluator TransformEvaluators} for
- * the {@link WriteView} {@link PTransform}, which is part of the
- * {@link InProcessCreatePCollectionView} composite transform. This transform is an override for the
- * {@link CreatePCollectionView} transform that applies windowing and triggers before the view is
- * written.
- */
-class ViewEvaluatorFactory implements TransformEvaluatorFactory {
- @Override
- public <T> TransformEvaluator<T> forApplication(
- AppliedPTransform<?, ?, ?> application,
- InProcessPipelineRunner.CommittedBundle<?> inputBundle,
- InProcessEvaluationContext evaluationContext) {
- @SuppressWarnings({"cast", "unchecked", "rawtypes"})
- TransformEvaluator<T> evaluator = (TransformEvaluator<T>) createEvaluator(
- (AppliedPTransform) application, evaluationContext);
- return evaluator;
- }
-
- private <InT, OuT> TransformEvaluator<Iterable<InT>> createEvaluator(
- final AppliedPTransform<PCollection<Iterable<InT>>, PCollectionView<OuT>, WriteView<InT, OuT>>
- application,
- InProcessEvaluationContext context) {
- PCollection<Iterable<InT>> input = application.getInput();
- final PCollectionViewWriter<InT, OuT> writer =
- context.createPCollectionViewWriter(input, application.getOutput());
- return new TransformEvaluator<Iterable<InT>>() {
- private final List<WindowedValue<InT>> elements = new ArrayList<>();
-
- @Override
- public void processElement(WindowedValue<Iterable<InT>> element) {
- for (InT input : element.getValue()) {
- elements.add(element.withValue(input));
- }
- }
-
- @Override
- public InProcessTransformResult finishBundle() {
- writer.add(elements);
- return StepTransformResult.withoutHold(application).build();
- }
- };
- }
-
- /**
- * An in-process override for {@link CreatePCollectionView}.
- */
- public static class InProcessCreatePCollectionView<ElemT, ViewT>
- extends PTransform<PCollection<ElemT>, PCollectionView<ViewT>> {
- private final CreatePCollectionView<ElemT, ViewT> og;
-
- private InProcessCreatePCollectionView(CreatePCollectionView<ElemT, ViewT> og) {
- this.og = og;
- }
-
- @Override
- public PCollectionView<ViewT> apply(PCollection<ElemT> input) {
- return input.apply(WithKeys.<Void, ElemT>of((Void) null))
- .setCoder(KvCoder.of(VoidCoder.of(), input.getCoder()))
- .apply(GroupByKey.<Void, ElemT>create())
- .apply(Values.<Iterable<ElemT>>create())
- .apply(new WriteView<ElemT, ViewT>(og));
- }
- }
-
- /**
- * An in-process implementation of the {@link CreatePCollectionView} primitive.
- *
- * This implementation requires the input {@link PCollection} to be an iterable, which is provided
- * to {@link PCollectionView#fromIterableInternal(Iterable)}.
- */
- public static final class WriteView<ElemT, ViewT>
- extends PTransform<PCollection<Iterable<ElemT>>, PCollectionView<ViewT>> {
- private final CreatePCollectionView<ElemT, ViewT> og;
-
- WriteView(CreatePCollectionView<ElemT, ViewT> og) {
- this.og = og;
- }
-
- @Override
- public PCollectionView<ViewT> apply(PCollection<Iterable<ElemT>> input) {
- return og.getView();
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/WatermarkCallbackExecutor.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/WatermarkCallbackExecutor.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/WatermarkCallbackExecutor.java
deleted file mode 100644
index 27d59b9..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/WatermarkCallbackExecutor.java
+++ /dev/null
@@ -1,143 +0,0 @@
-/*
- * Copyright (C) 2016 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.runners.inprocess;
-
-import com.google.cloud.dataflow.sdk.transforms.AppliedPTransform;
-import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
-import com.google.cloud.dataflow.sdk.util.WindowingStrategy;
-import com.google.common.collect.ComparisonChain;
-import com.google.common.collect.Ordering;
-
-import org.joda.time.Instant;
-
-import java.util.PriorityQueue;
-import java.util.concurrent.ConcurrentHashMap;
-import java.util.concurrent.ConcurrentMap;
-import java.util.concurrent.ExecutorService;
-import java.util.concurrent.Executors;
-
-/**
- * Executes callbacks that occur based on the progression of the watermark per-step.
- *
- * <p>Callbacks are registered by calls to
- * {@link #callOnGuaranteedFiring(AppliedPTransform, BoundedWindow, WindowingStrategy, Runnable)},
- * and are executed after a call to {@link #fireForWatermark(AppliedPTransform, Instant)} with the
- * same {@link AppliedPTransform} and a watermark sufficient to ensure that the trigger for the
- * windowing strategy would have been produced.
- *
- * <p>NOTE: {@link WatermarkCallbackExecutor} does not track the latest observed watermark for any
- * {@link AppliedPTransform} - any call to
- * {@link #callOnGuaranteedFiring(AppliedPTransform, BoundedWindow, WindowingStrategy, Runnable)}
- * that could have potentially already fired should be followed by a call to
- * {@link #fireForWatermark(AppliedPTransform, Instant)} for the same transform with the current
- * value of the watermark.
- */
-class WatermarkCallbackExecutor {
- /**
- * Create a new {@link WatermarkCallbackExecutor}.
- */
- public static WatermarkCallbackExecutor create() {
- return new WatermarkCallbackExecutor();
- }
-
- private final ConcurrentMap<AppliedPTransform<?, ?, ?>, PriorityQueue<WatermarkCallback>>
- callbacks;
- private final ExecutorService executor;
-
- private WatermarkCallbackExecutor() {
- this.callbacks = new ConcurrentHashMap<>();
- this.executor = Executors.newSingleThreadExecutor();
- }
-
- /**
- * Execute the provided {@link Runnable} after the next call to
- * {@link #fireForWatermark(AppliedPTransform, Instant)} where the window is guaranteed to have
- * produced output.
- */
- public void callOnGuaranteedFiring(
- AppliedPTransform<?, ?, ?> step,
- BoundedWindow window,
- WindowingStrategy<?, ?> windowingStrategy,
- Runnable runnable) {
- WatermarkCallback callback =
- WatermarkCallback.onGuaranteedFiring(window, windowingStrategy, runnable);
-
- PriorityQueue<WatermarkCallback> callbackQueue = callbacks.get(step);
- if (callbackQueue == null) {
- callbackQueue = new PriorityQueue<>(11, new CallbackOrdering());
- if (callbacks.putIfAbsent(step, callbackQueue) != null) {
- callbackQueue = callbacks.get(step);
- }
- }
-
- synchronized (callbackQueue) {
- callbackQueue.offer(callback);
- }
- }
-
- /**
- * Schedule all pending callbacks that must have produced output by the time of the provided
- * watermark.
- */
- public void fireForWatermark(AppliedPTransform<?, ?, ?> step, Instant watermark) {
- PriorityQueue<WatermarkCallback> callbackQueue = callbacks.get(step);
- if (callbackQueue == null) {
- return;
- }
- synchronized (callbackQueue) {
- while (!callbackQueue.isEmpty() && callbackQueue.peek().shouldFire(watermark)) {
- executor.submit(callbackQueue.poll().getCallback());
- }
- }
- }
-
- private static class WatermarkCallback {
- public static <W extends BoundedWindow> WatermarkCallback onGuaranteedFiring(
- BoundedWindow window, WindowingStrategy<?, W> strategy, Runnable callback) {
- @SuppressWarnings("unchecked")
- Instant firingAfter =
- strategy.getTrigger().getSpec().getWatermarkThatGuaranteesFiring((W) window);
- return new WatermarkCallback(firingAfter, callback);
- }
-
- private final Instant fireAfter;
- private final Runnable callback;
-
- private WatermarkCallback(Instant fireAfter, Runnable callback) {
- this.fireAfter = fireAfter;
- this.callback = callback;
- }
-
- public boolean shouldFire(Instant currentWatermark) {
- return currentWatermark.isAfter(fireAfter)
- || currentWatermark.equals(BoundedWindow.TIMESTAMP_MAX_VALUE);
- }
-
- public Runnable getCallback() {
- return callback;
- }
- }
-
- private static class CallbackOrdering extends Ordering<WatermarkCallback> {
- @Override
- public int compare(WatermarkCallback left, WatermarkCallback right) {
- return ComparisonChain.start()
- .compare(left.fireAfter, right.fireAfter)
- .compare(left.callback, right.callback, Ordering.arbitrary())
- .result();
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/package-info.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/package-info.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/package-info.java
deleted file mode 100644
index d1aa6af..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/package-info.java
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-/**
- * Defines runners for executing Pipelines in different modes, including
- * {@link com.google.cloud.dataflow.sdk.runners.DirectPipelineRunner} and
- * {@link com.google.cloud.dataflow.sdk.runners.DataflowPipelineRunner}.
- *
- * <p>{@link com.google.cloud.dataflow.sdk.runners.DirectPipelineRunner} executes a {@code Pipeline}
- * locally, without contacting the Dataflow service.
- * {@link com.google.cloud.dataflow.sdk.runners.DataflowPipelineRunner} submits a
- * {@code Pipeline} to the Dataflow service, which executes it on Dataflow-managed Compute Engine
- * instances. {@code DataflowPipelineRunner} returns
- * as soon as the {@code Pipeline} has been submitted. Use
- * {@link com.google.cloud.dataflow.sdk.runners.BlockingDataflowPipelineRunner} to have execution
- * updates printed to the console.
- *
- * <p>The runner is specified as part {@link com.google.cloud.dataflow.sdk.options.PipelineOptions}.
- */
-package com.google.cloud.dataflow.sdk.runners;
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/worker/IsmFormat.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/worker/IsmFormat.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/worker/IsmFormat.java
deleted file mode 100644
index 318de9b..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/worker/IsmFormat.java
+++ /dev/null
@@ -1,946 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.runners.worker;
-
-import static com.google.cloud.dataflow.sdk.util.Structs.addLong;
-import static com.google.common.base.Preconditions.checkArgument;
-import static com.google.common.base.Preconditions.checkNotNull;
-import static com.google.common.base.Preconditions.checkState;
-
-import com.google.cloud.dataflow.sdk.coders.AtomicCoder;
-import com.google.cloud.dataflow.sdk.coders.ByteArrayCoder;
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.coders.Coder.NonDeterministicException;
-import com.google.cloud.dataflow.sdk.coders.CoderException;
-import com.google.cloud.dataflow.sdk.coders.ListCoder;
-import com.google.cloud.dataflow.sdk.coders.StandardCoder;
-import com.google.cloud.dataflow.sdk.coders.VarIntCoder;
-import com.google.cloud.dataflow.sdk.coders.VarLongCoder;
-import com.google.cloud.dataflow.sdk.util.CloudObject;
-import com.google.cloud.dataflow.sdk.util.PropertyNames;
-import com.google.cloud.dataflow.sdk.util.RandomAccessData;
-import com.google.cloud.dataflow.sdk.util.VarInt;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.common.base.MoreObjects;
-import com.google.common.base.MoreObjects.ToStringHelper;
-import com.google.common.base.Objects;
-import com.google.common.base.Preconditions;
-import com.google.common.collect.ImmutableList;
-import com.google.common.hash.HashFunction;
-import com.google.common.hash.Hashing;
-
-import com.fasterxml.jackson.annotation.JsonCreator;
-import com.fasterxml.jackson.annotation.JsonProperty;
-
-import java.io.DataInputStream;
-import java.io.DataOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.List;
-
-import javax.annotation.Nullable;
-
-/**
- * An Ism file is a prefix encoded composite key value file broken into shards. Each composite
- * key is composed of a fixed number of component keys. A fixed number of those sub keys represent
- * the shard key portion; see {@link IsmRecord} and {@link IsmRecordCoder} for further details
- * around the data format. In addition to the data, there is a bloom filter,
- * and multiple indices to allow for efficient retrieval.
- *
- * <p>An Ism file is composed of these high level sections (in order):
- * <ul>
- * <li>shard block</li>
- * <li>bloom filter (See {@code ScalableBloomFilter} for details on encoding format)</li>
- * <li>shard index</li>
- * <li>footer (See {@link Footer} for details on encoding format)</li>
- * </ul>
- *
- * <p>The shard block is composed of multiple copies of the following:
- * <ul>
- * <li>data block</li>
- * <li>data index</li>
- * </ul>
- *
- * <p>The data block is composed of multiple copies of the following:
- * <ul>
- * <li>key prefix (See {@link KeyPrefix} for details on encoding format)</li>
- * <li>unshared key bytes</li>
- * <li>value bytes</li>
- * <li>optional 0x00 0x00 bytes followed by metadata bytes
- * (if the following 0x00 0x00 bytes are not present, then there are no metadata bytes)</li>
- * </ul>
- * Each key written into the data block must be in unsigned lexicographically increasing order
- * and also its shard portion of the key must hash to the same shard id as all other keys
- * within the same data block. The hashing function used is the
- * <a href="http://smhasher.googlecode.com/svn/trunk/MurmurHash3.cpp">
- * 32-bit murmur3 algorithm, x86 variant</a> (little-endian variant),
- * using {@code 1225801234} as the seed value.
- *
- * <p>The data index is composed of {@code N} copies of the following:
- * <ul>
- * <li>key prefix (See {@link KeyPrefix} for details on encoding format)</li>
- * <li>unshared key bytes</li>
- * <li>byte offset to key prefix in data block (variable length long coding)</li>
- * </ul>
- *
- * <p>The shard index is composed of a {@link VarInt variable length integer} encoding representing
- * the number of shard index records followed by that many shard index records.
- * See {@link IsmShardCoder} for further details as to its encoding scheme.
- */
-public class IsmFormat {
- private static final int HASH_SEED = 1225801234;
- private static final HashFunction HASH_FUNCTION = Hashing.murmur3_32(HASH_SEED);
- static final int SHARD_BITS = 0x7F; // [0-127] shards + [128-255] metadata shards
-
- /**
- * A record containing a composite key and either a value or metadata. The composite key
- * must not contain the metadata key component place holder if producing a value record, and must
- * contain the metadata component key place holder if producing a metadata record.
- *
- * <p>The composite key is a fixed number of component keys where the first {@code N} component
- * keys are used to create a shard id via hashing. See {@link IsmRecordCoder#hash(List)} for
- * further details.
- */
- public static class IsmRecord<V> {
- /** Returns an IsmRecord with the specified key components and value. */
- public static <V> IsmRecord<V> of(List<?> keyComponents, V value) {
- checkNotNull(keyComponents);
- checkArgument(!keyComponents.isEmpty(), "Expected non-empty list of key components.");
- checkArgument(!isMetadataKey(keyComponents),
- "Expected key components to not contain metadata key.");
- return new IsmRecord<>(keyComponents, value, null);
- }
-
- public static <V> IsmRecord<V> meta(List<?> keyComponents, byte[] metadata) {
- checkNotNull(keyComponents);
- checkNotNull(metadata);
- checkArgument(!keyComponents.isEmpty(), "Expected non-empty list of key components.");
- checkArgument(isMetadataKey(keyComponents),
- "Expected key components to contain metadata key.");
- return new IsmRecord<V>(keyComponents, null, metadata);
- }
-
- private final List<?> keyComponents;
- @Nullable
- private final V value;
- @Nullable
- private final byte[] metadata;
- private IsmRecord(List<?> keyComponents, V value, byte[] metadata) {
- this.keyComponents = keyComponents;
- this.value = value;
- this.metadata = metadata;
- }
-
- /** Returns the list of key components. */
- public List<?> getKeyComponents() {
- return keyComponents;
- }
-
- /** Returns the key component at the specified index. */
- public Object getKeyComponent(int index) {
- return keyComponents.get(index);
- }
-
- /**
- * Returns the value. Throws {@link IllegalStateException} if this is not a
- * value record.
- */
- public V getValue() {
- checkState(!isMetadataKey(keyComponents),
- "This is a metadata record and not a value record.");
- return value;
- }
-
- /**
- * Returns the metadata. Throws {@link IllegalStateException} if this is not a
- * metadata record.
- */
- public byte[] getMetadata() {
- checkState(isMetadataKey(keyComponents),
- "This is a value record and not a metadata record.");
- return metadata;
- }
-
- @Override
- public boolean equals(Object obj) {
- if (!(obj instanceof IsmRecord)) {
- return false;
- }
- IsmRecord<?> other = (IsmRecord<?>) obj;
- return Objects.equal(keyComponents, other.keyComponents)
- && Objects.equal(value, other.value)
- && Arrays.equals(metadata, other.metadata);
- }
-
- @Override
- public int hashCode() {
- return Objects.hashCode(keyComponents, value, Arrays.hashCode(metadata));
- }
-
- @Override
- public String toString() {
- ToStringHelper builder = MoreObjects.toStringHelper(IsmRecord.class)
- .add("keyComponents", keyComponents);
- if (isMetadataKey(keyComponents)) {
- builder.add("metadata", metadata);
- } else {
- builder.add("value", value);
- }
- return builder.toString();
- }
- }
-
- /** A {@link Coder} for {@link IsmRecord}s.
- *
- * <p>Note that this coder standalone will not produce an Ism file. This coder can be used
- * to materialize a {@link PCollection} of {@link IsmRecord}s. Only when this coder
- * is combined with an {@link IsmSink} will one produce an Ism file.
- *
- * <p>The {@link IsmRecord} encoded format is:
- * <ul>
- * <li>encoded key component 1 using key component coder 1</li>
- * <li>...</li>
- * <li>encoded key component N using key component coder N</li>
- * <li>encoded value using value coder</li>
- * </ul>
- */
- public static class IsmRecordCoder<V>
- extends StandardCoder<IsmRecord<V>> {
- /** Returns an IsmRecordCoder with the specified key component coders, value coder. */
- public static <V> IsmRecordCoder<V> of(
- int numberOfShardKeyCoders,
- int numberOfMetadataShardKeyCoders,
- List<Coder<?>> keyComponentCoders,
- Coder<V> valueCoder) {
- checkNotNull(keyComponentCoders);
- checkArgument(keyComponentCoders.size() > 0);
- checkArgument(numberOfShardKeyCoders > 0);
- checkArgument(numberOfShardKeyCoders <= keyComponentCoders.size());
- checkArgument(numberOfMetadataShardKeyCoders <= keyComponentCoders.size());
- return new IsmRecordCoder<>(
- numberOfShardKeyCoders,
- numberOfMetadataShardKeyCoders,
- keyComponentCoders,
- valueCoder);
- }
-
- /**
- * Returns an IsmRecordCoder with the specified coders. Note that this method is not meant
- * to be called by users but used by Jackson when decoding this coder.
- */
- @JsonCreator
- public static IsmRecordCoder<?> of(
- @JsonProperty(PropertyNames.NUM_SHARD_CODERS) int numberOfShardCoders,
- @JsonProperty(PropertyNames.NUM_METADATA_SHARD_CODERS) int numberOfMetadataShardCoders,
- @JsonProperty(PropertyNames.COMPONENT_ENCODINGS) List<Coder<?>> components) {
- Preconditions.checkArgument(components.size() >= 2,
- "Expecting at least 2 components, got " + components.size());
- return of(
- numberOfShardCoders,
- numberOfMetadataShardCoders,
- components.subList(0, components.size() - 1),
- components.get(components.size() - 1));
- }
-
- private final int numberOfShardKeyCoders;
- private final int numberOfMetadataShardKeyCoders;
- private final List<Coder<?>> keyComponentCoders;
- private final Coder<V> valueCoder;
-
- private IsmRecordCoder(
- int numberOfShardKeyCoders,
- int numberOfMetadataShardKeyCoders,
- List<Coder<?>> keyComponentCoders, Coder<V> valueCoder) {
- this.numberOfShardKeyCoders = numberOfShardKeyCoders;
- this.numberOfMetadataShardKeyCoders = numberOfMetadataShardKeyCoders;
- this.keyComponentCoders = keyComponentCoders;
- this.valueCoder = valueCoder;
- }
-
- /** Returns the list of key component coders. */
- public List<Coder<?>> getKeyComponentCoders() {
- return keyComponentCoders;
- }
-
- /** Returns the key coder at the specified index. */
- public Coder getKeyComponentCoder(int index) {
- return keyComponentCoders.get(index);
- }
-
- /** Returns the value coder. */
- public Coder<V> getValueCoder() {
- return valueCoder;
- }
-
- @Override
- public void encode(IsmRecord<V> value, OutputStream outStream,
- Coder.Context context) throws CoderException, IOException {
- if (value.getKeyComponents().size() != keyComponentCoders.size()) {
- throw new CoderException(String.format(
- "Expected %s key component(s) but received key component(s) %s.",
- keyComponentCoders.size(), value.getKeyComponents()));
- }
- for (int i = 0; i < keyComponentCoders.size(); ++i) {
- getKeyComponentCoder(i).encode(value.getKeyComponent(i), outStream, context.nested());
- }
- if (isMetadataKey(value.getKeyComponents())) {
- ByteArrayCoder.of().encode(value.getMetadata(), outStream, context.nested());
- } else {
- valueCoder.encode(value.getValue(), outStream, context.nested());
- }
- }
-
- @Override
- public IsmRecord<V> decode(InputStream inStream, Coder.Context context)
- throws CoderException, IOException {
- List<Object> keyComponents = new ArrayList<>(keyComponentCoders.size());
- for (Coder<?> keyCoder : keyComponentCoders) {
- keyComponents.add(keyCoder.decode(inStream, context.nested()));
- }
- if (isMetadataKey(keyComponents)) {
- return IsmRecord.<V>meta(
- keyComponents, ByteArrayCoder.of().decode(inStream, context.nested()));
- } else {
- return IsmRecord.<V>of(keyComponents, valueCoder.decode(inStream, context.nested()));
- }
- }
-
- int getNumberOfShardKeyCoders(List<?> keyComponents) {
- if (isMetadataKey(keyComponents)) {
- return numberOfMetadataShardKeyCoders;
- } else {
- return numberOfShardKeyCoders;
- }
- }
-
- /**
- * Computes the shard id for the given key component(s).
- *
- * The shard keys are encoded into their byte representations and hashed using the
- * <a href="http://smhasher.googlecode.com/svn/trunk/MurmurHash3.cpp">
- * 32-bit murmur3 algorithm, x86 variant</a> (little-endian variant),
- * using {@code 1225801234} as the seed value. We ensure that shard ids for
- * metadata keys and normal keys do not overlap.
- */
- public <V, T> int hash(List<?> keyComponents) {
- return encodeAndHash(keyComponents, new RandomAccessData(), new ArrayList<Integer>());
- }
-
- /**
- * Computes the shard id for the given key component(s).
- *
- * Mutates {@code keyBytes} such that when returned, contains the encoded
- * version of the key components.
- */
- <V, T> int encodeAndHash(List<?> keyComponents, RandomAccessData keyBytesToMutate) {
- return encodeAndHash(keyComponents, keyBytesToMutate, new ArrayList<Integer>());
- }
-
- /**
- * Computes the shard id for the given key component(s).
- *
- * Mutates {@code keyBytes} such that when returned, contains the encoded
- * version of the key components. Also, mutates {@code keyComponentByteOffsetsToMutate} to
- * store the location where each key component's encoded byte representation ends within
- * {@code keyBytes}.
- */
- <V, T> int encodeAndHash(
- List<?> keyComponents,
- RandomAccessData keyBytesToMutate,
- List<Integer> keyComponentByteOffsetsToMutate) {
- checkNotNull(keyComponents);
- checkArgument(keyComponents.size() <= keyComponentCoders.size(),
- "Expected at most %s key component(s) but received %s.",
- keyComponentCoders.size(), keyComponents);
-
- final int numberOfKeyCodersToUse;
- final int shardOffset;
- if (isMetadataKey(keyComponents)) {
- numberOfKeyCodersToUse = numberOfMetadataShardKeyCoders;
- shardOffset = SHARD_BITS + 1;
- } else {
- numberOfKeyCodersToUse = numberOfShardKeyCoders;
- shardOffset = 0;
- }
-
- checkArgument(numberOfKeyCodersToUse <= keyComponents.size(),
- "Expected at least %s key component(s) but received %s.",
- numberOfShardKeyCoders, keyComponents);
-
- try {
- // Encode the shard portion
- for (int i = 0; i < numberOfKeyCodersToUse; ++i) {
- getKeyComponentCoder(i).encode(
- keyComponents.get(i), keyBytesToMutate.asOutputStream(), Context.NESTED);
- keyComponentByteOffsetsToMutate.add(keyBytesToMutate.size());
- }
- int rval = HASH_FUNCTION.hashBytes(
- keyBytesToMutate.array(), 0, keyBytesToMutate.size()).asInt() & SHARD_BITS;
- rval += shardOffset;
-
- // Encode the remainder
- for (int i = numberOfKeyCodersToUse; i < keyComponents.size(); ++i) {
- getKeyComponentCoder(i).encode(
- keyComponents.get(i), keyBytesToMutate.asOutputStream(), Context.NESTED);
- keyComponentByteOffsetsToMutate.add(keyBytesToMutate.size());
- }
- return rval;
- } catch (IOException e) {
- throw new IllegalStateException(
- String.format("Failed to hash %s with coder %s", keyComponents, this), e);
- }
- }
-
- @Override
- public List<Coder<?>> getCoderArguments() {
- return ImmutableList.<Coder<?>>builder()
- .addAll(keyComponentCoders)
- .add(valueCoder)
- .build();
- }
-
- @Override
- public CloudObject asCloudObject() {
- CloudObject cloudObject = super.asCloudObject();
- addLong(cloudObject, PropertyNames.NUM_SHARD_CODERS, numberOfShardKeyCoders);
- addLong(cloudObject, PropertyNames.NUM_METADATA_SHARD_CODERS, numberOfMetadataShardKeyCoders);
- return cloudObject;
- }
-
- @Override
- public void verifyDeterministic() throws Coder.NonDeterministicException {
- verifyDeterministic("Key component coders expected to be deterministic.", keyComponentCoders);
- verifyDeterministic("Value coder expected to be deterministic.", valueCoder);
- }
-
- @Override
- public boolean consistentWithEquals() {
- for (Coder<?> keyComponentCoder : keyComponentCoders) {
- if (!keyComponentCoder.consistentWithEquals()) {
- return false;
- }
- }
- return valueCoder.consistentWithEquals();
- }
-
- @Override
- public Object structuralValue(IsmRecord<V> record) throws Exception {
- checkState(record.getKeyComponents().size() == keyComponentCoders.size(),
- "Expected the number of key component coders %s "
- + "to match the number of key components %s.",
- keyComponentCoders.size(), record.getKeyComponents());
-
- if (record != null && consistentWithEquals()) {
- ArrayList<Object> keyComponentStructuralValues = new ArrayList<>();
- for (int i = 0; i < keyComponentCoders.size(); ++i) {
- keyComponentStructuralValues.add(
- getKeyComponentCoder(i).structuralValue(record.getKeyComponent(i)));
- }
- if (isMetadataKey(record.getKeyComponents())) {
- return IsmRecord.meta(keyComponentStructuralValues, record.getMetadata());
- } else {
- return IsmRecord.of(keyComponentStructuralValues,
- valueCoder.structuralValue(record.getValue()));
- }
- }
- return super.structuralValue(record);
- }
- }
-
- /**
- * Validates that the key portion of the given coder is deterministic.
- */
- static void validateCoderIsCompatible(IsmRecordCoder<?> coder) {
- for (Coder<?> keyComponentCoder : coder.getKeyComponentCoders()) {
- try {
- keyComponentCoder.verifyDeterministic();
- } catch (NonDeterministicException e) {
- throw new IllegalArgumentException(
- String.format("Key component coder %s is expected to be deterministic.",
- keyComponentCoder), e);
- }
- }
- }
-
- /** Returns true if and only if any of the passed in key components represent a metadata key. */
- public static boolean isMetadataKey(List<?> keyComponents) {
- for (Object keyComponent : keyComponents) {
- if (keyComponent == METADATA_KEY) {
- return true;
- }
- }
- return false;
- }
-
- /** A marker object representing the wildcard metadata key component. */
- private static final Object METADATA_KEY = new Object() {
- @Override
- public String toString() {
- return "META";
- }
-
- @Override
- public boolean equals(Object obj) {
- return this == obj;
- }
-
- @Override
- public int hashCode() {
- return -1248902349;
- }
- };
-
- /**
- * An object representing a wild card for a key component.
- * Encoded using {@link MetadataKeyCoder}.
- */
- public static Object getMetadataKey() {
- return METADATA_KEY;
- }
-
- /**
- * A coder for metadata key component. Can be used to wrap key component coder allowing for
- * the metadata key component to be used as a place holder instead of an actual key.
- */
- public static class MetadataKeyCoder<K> extends StandardCoder<K> {
- public static <K> MetadataKeyCoder<K> of(Coder<K> keyCoder) {
- checkNotNull(keyCoder);
- return new MetadataKeyCoder<>(keyCoder);
- }
-
- /**
- * Returns an IsmRecordCoder with the specified coders. Note that this method is not meant
- * to be called by users but used by Jackson when decoding this coder.
- */
- @JsonCreator
- public static MetadataKeyCoder<?> of(
- @JsonProperty(PropertyNames.COMPONENT_ENCODINGS) List<Coder<?>> components) {
- Preconditions.checkArgument(components.size() == 1,
- "Expecting one component, got " + components.size());
- return of(components.get(0));
- }
-
- private final Coder<K> keyCoder;
-
- private MetadataKeyCoder(Coder<K> keyCoder) {
- this.keyCoder = keyCoder;
- }
-
- public Coder<K> getKeyCoder() {
- return keyCoder;
- }
-
- @Override
- public void encode(K value, OutputStream outStream, Coder.Context context)
- throws CoderException, IOException {
- if (value == METADATA_KEY) {
- outStream.write(0);
- } else {
- outStream.write(1);
- keyCoder.encode(value, outStream, context.nested());
- }
- }
-
- @Override
- public K decode(InputStream inStream, Coder.Context context)
- throws CoderException, IOException {
- int marker = inStream.read();
- if (marker == 0) {
- return (K) getMetadataKey();
- } else if (marker == 1) {
- return keyCoder.decode(inStream, context.nested());
- } else {
- throw new CoderException(String.format("Expected marker but got %s.", marker));
- }
- }
-
- @Override
- public List<Coder<?>> getCoderArguments() {
- return ImmutableList.<Coder<?>>of(keyCoder);
- }
-
- @Override
- public void verifyDeterministic() throws NonDeterministicException {
- verifyDeterministic("Expected key coder to be deterministic", keyCoder);
- }
- }
-
- /**
- * A shard descriptor containing shard id, the data block offset, and the index offset for the
- * given shard.
- */
- public static class IsmShard {
- private final int id;
- private final long blockOffset;
- private final long indexOffset;
-
- /** Returns an IsmShard with the given id, block offset and no index offset. */
- public static IsmShard of(int id, long blockOffset) {
- IsmShard ismShard = new IsmShard(id, blockOffset, -1);
- checkState(id >= 0,
- "%s attempting to be written with negative shard id.",
- ismShard);
- checkState(blockOffset >= 0,
- "%s attempting to be written with negative block offset.",
- ismShard);
- return ismShard;
- }
-
- /** Returns an IsmShard with the given id, block offset, and index offset. */
- public static IsmShard of(int id, long blockOffset, long indexOffset) {
- IsmShard ismShard = new IsmShard(id, blockOffset, indexOffset);
- checkState(id >= 0,
- "%s attempting to be written with negative shard id.",
- ismShard);
- checkState(blockOffset >= 0,
- "%s attempting to be written with negative block offset.",
- ismShard);
- checkState(indexOffset >= 0,
- "%s attempting to be written with negative index offset.",
- ismShard);
- return ismShard;
- }
-
- private IsmShard(int id, long blockOffset, long indexOffset) {
- this.id = id;
- this.blockOffset = blockOffset;
- this.indexOffset = indexOffset;
- }
-
- /** Return the shard id. */
- public int getId() {
- return id;
- }
-
- /** Return the absolute position within the Ism file where the data block begins. */
- public long getBlockOffset() {
- return blockOffset;
- }
-
- /**
- * Return the absolute position within the Ism file where the index block begins.
- * Throws {@link IllegalStateException} if the index offset was never specified.
- */
- public long getIndexOffset() {
- checkState(indexOffset >= 0,
- "Unable to fetch index offset because it was never specified.");
- return indexOffset;
- }
-
- /** Returns a new IsmShard like this one with the specified index offset. */
- public IsmShard withIndexOffset(long indexOffset) {
- return of(id, blockOffset, indexOffset);
- }
-
- @Override
- public String toString() {
- return MoreObjects.toStringHelper(IsmShard.class)
- .add("id", id)
- .add("blockOffset", blockOffset)
- .add("indexOffset", indexOffset)
- .toString();
- }
-
- @Override
- public boolean equals(Object obj) {
- if (!(obj instanceof IsmShard)) {
- return false;
- }
- IsmShard other = (IsmShard) obj;
- return Objects.equal(id, other.id)
- && Objects.equal(blockOffset, other.blockOffset)
- && Objects.equal(indexOffset, other.indexOffset);
- }
-
- @Override
- public int hashCode() {
- return Objects.hashCode(id, blockOffset, indexOffset);
- }
- }
-
- /**
- * A {@link ListCoder} wrapping a {@link IsmShardCoder} used to encode the shard index.
- * See {@link ListCoder} for its encoding specification and {@link IsmShardCoder} for its
- * encoding specification.
- */
- public static final Coder<List<IsmShard>> ISM_SHARD_INDEX_CODER =
- ListCoder.of(IsmShardCoder.of());
-
- /**
- * A coder for {@link IsmShard}s.
- *
- * The shard descriptor is encoded as:
- * <ul>
- * <li>id (variable length integer encoding)</li>
- * <li>blockOffset (variable length long encoding)</li>
- * <li>indexOffset (variable length long encoding)</li>
- * </ul>
- */
- public static class IsmShardCoder extends AtomicCoder<IsmShard> {
- private static final IsmShardCoder INSTANCE = new IsmShardCoder();
-
- /** Returns an IsmShardCoder. */
- @JsonCreator
- public static IsmShardCoder of() {
- return INSTANCE;
- }
-
- private IsmShardCoder() {
- }
-
- @Override
- public void encode(IsmShard value, OutputStream outStream, Coder.Context context)
- throws CoderException, IOException {
- checkState(value.getIndexOffset() >= 0,
- "%s attempting to be written without index offset.",
- value);
- VarIntCoder.of().encode(value.getId(), outStream, context.nested());
- VarLongCoder.of().encode(value.getBlockOffset(), outStream, context.nested());
- VarLongCoder.of().encode(value.getIndexOffset(), outStream, context.nested());
- }
-
- @Override
- public IsmShard decode(
- InputStream inStream, Coder.Context context) throws CoderException, IOException {
- return IsmShard.of(
- VarIntCoder.of().decode(inStream, context),
- VarLongCoder.of().decode(inStream, context),
- VarLongCoder.of().decode(inStream, context));
- }
-
- @Override
- public boolean consistentWithEquals() {
- return true;
- }
- }
-
- /**
- * The prefix used before each key which contains the number of shared and unshared
- * bytes from the previous key that was read. The key prefix along with the previous key
- * and the unshared key bytes allows one to construct the current key by doing the following
- * {@code currentKey = previousKey[0 : sharedBytes] + read(unsharedBytes)}.
- *
- * <p>The key prefix is encoded as:
- * <ul>
- * <li>number of shared key bytes (variable length integer coding)</li>
- * <li>number of unshared key bytes (variable length integer coding)</li>
- * </ul>
- */
- static class KeyPrefix {
- private final int sharedKeySize;
- private final int unsharedKeySize;
-
- KeyPrefix(int sharedBytes, int unsharedBytes) {
- this.sharedKeySize = sharedBytes;
- this.unsharedKeySize = unsharedBytes;
- }
-
- public int getSharedKeySize() {
- return sharedKeySize;
- }
-
- public int getUnsharedKeySize() {
- return unsharedKeySize;
- }
-
- @Override
- public int hashCode() {
- return Objects.hashCode(sharedKeySize, unsharedKeySize);
- }
-
- @Override
- public boolean equals(Object other) {
- if (other == this) {
- return true;
- }
- if (!(other instanceof KeyPrefix)) {
- return false;
- }
- KeyPrefix keyPrefix = (KeyPrefix) other;
- return sharedKeySize == keyPrefix.sharedKeySize
- && unsharedKeySize == keyPrefix.unsharedKeySize;
- }
-
- @Override
- public String toString() {
- return MoreObjects.toStringHelper(this)
- .add("sharedKeySize", sharedKeySize)
- .add("unsharedKeySize", unsharedKeySize)
- .toString();
- }
- }
-
- /** A {@link Coder} for {@link KeyPrefix}. */
- static final class KeyPrefixCoder extends AtomicCoder<KeyPrefix> {
- private static final KeyPrefixCoder INSTANCE = new KeyPrefixCoder();
-
- @JsonCreator
- public static KeyPrefixCoder of() {
- return INSTANCE;
- }
-
- @Override
- public void encode(KeyPrefix value, OutputStream outStream, Coder.Context context)
- throws CoderException, IOException {
- VarInt.encode(value.sharedKeySize, outStream);
- VarInt.encode(value.unsharedKeySize, outStream);
- }
-
- @Override
- public KeyPrefix decode(InputStream inStream, Coder.Context context)
- throws CoderException, IOException {
- return new KeyPrefix(VarInt.decodeInt(inStream), VarInt.decodeInt(inStream));
- }
-
- @Override
- public boolean consistentWithEquals() {
- return true;
- }
-
- @Override
- public boolean isRegisterByteSizeObserverCheap(KeyPrefix value, Coder.Context context) {
- return true;
- }
-
- @Override
- protected long getEncodedElementByteSize(KeyPrefix value, Coder.Context context)
- throws Exception {
- Preconditions.checkNotNull(value);
- return VarInt.getLength(value.sharedKeySize) + VarInt.getLength(value.unsharedKeySize);
- }
- }
-
- /**
- * The footer stores the relevant information required to locate the index and bloom filter.
- * It also stores a version byte and the number of keys stored.
- *
- * <p>The footer is encoded as the value containing:
- * <ul>
- * <li>start of bloom filter offset (big endian long coding)</li>
- * <li>start of shard index position offset (big endian long coding)</li>
- * <li>number of keys in file (big endian long coding)</li>
- * <li>0x01 (version key as a single byte)</li>
- * </ul>
- */
- static class Footer {
- static final int LONG_BYTES = 8;
- static final int FIXED_LENGTH = 3 * LONG_BYTES + 1;
- static final byte VERSION = 2;
-
- private final long indexPosition;
- private final long bloomFilterPosition;
- private final long numberOfKeys;
-
- Footer(long indexPosition, long bloomFilterPosition, long numberOfKeys) {
- this.indexPosition = indexPosition;
- this.bloomFilterPosition = bloomFilterPosition;
- this.numberOfKeys = numberOfKeys;
- }
-
- public long getIndexPosition() {
- return indexPosition;
- }
-
- public long getBloomFilterPosition() {
- return bloomFilterPosition;
- }
-
- public long getNumberOfKeys() {
- return numberOfKeys;
- }
-
- @Override
- public boolean equals(Object other) {
- if (other == this) {
- return true;
- }
- if (!(other instanceof Footer)) {
- return false;
- }
- Footer footer = (Footer) other;
- return indexPosition == footer.indexPosition
- && bloomFilterPosition == footer.bloomFilterPosition
- && numberOfKeys == footer.numberOfKeys;
- }
-
- @Override
- public int hashCode() {
- return Objects.hashCode(indexPosition, bloomFilterPosition, numberOfKeys);
- }
-
- @Override
- public String toString() {
- return MoreObjects.toStringHelper(this)
- .add("version", Footer.VERSION)
- .add("indexPosition", indexPosition)
- .add("bloomFilterPosition", bloomFilterPosition)
- .add("numberOfKeys", numberOfKeys)
- .toString();
- }
- }
-
- /** A {@link Coder} for {@link Footer}. */
- static final class FooterCoder extends AtomicCoder<Footer> {
- private static final FooterCoder INSTANCE = new FooterCoder();
-
- @JsonCreator
- public static FooterCoder of() {
- return INSTANCE;
- }
-
- @Override
- public void encode(Footer value, OutputStream outStream, Coder.Context context)
- throws CoderException, IOException {
- DataOutputStream dataOut = new DataOutputStream(outStream);
- dataOut.writeLong(value.indexPosition);
- dataOut.writeLong(value.bloomFilterPosition);
- dataOut.writeLong(value.numberOfKeys);
- dataOut.write(Footer.VERSION);
- }
-
- @Override
- public Footer decode(InputStream inStream, Coder.Context context)
- throws CoderException, IOException {
- DataInputStream dataIn = new DataInputStream(inStream);
- Footer footer = new Footer(dataIn.readLong(), dataIn.readLong(), dataIn.readLong());
- int version = dataIn.read();
- if (version != Footer.VERSION) {
- throw new IOException("Unknown version " + version + ". "
- + "Only version 2 is currently supported.");
- }
- return footer;
- }
-
- @Override
- public boolean consistentWithEquals() {
- return true;
- }
-
- @Override
- public boolean isRegisterByteSizeObserverCheap(Footer value, Coder.Context context) {
- return true;
- }
-
- @Override
- protected long getEncodedElementByteSize(Footer value, Coder.Context context)
- throws Exception {
- return Footer.FIXED_LENGTH;
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/worker/package-info.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/worker/package-info.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/worker/package-info.java
deleted file mode 100644
index af0a345..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/worker/package-info.java
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-/**
- * Implementation of the harness that runs on each Google Compute Engine instance to coordinate
- * execution of Pipeline code.
- */
-@ParametersAreNonnullByDefault
-package com.google.cloud.dataflow.sdk.runners.worker;
-
-import javax.annotation.ParametersAreNonnullByDefault;
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/testing/CoderProperties.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/testing/CoderProperties.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/testing/CoderProperties.java
deleted file mode 100644
index 5705dc4..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/testing/CoderProperties.java
+++ /dev/null
@@ -1,349 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.testing;
-
-import static org.hamcrest.Matchers.contains;
-import static org.hamcrest.Matchers.containsInAnyOrder;
-import static org.hamcrest.Matchers.emptyIterable;
-import static org.hamcrest.Matchers.equalTo;
-import static org.hamcrest.Matchers.hasItem;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertThat;
-import static org.junit.Assert.fail;
-
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.coders.Coder.NonDeterministicException;
-import com.google.cloud.dataflow.sdk.coders.CoderException;
-import com.google.cloud.dataflow.sdk.util.CoderUtils;
-import com.google.cloud.dataflow.sdk.util.PropertyNames;
-import com.google.cloud.dataflow.sdk.util.SerializableUtils;
-import com.google.cloud.dataflow.sdk.util.Serializer;
-import com.google.cloud.dataflow.sdk.util.Structs;
-import com.google.cloud.dataflow.sdk.util.UnownedInputStream;
-import com.google.cloud.dataflow.sdk.util.UnownedOutputStream;
-import com.google.common.annotations.VisibleForTesting;
-import com.google.common.collect.Iterables;
-
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.List;
-
-/**
- * Properties for use in {@link Coder} tests. These are implemented with junit assertions
- * rather than as predicates for the sake of error messages.
- *
- * <p>We serialize and deserialize the coder to make sure that any state information required by
- * the coder is preserved. This causes tests written such that coders that lose information during
- * serialization or change state during encoding/decoding will fail.
- */
-public class CoderProperties {
-
- /**
- * All the contexts, for use in test cases.
- */
- public static final List<Coder.Context> ALL_CONTEXTS = Arrays.asList(
- Coder.Context.OUTER, Coder.Context.NESTED);
-
- /**
- * Verifies that for the given {@code Coder<T>}, and values of
- * type {@code T}, if the values are equal then the encoded bytes are equal, in any
- * {@code Coder.Context}.
- */
- public static <T> void coderDeterministic(
- Coder<T> coder, T value1, T value2)
- throws Exception {
- for (Coder.Context context : ALL_CONTEXTS) {
- coderDeterministicInContext(coder, context, value1, value2);
- }
- }
-
- /**
- * Verifies that for the given {@code Coder<T>}, {@code Coder.Context}, and values of
- * type {@code T}, if the values are equal then the encoded bytes are equal.
- */
- public static <T> void coderDeterministicInContext(
- Coder<T> coder, Coder.Context context, T value1, T value2)
- throws Exception {
-
- try {
- coder.verifyDeterministic();
- } catch (NonDeterministicException e) {
- fail("Expected that the coder is deterministic");
- }
- assertThat("Expected that the passed in values are equal()", value1, equalTo(value2));
- assertThat(
- encode(coder, context, value1),
- equalTo(encode(coder, context, value2)));
- }
-
- /**
- * Verifies that for the given {@code Coder<T>},
- * and value of type {@code T}, encoding followed by decoding yields an
- * equal value of type {@code T}, in any {@code Coder.Context}.
- */
- public static <T> void coderDecodeEncodeEqual(
- Coder<T> coder, T value)
- throws Exception {
- for (Coder.Context context : ALL_CONTEXTS) {
- coderDecodeEncodeEqualInContext(coder, context, value);
- }
- }
-
- /**
- * Verifies that for the given {@code Coder<T>}, {@code Coder.Context},
- * and value of type {@code T}, encoding followed by decoding yields an
- * equal value of type {@code T}.
- */
- public static <T> void coderDecodeEncodeEqualInContext(
- Coder<T> coder, Coder.Context context, T value)
- throws Exception {
- assertThat(decodeEncode(coder, context, value), equalTo(value));
- }
-
- /**
- * Verifies that for the given {@code Coder<Collection<T>>},
- * and value of type {@code Collection<T>}, encoding followed by decoding yields an
- * equal value of type {@code Collection<T>}, in any {@code Coder.Context}.
- */
- public static <T, CollectionT extends Collection<T>> void coderDecodeEncodeContentsEqual(
- Coder<CollectionT> coder, CollectionT value)
- throws Exception {
- for (Coder.Context context : ALL_CONTEXTS) {
- coderDecodeEncodeContentsEqualInContext(coder, context, value);
- }
- }
-
- /**
- * Verifies that for the given {@code Coder<Collection<T>>},
- * and value of type {@code Collection<T>}, encoding followed by decoding yields an
- * equal value of type {@code Collection<T>}, in the given {@code Coder.Context}.
- */
- @SuppressWarnings("unchecked")
- public static <T, CollectionT extends Collection<T>> void coderDecodeEncodeContentsEqualInContext(
- Coder<CollectionT> coder, Coder.Context context, CollectionT value)
- throws Exception {
- // Matchers.containsInAnyOrder() requires at least one element
- Collection<T> result = decodeEncode(coder, context, value);
- if (value.isEmpty()) {
- assertThat(result, emptyIterable());
- } else {
- // This is the only Matchers.containInAnyOrder() overload that takes literal values
- assertThat(result, containsInAnyOrder((T[]) value.toArray()));
- }
- }
-
- /**
- * Verifies that for the given {@code Coder<Collection<T>>},
- * and value of type {@code Collection<T>}, encoding followed by decoding yields an
- * equal value of type {@code Collection<T>}, in any {@code Coder.Context}.
- */
- public static <T, IterableT extends Iterable<T>> void coderDecodeEncodeContentsInSameOrder(
- Coder<IterableT> coder, IterableT value)
- throws Exception {
- for (Coder.Context context : ALL_CONTEXTS) {
- CoderProperties.<T, IterableT>coderDecodeEncodeContentsInSameOrderInContext(
- coder, context, value);
- }
- }
-
- /**
- * Verifies that for the given {@code Coder<Iterable<T>>},
- * and value of type {@code Iterable<T>}, encoding followed by decoding yields an
- * equal value of type {@code Collection<T>}, in the given {@code Coder.Context}.
- */
- @SuppressWarnings("unchecked")
- public static <T, IterableT extends Iterable<T>> void
- coderDecodeEncodeContentsInSameOrderInContext(
- Coder<IterableT> coder, Coder.Context context, IterableT value)
- throws Exception {
- Iterable<T> result = decodeEncode(coder, context, value);
- // Matchers.contains() requires at least one element
- if (Iterables.isEmpty(value)) {
- assertThat(result, emptyIterable());
- } else {
- // This is the only Matchers.contains() overload that takes literal values
- assertThat(result, contains((T[]) Iterables.toArray(value, Object.class)));
- }
- }
-
- public static <T> void coderSerializable(Coder<T> coder) {
- SerializableUtils.ensureSerializable(coder);
- }
-
- public static <T> void coderConsistentWithEquals(
- Coder<T> coder, T value1, T value2)
- throws Exception {
-
- for (Coder.Context context : ALL_CONTEXTS) {
- CoderProperties.<T>coderConsistentWithEqualsInContext(coder, context, value1, value2);
- }
- }
-
- public static <T> void coderConsistentWithEqualsInContext(
- Coder<T> coder, Coder.Context context, T value1, T value2) throws Exception {
-
- assertEquals(
- value1.equals(value2),
- Arrays.equals(
- encode(coder, context, value1),
- encode(coder, context, value2)));
- }
-
- public static <T> void coderHasEncodingId(Coder<T> coder, String encodingId) throws Exception {
- assertThat(coder.getEncodingId(), equalTo(encodingId));
- assertThat(Structs.getString(coder.asCloudObject(), PropertyNames.ENCODING_ID, ""),
- equalTo(encodingId));
- }
-
- public static <T> void coderAllowsEncoding(Coder<T> coder, String encodingId) throws Exception {
- assertThat(coder.getAllowedEncodings(), hasItem(encodingId));
- assertThat(
- String.format("Expected to find \"%s\" in property \"%s\" of %s",
- encodingId, PropertyNames.ALLOWED_ENCODINGS, coder.asCloudObject()),
- Structs.getStrings(
- coder.asCloudObject(),
- PropertyNames.ALLOWED_ENCODINGS,
- Collections.<String>emptyList()),
- hasItem(encodingId));
- }
-
- public static <T> void structuralValueConsistentWithEquals(
- Coder<T> coder, T value1, T value2)
- throws Exception {
-
- for (Coder.Context context : ALL_CONTEXTS) {
- CoderProperties.<T>structuralValueConsistentWithEqualsInContext(
- coder, context, value1, value2);
- }
- }
-
- public static <T> void structuralValueConsistentWithEqualsInContext(
- Coder<T> coder, Coder.Context context, T value1, T value2) throws Exception {
-
- assertEquals(
- coder.structuralValue(value1).equals(coder.structuralValue(value2)),
- Arrays.equals(
- encode(coder, context, value1),
- encode(coder, context, value2)));
- }
-
-
- private static final String DECODING_WIRE_FORMAT_MESSAGE =
- "Decoded value from known wire format does not match expected value."
- + " This probably means that this Coder no longer correctly decodes"
- + " a prior wire format. Changing the wire formats this Coder can read"
- + " should be avoided, as it is likely to cause breakage."
- + " If you truly intend to change the backwards compatibility for this Coder "
- + " then you must remove any now-unsupported encodings from getAllowedEncodings().";
-
- public static <T> void coderDecodesBase64(Coder<T> coder, String base64Encoding, T value)
- throws Exception {
- assertThat(DECODING_WIRE_FORMAT_MESSAGE, CoderUtils.decodeFromBase64(coder, base64Encoding),
- equalTo(value));
- }
-
- public static <T> void coderDecodesBase64(
- Coder<T> coder, List<String> base64Encodings, List<T> values) throws Exception {
- assertThat("List of base64 encodings has different size than List of values",
- base64Encodings.size(), equalTo(values.size()));
-
- for (int i = 0; i < base64Encodings.size(); i++) {
- coderDecodesBase64(coder, base64Encodings.get(i), values.get(i));
- }
- }
-
- private static final String ENCODING_WIRE_FORMAT_MESSAGE =
- "Encoded value does not match expected wire format."
- + " Changing the wire format should be avoided, as it is likely to cause breakage."
- + " If you truly intend to change the wire format for this Coder "
- + " then you must update getEncodingId() to a new value and add any supported"
- + " prior formats to getAllowedEncodings()."
- + " See com.google.cloud.dataflow.sdk.coders.PrintBase64Encoding for how to generate"
- + " new test data.";
-
- public static <T> void coderEncodesBase64(Coder<T> coder, T value, String base64Encoding)
- throws Exception {
- assertThat(ENCODING_WIRE_FORMAT_MESSAGE, CoderUtils.encodeToBase64(coder, value),
- equalTo(base64Encoding));
- }
-
- public static <T> void coderEncodesBase64(
- Coder<T> coder, List<T> values, List<String> base64Encodings) throws Exception {
- assertThat("List of base64 encodings has different size than List of values",
- base64Encodings.size(), equalTo(values.size()));
-
- for (int i = 0; i < base64Encodings.size(); i++) {
- coderEncodesBase64(coder, values.get(i), base64Encodings.get(i));
- }
- }
-
- @SuppressWarnings("unchecked")
- public static <T, IterableT extends Iterable<T>> void coderDecodesBase64ContentsEqual(
- Coder<IterableT> coder, String base64Encoding, IterableT expected) throws Exception {
-
- IterableT result = CoderUtils.decodeFromBase64(coder, base64Encoding);
- if (Iterables.isEmpty(expected)) {
- assertThat(ENCODING_WIRE_FORMAT_MESSAGE, result, emptyIterable());
- } else {
- assertThat(ENCODING_WIRE_FORMAT_MESSAGE, result,
- containsInAnyOrder((T[]) Iterables.toArray(expected, Object.class)));
- }
- }
-
- public static <T, IterableT extends Iterable<T>> void coderDecodesBase64ContentsEqual(
- Coder<IterableT> coder, List<String> base64Encodings, List<IterableT> expected)
- throws Exception {
- assertThat("List of base64 encodings has different size than List of values",
- base64Encodings.size(), equalTo(expected.size()));
-
- for (int i = 0; i < base64Encodings.size(); i++) {
- coderDecodesBase64ContentsEqual(coder, base64Encodings.get(i), expected.get(i));
- }
- }
-
- //////////////////////////////////////////////////////////////////////////
-
- @VisibleForTesting
- static <T> byte[] encode(
- Coder<T> coder, Coder.Context context, T value) throws CoderException, IOException {
- @SuppressWarnings("unchecked")
- Coder<T> deserializedCoder = Serializer.deserialize(coder.asCloudObject(), Coder.class);
-
- ByteArrayOutputStream os = new ByteArrayOutputStream();
- deserializedCoder.encode(value, new UnownedOutputStream(os), context);
- return os.toByteArray();
- }
-
- @VisibleForTesting
- static <T> T decode(
- Coder<T> coder, Coder.Context context, byte[] bytes) throws CoderException, IOException {
- @SuppressWarnings("unchecked")
- Coder<T> deserializedCoder = Serializer.deserialize(coder.asCloudObject(), Coder.class);
-
- ByteArrayInputStream is = new ByteArrayInputStream(bytes);
- return deserializedCoder.decode(new UnownedInputStream(is), context);
- }
-
- private static <T> T decodeEncode(Coder<T> coder, Coder.Context context, T value)
- throws CoderException, IOException {
- return decode(coder, context, encode(coder, context, value));
- }
-}
[52/67] incubator-beam git commit: Directory reorganization
Posted by dh...@apache.org.
Directory reorganization
Move Java SDK-specific Javadoc information from "javadoc/" into "sdks/java/javadoc".
Project: http://git-wip-us.apache.org/repos/asf/incubator-beam/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-beam/commit/75cfa4ac
Tree: http://git-wip-us.apache.org/repos/asf/incubator-beam/tree/75cfa4ac
Diff: http://git-wip-us.apache.org/repos/asf/incubator-beam/diff/75cfa4ac
Branch: refs/heads/master
Commit: 75cfa4ac4114610d4296f0cef57d2148ff62055d
Parents: 7bef2b7
Author: Davor Bonaci <da...@google.com>
Authored: Wed Mar 23 16:55:05 2016 -0700
Committer: Davor Bonaci <da...@google.com>
Committed: Wed Mar 23 18:12:26 2016 -0700
----------------------------------------------------------------------
javadoc/README.md | 4 ---
javadoc/apiclient-docs/package-list | 34 --------------------
javadoc/avro-docs/package-list | 30 -----------------
javadoc/bq-docs/package-list | 2 --
javadoc/dataflow-sdk-docs/package-list | 11 -------
javadoc/datastore-docs/package-list | 2 --
javadoc/guava-docs/package-list | 15 ---------
javadoc/hamcrest-docs/package-list | 10 ------
javadoc/jackson-annotations-docs/package-list | 1 -
javadoc/jackson-databind-docs/package-list | 20 ------------
javadoc/joda-docs/package-list | 7 ----
javadoc/junit-docs/package-list | 7 ----
javadoc/oauth-docs/package-list | 11 -------
javadoc/overview.html | 31 ------------------
sdks/java/core/pom.xml | 3 +-
sdks/java/javadoc/README.md | 4 +++
sdks/java/javadoc/apiclient-docs/package-list | 34 ++++++++++++++++++++
sdks/java/javadoc/avro-docs/package-list | 30 +++++++++++++++++
sdks/java/javadoc/bq-docs/package-list | 2 ++
.../java/javadoc/dataflow-sdk-docs/package-list | 11 +++++++
sdks/java/javadoc/datastore-docs/package-list | 2 ++
sdks/java/javadoc/guava-docs/package-list | 15 +++++++++
sdks/java/javadoc/hamcrest-docs/package-list | 10 ++++++
.../jackson-annotations-docs/package-list | 1 +
.../javadoc/jackson-databind-docs/package-list | 20 ++++++++++++
sdks/java/javadoc/joda-docs/package-list | 7 ++++
sdks/java/javadoc/junit-docs/package-list | 7 ++++
sdks/java/javadoc/oauth-docs/package-list | 11 +++++++
sdks/java/javadoc/overview.html | 31 ++++++++++++++++++
29 files changed, 187 insertions(+), 186 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/75cfa4ac/javadoc/README.md
----------------------------------------------------------------------
diff --git a/javadoc/README.md b/javadoc/README.md
deleted file mode 100644
index 8240d3c..0000000
--- a/javadoc/README.md
+++ /dev/null
@@ -1,4 +0,0 @@
-# SDK Javadoc
-
-This directory contains package-info files for external javadoc we would like
-our javadoc to link to using `-linkoffline`.
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/75cfa4ac/javadoc/apiclient-docs/package-list
----------------------------------------------------------------------
diff --git a/javadoc/apiclient-docs/package-list b/javadoc/apiclient-docs/package-list
deleted file mode 100644
index 3ec1471..0000000
--- a/javadoc/apiclient-docs/package-list
+++ /dev/null
@@ -1,34 +0,0 @@
-com.google.api.client.googleapis
-com.google.api.client.googleapis.apache
-com.google.api.client.googleapis.auth.clientlogin
-com.google.api.client.googleapis.auth.oauth2
-com.google.api.client.googleapis.batch
-com.google.api.client.googleapis.batch.json
-com.google.api.client.googleapis.compute
-com.google.api.client.googleapis.extensions.android.accounts
-com.google.api.client.googleapis.extensions.android.gms.auth
-com.google.api.client.googleapis.extensions.appengine.auth.oauth2
-com.google.api.client.googleapis.extensions.appengine.notifications
-com.google.api.client.googleapis.extensions.appengine.testing.auth.oauth2
-com.google.api.client.googleapis.extensions.java6.auth.oauth2
-com.google.api.client.googleapis.extensions.servlet.notifications
-com.google.api.client.googleapis.javanet
-com.google.api.client.googleapis.json
-com.google.api.client.googleapis.media
-com.google.api.client.googleapis.notifications
-com.google.api.client.googleapis.notifications.json
-com.google.api.client.googleapis.notifications.json.gson
-com.google.api.client.googleapis.notifications.json.jackson2
-com.google.api.client.googleapis.services
-com.google.api.client.googleapis.services.json
-com.google.api.client.googleapis.services.protobuf
-com.google.api.client.googleapis.testing
-com.google.api.client.googleapis.testing.auth.oauth2
-com.google.api.client.googleapis.testing.compute
-com.google.api.client.googleapis.testing.json
-com.google.api.client.googleapis.testing.notifications
-com.google.api.client.googleapis.testing.services
-com.google.api.client.googleapis.testing.services.json
-com.google.api.client.googleapis.testing.services.protobuf
-com.google.api.client.googleapis.util
-com.google.api.client.googleapis.xml.atom
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/75cfa4ac/javadoc/avro-docs/package-list
----------------------------------------------------------------------
diff --git a/javadoc/avro-docs/package-list b/javadoc/avro-docs/package-list
deleted file mode 100644
index 319ff01..0000000
--- a/javadoc/avro-docs/package-list
+++ /dev/null
@@ -1,30 +0,0 @@
-org.apache.avro
-org.apache.avro.compiler.idl
-org.apache.avro.compiler.specific
-org.apache.avro.data
-org.apache.avro.file
-org.apache.avro.generic
-org.apache.avro.hadoop.file
-org.apache.avro.hadoop.io
-org.apache.avro.hadoop.util
-org.apache.avro.io
-org.apache.avro.io.parsing
-org.apache.avro.ipc
-org.apache.avro.ipc.generic
-org.apache.avro.ipc.reflect
-org.apache.avro.ipc.specific
-org.apache.avro.ipc.stats
-org.apache.avro.ipc.trace
-org.apache.avro.mapred
-org.apache.avro.mapred.tether
-org.apache.avro.mapreduce
-org.apache.avro.mojo
-org.apache.avro.protobuf
-org.apache.avro.reflect
-org.apache.avro.specific
-org.apache.avro.thrift
-org.apache.avro.tool
-org.apache.avro.util
-org.apache.trevni
-org.apache.trevni.avro
-org.apache.trevni.avro.mapreduce
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/75cfa4ac/javadoc/bq-docs/package-list
----------------------------------------------------------------------
diff --git a/javadoc/bq-docs/package-list b/javadoc/bq-docs/package-list
deleted file mode 100644
index 384b3fc..0000000
--- a/javadoc/bq-docs/package-list
+++ /dev/null
@@ -1,2 +0,0 @@
-com.google.api.services.bigquery
-com.google.api.services.bigquery.model
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/75cfa4ac/javadoc/dataflow-sdk-docs/package-list
----------------------------------------------------------------------
diff --git a/javadoc/dataflow-sdk-docs/package-list b/javadoc/dataflow-sdk-docs/package-list
deleted file mode 100644
index a26f5a3..0000000
--- a/javadoc/dataflow-sdk-docs/package-list
+++ /dev/null
@@ -1,11 +0,0 @@
-com.google.cloud.dataflow.sdk
-com.google.cloud.dataflow.sdk.annotations
-com.google.cloud.dataflow.sdk.coders
-com.google.cloud.dataflow.sdk.io
-com.google.cloud.dataflow.sdk.options
-com.google.cloud.dataflow.sdk.runners
-com.google.cloud.dataflow.sdk.testing
-com.google.cloud.dataflow.sdk.transforms
-com.google.cloud.dataflow.sdk.transforms.join
-com.google.cloud.dataflow.sdk.transforms.windowing
-com.google.cloud.dataflow.sdk.values
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/75cfa4ac/javadoc/datastore-docs/package-list
----------------------------------------------------------------------
diff --git a/javadoc/datastore-docs/package-list b/javadoc/datastore-docs/package-list
deleted file mode 100644
index ebbafd8..0000000
--- a/javadoc/datastore-docs/package-list
+++ /dev/null
@@ -1,2 +0,0 @@
-com.google.api.services.datastore
-com.google.api.services.datastore.client
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/75cfa4ac/javadoc/guava-docs/package-list
----------------------------------------------------------------------
diff --git a/javadoc/guava-docs/package-list b/javadoc/guava-docs/package-list
deleted file mode 100644
index f855178..0000000
--- a/javadoc/guava-docs/package-list
+++ /dev/null
@@ -1,15 +0,0 @@
-com.google.common.annotations
-com.google.common.base
-com.google.common.cache
-com.google.common.collect
-com.google.common.escape
-com.google.common.eventbus
-com.google.common.hash
-com.google.common.html
-com.google.common.io
-com.google.common.math
-com.google.common.net
-com.google.common.primitives
-com.google.common.reflect
-com.google.common.util.concurrent
-com.google.common.xml
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/75cfa4ac/javadoc/hamcrest-docs/package-list
----------------------------------------------------------------------
diff --git a/javadoc/hamcrest-docs/package-list b/javadoc/hamcrest-docs/package-list
deleted file mode 100644
index 3f5e945..0000000
--- a/javadoc/hamcrest-docs/package-list
+++ /dev/null
@@ -1,10 +0,0 @@
-org.hamcrest
-org.hamcrest.beans
-org.hamcrest.collection
-org.hamcrest.core
-org.hamcrest.integration
-org.hamcrest.internal
-org.hamcrest.number
-org.hamcrest.object
-org.hamcrest.text
-org.hamcrest.xml
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/75cfa4ac/javadoc/jackson-annotations-docs/package-list
----------------------------------------------------------------------
diff --git a/javadoc/jackson-annotations-docs/package-list b/javadoc/jackson-annotations-docs/package-list
deleted file mode 100644
index 768b3ba..0000000
--- a/javadoc/jackson-annotations-docs/package-list
+++ /dev/null
@@ -1 +0,0 @@
-com.fasterxml.jackson.annotation
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/75cfa4ac/javadoc/jackson-databind-docs/package-list
----------------------------------------------------------------------
diff --git a/javadoc/jackson-databind-docs/package-list b/javadoc/jackson-databind-docs/package-list
deleted file mode 100644
index 8a2cd8b..0000000
--- a/javadoc/jackson-databind-docs/package-list
+++ /dev/null
@@ -1,20 +0,0 @@
-com.fasterxml.jackson.databind
-com.fasterxml.jackson.databind.annotation
-com.fasterxml.jackson.databind.cfg
-com.fasterxml.jackson.databind.deser
-com.fasterxml.jackson.databind.deser.impl
-com.fasterxml.jackson.databind.deser.std
-com.fasterxml.jackson.databind.exc
-com.fasterxml.jackson.databind.ext
-com.fasterxml.jackson.databind.introspect
-com.fasterxml.jackson.databind.jsonFormatVisitors
-com.fasterxml.jackson.databind.jsonschema
-com.fasterxml.jackson.databind.jsontype
-com.fasterxml.jackson.databind.jsontype.impl
-com.fasterxml.jackson.databind.module
-com.fasterxml.jackson.databind.node
-com.fasterxml.jackson.databind.ser
-com.fasterxml.jackson.databind.ser.impl
-com.fasterxml.jackson.databind.ser.std
-com.fasterxml.jackson.databind.type
-com.fasterxml.jackson.databind.util
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/75cfa4ac/javadoc/joda-docs/package-list
----------------------------------------------------------------------
diff --git a/javadoc/joda-docs/package-list b/javadoc/joda-docs/package-list
deleted file mode 100644
index 2ab05aa..0000000
--- a/javadoc/joda-docs/package-list
+++ /dev/null
@@ -1,7 +0,0 @@
-org.joda.time
-org.joda.time.base
-org.joda.time.chrono
-org.joda.time.convert
-org.joda.time.field
-org.joda.time.format
-org.joda.time.tz
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/75cfa4ac/javadoc/junit-docs/package-list
----------------------------------------------------------------------
diff --git a/javadoc/junit-docs/package-list b/javadoc/junit-docs/package-list
deleted file mode 100644
index 0735177..0000000
--- a/javadoc/junit-docs/package-list
+++ /dev/null
@@ -1,7 +0,0 @@
-org.hamcrest.core
-org.junit
-org.junit.matchers
-org.junit.runner
-org.junit.runner.manipulation
-org.junit.runner.notification
-org.junit.runners
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/75cfa4ac/javadoc/oauth-docs/package-list
----------------------------------------------------------------------
diff --git a/javadoc/oauth-docs/package-list b/javadoc/oauth-docs/package-list
deleted file mode 100644
index 38fc046..0000000
--- a/javadoc/oauth-docs/package-list
+++ /dev/null
@@ -1,11 +0,0 @@
-com.google.api.client.auth.oauth
-com.google.api.client.auth.oauth2
-com.google.api.client.auth.openidconnect
-com.google.api.client.extensions.appengine.auth
-com.google.api.client.extensions.appengine.auth.oauth2
-com.google.api.client.extensions.auth.helpers
-com.google.api.client.extensions.auth.helpers.oauth
-com.google.api.client.extensions.java6.auth.oauth2
-com.google.api.client.extensions.jetty.auth.oauth2
-com.google.api.client.extensions.servlet.auth
-com.google.api.client.extensions.servlet.auth.oauth2
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/75cfa4ac/javadoc/overview.html
----------------------------------------------------------------------
diff --git a/javadoc/overview.html b/javadoc/overview.html
deleted file mode 100644
index 4ffd33f..0000000
--- a/javadoc/overview.html
+++ /dev/null
@@ -1,31 +0,0 @@
-<!DOCTYPE html>
-<html>
- <head>
- <title>Google Cloud Dataflow Java SDK</title>
- </head>
- <body>
- <p>The Google Cloud Dataflow SDK for Java provides a simple and elegant
- programming model to express your data processing pipelines;
- see <a href="https://cloud.google.com/dataflow/">our product page</a>
- for more information and getting started instructions.</p>
-
- <p>The easiest way to use the Google Cloud Dataflow SDK for Java is via
- one of the released artifacts from the
- <a href="http://search.maven.org/#search%7Cga%7C1%7Cg%3A%22com.google.cloud.dataflow%22">
- Maven Central Repository</a>.
- See our <a href="https://cloud.google.com/dataflow/release-notes/java">
- release notes</a> for more information about each released version.<p>
-
- <p>Version numbers use the form <i>major</i>.<i>minor</i>.<i>incremental</i>
- and are incremented as follows:<p>
- <ul>
- <li>major version for incompatible API changes</li>
- <li>minor version for new functionality added in a backward-compatible manner</li>
- <li>incremental version for forward-compatible bug fixes</li>
- </ul>
-
- <p>Please note that APIs marked
- {@link com.google.cloud.dataflow.sdk.annotations.Experimental @Experimental}
- may change at any point and are not guaranteed to remain compatible across versions.</p>
- </body>
-</html>
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/75cfa4ac/sdks/java/core/pom.xml
----------------------------------------------------------------------
diff --git a/sdks/java/core/pom.xml b/sdks/java/core/pom.xml
index e90446c..2b9e4a9 100644
--- a/sdks/java/core/pom.xml
+++ b/sdks/java/core/pom.xml
@@ -157,7 +157,8 @@
</plugin>
<plugin>
- <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-javadoc-plugin</artifactId>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-javadoc-plugin</artifactId>
<configuration>
<windowtitle>Google Cloud Dataflow SDK ${project.version} API</windowtitle>
<doctitle>Google Cloud Dataflow SDK for Java, version ${project.version}</doctitle>
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/75cfa4ac/sdks/java/javadoc/README.md
----------------------------------------------------------------------
diff --git a/sdks/java/javadoc/README.md b/sdks/java/javadoc/README.md
new file mode 100644
index 0000000..8240d3c
--- /dev/null
+++ b/sdks/java/javadoc/README.md
@@ -0,0 +1,4 @@
+# SDK Javadoc
+
+This directory contains package-info files for external javadoc we would like
+our javadoc to link to using `-linkoffline`.
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/75cfa4ac/sdks/java/javadoc/apiclient-docs/package-list
----------------------------------------------------------------------
diff --git a/sdks/java/javadoc/apiclient-docs/package-list b/sdks/java/javadoc/apiclient-docs/package-list
new file mode 100644
index 0000000..3ec1471
--- /dev/null
+++ b/sdks/java/javadoc/apiclient-docs/package-list
@@ -0,0 +1,34 @@
+com.google.api.client.googleapis
+com.google.api.client.googleapis.apache
+com.google.api.client.googleapis.auth.clientlogin
+com.google.api.client.googleapis.auth.oauth2
+com.google.api.client.googleapis.batch
+com.google.api.client.googleapis.batch.json
+com.google.api.client.googleapis.compute
+com.google.api.client.googleapis.extensions.android.accounts
+com.google.api.client.googleapis.extensions.android.gms.auth
+com.google.api.client.googleapis.extensions.appengine.auth.oauth2
+com.google.api.client.googleapis.extensions.appengine.notifications
+com.google.api.client.googleapis.extensions.appengine.testing.auth.oauth2
+com.google.api.client.googleapis.extensions.java6.auth.oauth2
+com.google.api.client.googleapis.extensions.servlet.notifications
+com.google.api.client.googleapis.javanet
+com.google.api.client.googleapis.json
+com.google.api.client.googleapis.media
+com.google.api.client.googleapis.notifications
+com.google.api.client.googleapis.notifications.json
+com.google.api.client.googleapis.notifications.json.gson
+com.google.api.client.googleapis.notifications.json.jackson2
+com.google.api.client.googleapis.services
+com.google.api.client.googleapis.services.json
+com.google.api.client.googleapis.services.protobuf
+com.google.api.client.googleapis.testing
+com.google.api.client.googleapis.testing.auth.oauth2
+com.google.api.client.googleapis.testing.compute
+com.google.api.client.googleapis.testing.json
+com.google.api.client.googleapis.testing.notifications
+com.google.api.client.googleapis.testing.services
+com.google.api.client.googleapis.testing.services.json
+com.google.api.client.googleapis.testing.services.protobuf
+com.google.api.client.googleapis.util
+com.google.api.client.googleapis.xml.atom
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/75cfa4ac/sdks/java/javadoc/avro-docs/package-list
----------------------------------------------------------------------
diff --git a/sdks/java/javadoc/avro-docs/package-list b/sdks/java/javadoc/avro-docs/package-list
new file mode 100644
index 0000000..319ff01
--- /dev/null
+++ b/sdks/java/javadoc/avro-docs/package-list
@@ -0,0 +1,30 @@
+org.apache.avro
+org.apache.avro.compiler.idl
+org.apache.avro.compiler.specific
+org.apache.avro.data
+org.apache.avro.file
+org.apache.avro.generic
+org.apache.avro.hadoop.file
+org.apache.avro.hadoop.io
+org.apache.avro.hadoop.util
+org.apache.avro.io
+org.apache.avro.io.parsing
+org.apache.avro.ipc
+org.apache.avro.ipc.generic
+org.apache.avro.ipc.reflect
+org.apache.avro.ipc.specific
+org.apache.avro.ipc.stats
+org.apache.avro.ipc.trace
+org.apache.avro.mapred
+org.apache.avro.mapred.tether
+org.apache.avro.mapreduce
+org.apache.avro.mojo
+org.apache.avro.protobuf
+org.apache.avro.reflect
+org.apache.avro.specific
+org.apache.avro.thrift
+org.apache.avro.tool
+org.apache.avro.util
+org.apache.trevni
+org.apache.trevni.avro
+org.apache.trevni.avro.mapreduce
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/75cfa4ac/sdks/java/javadoc/bq-docs/package-list
----------------------------------------------------------------------
diff --git a/sdks/java/javadoc/bq-docs/package-list b/sdks/java/javadoc/bq-docs/package-list
new file mode 100644
index 0000000..384b3fc
--- /dev/null
+++ b/sdks/java/javadoc/bq-docs/package-list
@@ -0,0 +1,2 @@
+com.google.api.services.bigquery
+com.google.api.services.bigquery.model
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/75cfa4ac/sdks/java/javadoc/dataflow-sdk-docs/package-list
----------------------------------------------------------------------
diff --git a/sdks/java/javadoc/dataflow-sdk-docs/package-list b/sdks/java/javadoc/dataflow-sdk-docs/package-list
new file mode 100644
index 0000000..a26f5a3
--- /dev/null
+++ b/sdks/java/javadoc/dataflow-sdk-docs/package-list
@@ -0,0 +1,11 @@
+com.google.cloud.dataflow.sdk
+com.google.cloud.dataflow.sdk.annotations
+com.google.cloud.dataflow.sdk.coders
+com.google.cloud.dataflow.sdk.io
+com.google.cloud.dataflow.sdk.options
+com.google.cloud.dataflow.sdk.runners
+com.google.cloud.dataflow.sdk.testing
+com.google.cloud.dataflow.sdk.transforms
+com.google.cloud.dataflow.sdk.transforms.join
+com.google.cloud.dataflow.sdk.transforms.windowing
+com.google.cloud.dataflow.sdk.values
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/75cfa4ac/sdks/java/javadoc/datastore-docs/package-list
----------------------------------------------------------------------
diff --git a/sdks/java/javadoc/datastore-docs/package-list b/sdks/java/javadoc/datastore-docs/package-list
new file mode 100644
index 0000000..ebbafd8
--- /dev/null
+++ b/sdks/java/javadoc/datastore-docs/package-list
@@ -0,0 +1,2 @@
+com.google.api.services.datastore
+com.google.api.services.datastore.client
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/75cfa4ac/sdks/java/javadoc/guava-docs/package-list
----------------------------------------------------------------------
diff --git a/sdks/java/javadoc/guava-docs/package-list b/sdks/java/javadoc/guava-docs/package-list
new file mode 100644
index 0000000..f855178
--- /dev/null
+++ b/sdks/java/javadoc/guava-docs/package-list
@@ -0,0 +1,15 @@
+com.google.common.annotations
+com.google.common.base
+com.google.common.cache
+com.google.common.collect
+com.google.common.escape
+com.google.common.eventbus
+com.google.common.hash
+com.google.common.html
+com.google.common.io
+com.google.common.math
+com.google.common.net
+com.google.common.primitives
+com.google.common.reflect
+com.google.common.util.concurrent
+com.google.common.xml
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/75cfa4ac/sdks/java/javadoc/hamcrest-docs/package-list
----------------------------------------------------------------------
diff --git a/sdks/java/javadoc/hamcrest-docs/package-list b/sdks/java/javadoc/hamcrest-docs/package-list
new file mode 100644
index 0000000..3f5e945
--- /dev/null
+++ b/sdks/java/javadoc/hamcrest-docs/package-list
@@ -0,0 +1,10 @@
+org.hamcrest
+org.hamcrest.beans
+org.hamcrest.collection
+org.hamcrest.core
+org.hamcrest.integration
+org.hamcrest.internal
+org.hamcrest.number
+org.hamcrest.object
+org.hamcrest.text
+org.hamcrest.xml
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/75cfa4ac/sdks/java/javadoc/jackson-annotations-docs/package-list
----------------------------------------------------------------------
diff --git a/sdks/java/javadoc/jackson-annotations-docs/package-list b/sdks/java/javadoc/jackson-annotations-docs/package-list
new file mode 100644
index 0000000..768b3ba
--- /dev/null
+++ b/sdks/java/javadoc/jackson-annotations-docs/package-list
@@ -0,0 +1 @@
+com.fasterxml.jackson.annotation
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/75cfa4ac/sdks/java/javadoc/jackson-databind-docs/package-list
----------------------------------------------------------------------
diff --git a/sdks/java/javadoc/jackson-databind-docs/package-list b/sdks/java/javadoc/jackson-databind-docs/package-list
new file mode 100644
index 0000000..8a2cd8b
--- /dev/null
+++ b/sdks/java/javadoc/jackson-databind-docs/package-list
@@ -0,0 +1,20 @@
+com.fasterxml.jackson.databind
+com.fasterxml.jackson.databind.annotation
+com.fasterxml.jackson.databind.cfg
+com.fasterxml.jackson.databind.deser
+com.fasterxml.jackson.databind.deser.impl
+com.fasterxml.jackson.databind.deser.std
+com.fasterxml.jackson.databind.exc
+com.fasterxml.jackson.databind.ext
+com.fasterxml.jackson.databind.introspect
+com.fasterxml.jackson.databind.jsonFormatVisitors
+com.fasterxml.jackson.databind.jsonschema
+com.fasterxml.jackson.databind.jsontype
+com.fasterxml.jackson.databind.jsontype.impl
+com.fasterxml.jackson.databind.module
+com.fasterxml.jackson.databind.node
+com.fasterxml.jackson.databind.ser
+com.fasterxml.jackson.databind.ser.impl
+com.fasterxml.jackson.databind.ser.std
+com.fasterxml.jackson.databind.type
+com.fasterxml.jackson.databind.util
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/75cfa4ac/sdks/java/javadoc/joda-docs/package-list
----------------------------------------------------------------------
diff --git a/sdks/java/javadoc/joda-docs/package-list b/sdks/java/javadoc/joda-docs/package-list
new file mode 100644
index 0000000..2ab05aa
--- /dev/null
+++ b/sdks/java/javadoc/joda-docs/package-list
@@ -0,0 +1,7 @@
+org.joda.time
+org.joda.time.base
+org.joda.time.chrono
+org.joda.time.convert
+org.joda.time.field
+org.joda.time.format
+org.joda.time.tz
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/75cfa4ac/sdks/java/javadoc/junit-docs/package-list
----------------------------------------------------------------------
diff --git a/sdks/java/javadoc/junit-docs/package-list b/sdks/java/javadoc/junit-docs/package-list
new file mode 100644
index 0000000..0735177
--- /dev/null
+++ b/sdks/java/javadoc/junit-docs/package-list
@@ -0,0 +1,7 @@
+org.hamcrest.core
+org.junit
+org.junit.matchers
+org.junit.runner
+org.junit.runner.manipulation
+org.junit.runner.notification
+org.junit.runners
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/75cfa4ac/sdks/java/javadoc/oauth-docs/package-list
----------------------------------------------------------------------
diff --git a/sdks/java/javadoc/oauth-docs/package-list b/sdks/java/javadoc/oauth-docs/package-list
new file mode 100644
index 0000000..38fc046
--- /dev/null
+++ b/sdks/java/javadoc/oauth-docs/package-list
@@ -0,0 +1,11 @@
+com.google.api.client.auth.oauth
+com.google.api.client.auth.oauth2
+com.google.api.client.auth.openidconnect
+com.google.api.client.extensions.appengine.auth
+com.google.api.client.extensions.appengine.auth.oauth2
+com.google.api.client.extensions.auth.helpers
+com.google.api.client.extensions.auth.helpers.oauth
+com.google.api.client.extensions.java6.auth.oauth2
+com.google.api.client.extensions.jetty.auth.oauth2
+com.google.api.client.extensions.servlet.auth
+com.google.api.client.extensions.servlet.auth.oauth2
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/75cfa4ac/sdks/java/javadoc/overview.html
----------------------------------------------------------------------
diff --git a/sdks/java/javadoc/overview.html b/sdks/java/javadoc/overview.html
new file mode 100644
index 0000000..4ffd33f
--- /dev/null
+++ b/sdks/java/javadoc/overview.html
@@ -0,0 +1,31 @@
+<!DOCTYPE html>
+<html>
+ <head>
+ <title>Google Cloud Dataflow Java SDK</title>
+ </head>
+ <body>
+ <p>The Google Cloud Dataflow SDK for Java provides a simple and elegant
+ programming model to express your data processing pipelines;
+ see <a href="https://cloud.google.com/dataflow/">our product page</a>
+ for more information and getting started instructions.</p>
+
+ <p>The easiest way to use the Google Cloud Dataflow SDK for Java is via
+ one of the released artifacts from the
+ <a href="http://search.maven.org/#search%7Cga%7C1%7Cg%3A%22com.google.cloud.dataflow%22">
+ Maven Central Repository</a>.
+ See our <a href="https://cloud.google.com/dataflow/release-notes/java">
+ release notes</a> for more information about each released version.<p>
+
+ <p>Version numbers use the form <i>major</i>.<i>minor</i>.<i>incremental</i>
+ and are incremented as follows:<p>
+ <ul>
+ <li>major version for incompatible API changes</li>
+ <li>minor version for new functionality added in a backward-compatible manner</li>
+ <li>incremental version for forward-compatible bug fixes</li>
+ </ul>
+
+ <p>Please note that APIs marked
+ {@link com.google.cloud.dataflow.sdk.annotations.Experimental @Experimental}
+ may change at any point and are not guaranteed to remain compatible across versions.</p>
+ </body>
+</html>
[39/67] [partial] incubator-beam git commit: Directory reorganization
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/bigtable/BigtableServiceImpl.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/bigtable/BigtableServiceImpl.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/bigtable/BigtableServiceImpl.java
deleted file mode 100644
index 5ab8582..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/bigtable/BigtableServiceImpl.java
+++ /dev/null
@@ -1,241 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.io.bigtable;
-
-import com.google.bigtable.admin.table.v1.GetTableRequest;
-import com.google.bigtable.v1.MutateRowRequest;
-import com.google.bigtable.v1.Mutation;
-import com.google.bigtable.v1.ReadRowsRequest;
-import com.google.bigtable.v1.Row;
-import com.google.bigtable.v1.RowRange;
-import com.google.bigtable.v1.SampleRowKeysRequest;
-import com.google.bigtable.v1.SampleRowKeysResponse;
-import com.google.cloud.bigtable.config.BigtableOptions;
-import com.google.cloud.bigtable.grpc.BigtableSession;
-import com.google.cloud.bigtable.grpc.async.AsyncExecutor;
-import com.google.cloud.bigtable.grpc.async.HeapSizeManager;
-import com.google.cloud.bigtable.grpc.scanner.ResultScanner;
-import com.google.cloud.dataflow.sdk.io.bigtable.BigtableIO.BigtableSource;
-import com.google.cloud.dataflow.sdk.values.KV;
-import com.google.common.base.MoreObjects;
-import com.google.common.io.Closer;
-import com.google.common.util.concurrent.ListenableFuture;
-import com.google.protobuf.ByteString;
-import com.google.protobuf.Empty;
-
-import io.grpc.Status.Code;
-import io.grpc.StatusRuntimeException;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.IOException;
-import java.util.List;
-import java.util.NoSuchElementException;
-
-/**
- * An implementation of {@link BigtableService} that actually communicates with the Cloud Bigtable
- * service.
- */
-class BigtableServiceImpl implements BigtableService {
- private static final Logger logger = LoggerFactory.getLogger(BigtableService.class);
-
- public BigtableServiceImpl(BigtableOptions options) {
- this.options = options;
- }
-
- private final BigtableOptions options;
-
- @Override
- public BigtableWriterImpl openForWriting(String tableId) throws IOException {
- BigtableSession session = new BigtableSession(options);
- String tableName = options.getClusterName().toTableNameStr(tableId);
- return new BigtableWriterImpl(session, tableName);
- }
-
- @Override
- public boolean tableExists(String tableId) throws IOException {
- if (!BigtableSession.isAlpnProviderEnabled()) {
- logger.info(
- "Skipping existence check for table {} (BigtableOptions {}) because ALPN is not"
- + " configured.",
- tableId,
- options);
- return true;
- }
-
- try (BigtableSession session = new BigtableSession(options)) {
- GetTableRequest getTable =
- GetTableRequest.newBuilder()
- .setName(options.getClusterName().toTableNameStr(tableId))
- .build();
- session.getTableAdminClient().getTable(getTable);
- return true;
- } catch (StatusRuntimeException e) {
- if (e.getStatus().getCode() == Code.NOT_FOUND) {
- return false;
- }
- String message =
- String.format(
- "Error checking whether table %s (BigtableOptions %s) exists", tableId, options);
- logger.error(message, e);
- throw new IOException(message, e);
- }
- }
-
- private class BigtableReaderImpl implements Reader {
- private BigtableSession session;
- private final BigtableSource source;
- private ResultScanner<Row> results;
- private Row currentRow;
-
- public BigtableReaderImpl(BigtableSession session, BigtableSource source) {
- this.session = session;
- this.source = source;
- }
-
- @Override
- public boolean start() throws IOException {
- RowRange range =
- RowRange.newBuilder()
- .setStartKey(source.getRange().getStartKey().getValue())
- .setEndKey(source.getRange().getEndKey().getValue())
- .build();
- ReadRowsRequest.Builder requestB =
- ReadRowsRequest.newBuilder()
- .setRowRange(range)
- .setTableName(options.getClusterName().toTableNameStr(source.getTableId()));
- if (source.getRowFilter() != null) {
- requestB.setFilter(source.getRowFilter());
- }
- results = session.getDataClient().readRows(requestB.build());
- return advance();
- }
-
- @Override
- public boolean advance() throws IOException {
- currentRow = results.next();
- return (currentRow != null);
- }
-
- @Override
- public void close() throws IOException {
- // Goal: by the end of this function, both results and session are null and closed,
- // independent of what errors they throw or prior state.
-
- if (session == null) {
- // Only possible when previously closed, so we know that results is also null.
- return;
- }
-
- // Session does not implement Closeable -- it's AutoCloseable. So we can't register it with
- // the Closer, but we can use the Closer to simplify the error handling.
- try (Closer closer = Closer.create()) {
- if (results != null) {
- closer.register(results);
- results = null;
- }
-
- session.close();
- } finally {
- session = null;
- }
- }
-
- @Override
- public Row getCurrentRow() throws NoSuchElementException {
- if (currentRow == null) {
- throw new NoSuchElementException();
- }
- return currentRow;
- }
- }
-
- private static class BigtableWriterImpl implements Writer {
- private BigtableSession session;
- private AsyncExecutor executor;
- private final MutateRowRequest.Builder partialBuilder;
-
- public BigtableWriterImpl(BigtableSession session, String tableName) {
- this.session = session;
- this.executor =
- new AsyncExecutor(
- session.getDataClient(),
- new HeapSizeManager(
- AsyncExecutor.ASYNC_MUTATOR_MAX_MEMORY_DEFAULT,
- AsyncExecutor.MAX_INFLIGHT_RPCS_DEFAULT));
-
- partialBuilder = MutateRowRequest.newBuilder().setTableName(tableName);
- }
-
- @Override
- public void close() throws IOException {
- try {
- if (executor != null) {
- executor.flush();
- executor = null;
- }
- } finally {
- if (session != null) {
- session.close();
- session = null;
- }
- }
- }
-
- @Override
- public ListenableFuture<Empty> writeRecord(KV<ByteString, Iterable<Mutation>> record)
- throws IOException {
- MutateRowRequest r =
- partialBuilder
- .clone()
- .setRowKey(record.getKey())
- .addAllMutations(record.getValue())
- .build();
- try {
- return executor.mutateRowAsync(r);
- } catch (InterruptedException e) {
- Thread.currentThread().interrupt();
- throw new IOException("Write interrupted", e);
- }
- }
- }
-
- @Override
- public String toString() {
- return MoreObjects
- .toStringHelper(BigtableServiceImpl.class)
- .add("options", options)
- .toString();
- }
-
- @Override
- public Reader createReader(BigtableSource source) throws IOException {
- BigtableSession session = new BigtableSession(options);
- return new BigtableReaderImpl(session, source);
- }
-
- @Override
- public List<SampleRowKeysResponse> getSampleRowKeys(BigtableSource source) throws IOException {
- try (BigtableSession session = new BigtableSession(options)) {
- SampleRowKeysRequest request =
- SampleRowKeysRequest.newBuilder()
- .setTableName(options.getClusterName().toTableNameStr(source.getTableId()))
- .build();
- return session.getDataClient().sampleRowKeys(request);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/bigtable/package-info.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/bigtable/package-info.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/bigtable/package-info.java
deleted file mode 100644
index 112a954..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/bigtable/package-info.java
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-/**
- * Defines transforms for reading and writing from Google Cloud Bigtable.
- *
- * @see com.google.cloud.dataflow.sdk.io.bigtable.BigtableIO
- */
-package com.google.cloud.dataflow.sdk.io.bigtable;
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/package-info.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/package-info.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/package-info.java
deleted file mode 100644
index de0bd86..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/package-info.java
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-/**
- * Defines transforms for reading and writing common storage formats, including
- * {@link com.google.cloud.dataflow.sdk.io.AvroIO},
- * {@link com.google.cloud.dataflow.sdk.io.BigQueryIO}, and
- * {@link com.google.cloud.dataflow.sdk.io.TextIO}.
- *
- * <p>The classes in this package provide {@code Read} transforms that create PCollections
- * from existing storage:
- * <pre>{@code
- * PCollection<TableRow> inputData = pipeline.apply(
- * BigQueryIO.Read.named("Read")
- * .from("clouddataflow-readonly:samples.weather_stations");
- * }</pre>
- * and {@code Write} transforms that persist PCollections to external storage:
- * <pre> {@code
- * PCollection<Integer> numbers = ...;
- * numbers.apply(TextIO.Write.named("WriteNumbers")
- * .to("gs://my_bucket/path/to/numbers"));
- * } </pre>
- */
-package com.google.cloud.dataflow.sdk.io;
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/range/ByteKey.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/range/ByteKey.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/range/ByteKey.java
deleted file mode 100644
index 30772da..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/range/ByteKey.java
+++ /dev/null
@@ -1,173 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.io.range;
-
-import static com.google.common.base.Preconditions.checkNotNull;
-
-import com.google.protobuf.ByteString;
-import com.google.protobuf.ByteString.ByteIterator;
-
-import java.io.Serializable;
-
-/**
- * A class representing a key consisting of an array of bytes. Arbitrary-length
- * {@code byte[]} keys are typical in key-value stores such as Google Cloud Bigtable.
- *
- * <p>Instances of {@link ByteKey} are immutable.
- *
- * <p>{@link ByteKey} implements {@link Comparable Comparable<ByteKey>} by comparing the
- * arrays in lexicographic order. The smallest {@link ByteKey} is a zero-length array; the successor
- * to a key is the same key with an additional 0 byte appended; and keys have unbounded size.
- *
- * <p>Note that the empty {@link ByteKey} compares smaller than all other keys, but some systems
- * have the semantic that when an empty {@link ByteKey} is used as an upper bound, it represents
- * the largest possible key. In these cases, implementors should use {@link #isEmpty} to test
- * whether an upper bound key is empty.
- */
-public final class ByteKey implements Comparable<ByteKey>, Serializable {
- /** An empty key. */
- public static final ByteKey EMPTY = ByteKey.of();
-
- /**
- * Creates a new {@link ByteKey} backed by the specified {@link ByteString}.
- */
- public static ByteKey of(ByteString value) {
- return new ByteKey(value);
- }
-
- /**
- * Creates a new {@link ByteKey} backed by a copy of the specified {@code byte[]}.
- *
- * <p>Makes a copy of the underlying array.
- */
- public static ByteKey copyFrom(byte[] bytes) {
- return of(ByteString.copyFrom(bytes));
- }
-
- /**
- * Creates a new {@link ByteKey} backed by a copy of the specified {@code int[]}. This method is
- * primarily used as a convenience to create a {@link ByteKey} in code without casting down to
- * signed Java {@link Byte bytes}:
- *
- * <pre>{@code
- * ByteKey key = ByteKey.of(0xde, 0xad, 0xbe, 0xef);
- * }</pre>
- *
- * <p>Makes a copy of the input.
- */
- public static ByteKey of(int... bytes) {
- byte[] ret = new byte[bytes.length];
- for (int i = 0; i < bytes.length; ++i) {
- ret[i] = (byte) (bytes[i] & 0xff);
- }
- return ByteKey.copyFrom(ret);
- }
-
- /**
- * Returns an immutable {@link ByteString} representing this {@link ByteKey}.
- *
- * <p>Does not copy.
- */
- public ByteString getValue() {
- return value;
- }
-
- /**
- * Returns a newly-allocated {@code byte[]} representing this {@link ByteKey}.
- *
- * <p>Copies the underlying {@code byte[]}.
- */
- public byte[] getBytes() {
- return value.toByteArray();
- }
-
- /**
- * Returns {@code true} if the {@code byte[]} backing this {@link ByteKey} is of length 0.
- */
- public boolean isEmpty() {
- return value.isEmpty();
- }
-
- /**
- * {@link ByteKey} implements {@link Comparable Comparable<ByteKey>} by comparing the
- * arrays in lexicographic order. The smallest {@link ByteKey} is a zero-length array; the
- * successor to a key is the same key with an additional 0 byte appended; and keys have unbounded
- * size.
- */
- @Override
- public int compareTo(ByteKey other) {
- checkNotNull(other, "other");
- ByteIterator thisIt = value.iterator();
- ByteIterator otherIt = other.value.iterator();
- while (thisIt.hasNext() && otherIt.hasNext()) {
- // (byte & 0xff) converts [-128,127] bytes to [0,255] ints.
- int cmp = (thisIt.nextByte() & 0xff) - (otherIt.nextByte() & 0xff);
- if (cmp != 0) {
- return cmp;
- }
- }
- // If we get here, the prefix of both arrays is equal up to the shorter array. The array with
- // more bytes is larger.
- return value.size() - other.value.size();
- }
-
- ////////////////////////////////////////////////////////////////////////////////////
- private final ByteString value;
-
- private ByteKey(ByteString value) {
- this.value = value;
- }
-
- /** Array used as a helper in {@link #toString}. */
- private static final char[] HEX =
- new char[] {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'};
-
- // Prints the key as a string "[deadbeef]".
- @Override
- public String toString() {
- char[] encoded = new char[2 * value.size() + 2];
- encoded[0] = '[';
- int cnt = 1;
- ByteIterator iterator = value.iterator();
- while (iterator.hasNext()) {
- byte b = iterator.nextByte();
- encoded[cnt] = HEX[(b & 0xF0) >>> 4];
- ++cnt;
- encoded[cnt] = HEX[b & 0xF];
- ++cnt;
- }
- encoded[cnt] = ']';
- return new String(encoded);
- }
-
- @Override
- public boolean equals(Object o) {
- if (o == this) {
- return true;
- }
- if (!(o instanceof ByteKey)) {
- return false;
- }
- ByteKey other = (ByteKey) o;
- return (other.value.size() == value.size()) && this.compareTo(other) == 0;
- }
-
- @Override
- public int hashCode() {
- return value.hashCode();
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/range/ByteKeyRange.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/range/ByteKeyRange.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/range/ByteKeyRange.java
deleted file mode 100644
index 6f58d39..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/range/ByteKeyRange.java
+++ /dev/null
@@ -1,376 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.io.range;
-
-import static com.google.common.base.Preconditions.checkArgument;
-import static com.google.common.base.Preconditions.checkNotNull;
-import static com.google.common.base.Preconditions.checkState;
-import static com.google.common.base.Verify.verify;
-
-import com.google.common.base.MoreObjects;
-import com.google.common.collect.ImmutableList;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.Serializable;
-import java.math.BigDecimal;
-import java.math.BigInteger;
-import java.util.Arrays;
-import java.util.List;
-import java.util.Objects;
-
-/**
- * A class representing a range of {@link ByteKey ByteKeys}.
- *
- * <p>Instances of {@link ByteKeyRange} are immutable.
- *
- * <p>A {@link ByteKeyRange} enforces the restriction that its start and end keys must form a valid,
- * non-empty range {@code [startKey, endKey)} that is inclusive of the start key and exclusive of
- * the end key.
- *
- * <p>When the end key is empty, it is treated as the largest possible key.
- *
- * <h3>Interpreting {@link ByteKey} in a {@link ByteKeyRange}</h3>
- *
- * <p>The primary role of {@link ByteKeyRange} is to provide functionality for
- * {@link #estimateFractionForKey(ByteKey)}, {@link #interpolateKey(double)}, and
- * {@link #split(int)}, which are used for Google Cloud Dataflow's
- * <a href="https://cloud.google.com/dataflow/service/dataflow-service-desc#AutoScaling">Autoscaling
- * and Dynamic Work Rebalancing</a> features.
- *
- * <p>{@link ByteKeyRange} implements these features by treating a {@link ByteKey}'s underlying
- * {@code byte[]} as the binary expansion of floating point numbers in the range {@code [0.0, 1.0]}.
- * For example, the keys {@code ByteKey.of(0x80)}, {@code ByteKey.of(0xc0)}, and
- * {@code ByteKey.of(0xe0)} are interpreted as {@code 0.5}, {@code 0.75}, and {@code 0.875}
- * respectively. The empty {@code ByteKey.EMPTY} is interpreted as {@code 0.0} when used as the
- * start of a range and {@code 1.0} when used as the end key.
- *
- * <p>Key interpolation, fraction estimation, and range splitting are all interpreted in these
- * floating-point semantics. See the respective implementations for further details. <b>Note:</b>
- * the underlying implementations of these functions use {@link BigInteger} and {@link BigDecimal},
- * so they can be slow and should not be called in hot loops. Dataflow's dynamic work
- * rebalancing will only invoke these functions during periodic control operations, so they are not
- * called on the critical path.
- *
- * @see ByteKey
- */
-public final class ByteKeyRange implements Serializable {
- private static final Logger logger = LoggerFactory.getLogger(ByteKeyRange.class);
-
- /** The range of all keys, with empty start and end keys. */
- public static final ByteKeyRange ALL_KEYS = ByteKeyRange.of(ByteKey.EMPTY, ByteKey.EMPTY);
-
- /**
- * Creates a new {@link ByteKeyRange} with the given start and end keys.
- *
- * <p>Note that if {@code endKey} is empty, it is treated as the largest possible key.
- *
- * @see ByteKeyRange
- *
- * @throws IllegalArgumentException if {@code endKey} is less than or equal to {@code startKey},
- * unless {@code endKey} is empty indicating the maximum possible {@link ByteKey}.
- */
- public static ByteKeyRange of(ByteKey startKey, ByteKey endKey) {
- return new ByteKeyRange(startKey, endKey);
- }
-
- /**
- * Returns the {@link ByteKey} representing the lower bound of this {@link ByteKeyRange}.
- */
- public ByteKey getStartKey() {
- return startKey;
- }
-
- /**
- * Returns the {@link ByteKey} representing the upper bound of this {@link ByteKeyRange}.
- *
- * <p>Note that if {@code endKey} is empty, it is treated as the largest possible key.
- */
- public ByteKey getEndKey() {
- return endKey;
- }
-
- /**
- * Returns {@code true} if the specified {@link ByteKey} is contained within this range.
- */
- public Boolean containsKey(ByteKey key) {
- return key.compareTo(startKey) >= 0 && endsAfterKey(key);
- }
-
- /**
- * Returns {@code true} if the specified {@link ByteKeyRange} overlaps this range.
- */
- public Boolean overlaps(ByteKeyRange other) {
- // If each range starts before the other range ends, then they must overlap.
- // { [] } -- one range inside the other OR { [ } ] -- partial overlap.
- return endsAfterKey(other.startKey) && other.endsAfterKey(startKey);
- }
-
- /**
- * Returns a list of up to {@code numSplits + 1} {@link ByteKey ByteKeys} in ascending order,
- * where the keys have been interpolated to form roughly equal sub-ranges of this
- * {@link ByteKeyRange}, assuming a uniform distribution of keys within this range.
- *
- * <p>The first {@link ByteKey} in the result is guaranteed to be equal to {@link #getStartKey},
- * and the last {@link ByteKey} in the result is guaranteed to be equal to {@link #getEndKey}.
- * Thus the resulting list exactly spans the same key range as this {@link ByteKeyRange}.
- *
- * <p>Note that the number of keys returned is not always equal to {@code numSplits + 1}.
- * Specifically, if this range is unsplittable (e.g., because the start and end keys are equal
- * up to padding by zero bytes), the list returned will only contain the start and end key.
- *
- * @throws IllegalArgumentException if the specified number of splits is < 1
- * @see ByteKeyRange the ByteKeyRange class Javadoc for more information about split semantics.
- */
- public List<ByteKey> split(int numSplits) {
- checkArgument(numSplits > 0, "numSplits %s must be a positive integer", numSplits);
-
- try {
- ImmutableList.Builder<ByteKey> ret = ImmutableList.builder();
- ret.add(startKey);
- for (int i = 1; i < numSplits; ++i) {
- ret.add(interpolateKey(i / (double) numSplits));
- }
- ret.add(endKey);
- return ret.build();
- } catch (IllegalStateException e) {
- // The range is not splittable -- just return
- return ImmutableList.of(startKey, endKey);
- }
- }
-
- /**
- * Returns the fraction of this range {@code [startKey, endKey)} that is in the interval
- * {@code [startKey, key)}.
- *
- * @throws IllegalArgumentException if {@code key} does not fall within this range
- * @see ByteKeyRange the ByteKeyRange class Javadoc for more information about fraction semantics.
- */
- public double estimateFractionForKey(ByteKey key) {
- checkNotNull(key, "key");
- checkArgument(!key.isEmpty(), "Cannot compute fraction for an empty key");
- checkArgument(
- key.compareTo(startKey) >= 0, "Expected key %s >= range start key %s", key, startKey);
-
- if (key.equals(endKey)) {
- return 1.0;
- }
- checkArgument(containsKey(key), "Cannot compute fraction for %s outside this %s", key, this);
-
- byte[] startBytes = startKey.getBytes();
- byte[] endBytes = endKey.getBytes();
- byte[] keyBytes = key.getBytes();
- // If the endKey is unspecified, add a leading 1 byte to it and a leading 0 byte to all other
- // keys, to get a concrete least upper bound for the desired range.
- if (endKey.isEmpty()) {
- startBytes = addHeadByte(startBytes, (byte) 0);
- endBytes = addHeadByte(endBytes, (byte) 1);
- keyBytes = addHeadByte(keyBytes, (byte) 0);
- }
-
- // Pad to the longest of all 3 keys.
- int paddedKeyLength = Math.max(Math.max(startBytes.length, endBytes.length), keyBytes.length);
- BigInteger rangeStartInt = paddedPositiveInt(startBytes, paddedKeyLength);
- BigInteger rangeEndInt = paddedPositiveInt(endBytes, paddedKeyLength);
- BigInteger keyInt = paddedPositiveInt(keyBytes, paddedKeyLength);
-
- // Keys are equal subject to padding by 0.
- BigInteger range = rangeEndInt.subtract(rangeStartInt);
- if (range.equals(BigInteger.ZERO)) {
- logger.warn(
- "Using 0.0 as the default fraction for this near-empty range {} where start and end keys"
- + " differ only by trailing zeros.",
- this);
- return 0.0;
- }
-
- // Compute the progress (key-start)/(end-start) scaling by 2^64, dividing (which rounds),
- // and then scaling down after the division. This gives ample precision when converted to
- // double.
- BigInteger progressScaled = keyInt.subtract(rangeStartInt).shiftLeft(64);
- return progressScaled.divide(range).doubleValue() / Math.pow(2, 64);
- }
-
- /**
- * Returns a {@link ByteKey} {@code key} such that {@code [startKey, key)} represents
- * approximately the specified fraction of the range {@code [startKey, endKey)}. The interpolation
- * is computed assuming a uniform distribution of keys.
- *
- * <p>For example, given the largest possible range (defined by empty start and end keys), the
- * fraction {@code 0.5} will return the {@code ByteKey.of(0x80)}, which will also be returned for
- * ranges {@code [0x40, 0xc0)} and {@code [0x6f, 0x91)}.
- *
- * <p>The key returned will never be empty.
- *
- * @throws IllegalArgumentException if {@code fraction} is outside the range [0, 1)
- * @throws IllegalStateException if this range cannot be interpolated
- * @see ByteKeyRange the ByteKeyRange class Javadoc for more information about fraction semantics.
- */
- public ByteKey interpolateKey(double fraction) {
- checkArgument(
- fraction >= 0.0 && fraction < 1.0, "Fraction %s must be in the range [0, 1)", fraction);
- byte[] startBytes = startKey.getBytes();
- byte[] endBytes = endKey.getBytes();
- // If the endKey is unspecified, add a leading 1 byte to it and a leading 0 byte to all other
- // keys, to get a concrete least upper bound for the desired range.
- if (endKey.isEmpty()) {
- startBytes = addHeadByte(startBytes, (byte) 0);
- endBytes = addHeadByte(endBytes, (byte) 1);
- }
-
- // Pad to the longest key.
- int paddedKeyLength = Math.max(startBytes.length, endBytes.length);
- BigInteger rangeStartInt = paddedPositiveInt(startBytes, paddedKeyLength);
- BigInteger rangeEndInt = paddedPositiveInt(endBytes, paddedKeyLength);
-
- // If the keys are equal subject to padding by 0, we can't interpolate.
- BigInteger range = rangeEndInt.subtract(rangeStartInt);
- checkState(
- !range.equals(BigInteger.ZERO),
- "Refusing to interpolate for near-empty %s where start and end keys differ only by trailing"
- + " zero bytes.",
- this);
-
- // Add precision so that range is at least 53 (double mantissa length) bits long. This way, we
- // can interpolate small ranges finely, e.g., split the range key 3 to key 4 into 1024 parts.
- // We add precision to range by adding zero bytes to the end of the keys, aka shifting the
- // underlying BigInteger left by a multiple of 8 bits.
- int bytesNeeded = ((53 - range.bitLength()) + 7) / 8;
- if (bytesNeeded > 0) {
- range = range.shiftLeft(bytesNeeded * 8);
- rangeStartInt = rangeStartInt.shiftLeft(bytesNeeded * 8);
- paddedKeyLength += bytesNeeded;
- }
-
- BigInteger interpolatedOffset =
- new BigDecimal(range).multiply(BigDecimal.valueOf(fraction)).toBigInteger();
-
- int outputKeyLength = endKey.isEmpty() ? (paddedKeyLength - 1) : paddedKeyLength;
- return ByteKey.copyFrom(
- fixupHeadZeros(rangeStartInt.add(interpolatedOffset).toByteArray(), outputKeyLength));
- }
-
- /**
- * Returns new {@link ByteKeyRange} like this one, but with the specified start key.
- */
- public ByteKeyRange withStartKey(ByteKey startKey) {
- return new ByteKeyRange(startKey, endKey);
- }
-
- /**
- * Returns new {@link ByteKeyRange} like this one, but with the specified end key.
- */
- public ByteKeyRange withEndKey(ByteKey endKey) {
- return new ByteKeyRange(startKey, endKey);
- }
-
- ////////////////////////////////////////////////////////////////////////////////////
- private final ByteKey startKey;
- private final ByteKey endKey;
-
- private ByteKeyRange(ByteKey startKey, ByteKey endKey) {
- this.startKey = checkNotNull(startKey, "startKey");
- this.endKey = checkNotNull(endKey, "endKey");
- checkArgument(endsAfterKey(startKey), "Start %s must be less than end %s", startKey, endKey);
- }
-
- @Override
- public String toString() {
- return MoreObjects.toStringHelper(ByteKeyRange.class)
- .add("startKey", startKey)
- .add("endKey", endKey)
- .toString();
- }
-
- @Override
- public boolean equals(Object o) {
- if (o == this) {
- return true;
- }
- if (!(o instanceof ByteKeyRange)) {
- return false;
- }
- ByteKeyRange other = (ByteKeyRange) o;
- return Objects.equals(startKey, other.startKey) && Objects.equals(endKey, other.endKey);
- }
-
- @Override
- public int hashCode() {
- return Objects.hash(startKey, endKey);
- }
-
- /**
- * Returns a copy of the specified array with the specified byte added at the front.
- */
- private static byte[] addHeadByte(byte[] array, byte b) {
- byte[] ret = new byte[array.length + 1];
- ret[0] = b;
- System.arraycopy(array, 0, ret, 1, array.length);
- return ret;
- }
-
- /**
- * Ensures the array is exactly {@code size} bytes long. Returns the input array if the condition
- * is met, otherwise either adds or removes zero bytes from the beginning of {@code array}.
- */
- private static byte[] fixupHeadZeros(byte[] array, int size) {
- int padding = size - array.length;
- if (padding == 0) {
- return array;
- }
-
- if (padding < 0) {
- // There is one zero byte at the beginning, added by BigInteger to make there be a sign
- // bit when converting to bytes.
- verify(
- padding == -1,
- "key %s: expected length %d with exactly one byte of padding, found %d",
- ByteKey.copyFrom(array),
- size,
- -padding);
- verify(
- (array[0] == 0) && ((array[1] & 0x80) == 0x80),
- "key %s: is 1 byte longer than expected, indicating BigInteger padding. Expect first byte"
- + " to be zero with set MSB in second byte.",
- ByteKey.copyFrom(array));
- return Arrays.copyOfRange(array, 1, array.length);
- }
-
- byte[] ret = new byte[size];
- System.arraycopy(array, 0, ret, padding, array.length);
- return ret;
- }
-
- /**
- * Returns {@code true} when the specified {@code key} is smaller this range's end key. The only
- * semantic change from {@code (key.compareTo(getEndKey()) < 0)} is that the empty end key is
- * treated as larger than all possible {@link ByteKey keys}.
- */
- boolean endsAfterKey(ByteKey key) {
- return endKey.isEmpty() || key.compareTo(endKey) < 0;
- }
-
- /** Builds a BigInteger out of the specified array, padded to the desired byte length. */
- private static BigInteger paddedPositiveInt(byte[] bytes, int length) {
- int bytePaddingNeeded = length - bytes.length;
- checkArgument(
- bytePaddingNeeded >= 0, "Required bytes.length {} < length {}", bytes.length, length);
- BigInteger ret = new BigInteger(1, bytes);
- return (bytePaddingNeeded == 0) ? ret : ret.shiftLeft(8 * bytePaddingNeeded);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/range/ByteKeyRangeTracker.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/range/ByteKeyRangeTracker.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/range/ByteKeyRangeTracker.java
deleted file mode 100644
index f6796cc..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/range/ByteKeyRangeTracker.java
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.io.range;
-
-import static com.google.common.base.MoreObjects.toStringHelper;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import javax.annotation.Nullable;
-
-/**
- * A {@link RangeTracker} for {@link ByteKey ByteKeys} in {@link ByteKeyRange ByteKeyRanges}.
- *
- * @see ByteKey
- * @see ByteKeyRange
- */
-public final class ByteKeyRangeTracker implements RangeTracker<ByteKey> {
- private static final Logger logger = LoggerFactory.getLogger(ByteKeyRangeTracker.class);
-
- /** Instantiates a new {@link ByteKeyRangeTracker} with the specified range. */
- public static ByteKeyRangeTracker of(ByteKeyRange range) {
- return new ByteKeyRangeTracker(range);
- }
-
- @Override
- public synchronized ByteKey getStartPosition() {
- return range.getStartKey();
- }
-
- @Override
- public synchronized ByteKey getStopPosition() {
- return range.getEndKey();
- }
-
- @Override
- public synchronized boolean tryReturnRecordAt(boolean isAtSplitPoint, ByteKey recordStart) {
- if (isAtSplitPoint && !range.containsKey(recordStart)) {
- return false;
- }
- position = recordStart;
- return true;
- }
-
- @Override
- public synchronized boolean trySplitAtPosition(ByteKey splitPosition) {
- // Unstarted.
- if (position == null) {
- logger.warn(
- "{}: Rejecting split request at {} because no records have been returned.",
- this,
- splitPosition);
- return false;
- }
-
- // Started, but not after current position.
- if (splitPosition.compareTo(position) <= 0) {
- logger.warn(
- "{}: Rejecting split request at {} because it is not after current position {}.",
- this,
- splitPosition,
- position);
- return false;
- }
-
- // Sanity check.
- if (!range.containsKey(splitPosition)) {
- logger.warn(
- "{}: Rejecting split request at {} because it is not within the range.",
- this,
- splitPosition);
- return false;
- }
-
- range = range.withEndKey(splitPosition);
- return true;
- }
-
- @Override
- public synchronized double getFractionConsumed() {
- if (position == null) {
- return 0;
- }
- return range.estimateFractionForKey(position);
- }
-
- ///////////////////////////////////////////////////////////////////////////////
- private ByteKeyRange range;
- @Nullable private ByteKey position;
-
- private ByteKeyRangeTracker(ByteKeyRange range) {
- this.range = range;
- this.position = null;
- }
-
- @Override
- public String toString() {
- return toStringHelper(ByteKeyRangeTracker.class)
- .add("range", range)
- .add("position", position)
- .toString();
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/range/OffsetRangeTracker.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/range/OffsetRangeTracker.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/range/OffsetRangeTracker.java
deleted file mode 100644
index b237217..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/range/OffsetRangeTracker.java
+++ /dev/null
@@ -1,182 +0,0 @@
-/*******************************************************************************
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- ******************************************************************************/
-
-package com.google.cloud.dataflow.sdk.io.range;
-
-import com.google.common.annotations.VisibleForTesting;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * A {@link RangeTracker} for non-negative positions of type {@code long}.
- */
-public class OffsetRangeTracker implements RangeTracker<Long> {
- private static final Logger LOG = LoggerFactory.getLogger(OffsetRangeTracker.class);
-
- private final long startOffset;
- private long stopOffset;
- private long lastRecordStart = -1L;
- private long offsetOfLastSplitPoint = -1L;
-
- /**
- * Offset corresponding to infinity. This can only be used as the upper-bound of a range, and
- * indicates reading all of the records until the end without specifying exactly what the end is.
- *
- * <p>Infinite ranges cannot be split because it is impossible to estimate progress within them.
- */
- public static final long OFFSET_INFINITY = Long.MAX_VALUE;
-
- /**
- * Creates an {@code OffsetRangeTracker} for the specified range.
- */
- public OffsetRangeTracker(long startOffset, long stopOffset) {
- this.startOffset = startOffset;
- this.stopOffset = stopOffset;
- }
-
- @Override
- public synchronized Long getStartPosition() {
- return startOffset;
- }
-
- @Override
- public synchronized Long getStopPosition() {
- return stopOffset;
- }
-
- @Override
- public boolean tryReturnRecordAt(boolean isAtSplitPoint, Long recordStart) {
- return tryReturnRecordAt(isAtSplitPoint, recordStart.longValue());
- }
-
- public synchronized boolean tryReturnRecordAt(boolean isAtSplitPoint, long recordStart) {
- if (lastRecordStart == -1 && !isAtSplitPoint) {
- throw new IllegalStateException(
- String.format("The first record [starting at %d] must be at a split point", recordStart));
- }
- if (recordStart < lastRecordStart) {
- throw new IllegalStateException(
- String.format(
- "Trying to return record [starting at %d] "
- + "which is before the last-returned record [starting at %d]",
- recordStart,
- lastRecordStart));
- }
- if (isAtSplitPoint) {
- if (offsetOfLastSplitPoint != -1L && recordStart == offsetOfLastSplitPoint) {
- throw new IllegalStateException(
- String.format(
- "Record at a split point has same offset as the previous split point: "
- + "previous split point at %d, current record starts at %d",
- offsetOfLastSplitPoint, recordStart));
- }
- if (recordStart >= stopOffset) {
- return false;
- }
- offsetOfLastSplitPoint = recordStart;
- }
-
- lastRecordStart = recordStart;
- return true;
- }
-
- @Override
- public boolean trySplitAtPosition(Long splitOffset) {
- return trySplitAtPosition(splitOffset.longValue());
- }
-
- public synchronized boolean trySplitAtPosition(long splitOffset) {
- if (stopOffset == OFFSET_INFINITY) {
- LOG.debug("Refusing to split {} at {}: stop position unspecified", this, splitOffset);
- return false;
- }
- if (lastRecordStart == -1) {
- LOG.debug("Refusing to split {} at {}: unstarted", this, splitOffset);
- return false;
- }
-
- // Note: technically it is correct to split at any position after the last returned
- // split point, not just the last returned record.
- // TODO: Investigate whether in practice this is useful or, rather, confusing.
- if (splitOffset <= lastRecordStart) {
- LOG.debug(
- "Refusing to split {} at {}: already past proposed split position", this, splitOffset);
- return false;
- }
- if (splitOffset < startOffset || splitOffset >= stopOffset) {
- LOG.debug(
- "Refusing to split {} at {}: proposed split position out of range", this, splitOffset);
- return false;
- }
- LOG.debug("Agreeing to split {} at {}", this, splitOffset);
- this.stopOffset = splitOffset;
- return true;
- }
-
- /**
- * Returns a position {@code P} such that the range {@code [start, P)} represents approximately
- * the given fraction of the range {@code [start, end)}. Assumes that the density of records
- * in the range is approximately uniform.
- */
- public synchronized long getPositionForFractionConsumed(double fraction) {
- if (stopOffset == OFFSET_INFINITY) {
- throw new IllegalArgumentException(
- "getPositionForFractionConsumed is not applicable to an unbounded range: " + this);
- }
- return (long) Math.ceil(startOffset + fraction * (stopOffset - startOffset));
- }
-
- @Override
- public synchronized double getFractionConsumed() {
- if (stopOffset == OFFSET_INFINITY) {
- return 0.0;
- }
- if (lastRecordStart == -1) {
- return 0.0;
- }
- // E.g., when reading [3, 6) and lastRecordStart is 4, that means we consumed 3,4 of 3,4,5
- // which is (4 - 3 + 1) / (6 - 3) = 67%.
- // Also, clamp to at most 1.0 because the last consumed position can extend past the
- // stop position.
- return Math.min(1.0, 1.0 * (lastRecordStart - startOffset + 1) / (stopOffset - startOffset));
- }
-
- @Override
- public synchronized String toString() {
- String stopString = (stopOffset == OFFSET_INFINITY) ? "infinity" : String.valueOf(stopOffset);
- if (lastRecordStart >= 0) {
- return String.format(
- "<at [starting at %d] of offset range [%d, %s)>",
- lastRecordStart,
- startOffset,
- stopString);
- } else {
- return String.format("<unstarted in offset range [%d, %s)>", startOffset, stopString);
- }
- }
-
- /**
- * Returns a copy of this tracker for testing purposes (to simplify testing methods with
- * side effects).
- */
- @VisibleForTesting
- OffsetRangeTracker copy() {
- OffsetRangeTracker res = new OffsetRangeTracker(startOffset, stopOffset);
- res.lastRecordStart = this.lastRecordStart;
- return res;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/range/RangeTracker.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/range/RangeTracker.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/range/RangeTracker.java
deleted file mode 100644
index 84359f1..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/range/RangeTracker.java
+++ /dev/null
@@ -1,220 +0,0 @@
-/*******************************************************************************
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- ******************************************************************************/
-
-package com.google.cloud.dataflow.sdk.io.range;
-
-/**
- * A {@code RangeTracker} is a thread-safe helper object for implementing dynamic work rebalancing
- * in position-based {@link com.google.cloud.dataflow.sdk.io.BoundedSource.BoundedReader}
- * subclasses.
- *
- * <h3>Usage of the RangeTracker class hierarchy</h3>
- * The abstract {@code RangeTracker} interface should not be used per se - all users should use its
- * subclasses directly. We declare it here because all subclasses have roughly the same interface
- * and the same properties, to centralize the documentation. Currently we provide one
- * implementation - {@link OffsetRangeTracker}.
- *
- * <h3>Position-based sources</h3>
- * A position-based source is one where the source can be described by a range of positions of
- * an ordered type and the records returned by the reader can be described by positions of the
- * same type.
- *
- * <p>In case a record occupies a range of positions in the source, the most important thing about
- * the record is the position where it starts.
- *
- * <p>Defining the semantics of positions for a source is entirely up to the source class, however
- * the chosen definitions have to obey certain properties in order to make it possible to correctly
- * split the source into parts, including dynamic splitting. Two main aspects need to be defined:
- * <ul>
- * <li>How to assign starting positions to records.
- * <li>Which records should be read by a source with a range {@code [A, B)}.
- * </ul>
- * Moreover, reading a range must be <i>efficient</i>, i.e., the performance of reading a range
- * should not significantly depend on the location of the range. For example, reading the range
- * {@code [A, B)} should not require reading all data before {@code A}.
- *
- * <p>The sections below explain exactly what properties these definitions must satisfy, and
- * how to use a {@code RangeTracker} with a properly defined source.
- *
- * <h3>Properties of position-based sources</h3>
- * The main requirement for position-based sources is <i>associativity</i>: reading records from
- * {@code [A, B)} and records from {@code [B, C)} should give the same records as reading from
- * {@code [A, C)}, where {@code A <= B <= C}. This property ensures that no matter how a range
- * of positions is split into arbitrarily many sub-ranges, the total set of records described by
- * them stays the same.
- *
- * <p>The other important property is how the source's range relates to positions of records in
- * the source. In many sources each record can be identified by a unique starting position.
- * In this case:
- * <ul>
- * <li>All records returned by a source {@code [A, B)} must have starting positions
- * in this range.
- * <li>All but the last record should end within this range. The last record may or may not
- * extend past the end of the range.
- * <li>Records should not overlap.
- * </ul>
- * Such sources should define "read {@code [A, B)}" as "read from the first record starting at or
- * after A, up to but not including the first record starting at or after B".
- *
- * <p>Some examples of such sources include reading lines or CSV from a text file, reading keys and
- * values from a BigTable, etc.
- *
- * <p>The concept of <i>split points</i> allows to extend the definitions for dealing with sources
- * where some records cannot be identified by a unique starting position.
- *
- * <p>In all cases, all records returned by a source {@code [A, B)} must <i>start</i> at or after
- * {@code A}.
- *
- * <h3>Split points</h3>
- *
- * <p>Some sources may have records that are not directly addressable. For example, imagine a file
- * format consisting of a sequence of compressed blocks. Each block can be assigned an offset, but
- * records within the block cannot be directly addressed without decompressing the block. Let us
- * refer to this hypothetical format as <i>CBF (Compressed Blocks Format)</i>.
- *
- * <p>Many such formats can still satisfy the associativity property. For example, in CBF, reading
- * {@code [A, B)} can mean "read all the records in all blocks whose starting offset is in
- * {@code [A, B)}".
- *
- * <p>To support such complex formats, we introduce the notion of <i>split points</i>. We say that
- * a record is a split point if there exists a position {@code A} such that the record is the first
- * one to be returned when reading the range {@code [A, infinity)}. In CBF, the only split points
- * would be the first records in each block.
- *
- * <p>Split points allow us to define the meaning of a record's position and a source's range
- * in all cases:
- * <ul>
- * <li>For a record that is at a split point, its position is defined to be the largest
- * {@code A} such that reading a source with the range {@code [A, infinity)} returns this record;
- * <li>Positions of other records are only required to be non-decreasing;
- * <li>Reading the source {@code [A, B)} must return records starting from the first split point
- * at or after {@code A}, up to but not including the first split point at or after {@code B}.
- * In particular, this means that the first record returned by a source MUST always be
- * a split point.
- * <li>Positions of split points must be unique.
- * </ul>
- * As a result, for any decomposition of the full range of the source into position ranges, the
- * total set of records will be the full set of records in the source, and each record
- * will be read exactly once.
- *
- * <h3>Consumed positions</h3>
- * As the source is being read, and records read from it are being passed to the downstream
- * transforms in the pipeline, we say that positions in the source are being <i>consumed</i>.
- * When a reader has read a record (or promised to a caller that a record will be returned),
- * positions up to and including the record's start position are considered <i>consumed</i>.
- *
- * <p>Dynamic splitting can happen only at <i>unconsumed</i> positions. If the reader just
- * returned a record at offset 42 in a file, dynamic splitting can happen only at offset 43 or
- * beyond, as otherwise that record could be read twice (by the current reader and by a reader
- * of the task starting at 43).
- *
- * <h3>Example</h3>
- * The following example uses an {@link OffsetRangeTracker} to support dynamically splitting
- * a source with integer positions (offsets).
- * <pre> {@code
- * class MyReader implements BoundedReader<Foo> {
- * private MySource currentSource;
- * private final OffsetRangeTracker tracker = new OffsetRangeTracker();
- * ...
- * MyReader(MySource source) {
- * this.currentSource = source;
- * this.tracker = new MyRangeTracker<>(source.getStartOffset(), source.getEndOffset())
- * }
- * ...
- * boolean start() {
- * ... (general logic for locating the first record) ...
- * if (!tracker.tryReturnRecordAt(true, recordStartOffset)) return false;
- * ... (any logic that depends on the record being returned, e.g. counting returned records)
- * return true;
- * }
- * boolean advance() {
- * ... (general logic for locating the next record) ...
- * if (!tracker.tryReturnRecordAt(isAtSplitPoint, recordStartOffset)) return false;
- * ... (any logic that depends on the record being returned, e.g. counting returned records)
- * return true;
- * }
- *
- * double getFractionConsumed() {
- * return tracker.getFractionConsumed();
- * }
- * }
- * } </pre>
- *
- * <h3>Usage with different models of iteration</h3>
- * When using this class to protect a
- * {@link com.google.cloud.dataflow.sdk.io.BoundedSource.BoundedReader}, follow the pattern
- * described above.
- *
- * <p>When using this class to protect iteration in the {@code hasNext()/next()}
- * model, consider the record consumed when {@code hasNext()} is about to return true, rather than
- * when {@code next()} is called, because {@code hasNext()} returning true is promising the caller
- * that {@code next()} will have an element to return - so {@link #trySplitAtPosition} must not
- * split the range in a way that would make the record promised by {@code hasNext()} belong to
- * a different range.
- *
- * <p>Also note that implementations of {@code hasNext()} need to ensure
- * that they call {@link #tryReturnRecordAt} only once even if {@code hasNext()} is called
- * repeatedly, due to the requirement on uniqueness of split point positions.
- *
- * @param <PositionT> Type of positions used by the source to define ranges and identify records.
- */
-public interface RangeTracker<PositionT> {
- /**
- * Returns the starting position of the current range, inclusive.
- */
- PositionT getStartPosition();
-
- /**
- * Returns the ending position of the current range, exclusive.
- */
- PositionT getStopPosition();
-
- /**
- * Atomically determines whether a record at the given position can be returned and updates
- * internal state. In particular:
- * <ul>
- * <li>If {@code isAtSplitPoint} is {@code true}, and {@code recordStart} is outside the current
- * range, returns {@code false};
- * <li>Otherwise, updates the last-consumed position to {@code recordStart} and returns
- * {@code true}.
- * </ul>
- * <p>This method MUST be called on all split point records. It may be called on every record.
- */
- boolean tryReturnRecordAt(boolean isAtSplitPoint, PositionT recordStart);
-
- /**
- * Atomically splits the current range [{@link #getStartPosition}, {@link #getStopPosition})
- * into a "primary" part [{@link #getStartPosition}, {@code splitPosition})
- * and a "residual" part [{@code splitPosition}, {@link #getStopPosition}), assuming the current
- * last-consumed position is within [{@link #getStartPosition}, splitPosition)
- * (i.e., {@code splitPosition} has not been consumed yet).
- *
- * <p>Updates the current range to be the primary and returns {@code true}. This means that
- * all further calls on the current object will interpret their arguments relative to the
- * primary range.
- *
- * <p>If the split position has already been consumed, or if no {@link #tryReturnRecordAt} call
- * was made yet, returns {@code false}. The second condition is to prevent dynamic splitting
- * during reader start-up.
- */
- boolean trySplitAtPosition(PositionT splitPosition);
-
- /**
- * Returns the approximate fraction of positions in the source that have been consumed by
- * successful {@link #tryReturnRecordAt} calls, or 0.0 if no such calls have happened.
- */
- double getFractionConsumed();
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/range/package-info.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/range/package-info.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/range/package-info.java
deleted file mode 100644
index beb77bf..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/range/package-info.java
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-/**
- * Provides thread-safe helpers for implementing dynamic work rebalancing in position-based
- * bounded sources.
- *
- * <p>See {@link com.google.cloud.dataflow.sdk.io.range.RangeTracker} to get started.
- */
-package com.google.cloud.dataflow.sdk.io.range;
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/ApplicationNameOptions.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/ApplicationNameOptions.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/ApplicationNameOptions.java
deleted file mode 100644
index 60d62d3..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/ApplicationNameOptions.java
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.options;
-
-/**
- * Options that allow setting the application name.
- */
-public interface ApplicationNameOptions extends PipelineOptions {
- /**
- * Name of application, for display purposes.
- *
- * <p>Defaults to the name of the class that constructs the {@link PipelineOptions}
- * via the {@link PipelineOptionsFactory}.
- */
- @Description("Name of application for display purposes. Defaults to the name of the class that "
- + "constructs the PipelineOptions via the PipelineOptionsFactory.")
- String getAppName();
- void setAppName(String value);
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/BigQueryOptions.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/BigQueryOptions.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/BigQueryOptions.java
deleted file mode 100644
index ed4eb24..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/BigQueryOptions.java
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.options;
-
-/**
- * Properties needed when using BigQuery with the Dataflow SDK.
- */
-@Description("Options that are used to configure BigQuery. See "
- + "https://cloud.google.com/bigquery/what-is-bigquery for details on BigQuery.")
-public interface BigQueryOptions extends ApplicationNameOptions, GcpOptions,
- PipelineOptions, StreamingOptions {
- @Description("Temporary dataset for BigQuery table operations. "
- + "Supported values are \"bigquery.googleapis.com/{dataset}\"")
- @Default.String("bigquery.googleapis.com/cloud_dataflow")
- String getTempDatasetId();
- void setTempDatasetId(String value);
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/BlockingDataflowPipelineOptions.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/BlockingDataflowPipelineOptions.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/BlockingDataflowPipelineOptions.java
deleted file mode 100644
index 43a46b0..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/BlockingDataflowPipelineOptions.java
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.options;
-
-import com.google.cloud.dataflow.sdk.runners.BlockingDataflowPipelineRunner;
-
-import com.fasterxml.jackson.annotation.JsonIgnore;
-
-import java.io.PrintStream;
-
-/**
- * Options that are used to configure the {@link BlockingDataflowPipelineRunner}.
- */
-@Description("Configure options on the BlockingDataflowPipelineRunner.")
-public interface BlockingDataflowPipelineOptions extends DataflowPipelineOptions {
- /**
- * Output stream for job status messages.
- */
- @Description("Where messages generated during execution of the Dataflow job will be output.")
- @JsonIgnore
- @Hidden
- @Default.InstanceFactory(StandardOutputFactory.class)
- PrintStream getJobMessageOutput();
- void setJobMessageOutput(PrintStream value);
-
- /**
- * Returns a default of {@link System#out}.
- */
- public static class StandardOutputFactory implements DefaultValueFactory<PrintStream> {
- @Override
- public PrintStream create(PipelineOptions options) {
- return System.out;
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/CloudDebuggerOptions.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/CloudDebuggerOptions.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/CloudDebuggerOptions.java
deleted file mode 100644
index 2e1ad94..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/CloudDebuggerOptions.java
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.options;
-
-import com.google.api.services.clouddebugger.v2.model.Debuggee;
-import com.google.cloud.dataflow.sdk.annotations.Experimental;
-
-import javax.annotation.Nullable;
-
-/**
- * Options for controlling Cloud Debugger.
- */
-@Description("[Experimental] Used to configure the Cloud Debugger")
-@Experimental
-@Hidden
-public interface CloudDebuggerOptions {
-
- /**
- * Whether to enable the Cloud Debugger snapshot agent for the current job.
- */
- @Description("Whether to enable the Cloud Debugger snapshot agent for the current job.")
- boolean getEnableCloudDebugger();
- void setEnableCloudDebugger(boolean enabled);
-
- @Description("The Cloud Debugger debugee to associate with. This should not be set directly.")
- @Hidden
- @Nullable Debuggee getDebuggee();
- void setDebuggee(Debuggee debuggee);
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/DataflowPipelineDebugOptions.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/DataflowPipelineDebugOptions.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/DataflowPipelineDebugOptions.java
deleted file mode 100644
index cadc011..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/DataflowPipelineDebugOptions.java
+++ /dev/null
@@ -1,259 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.options;
-
-import com.google.api.services.dataflow.Dataflow;
-import com.google.cloud.dataflow.sdk.annotations.Experimental;
-import com.google.cloud.dataflow.sdk.util.DataflowPathValidator;
-import com.google.cloud.dataflow.sdk.util.GcsStager;
-import com.google.cloud.dataflow.sdk.util.InstanceBuilder;
-import com.google.cloud.dataflow.sdk.util.PathValidator;
-import com.google.cloud.dataflow.sdk.util.Stager;
-import com.google.cloud.dataflow.sdk.util.Transport;
-
-import com.fasterxml.jackson.annotation.JsonIgnore;
-
-import java.util.List;
-import java.util.Map;
-
-/**
- * Internal. Options used to control execution of the Dataflow SDK for
- * debugging and testing purposes.
- */
-@Description("[Internal] Options used to control execution of the Dataflow SDK for "
- + "debugging and testing purposes.")
-@Hidden
-public interface DataflowPipelineDebugOptions extends PipelineOptions {
-
- /**
- * The list of backend experiments to enable.
- *
- * <p>Dataflow provides a number of experimental features that can be enabled
- * with this flag.
- *
- * <p>Please sync with the Dataflow team before enabling any experiments.
- */
- @Description("[Experimental] Dataflow provides a number of experimental features that can "
- + "be enabled with this flag. Please sync with the Dataflow team before enabling any "
- + "experiments.")
- @Experimental
- List<String> getExperiments();
- void setExperiments(List<String> value);
-
- /**
- * The root URL for the Dataflow API. {@code dataflowEndpoint} can override this value
- * if it contains an absolute URL, otherwise {@code apiRootUrl} will be combined with
- * {@code dataflowEndpoint} to generate the full URL to communicate with the Dataflow API.
- */
- @Description("The root URL for the Dataflow API. dataflowEndpoint can override this "
- + "value if it contains an absolute URL, otherwise apiRootUrl will be combined with "
- + "dataflowEndpoint to generate the full URL to communicate with the Dataflow API.")
- @Default.String(Dataflow.DEFAULT_ROOT_URL)
- String getApiRootUrl();
- void setApiRootUrl(String value);
-
- /**
- * Dataflow endpoint to use.
- *
- * <p>Defaults to the current version of the Google Cloud Dataflow
- * API, at the time the current SDK version was released.
- *
- * <p>If the string contains "://", then this is treated as a URL,
- * otherwise {@link #getApiRootUrl()} is used as the root
- * URL.
- */
- @Description("The URL for the Dataflow API. If the string contains \"://\", this"
- + " will be treated as the entire URL, otherwise will be treated relative to apiRootUrl.")
- @Default.String(Dataflow.DEFAULT_SERVICE_PATH)
- String getDataflowEndpoint();
- void setDataflowEndpoint(String value);
-
- /**
- * The path to write the translated Dataflow job specification out to
- * at job submission time. The Dataflow job specification will be represented in JSON
- * format.
- */
- @Description("The path to write the translated Dataflow job specification out to "
- + "at job submission time. The Dataflow job specification will be represented in JSON "
- + "format.")
- String getDataflowJobFile();
- void setDataflowJobFile(String value);
-
- /**
- * The class of the validator that should be created and used to validate paths.
- * If pathValidator has not been set explicitly, an instance of this class will be
- * constructed and used as the path validator.
- */
- @Description("The class of the validator that should be created and used to validate paths. "
- + "If pathValidator has not been set explicitly, an instance of this class will be "
- + "constructed and used as the path validator.")
- @Default.Class(DataflowPathValidator.class)
- Class<? extends PathValidator> getPathValidatorClass();
- void setPathValidatorClass(Class<? extends PathValidator> validatorClass);
-
- /**
- * The path validator instance that should be used to validate paths.
- * If no path validator has been set explicitly, the default is to use the instance factory that
- * constructs a path validator based upon the currently set pathValidatorClass.
- */
- @JsonIgnore
- @Description("The path validator instance that should be used to validate paths. "
- + "If no path validator has been set explicitly, the default is to use the instance factory "
- + "that constructs a path validator based upon the currently set pathValidatorClass.")
- @Default.InstanceFactory(PathValidatorFactory.class)
- PathValidator getPathValidator();
- void setPathValidator(PathValidator validator);
-
- /**
- * The class responsible for staging resources to be accessible by workers
- * during job execution. If stager has not been set explicitly, an instance of this class
- * will be created and used as the resource stager.
- */
- @Description("The class of the stager that should be created and used to stage resources. "
- + "If stager has not been set explicitly, an instance of the this class will be created "
- + "and used as the resource stager.")
- @Default.Class(GcsStager.class)
- Class<? extends Stager> getStagerClass();
- void setStagerClass(Class<? extends Stager> stagerClass);
-
- /**
- * The resource stager instance that should be used to stage resources.
- * If no stager has been set explicitly, the default is to use the instance factory
- * that constructs a resource stager based upon the currently set stagerClass.
- */
- @JsonIgnore
- @Description("The resource stager instance that should be used to stage resources. "
- + "If no stager has been set explicitly, the default is to use the instance factory "
- + "that constructs a resource stager based upon the currently set stagerClass.")
- @Default.InstanceFactory(StagerFactory.class)
- Stager getStager();
- void setStager(Stager stager);
-
- /**
- * An instance of the Dataflow client. Defaults to creating a Dataflow client
- * using the current set of options.
- */
- @JsonIgnore
- @Description("An instance of the Dataflow client. Defaults to creating a Dataflow client "
- + "using the current set of options.")
- @Default.InstanceFactory(DataflowClientFactory.class)
- Dataflow getDataflowClient();
- void setDataflowClient(Dataflow value);
-
- /** Returns the default Dataflow client built from the passed in PipelineOptions. */
- public static class DataflowClientFactory implements DefaultValueFactory<Dataflow> {
- @Override
- public Dataflow create(PipelineOptions options) {
- return Transport.newDataflowClient(options.as(DataflowPipelineOptions.class)).build();
- }
- }
-
- /**
- * Root URL for use with the Pubsub API.
- */
- @Description("Root URL for use with the Pubsub API")
- @Default.String("https://pubsub.googleapis.com")
- String getPubsubRootUrl();
- void setPubsubRootUrl(String value);
-
- /**
- * Whether to update the currently running pipeline with the same name as this one.
- *
- * @deprecated This property is replaced by {@link DataflowPipelineOptions#getUpdate()}
- */
- @Deprecated
- @Description("If set, replace the existing pipeline with the name specified by --jobName with "
- + "this pipeline, preserving state.")
- boolean getUpdate();
- @Deprecated
- void setUpdate(boolean value);
-
- /**
- * Mapping of old PTranform names to new ones, specified as JSON
- * <code>{"oldName":"newName",...}</code>. To mark a transform as deleted, make newName the
- * empty string.
- */
- @JsonIgnore
- @Description(
- "Mapping of old PTranform names to new ones, specified as JSON "
- + "{\"oldName\":\"newName\",...}. To mark a transform as deleted, make newName the empty "
- + "string.")
- Map<String, String> getTransformNameMapping();
- void setTransformNameMapping(Map<String, String> value);
-
- /**
- * Custom windmill_main binary to use with the streaming runner.
- */
- @Description("Custom windmill_main binary to use with the streaming runner")
- String getOverrideWindmillBinary();
- void setOverrideWindmillBinary(String value);
-
- /**
- * Number of threads to use on the Dataflow worker harness. If left unspecified,
- * the Dataflow service will compute an appropriate number of threads to use.
- */
- @Description("Number of threads to use on the Dataflow worker harness. If left unspecified, "
- + "the Dataflow service will compute an appropriate number of threads to use.")
- int getNumberOfWorkerHarnessThreads();
- void setNumberOfWorkerHarnessThreads(int value);
-
- /**
- * If {@literal true}, save a heap dump before killing a thread or process which is GC
- * thrashing or out of memory. The location of the heap file will either be echoed back
- * to the user, or the user will be given the opportunity to download the heap file.
- *
- * <p>
- * CAUTION: Heap dumps can of comparable size to the default boot disk. Consider increasing
- * the boot disk size before setting this flag to true.
- */
- @Description("If {@literal true}, save a heap dump before killing a thread or process "
- + "which is GC thrashing or out of memory.")
- boolean getDumpHeapOnOOM();
- void setDumpHeapOnOOM(boolean dumpHeapBeforeExit);
-
- /**
- * Creates a {@link PathValidator} object using the class specified in
- * {@link #getPathValidatorClass()}.
- */
- public static class PathValidatorFactory implements DefaultValueFactory<PathValidator> {
- @Override
- public PathValidator create(PipelineOptions options) {
- DataflowPipelineDebugOptions debugOptions = options.as(DataflowPipelineDebugOptions.class);
- return InstanceBuilder.ofType(PathValidator.class)
- .fromClass(debugOptions.getPathValidatorClass())
- .fromFactoryMethod("fromOptions")
- .withArg(PipelineOptions.class, options)
- .build();
- }
- }
-
- /**
- * Creates a {@link Stager} object using the class specified in
- * {@link #getStagerClass()}.
- */
- public static class StagerFactory implements DefaultValueFactory<Stager> {
- @Override
- public Stager create(PipelineOptions options) {
- DataflowPipelineDebugOptions debugOptions = options.as(DataflowPipelineDebugOptions.class);
- return InstanceBuilder.ofType(Stager.class)
- .fromClass(debugOptions.getStagerClass())
- .fromFactoryMethod("fromOptions")
- .withArg(PipelineOptions.class, options)
- .build();
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/DataflowPipelineOptions.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/DataflowPipelineOptions.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/DataflowPipelineOptions.java
deleted file mode 100644
index 1aa4342..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/DataflowPipelineOptions.java
+++ /dev/null
@@ -1,134 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.options;
-
-import com.google.cloud.dataflow.sdk.runners.DataflowPipeline;
-import com.google.common.base.MoreObjects;
-
-import org.joda.time.DateTimeUtils;
-import org.joda.time.DateTimeZone;
-import org.joda.time.format.DateTimeFormat;
-import org.joda.time.format.DateTimeFormatter;
-
-/**
- * Options that can be used to configure the {@link DataflowPipeline}.
- */
-@Description("Options that configure the Dataflow pipeline.")
-public interface DataflowPipelineOptions extends
- PipelineOptions, GcpOptions, ApplicationNameOptions, DataflowPipelineDebugOptions,
- DataflowPipelineWorkerPoolOptions, BigQueryOptions,
- GcsOptions, StreamingOptions, CloudDebuggerOptions, DataflowWorkerLoggingOptions,
- DataflowProfilingOptions {
-
- static final String DATAFLOW_STORAGE_LOCATION = "Dataflow Storage Location";
-
- @Description("Project id. Required when running a Dataflow in the cloud. "
- + "See https://cloud.google.com/storage/docs/projects for further details.")
- @Override
- @Validation.Required
- @Default.InstanceFactory(DefaultProjectFactory.class)
- String getProject();
- @Override
- void setProject(String value);
-
- /**
- * GCS path for temporary files, e.g. gs://bucket/object
- *
- * <p>Must be a valid Cloud Storage URL, beginning with the prefix "gs://"
- *
- * <p>At least one of {@link #getTempLocation()} or {@link #getStagingLocation()} must be set. If
- * {@link #getTempLocation()} is not set, then the Dataflow pipeline defaults to using
- * {@link #getStagingLocation()}.
- */
- @Description("GCS path for temporary files, eg \"gs://bucket/object\". "
- + "Must be a valid Cloud Storage URL, beginning with the prefix \"gs://\". "
- + "At least one of tempLocation or stagingLocation must be set. If tempLocation is unset, "
- + "defaults to using stagingLocation.")
- @Validation.Required(groups = {DATAFLOW_STORAGE_LOCATION})
- String getTempLocation();
- void setTempLocation(String value);
-
- /**
- * GCS path for staging local files, e.g. gs://bucket/object
- *
- * <p>Must be a valid Cloud Storage URL, beginning with the prefix "gs://"
- *
- * <p>At least one of {@link #getTempLocation()} or {@link #getStagingLocation()} must be set. If
- * {@link #getTempLocation()} is not set, then the Dataflow pipeline defaults to using
- * {@link #getStagingLocation()}.
- */
- @Description("GCS path for staging local files, e.g. \"gs://bucket/object\". "
- + "Must be a valid Cloud Storage URL, beginning with the prefix \"gs://\". "
- + "At least one of stagingLocation or tempLocation must be set. If stagingLocation is unset, "
- + "defaults to using tempLocation.")
- @Validation.Required(groups = {DATAFLOW_STORAGE_LOCATION})
- String getStagingLocation();
- void setStagingLocation(String value);
-
- /**
- * The Dataflow job name is used as an idempotence key within the Dataflow service.
- * If there is an existing job that is currently active, another active job with the same
- * name will not be able to be created. Defaults to using the ApplicationName-UserName-Date.
- */
- @Description("The Dataflow job name is used as an idempotence key within the Dataflow service. "
- + "If there is an existing job that is currently active, another active job with the same "
- + "name will not be able to be created. Defaults to using the ApplicationName-UserName-Date.")
- @Default.InstanceFactory(JobNameFactory.class)
- String getJobName();
- void setJobName(String value);
-
- /**
- * Whether to update the currently running pipeline with the same name as this one.
- */
- @Override
- @SuppressWarnings("deprecation") // base class member deprecated in favor of this one.
- @Description(
- "If set, replace the existing pipeline with the name specified by --jobName with "
- + "this pipeline, preserving state.")
- boolean getUpdate();
- @Override
- @SuppressWarnings("deprecation") // base class member deprecated in favor of this one.
- void setUpdate(boolean value);
-
- /**
- * Returns a normalized job name constructed from {@link ApplicationNameOptions#getAppName()}, the
- * local system user name (if available), and the current time. The normalization makes sure that
- * the job name matches the required pattern of [a-z]([-a-z0-9]*[a-z0-9])? and length limit of 40
- * characters.
- *
- * <p>This job name factory is only able to generate one unique name per second per application
- * and user combination.
- */
- public static class JobNameFactory implements DefaultValueFactory<String> {
- private static final DateTimeFormatter FORMATTER =
- DateTimeFormat.forPattern("MMddHHmmss").withZone(DateTimeZone.UTC);
-
- @Override
- public String create(PipelineOptions options) {
- String appName = options.as(ApplicationNameOptions.class).getAppName();
- String normalizedAppName = appName == null || appName.length() == 0 ? "dataflow"
- : appName.toLowerCase()
- .replaceAll("[^a-z0-9]", "0")
- .replaceAll("^[^a-z]", "a");
- String userName = MoreObjects.firstNonNull(System.getProperty("user.name"), "");
- String normalizedUserName = userName.toLowerCase()
- .replaceAll("[^a-z0-9]", "0");
- String datePart = FORMATTER.print(DateTimeUtils.currentTimeMillis());
- return normalizedAppName + "-" + normalizedUserName + "-" + datePart;
- }
- }
-}
[35/67] [partial] incubator-beam git commit: Directory reorganization
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/DataflowPipelineRunner.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/DataflowPipelineRunner.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/DataflowPipelineRunner.java
deleted file mode 100644
index cd0ebc6..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/DataflowPipelineRunner.java
+++ /dev/null
@@ -1,3003 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.runners;
-
-import static com.google.cloud.dataflow.sdk.util.StringUtils.approximatePTransformName;
-import static com.google.cloud.dataflow.sdk.util.StringUtils.approximateSimpleName;
-import static com.google.cloud.dataflow.sdk.util.WindowedValue.valueInEmptyWindows;
-import static com.google.common.base.Preconditions.checkArgument;
-import static com.google.common.base.Preconditions.checkState;
-
-import com.google.api.client.googleapis.json.GoogleJsonResponseException;
-import com.google.api.services.clouddebugger.v2.Clouddebugger;
-import com.google.api.services.clouddebugger.v2.model.Debuggee;
-import com.google.api.services.clouddebugger.v2.model.RegisterDebuggeeRequest;
-import com.google.api.services.clouddebugger.v2.model.RegisterDebuggeeResponse;
-import com.google.api.services.dataflow.Dataflow;
-import com.google.api.services.dataflow.model.DataflowPackage;
-import com.google.api.services.dataflow.model.Job;
-import com.google.api.services.dataflow.model.ListJobsResponse;
-import com.google.api.services.dataflow.model.WorkerPool;
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.Pipeline.PipelineVisitor;
-import com.google.cloud.dataflow.sdk.PipelineResult.State;
-import com.google.cloud.dataflow.sdk.annotations.Experimental;
-import com.google.cloud.dataflow.sdk.coders.AvroCoder;
-import com.google.cloud.dataflow.sdk.coders.BigEndianLongCoder;
-import com.google.cloud.dataflow.sdk.coders.CannotProvideCoderException;
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.coders.Coder.NonDeterministicException;
-import com.google.cloud.dataflow.sdk.coders.CoderException;
-import com.google.cloud.dataflow.sdk.coders.CoderRegistry;
-import com.google.cloud.dataflow.sdk.coders.IterableCoder;
-import com.google.cloud.dataflow.sdk.coders.KvCoder;
-import com.google.cloud.dataflow.sdk.coders.ListCoder;
-import com.google.cloud.dataflow.sdk.coders.MapCoder;
-import com.google.cloud.dataflow.sdk.coders.SerializableCoder;
-import com.google.cloud.dataflow.sdk.coders.StandardCoder;
-import com.google.cloud.dataflow.sdk.coders.VarIntCoder;
-import com.google.cloud.dataflow.sdk.coders.VarLongCoder;
-import com.google.cloud.dataflow.sdk.io.AvroIO;
-import com.google.cloud.dataflow.sdk.io.BigQueryIO;
-import com.google.cloud.dataflow.sdk.io.FileBasedSink;
-import com.google.cloud.dataflow.sdk.io.PubsubIO;
-import com.google.cloud.dataflow.sdk.io.Read;
-import com.google.cloud.dataflow.sdk.io.ShardNameTemplate;
-import com.google.cloud.dataflow.sdk.io.TextIO;
-import com.google.cloud.dataflow.sdk.io.UnboundedSource;
-import com.google.cloud.dataflow.sdk.io.Write;
-import com.google.cloud.dataflow.sdk.options.DataflowPipelineDebugOptions;
-import com.google.cloud.dataflow.sdk.options.DataflowPipelineOptions;
-import com.google.cloud.dataflow.sdk.options.DataflowPipelineWorkerPoolOptions;
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-import com.google.cloud.dataflow.sdk.options.PipelineOptionsValidator;
-import com.google.cloud.dataflow.sdk.options.StreamingOptions;
-import com.google.cloud.dataflow.sdk.runners.DataflowPipelineTranslator.JobSpecification;
-import com.google.cloud.dataflow.sdk.runners.DataflowPipelineTranslator.TransformTranslator;
-import com.google.cloud.dataflow.sdk.runners.DataflowPipelineTranslator.TranslationContext;
-import com.google.cloud.dataflow.sdk.runners.dataflow.AssignWindows;
-import com.google.cloud.dataflow.sdk.runners.dataflow.DataflowAggregatorTransforms;
-import com.google.cloud.dataflow.sdk.runners.dataflow.PubsubIOTranslator;
-import com.google.cloud.dataflow.sdk.runners.dataflow.ReadTranslator;
-import com.google.cloud.dataflow.sdk.runners.worker.IsmFormat;
-import com.google.cloud.dataflow.sdk.runners.worker.IsmFormat.IsmRecord;
-import com.google.cloud.dataflow.sdk.runners.worker.IsmFormat.IsmRecordCoder;
-import com.google.cloud.dataflow.sdk.runners.worker.IsmFormat.MetadataKeyCoder;
-import com.google.cloud.dataflow.sdk.transforms.Aggregator;
-import com.google.cloud.dataflow.sdk.transforms.Combine;
-import com.google.cloud.dataflow.sdk.transforms.Combine.CombineFn;
-import com.google.cloud.dataflow.sdk.transforms.Create;
-import com.google.cloud.dataflow.sdk.transforms.DoFn;
-import com.google.cloud.dataflow.sdk.transforms.Flatten;
-import com.google.cloud.dataflow.sdk.transforms.GroupByKey;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-import com.google.cloud.dataflow.sdk.transforms.ParDo;
-import com.google.cloud.dataflow.sdk.transforms.SerializableFunction;
-import com.google.cloud.dataflow.sdk.transforms.View;
-import com.google.cloud.dataflow.sdk.transforms.View.CreatePCollectionView;
-import com.google.cloud.dataflow.sdk.transforms.WithKeys;
-import com.google.cloud.dataflow.sdk.transforms.windowing.AfterPane;
-import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
-import com.google.cloud.dataflow.sdk.transforms.windowing.DefaultTrigger;
-import com.google.cloud.dataflow.sdk.transforms.windowing.GlobalWindow;
-import com.google.cloud.dataflow.sdk.transforms.windowing.GlobalWindows;
-import com.google.cloud.dataflow.sdk.transforms.windowing.Window;
-import com.google.cloud.dataflow.sdk.util.CoderUtils;
-import com.google.cloud.dataflow.sdk.util.DataflowReleaseInfo;
-import com.google.cloud.dataflow.sdk.util.IOChannelUtils;
-import com.google.cloud.dataflow.sdk.util.InstanceBuilder;
-import com.google.cloud.dataflow.sdk.util.MonitoringUtil;
-import com.google.cloud.dataflow.sdk.util.PCollectionViews;
-import com.google.cloud.dataflow.sdk.util.PathValidator;
-import com.google.cloud.dataflow.sdk.util.PropertyNames;
-import com.google.cloud.dataflow.sdk.util.Reshuffle;
-import com.google.cloud.dataflow.sdk.util.SystemDoFnInternal;
-import com.google.cloud.dataflow.sdk.util.Transport;
-import com.google.cloud.dataflow.sdk.util.ValueWithRecordId;
-import com.google.cloud.dataflow.sdk.util.WindowedValue;
-import com.google.cloud.dataflow.sdk.util.WindowedValue.FullWindowedValueCoder;
-import com.google.cloud.dataflow.sdk.util.WindowingStrategy;
-import com.google.cloud.dataflow.sdk.values.KV;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.cloud.dataflow.sdk.values.PCollection.IsBounded;
-import com.google.cloud.dataflow.sdk.values.PCollectionList;
-import com.google.cloud.dataflow.sdk.values.PCollectionTuple;
-import com.google.cloud.dataflow.sdk.values.PCollectionView;
-import com.google.cloud.dataflow.sdk.values.PDone;
-import com.google.cloud.dataflow.sdk.values.PInput;
-import com.google.cloud.dataflow.sdk.values.POutput;
-import com.google.cloud.dataflow.sdk.values.PValue;
-import com.google.cloud.dataflow.sdk.values.TupleTag;
-import com.google.cloud.dataflow.sdk.values.TupleTagList;
-import com.google.common.annotations.VisibleForTesting;
-import com.google.common.base.Function;
-import com.google.common.base.Joiner;
-import com.google.common.base.Optional;
-import com.google.common.base.Preconditions;
-import com.google.common.base.Strings;
-import com.google.common.base.Utf8;
-import com.google.common.collect.ForwardingMap;
-import com.google.common.collect.HashMultimap;
-import com.google.common.collect.ImmutableList;
-import com.google.common.collect.ImmutableMap;
-import com.google.common.collect.Iterables;
-import com.google.common.collect.Maps;
-import com.google.common.collect.Multimap;
-
-import com.fasterxml.jackson.annotation.JsonCreator;
-import com.fasterxml.jackson.annotation.JsonProperty;
-
-import org.joda.time.DateTimeUtils;
-import org.joda.time.DateTimeZone;
-import org.joda.time.Duration;
-import org.joda.time.format.DateTimeFormat;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.File;
-import java.io.FileNotFoundException;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.io.PrintWriter;
-import java.io.Serializable;
-import java.net.URISyntaxException;
-import java.net.URL;
-import java.net.URLClassLoader;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-import java.util.Random;
-import java.util.Set;
-import java.util.SortedSet;
-import java.util.TreeSet;
-
-/**
- * A {@link PipelineRunner} that executes the operations in the
- * pipeline by first translating them to the Dataflow representation
- * using the {@link DataflowPipelineTranslator} and then submitting
- * them to a Dataflow service for execution.
- *
- * <p><h3>Permissions</h3>
- * When reading from a Dataflow source or writing to a Dataflow sink using
- * {@code DataflowPipelineRunner}, the Google cloudservices account and the Google compute engine
- * service account of the GCP project running the Dataflow Job will need access to the corresponding
- * source/sink.
- *
- * <p>Please see <a href="https://cloud.google.com/dataflow/security-and-permissions">Google Cloud
- * Dataflow Security and Permissions</a> for more details.
- */
-public class DataflowPipelineRunner extends PipelineRunner<DataflowPipelineJob> {
- private static final Logger LOG = LoggerFactory.getLogger(DataflowPipelineRunner.class);
-
- /** Provided configuration options. */
- private final DataflowPipelineOptions options;
-
- /** Client for the Dataflow service. This is used to actually submit jobs. */
- private final Dataflow dataflowClient;
-
- /** Translator for this DataflowPipelineRunner, based on options. */
- private final DataflowPipelineTranslator translator;
-
- /** Custom transforms implementations. */
- private final Map<Class<?>, Class<?>> overrides;
-
- /** A set of user defined functions to invoke at different points in execution. */
- private DataflowPipelineRunnerHooks hooks;
-
- // Environment version information.
- private static final String ENVIRONMENT_MAJOR_VERSION = "4";
-
- // Default Docker container images that execute Dataflow worker harness, residing in Google
- // Container Registry, separately for Batch and Streaming.
- public static final String BATCH_WORKER_HARNESS_CONTAINER_IMAGE
- = "dataflow.gcr.io/v1beta3/java-batch:1.5.0";
- public static final String STREAMING_WORKER_HARNESS_CONTAINER_IMAGE
- = "dataflow.gcr.io/v1beta3/java-streaming:1.5.0";
-
- // The limit of CreateJob request size.
- private static final int CREATE_JOB_REQUEST_LIMIT_BYTES = 10 * 1024 * 1024;
-
- private final Set<PCollection<?>> pcollectionsRequiringIndexedFormat;
-
- /**
- * Project IDs must contain lowercase letters, digits, or dashes.
- * IDs must start with a letter and may not end with a dash.
- * This regex isn't exact - this allows for patterns that would be rejected by
- * the service, but this is sufficient for basic validation of project IDs.
- */
- public static final String PROJECT_ID_REGEXP = "[a-z][-a-z0-9:.]+[a-z0-9]";
-
- /**
- * Construct a runner from the provided options.
- *
- * @param options Properties that configure the runner.
- * @return The newly created runner.
- */
- public static DataflowPipelineRunner fromOptions(PipelineOptions options) {
- // (Re-)register standard IO factories. Clobbers any prior credentials.
- IOChannelUtils.registerStandardIOFactories(options);
-
- DataflowPipelineOptions dataflowOptions =
- PipelineOptionsValidator.validate(DataflowPipelineOptions.class, options);
- ArrayList<String> missing = new ArrayList<>();
-
- if (dataflowOptions.getAppName() == null) {
- missing.add("appName");
- }
- if (missing.size() > 0) {
- throw new IllegalArgumentException(
- "Missing required values: " + Joiner.on(',').join(missing));
- }
-
- PathValidator validator = dataflowOptions.getPathValidator();
- if (dataflowOptions.getStagingLocation() != null) {
- validator.validateOutputFilePrefixSupported(dataflowOptions.getStagingLocation());
- }
- if (dataflowOptions.getTempLocation() != null) {
- validator.validateOutputFilePrefixSupported(dataflowOptions.getTempLocation());
- }
- if (Strings.isNullOrEmpty(dataflowOptions.getTempLocation())) {
- dataflowOptions.setTempLocation(dataflowOptions.getStagingLocation());
- } else if (Strings.isNullOrEmpty(dataflowOptions.getStagingLocation())) {
- try {
- dataflowOptions.setStagingLocation(
- IOChannelUtils.resolve(dataflowOptions.getTempLocation(), "staging"));
- } catch (IOException e) {
- throw new IllegalArgumentException("Unable to resolve PipelineOptions.stagingLocation "
- + "from PipelineOptions.tempLocation. Please set the staging location explicitly.", e);
- }
- }
-
- if (dataflowOptions.getFilesToStage() == null) {
- dataflowOptions.setFilesToStage(detectClassPathResourcesToStage(
- DataflowPipelineRunner.class.getClassLoader()));
- LOG.info("PipelineOptions.filesToStage was not specified. "
- + "Defaulting to files from the classpath: will stage {} files. "
- + "Enable logging at DEBUG level to see which files will be staged.",
- dataflowOptions.getFilesToStage().size());
- LOG.debug("Classpath elements: {}", dataflowOptions.getFilesToStage());
- }
-
- // Verify jobName according to service requirements.
- String jobName = dataflowOptions.getJobName().toLowerCase();
- Preconditions.checkArgument(
- jobName.matches("[a-z]([-a-z0-9]*[a-z0-9])?"),
- "JobName invalid; the name must consist of only the characters "
- + "[-a-z0-9], starting with a letter and ending with a letter "
- + "or number");
-
- // Verify project
- String project = dataflowOptions.getProject();
- if (project.matches("[0-9]*")) {
- throw new IllegalArgumentException("Project ID '" + project
- + "' invalid. Please make sure you specified the Project ID, not project number.");
- } else if (!project.matches(PROJECT_ID_REGEXP)) {
- throw new IllegalArgumentException("Project ID '" + project
- + "' invalid. Please make sure you specified the Project ID, not project description.");
- }
-
- DataflowPipelineDebugOptions debugOptions =
- dataflowOptions.as(DataflowPipelineDebugOptions.class);
- // Verify the number of worker threads is a valid value
- if (debugOptions.getNumberOfWorkerHarnessThreads() < 0) {
- throw new IllegalArgumentException("Number of worker harness threads '"
- + debugOptions.getNumberOfWorkerHarnessThreads()
- + "' invalid. Please make sure the value is non-negative.");
- }
-
- return new DataflowPipelineRunner(dataflowOptions);
- }
-
- @VisibleForTesting protected DataflowPipelineRunner(DataflowPipelineOptions options) {
- this.options = options;
- this.dataflowClient = options.getDataflowClient();
- this.translator = DataflowPipelineTranslator.fromOptions(options);
- this.pcollectionsRequiringIndexedFormat = new HashSet<>();
- this.ptransformViewsWithNonDeterministicKeyCoders = new HashSet<>();
-
- if (options.isStreaming()) {
- overrides = ImmutableMap.<Class<?>, Class<?>>builder()
- .put(Combine.GloballyAsSingletonView.class, StreamingCombineGloballyAsSingletonView.class)
- .put(Create.Values.class, StreamingCreate.class)
- .put(View.AsMap.class, StreamingViewAsMap.class)
- .put(View.AsMultimap.class, StreamingViewAsMultimap.class)
- .put(View.AsSingleton.class, StreamingViewAsSingleton.class)
- .put(View.AsList.class, StreamingViewAsList.class)
- .put(View.AsIterable.class, StreamingViewAsIterable.class)
- .put(Write.Bound.class, StreamingWrite.class)
- .put(PubsubIO.Write.Bound.class, StreamingPubsubIOWrite.class)
- .put(Read.Unbounded.class, StreamingUnboundedRead.class)
- .put(Read.Bounded.class, UnsupportedIO.class)
- .put(AvroIO.Read.Bound.class, UnsupportedIO.class)
- .put(AvroIO.Write.Bound.class, UnsupportedIO.class)
- .put(BigQueryIO.Read.Bound.class, UnsupportedIO.class)
- .put(TextIO.Read.Bound.class, UnsupportedIO.class)
- .put(TextIO.Write.Bound.class, UnsupportedIO.class)
- .put(Window.Bound.class, AssignWindows.class)
- .build();
- } else {
- ImmutableMap.Builder<Class<?>, Class<?>> builder = ImmutableMap.<Class<?>, Class<?>>builder();
- builder.put(Read.Unbounded.class, UnsupportedIO.class);
- builder.put(Window.Bound.class, AssignWindows.class);
- builder.put(Write.Bound.class, BatchWrite.class);
- builder.put(AvroIO.Write.Bound.class, BatchAvroIOWrite.class);
- builder.put(TextIO.Write.Bound.class, BatchTextIOWrite.class);
- if (options.getExperiments() == null
- || !options.getExperiments().contains("disable_ism_side_input")) {
- builder.put(View.AsMap.class, BatchViewAsMap.class);
- builder.put(View.AsMultimap.class, BatchViewAsMultimap.class);
- builder.put(View.AsSingleton.class, BatchViewAsSingleton.class);
- builder.put(View.AsList.class, BatchViewAsList.class);
- builder.put(View.AsIterable.class, BatchViewAsIterable.class);
- }
- overrides = builder.build();
- }
- }
-
- /**
- * Applies the given transform to the input. For transforms with customized definitions
- * for the Dataflow pipeline runner, the application is intercepted and modified here.
- */
- @Override
- public <OutputT extends POutput, InputT extends PInput> OutputT apply(
- PTransform<InputT, OutputT> transform, InputT input) {
-
- if (Combine.GroupedValues.class.equals(transform.getClass())
- || GroupByKey.class.equals(transform.getClass())) {
-
- // For both Dataflow runners (streaming and batch), GroupByKey and GroupedValues are
- // primitives. Returning a primitive output instead of the expanded definition
- // signals to the translator that translation is necessary.
- @SuppressWarnings("unchecked")
- PCollection<?> pc = (PCollection<?>) input;
- @SuppressWarnings("unchecked")
- OutputT outputT = (OutputT) PCollection.createPrimitiveOutputInternal(
- pc.getPipeline(),
- transform instanceof GroupByKey
- ? ((GroupByKey<?, ?>) transform).updateWindowingStrategy(pc.getWindowingStrategy())
- : pc.getWindowingStrategy(),
- pc.isBounded());
- return outputT;
- } else if (Window.Bound.class.equals(transform.getClass())) {
- /*
- * TODO: make this the generic way overrides are applied (using super.apply() rather than
- * Pipeline.applyTransform(); this allows the apply method to be replaced without inserting
- * additional nodes into the graph.
- */
- // casting to wildcard
- @SuppressWarnings("unchecked")
- OutputT windowed = (OutputT) applyWindow((Window.Bound<?>) transform, (PCollection<?>) input);
- return windowed;
- } else if (Flatten.FlattenPCollectionList.class.equals(transform.getClass())
- && ((PCollectionList<?>) input).size() == 0) {
- return (OutputT) Pipeline.applyTransform(input, Create.of());
- } else if (overrides.containsKey(transform.getClass())) {
- // It is the responsibility of whoever constructs overrides to ensure this is type safe.
- @SuppressWarnings("unchecked")
- Class<PTransform<InputT, OutputT>> transformClass =
- (Class<PTransform<InputT, OutputT>>) transform.getClass();
-
- @SuppressWarnings("unchecked")
- Class<PTransform<InputT, OutputT>> customTransformClass =
- (Class<PTransform<InputT, OutputT>>) overrides.get(transform.getClass());
-
- PTransform<InputT, OutputT> customTransform =
- InstanceBuilder.ofType(customTransformClass)
- .withArg(DataflowPipelineRunner.class, this)
- .withArg(transformClass, transform)
- .build();
-
- return Pipeline.applyTransform(input, customTransform);
- } else {
- return super.apply(transform, input);
- }
- }
-
- private <T> PCollection<T> applyWindow(
- Window.Bound<?> intitialTransform, PCollection<?> initialInput) {
- // types are matched at compile time
- @SuppressWarnings("unchecked")
- Window.Bound<T> transform = (Window.Bound<T>) intitialTransform;
- @SuppressWarnings("unchecked")
- PCollection<T> input = (PCollection<T>) initialInput;
- return super.apply(new AssignWindows<>(transform), input);
- }
-
- private String debuggerMessage(String projectId, String uniquifier) {
- return String.format("To debug your job, visit Google Cloud Debugger at: "
- + "https://console.developers.google.com/debug?project=%s&dbgee=%s",
- projectId, uniquifier);
- }
-
- private void maybeRegisterDebuggee(DataflowPipelineOptions options, String uniquifier) {
- if (!options.getEnableCloudDebugger()) {
- return;
- }
-
- if (options.getDebuggee() != null) {
- throw new RuntimeException("Should not specify the debuggee");
- }
-
- Clouddebugger debuggerClient = Transport.newClouddebuggerClient(options).build();
- Debuggee debuggee = registerDebuggee(debuggerClient, uniquifier);
- options.setDebuggee(debuggee);
-
- System.out.println(debuggerMessage(options.getProject(), debuggee.getUniquifier()));
- }
-
- private Debuggee registerDebuggee(Clouddebugger debuggerClient, String uniquifier) {
- RegisterDebuggeeRequest registerReq = new RegisterDebuggeeRequest();
- registerReq.setDebuggee(new Debuggee()
- .setProject(options.getProject())
- .setUniquifier(uniquifier)
- .setDescription(uniquifier)
- .setAgentVersion("google.com/cloud-dataflow-java/v1"));
-
- try {
- RegisterDebuggeeResponse registerResponse =
- debuggerClient.controller().debuggees().register(registerReq).execute();
- Debuggee debuggee = registerResponse.getDebuggee();
- if (debuggee.getStatus() != null && debuggee.getStatus().getIsError()) {
- throw new RuntimeException("Unable to register with the debugger: " +
- debuggee.getStatus().getDescription().getFormat());
- }
-
- return debuggee;
- } catch (IOException e) {
- throw new RuntimeException("Unable to register with the debugger: ", e);
- }
- }
-
- @Override
- public DataflowPipelineJob run(Pipeline pipeline) {
- logWarningIfPCollectionViewHasNonDeterministicKeyCoder(pipeline);
-
- LOG.info("Executing pipeline on the Dataflow Service, which will have billing implications "
- + "related to Google Compute Engine usage and other Google Cloud Services.");
-
- List<DataflowPackage> packages = options.getStager().stageFiles();
-
-
- // Set a unique client_request_id in the CreateJob request.
- // This is used to ensure idempotence of job creation across retried
- // attempts to create a job. Specifically, if the service returns a job with
- // a different client_request_id, it means the returned one is a different
- // job previously created with the same job name, and that the job creation
- // has been effectively rejected. The SDK should return
- // Error::Already_Exists to user in that case.
- int randomNum = new Random().nextInt(9000) + 1000;
- String requestId = DateTimeFormat.forPattern("YYYYMMddHHmmssmmm").withZone(DateTimeZone.UTC)
- .print(DateTimeUtils.currentTimeMillis()) + "_" + randomNum;
-
- // Try to create a debuggee ID. This must happen before the job is translated since it may
- // update the options.
- DataflowPipelineOptions dataflowOptions = options.as(DataflowPipelineOptions.class);
- maybeRegisterDebuggee(dataflowOptions, requestId);
-
- JobSpecification jobSpecification =
- translator.translate(pipeline, this, packages);
- Job newJob = jobSpecification.getJob();
- newJob.setClientRequestId(requestId);
-
- String version = DataflowReleaseInfo.getReleaseInfo().getVersion();
- System.out.println("Dataflow SDK version: " + version);
-
- newJob.getEnvironment().setUserAgent(DataflowReleaseInfo.getReleaseInfo());
- // The Dataflow Service may write to the temporary directory directly, so
- // must be verified.
- if (!Strings.isNullOrEmpty(options.getTempLocation())) {
- newJob.getEnvironment().setTempStoragePrefix(
- dataflowOptions.getPathValidator().verifyPath(options.getTempLocation()));
- }
- newJob.getEnvironment().setDataset(options.getTempDatasetId());
- newJob.getEnvironment().setExperiments(options.getExperiments());
-
- // Set the Docker container image that executes Dataflow worker harness, residing in Google
- // Container Registry. Translator is guaranteed to create a worker pool prior to this point.
- String workerHarnessContainerImage =
- options.as(DataflowPipelineWorkerPoolOptions.class)
- .getWorkerHarnessContainerImage();
- for (WorkerPool workerPool : newJob.getEnvironment().getWorkerPools()) {
- workerPool.setWorkerHarnessContainerImage(workerHarnessContainerImage);
- }
-
- // Requirements about the service.
- Map<String, Object> environmentVersion = new HashMap<>();
- environmentVersion.put(PropertyNames.ENVIRONMENT_VERSION_MAJOR_KEY, ENVIRONMENT_MAJOR_VERSION);
- newJob.getEnvironment().setVersion(environmentVersion);
- // Default jobType is JAVA_BATCH_AUTOSCALING: A Java job with workers that the job can
- // autoscale if specified.
- String jobType = "JAVA_BATCH_AUTOSCALING";
-
- if (options.isStreaming()) {
- jobType = "STREAMING";
- }
- environmentVersion.put(PropertyNames.ENVIRONMENT_VERSION_JOB_TYPE_KEY, jobType);
-
- if (hooks != null) {
- hooks.modifyEnvironmentBeforeSubmission(newJob.getEnvironment());
- }
-
- if (!Strings.isNullOrEmpty(options.getDataflowJobFile())) {
- try (PrintWriter printWriter = new PrintWriter(
- new File(options.getDataflowJobFile()))) {
- String workSpecJson = DataflowPipelineTranslator.jobToString(newJob);
- printWriter.print(workSpecJson);
- LOG.info("Printed workflow specification to {}", options.getDataflowJobFile());
- } catch (IllegalStateException ex) {
- LOG.warn("Cannot translate workflow spec to json for debug.");
- } catch (FileNotFoundException ex) {
- LOG.warn("Cannot create workflow spec output file.");
- }
- }
-
- String jobIdToUpdate = null;
- if (options.getUpdate()) {
- jobIdToUpdate = getJobIdFromName(options.getJobName());
- newJob.setTransformNameMapping(options.getTransformNameMapping());
- newJob.setReplaceJobId(jobIdToUpdate);
- }
- Job jobResult;
- try {
- jobResult = dataflowClient
- .projects()
- .jobs()
- .create(options.getProject(), newJob)
- .execute();
- } catch (GoogleJsonResponseException e) {
- String errorMessages = "Unexpected errors";
- if (e.getDetails() != null) {
- if (Utf8.encodedLength(newJob.toString()) >= CREATE_JOB_REQUEST_LIMIT_BYTES) {
- errorMessages = "The size of the serialized JSON representation of the pipeline "
- + "exceeds the allowable limit. "
- + "For more information, please check the FAQ link below:\n"
- + "https://cloud.google.com/dataflow/faq";
- } else {
- errorMessages = e.getDetails().getMessage();
- }
- }
- throw new RuntimeException("Failed to create a workflow job: " + errorMessages, e);
- } catch (IOException e) {
- throw new RuntimeException("Failed to create a workflow job", e);
- }
-
- // Obtain all of the extractors from the PTransforms used in the pipeline so the
- // DataflowPipelineJob has access to them.
- AggregatorPipelineExtractor aggregatorExtractor = new AggregatorPipelineExtractor(pipeline);
- Map<Aggregator<?, ?>, Collection<PTransform<?, ?>>> aggregatorSteps =
- aggregatorExtractor.getAggregatorSteps();
-
- DataflowAggregatorTransforms aggregatorTransforms =
- new DataflowAggregatorTransforms(aggregatorSteps, jobSpecification.getStepNames());
-
- // Use a raw client for post-launch monitoring, as status calls may fail
- // regularly and need not be retried automatically.
- DataflowPipelineJob dataflowPipelineJob =
- new DataflowPipelineJob(options.getProject(), jobResult.getId(),
- Transport.newRawDataflowClient(options).build(), aggregatorTransforms);
-
- // If the service returned client request id, the SDK needs to compare it
- // with the original id generated in the request, if they are not the same
- // (i.e., the returned job is not created by this request), throw
- // DataflowJobAlreadyExistsException or DataflowJobAlreadyUpdatedExcetpion
- // depending on whether this is a reload or not.
- if (jobResult.getClientRequestId() != null && !jobResult.getClientRequestId().isEmpty()
- && !jobResult.getClientRequestId().equals(requestId)) {
- // If updating a job.
- if (options.getUpdate()) {
- throw new DataflowJobAlreadyUpdatedException(dataflowPipelineJob,
- String.format("The job named %s with id: %s has already been updated into job id: %s "
- + "and cannot be updated again.",
- newJob.getName(), jobIdToUpdate, jobResult.getId()));
- } else {
- throw new DataflowJobAlreadyExistsException(dataflowPipelineJob,
- String.format("There is already an active job named %s with id: %s. If you want "
- + "to submit a second job, try again by setting a different name using --jobName.",
- newJob.getName(), jobResult.getId()));
- }
- }
-
- LOG.info("To access the Dataflow monitoring console, please navigate to {}",
- MonitoringUtil.getJobMonitoringPageURL(options.getProject(), jobResult.getId()));
- System.out.println("Submitted job: " + jobResult.getId());
-
- LOG.info("To cancel the job using the 'gcloud' tool, run:\n> {}",
- MonitoringUtil.getGcloudCancelCommand(options, jobResult.getId()));
-
- return dataflowPipelineJob;
- }
-
- /**
- * Returns the DataflowPipelineTranslator associated with this object.
- */
- public DataflowPipelineTranslator getTranslator() {
- return translator;
- }
-
- /**
- * Sets callbacks to invoke during execution see {@code DataflowPipelineRunnerHooks}.
- */
- @Experimental
- public void setHooks(DataflowPipelineRunnerHooks hooks) {
- this.hooks = hooks;
- }
-
- /////////////////////////////////////////////////////////////////////////////
-
- /** Outputs a warning about PCollection views without deterministic key coders. */
- private void logWarningIfPCollectionViewHasNonDeterministicKeyCoder(Pipeline pipeline) {
- // We need to wait till this point to determine the names of the transforms since only
- // at this time do we know the hierarchy of the transforms otherwise we could
- // have just recorded the full names during apply time.
- if (!ptransformViewsWithNonDeterministicKeyCoders.isEmpty()) {
- final SortedSet<String> ptransformViewNamesWithNonDeterministicKeyCoders = new TreeSet<>();
- pipeline.traverseTopologically(new PipelineVisitor() {
- @Override
- public void visitValue(PValue value, TransformTreeNode producer) {
- }
-
- @Override
- public void visitTransform(TransformTreeNode node) {
- if (ptransformViewsWithNonDeterministicKeyCoders.contains(node.getTransform())) {
- ptransformViewNamesWithNonDeterministicKeyCoders.add(node.getFullName());
- }
- }
-
- @Override
- public void enterCompositeTransform(TransformTreeNode node) {
- if (ptransformViewsWithNonDeterministicKeyCoders.contains(node.getTransform())) {
- ptransformViewNamesWithNonDeterministicKeyCoders.add(node.getFullName());
- }
- }
-
- @Override
- public void leaveCompositeTransform(TransformTreeNode node) {
- }
- });
-
- LOG.warn("Unable to use indexed implementation for View.AsMap and View.AsMultimap for {} "
- + "because the key coder is not deterministic. Falling back to singleton implementation "
- + "which may cause memory and/or performance problems. Future major versions of "
- + "Dataflow will require deterministic key coders.",
- ptransformViewNamesWithNonDeterministicKeyCoders);
- }
- }
-
- /**
- * Returns true if the passed in {@link PCollection} needs to be materialiazed using
- * an indexed format.
- */
- boolean doesPCollectionRequireIndexedFormat(PCollection<?> pcol) {
- return pcollectionsRequiringIndexedFormat.contains(pcol);
- }
-
- /**
- * Marks the passed in {@link PCollection} as requiring to be materialized using
- * an indexed format.
- */
- private void addPCollectionRequiringIndexedFormat(PCollection<?> pcol) {
- pcollectionsRequiringIndexedFormat.add(pcol);
- }
-
- /** A set of {@link View}s with non-deterministic key coders. */
- Set<PTransform<?, ?>> ptransformViewsWithNonDeterministicKeyCoders;
-
- /**
- * Records that the {@link PTransform} requires a deterministic key coder.
- */
- private void recordViewUsesNonDeterministicKeyCoder(PTransform<?, ?> ptransform) {
- ptransformViewsWithNonDeterministicKeyCoders.add(ptransform);
- }
-
- /**
- * A {@link GroupByKey} transform for the {@link DataflowPipelineRunner} which sorts
- * values using the secondary key {@code K2}.
- *
- * <p>The {@link PCollection} created created by this {@link PTransform} will have values in
- * the empty window. Care must be taken *afterwards* to either re-window
- * (using {@link Window#into}) or only use {@link PTransform}s that do not depend on the
- * values being within a window.
- */
- static class GroupByKeyAndSortValuesOnly<K1, K2, V>
- extends PTransform<PCollection<KV<K1, KV<K2, V>>>, PCollection<KV<K1, Iterable<KV<K2, V>>>>> {
- private GroupByKeyAndSortValuesOnly() {
- }
-
- @Override
- public PCollection<KV<K1, Iterable<KV<K2, V>>>> apply(PCollection<KV<K1, KV<K2, V>>> input) {
- PCollection<KV<K1, Iterable<KV<K2, V>>>> rval =
- PCollection.<KV<K1, Iterable<KV<K2, V>>>>createPrimitiveOutputInternal(
- input.getPipeline(),
- WindowingStrategy.globalDefault(),
- IsBounded.BOUNDED);
-
- @SuppressWarnings({"unchecked", "rawtypes"})
- KvCoder<K1, KV<K2, V>> inputCoder = (KvCoder) input.getCoder();
- rval.setCoder(
- KvCoder.of(inputCoder.getKeyCoder(),
- IterableCoder.of(inputCoder.getValueCoder())));
- return rval;
- }
- }
-
- /**
- * A {@link PTransform} that groups the values by a hash of the window's byte representation
- * and sorts the values using the windows byte representation.
- */
- private static class GroupByWindowHashAsKeyAndWindowAsSortKey<T, W extends BoundedWindow> extends
- PTransform<PCollection<T>, PCollection<KV<Integer, Iterable<KV<W, WindowedValue<T>>>>>> {
-
- /**
- * A {@link DoFn} that for each element outputs a {@code KV} structure suitable for
- * grouping by the hash of the window's byte representation and sorting the grouped values
- * using the window's byte representation.
- */
- @SystemDoFnInternal
- private static class UseWindowHashAsKeyAndWindowAsSortKeyDoFn<T, W extends BoundedWindow>
- extends DoFn<T, KV<Integer, KV<W, WindowedValue<T>>>> implements DoFn.RequiresWindowAccess {
-
- private final IsmRecordCoder<?> ismCoderForHash;
- private UseWindowHashAsKeyAndWindowAsSortKeyDoFn(IsmRecordCoder<?> ismCoderForHash) {
- this.ismCoderForHash = ismCoderForHash;
- }
-
- @Override
- public void processElement(ProcessContext c) throws Exception {
- @SuppressWarnings("unchecked")
- W window = (W) c.window();
- c.output(
- KV.of(ismCoderForHash.hash(ImmutableList.of(window)),
- KV.of(window,
- WindowedValue.of(
- c.element(),
- c.timestamp(),
- c.window(),
- c.pane()))));
- }
- }
-
- private final IsmRecordCoder<?> ismCoderForHash;
- private GroupByWindowHashAsKeyAndWindowAsSortKey(IsmRecordCoder<?> ismCoderForHash) {
- this.ismCoderForHash = ismCoderForHash;
- }
-
- @Override
- public PCollection<KV<Integer, Iterable<KV<W, WindowedValue<T>>>>> apply(PCollection<T> input) {
- @SuppressWarnings("unchecked")
- Coder<W> windowCoder = (Coder<W>)
- input.getWindowingStrategy().getWindowFn().windowCoder();
- PCollection<KV<Integer, KV<W, WindowedValue<T>>>> rval =
- input.apply(ParDo.of(
- new UseWindowHashAsKeyAndWindowAsSortKeyDoFn<T, W>(ismCoderForHash)));
- rval.setCoder(
- KvCoder.of(
- VarIntCoder.of(),
- KvCoder.of(windowCoder,
- FullWindowedValueCoder.of(input.getCoder(), windowCoder))));
- return rval.apply(new GroupByKeyAndSortValuesOnly<Integer, W, WindowedValue<T>>());
- }
- }
-
- /**
- * Specialized implementation for
- * {@link com.google.cloud.dataflow.sdk.transforms.View.AsSingleton View.AsSingleton} for the
- * Dataflow runner in batch mode.
- *
- * <p>Creates a set of files in the {@link IsmFormat} sharded by the hash of the windows
- * byte representation and with records having:
- * <ul>
- * <li>Key 1: Window</li>
- * <li>Value: Windowed value</li>
- * </ul>
- */
- static class BatchViewAsSingleton<T>
- extends PTransform<PCollection<T>, PCollectionView<T>> {
-
- /**
- * A {@link DoFn} that outputs {@link IsmRecord}s. These records are structured as follows:
- * <ul>
- * <li>Key 1: Window
- * <li>Value: Windowed value
- * </ul>
- */
- static class IsmRecordForSingularValuePerWindowDoFn<T, W extends BoundedWindow>
- extends DoFn<KV<Integer, Iterable<KV<W, WindowedValue<T>>>>,
- IsmRecord<WindowedValue<T>>> {
-
- @Override
- public void processElement(ProcessContext c) throws Exception {
- Iterator<KV<W, WindowedValue<T>>> iterator = c.element().getValue().iterator();
- while (iterator.hasNext()) {
- KV<W, WindowedValue<T>> next = iterator.next();
- c.output(
- IsmRecord.of(
- ImmutableList.of(next.getKey()), next.getValue()));
- }
- }
- }
-
- private final DataflowPipelineRunner runner;
- private final View.AsSingleton<T> transform;
- /**
- * Builds an instance of this class from the overridden transform.
- */
- @SuppressWarnings("unused") // used via reflection in DataflowPipelineRunner#apply()
- public BatchViewAsSingleton(DataflowPipelineRunner runner, View.AsSingleton<T> transform) {
- this.runner = runner;
- this.transform = transform;
- }
-
- @Override
- public PCollectionView<T> apply(PCollection<T> input) {
- return BatchViewAsSingleton.<T, T, T, BoundedWindow>applyForSingleton(
- runner,
- input,
- new IsmRecordForSingularValuePerWindowDoFn<T, BoundedWindow>(),
- transform.hasDefaultValue(),
- transform.defaultValue(),
- input.getCoder());
- }
-
- static <T, FinalT, ViewT, W extends BoundedWindow> PCollectionView<ViewT>
- applyForSingleton(
- DataflowPipelineRunner runner,
- PCollection<T> input,
- DoFn<KV<Integer, Iterable<KV<W, WindowedValue<T>>>>,
- IsmRecord<WindowedValue<FinalT>>> doFn,
- boolean hasDefault,
- FinalT defaultValue,
- Coder<FinalT> defaultValueCoder) {
-
- @SuppressWarnings("unchecked")
- Coder<W> windowCoder = (Coder<W>)
- input.getWindowingStrategy().getWindowFn().windowCoder();
-
- @SuppressWarnings({"rawtypes", "unchecked"})
- PCollectionView<ViewT> view =
- (PCollectionView<ViewT>) PCollectionViews.<FinalT, W>singletonView(
- input.getPipeline(),
- (WindowingStrategy) input.getWindowingStrategy(),
- hasDefault,
- defaultValue,
- defaultValueCoder);
-
- IsmRecordCoder<WindowedValue<FinalT>> ismCoder =
- coderForSingleton(windowCoder, defaultValueCoder);
-
- PCollection<IsmRecord<WindowedValue<FinalT>>> reifiedPerWindowAndSorted = input
- .apply(new GroupByWindowHashAsKeyAndWindowAsSortKey<T, W>(ismCoder))
- .apply(ParDo.of(doFn));
- reifiedPerWindowAndSorted.setCoder(ismCoder);
-
- runner.addPCollectionRequiringIndexedFormat(reifiedPerWindowAndSorted);
- return reifiedPerWindowAndSorted.apply(
- CreatePCollectionView.<IsmRecord<WindowedValue<FinalT>>, ViewT>of(view));
- }
-
- @Override
- protected String getKindString() {
- return "BatchViewAsSingleton";
- }
-
- static <T> IsmRecordCoder<WindowedValue<T>> coderForSingleton(
- Coder<? extends BoundedWindow> windowCoder, Coder<T> valueCoder) {
- return IsmRecordCoder.of(
- 1, // We hash using only the window
- 0, // There are no metadata records
- ImmutableList.<Coder<?>>of(windowCoder),
- FullWindowedValueCoder.of(valueCoder, windowCoder));
- }
- }
-
- /**
- * Specialized implementation for
- * {@link com.google.cloud.dataflow.sdk.transforms.View.AsIterable View.AsIterable} for the
- * Dataflow runner in batch mode.
- *
- * <p>Creates a set of {@code Ism} files sharded by the hash of the windows byte representation
- * and with records having:
- * <ul>
- * <li>Key 1: Window</li>
- * <li>Key 2: Index offset within window</li>
- * <li>Value: Windowed value</li>
- * </ul>
- */
- static class BatchViewAsIterable<T>
- extends PTransform<PCollection<T>, PCollectionView<Iterable<T>>> {
-
- private final DataflowPipelineRunner runner;
- /**
- * Builds an instance of this class from the overridden transform.
- */
- @SuppressWarnings("unused") // used via reflection in DataflowPipelineRunner#apply()
- public BatchViewAsIterable(DataflowPipelineRunner runner, View.AsIterable<T> transform) {
- this.runner = runner;
- }
-
- @Override
- public PCollectionView<Iterable<T>> apply(PCollection<T> input) {
- PCollectionView<Iterable<T>> view = PCollectionViews.iterableView(
- input.getPipeline(), input.getWindowingStrategy(), input.getCoder());
- return BatchViewAsList.applyForIterableLike(runner, input, view);
- }
- }
-
- /**
- * Specialized implementation for
- * {@link com.google.cloud.dataflow.sdk.transforms.View.AsList View.AsList} for the
- * Dataflow runner in batch mode.
- *
- * <p>Creates a set of {@code Ism} files sharded by the hash of the window's byte representation
- * and with records having:
- * <ul>
- * <li>Key 1: Window</li>
- * <li>Key 2: Index offset within window</li>
- * <li>Value: Windowed value</li>
- * </ul>
- */
- static class BatchViewAsList<T>
- extends PTransform<PCollection<T>, PCollectionView<List<T>>> {
- /**
- * A {@link DoFn} which creates {@link IsmRecord}s assuming that each element is within the
- * global window. Each {@link IsmRecord} has
- * <ul>
- * <li>Key 1: Global window</li>
- * <li>Key 2: Index offset within window</li>
- * <li>Value: Windowed value</li>
- * </ul>
- */
- @SystemDoFnInternal
- static class ToIsmRecordForGlobalWindowDoFn<T>
- extends DoFn<T, IsmRecord<WindowedValue<T>>> {
-
- long indexInBundle;
- @Override
- public void startBundle(Context c) throws Exception {
- indexInBundle = 0;
- }
-
- @Override
- public void processElement(ProcessContext c) throws Exception {
- c.output(IsmRecord.of(
- ImmutableList.of(GlobalWindow.INSTANCE, indexInBundle),
- WindowedValue.of(
- c.element(),
- c.timestamp(),
- GlobalWindow.INSTANCE,
- c.pane())));
- indexInBundle += 1;
- }
- }
-
- /**
- * A {@link DoFn} which creates {@link IsmRecord}s comparing successive elements windows
- * to locate the window boundaries. The {@link IsmRecord} has:
- * <ul>
- * <li>Key 1: Window</li>
- * <li>Key 2: Index offset within window</li>
- * <li>Value: Windowed value</li>
- * </ul>
- */
- @SystemDoFnInternal
- static class ToIsmRecordForNonGlobalWindowDoFn<T, W extends BoundedWindow>
- extends DoFn<KV<Integer, Iterable<KV<W, WindowedValue<T>>>>,
- IsmRecord<WindowedValue<T>>> {
-
- private final Coder<W> windowCoder;
- ToIsmRecordForNonGlobalWindowDoFn(Coder<W> windowCoder) {
- this.windowCoder = windowCoder;
- }
-
- @Override
- public void processElement(ProcessContext c) throws Exception {
- long elementsInWindow = 0;
- Optional<Object> previousWindowStructuralValue = Optional.absent();
- for (KV<W, WindowedValue<T>> value : c.element().getValue()) {
- Object currentWindowStructuralValue = windowCoder.structuralValue(value.getKey());
- // Compare to see if this is a new window so we can reset the index counter i
- if (previousWindowStructuralValue.isPresent()
- && !previousWindowStructuralValue.get().equals(currentWindowStructuralValue)) {
- // Reset i since we have a new window.
- elementsInWindow = 0;
- }
- c.output(IsmRecord.of(
- ImmutableList.of(value.getKey(), elementsInWindow),
- value.getValue()));
- previousWindowStructuralValue = Optional.of(currentWindowStructuralValue);
- elementsInWindow += 1;
- }
- }
- }
-
- private final DataflowPipelineRunner runner;
- /**
- * Builds an instance of this class from the overridden transform.
- */
- @SuppressWarnings("unused") // used via reflection in DataflowPipelineRunner#apply()
- public BatchViewAsList(DataflowPipelineRunner runner, View.AsList<T> transform) {
- this.runner = runner;
- }
-
- @Override
- public PCollectionView<List<T>> apply(PCollection<T> input) {
- PCollectionView<List<T>> view = PCollectionViews.listView(
- input.getPipeline(), input.getWindowingStrategy(), input.getCoder());
- return applyForIterableLike(runner, input, view);
- }
-
- static <T, W extends BoundedWindow, ViewT> PCollectionView<ViewT> applyForIterableLike(
- DataflowPipelineRunner runner,
- PCollection<T> input,
- PCollectionView<ViewT> view) {
-
- @SuppressWarnings("unchecked")
- Coder<W> windowCoder = (Coder<W>)
- input.getWindowingStrategy().getWindowFn().windowCoder();
-
- IsmRecordCoder<WindowedValue<T>> ismCoder = coderForListLike(windowCoder, input.getCoder());
-
- // If we are working in the global window, we do not need to do a GBK using the window
- // as the key since all the elements of the input PCollection are already such.
- // We just reify the windowed value while converting them to IsmRecords and generating
- // an index based upon where we are within the bundle. Each bundle
- // maps to one file exactly.
- if (input.getWindowingStrategy().getWindowFn() instanceof GlobalWindows) {
- PCollection<IsmRecord<WindowedValue<T>>> reifiedPerWindowAndSorted =
- input.apply(ParDo.of(new ToIsmRecordForGlobalWindowDoFn<T>()));
- reifiedPerWindowAndSorted.setCoder(ismCoder);
-
- runner.addPCollectionRequiringIndexedFormat(reifiedPerWindowAndSorted);
- return reifiedPerWindowAndSorted.apply(
- CreatePCollectionView.<IsmRecord<WindowedValue<T>>, ViewT>of(view));
- }
-
- PCollection<IsmRecord<WindowedValue<T>>> reifiedPerWindowAndSorted = input
- .apply(new GroupByWindowHashAsKeyAndWindowAsSortKey<T, W>(ismCoder))
- .apply(ParDo.of(new ToIsmRecordForNonGlobalWindowDoFn<T, W>(windowCoder)));
- reifiedPerWindowAndSorted.setCoder(ismCoder);
-
- runner.addPCollectionRequiringIndexedFormat(reifiedPerWindowAndSorted);
- return reifiedPerWindowAndSorted.apply(
- CreatePCollectionView.<IsmRecord<WindowedValue<T>>, ViewT>of(view));
- }
-
- @Override
- protected String getKindString() {
- return "BatchViewAsList";
- }
-
- static <T> IsmRecordCoder<WindowedValue<T>> coderForListLike(
- Coder<? extends BoundedWindow> windowCoder, Coder<T> valueCoder) {
- // TODO: swap to use a variable length long coder which has values which compare
- // the same as their byte representation compare lexicographically within the key coder
- return IsmRecordCoder.of(
- 1, // We hash using only the window
- 0, // There are no metadata records
- ImmutableList.of(windowCoder, BigEndianLongCoder.of()),
- FullWindowedValueCoder.of(valueCoder, windowCoder));
- }
- }
-
- /**
- * Specialized implementation for
- * {@link com.google.cloud.dataflow.sdk.transforms.View.AsMap View.AsMap} for the
- * Dataflow runner in batch mode.
- *
- * <p>Creates a set of {@code Ism} files sharded by the hash of the key's byte
- * representation. Each record is structured as follows:
- * <ul>
- * <li>Key 1: User key K</li>
- * <li>Key 2: Window</li>
- * <li>Key 3: 0L (constant)</li>
- * <li>Value: Windowed value</li>
- * </ul>
- *
- * <p>Alongside the data records, there are the following metadata records:
- * <ul>
- * <li>Key 1: Metadata Key</li>
- * <li>Key 2: Window</li>
- * <li>Key 3: Index [0, size of map]</li>
- * <li>Value: variable length long byte representation of size of map if index is 0,
- * otherwise the byte representation of a key</li>
- * </ul>
- * The {@code [META, Window, 0]} record stores the number of unique keys per window, while
- * {@code [META, Window, i]} for {@code i} in {@code [1, size of map]} stores a the users key.
- * This allows for one to access the size of the map by looking at {@code [META, Window, 0]}
- * and iterate over all the keys by accessing {@code [META, Window, i]} for {@code i} in
- * {@code [1, size of map]}.
- *
- * <p>Note that in the case of a non-deterministic key coder, we fallback to using
- * {@link com.google.cloud.dataflow.sdk.transforms.View.AsSingleton View.AsSingleton} printing
- * a warning to users to specify a deterministic key coder.
- */
- static class BatchViewAsMap<K, V>
- extends PTransform<PCollection<KV<K, V>>, PCollectionView<Map<K, V>>> {
-
- /**
- * A {@link DoFn} which groups elements by window boundaries. For each group,
- * the group of elements is transformed into a {@link TransformedMap}.
- * The transformed {@code Map<K, V>} is backed by a {@code Map<K, WindowedValue<V>>}
- * and contains a function {@code WindowedValue<V> -> V}.
- *
- * <p>Outputs {@link IsmRecord}s having:
- * <ul>
- * <li>Key 1: Window</li>
- * <li>Value: Transformed map containing a transform that removes the encapsulation
- * of the window around each value,
- * {@code Map<K, WindowedValue<V>> -> Map<K, V>}.</li>
- * </ul>
- */
- static class ToMapDoFn<K, V, W extends BoundedWindow>
- extends DoFn<KV<Integer, Iterable<KV<W, WindowedValue<KV<K, V>>>>>,
- IsmRecord<WindowedValue<TransformedMap<K,
- WindowedValue<V>,
- V>>>> {
-
- private final Coder<W> windowCoder;
- ToMapDoFn(Coder<W> windowCoder) {
- this.windowCoder = windowCoder;
- }
-
- @Override
- public void processElement(ProcessContext c)
- throws Exception {
- Optional<Object> previousWindowStructuralValue = Optional.absent();
- Optional<W> previousWindow = Optional.absent();
- Map<K, WindowedValue<V>> map = new HashMap<>();
- for (KV<W, WindowedValue<KV<K, V>>> kv : c.element().getValue()) {
- Object currentWindowStructuralValue = windowCoder.structuralValue(kv.getKey());
- if (previousWindowStructuralValue.isPresent()
- && !previousWindowStructuralValue.get().equals(currentWindowStructuralValue)) {
- // Construct the transformed map containing all the elements since we
- // are at a window boundary.
- c.output(IsmRecord.of(
- ImmutableList.of(previousWindow.get()),
- valueInEmptyWindows(new TransformedMap<>(WindowedValueToValue.<V>of(), map))));
- map = new HashMap<>();
- }
-
- // Verify that the user isn't trying to insert the same key multiple times.
- checkState(!map.containsKey(kv.getValue().getValue().getKey()),
- "Multiple values [%s, %s] found for single key [%s] within window [%s].",
- map.get(kv.getValue().getValue().getKey()),
- kv.getValue().getValue().getValue(),
- kv.getKey());
- map.put(kv.getValue().getValue().getKey(),
- kv.getValue().withValue(kv.getValue().getValue().getValue()));
- previousWindowStructuralValue = Optional.of(currentWindowStructuralValue);
- previousWindow = Optional.of(kv.getKey());
- }
-
- // The last value for this hash is guaranteed to be at a window boundary
- // so we output a transformed map containing all the elements since the last
- // window boundary.
- c.output(IsmRecord.of(
- ImmutableList.of(previousWindow.get()),
- valueInEmptyWindows(new TransformedMap<>(WindowedValueToValue.<V>of(), map))));
- }
- }
-
- private final DataflowPipelineRunner runner;
- /**
- * Builds an instance of this class from the overridden transform.
- */
- @SuppressWarnings("unused") // used via reflection in DataflowPipelineRunner#apply()
- public BatchViewAsMap(DataflowPipelineRunner runner, View.AsMap<K, V> transform) {
- this.runner = runner;
- }
-
- @Override
- public PCollectionView<Map<K, V>> apply(PCollection<KV<K, V>> input) {
- return this.<BoundedWindow>applyInternal(input);
- }
-
- private <W extends BoundedWindow> PCollectionView<Map<K, V>>
- applyInternal(PCollection<KV<K, V>> input) {
-
- @SuppressWarnings({"rawtypes", "unchecked"})
- KvCoder<K, V> inputCoder = (KvCoder) input.getCoder();
- try {
- PCollectionView<Map<K, V>> view = PCollectionViews.mapView(
- input.getPipeline(), input.getWindowingStrategy(), inputCoder);
- return BatchViewAsMultimap.applyForMapLike(runner, input, view, true /* unique keys */);
- } catch (NonDeterministicException e) {
- runner.recordViewUsesNonDeterministicKeyCoder(this);
-
- // Since the key coder is not deterministic, we convert the map into a singleton
- // and return a singleton view equivalent.
- return applyForSingletonFallback(input);
- }
- }
-
- @Override
- protected String getKindString() {
- return "BatchViewAsMap";
- }
-
- /** Transforms the input {@link PCollection} into a singleton {@link Map} per window. */
- private <W extends BoundedWindow> PCollectionView<Map<K, V>>
- applyForSingletonFallback(PCollection<KV<K, V>> input) {
- @SuppressWarnings("unchecked")
- Coder<W> windowCoder = (Coder<W>)
- input.getWindowingStrategy().getWindowFn().windowCoder();
-
- @SuppressWarnings({"rawtypes", "unchecked"})
- KvCoder<K, V> inputCoder = (KvCoder) input.getCoder();
-
- @SuppressWarnings({"unchecked", "rawtypes"})
- Coder<Function<WindowedValue<V>, V>> transformCoder =
- (Coder) SerializableCoder.of(WindowedValueToValue.class);
-
- Coder<TransformedMap<K, WindowedValue<V>, V>> finalValueCoder =
- TransformedMapCoder.of(
- transformCoder,
- MapCoder.of(
- inputCoder.getKeyCoder(),
- FullWindowedValueCoder.of(inputCoder.getValueCoder(), windowCoder)));
-
- TransformedMap<K, WindowedValue<V>, V> defaultValue = new TransformedMap<>(
- WindowedValueToValue.<V>of(),
- ImmutableMap.<K, WindowedValue<V>>of());
-
- return BatchViewAsSingleton.<KV<K, V>,
- TransformedMap<K, WindowedValue<V>, V>,
- Map<K, V>,
- W> applyForSingleton(
- runner,
- input,
- new ToMapDoFn<K, V, W>(windowCoder),
- true,
- defaultValue,
- finalValueCoder);
- }
- }
-
- /**
- * Specialized implementation for
- * {@link com.google.cloud.dataflow.sdk.transforms.View.AsMultimap View.AsMultimap} for the
- * Dataflow runner in batch mode.
- *
- * <p>Creates a set of {@code Ism} files sharded by the hash of the key's byte
- * representation. Each record is structured as follows:
- * <ul>
- * <li>Key 1: User key K</li>
- * <li>Key 2: Window</li>
- * <li>Key 3: Index offset for a given key and window.</li>
- * <li>Value: Windowed value</li>
- * </ul>
- *
- * <p>Alongside the data records, there are the following metadata records:
- * <ul>
- * <li>Key 1: Metadata Key</li>
- * <li>Key 2: Window</li>
- * <li>Key 3: Index [0, size of map]</li>
- * <li>Value: variable length long byte representation of size of map if index is 0,
- * otherwise the byte representation of a key</li>
- * </ul>
- * The {@code [META, Window, 0]} record stores the number of unique keys per window, while
- * {@code [META, Window, i]} for {@code i} in {@code [1, size of map]} stores a the users key.
- * This allows for one to access the size of the map by looking at {@code [META, Window, 0]}
- * and iterate over all the keys by accessing {@code [META, Window, i]} for {@code i} in
- * {@code [1, size of map]}.
- *
- * <p>Note that in the case of a non-deterministic key coder, we fallback to using
- * {@link com.google.cloud.dataflow.sdk.transforms.View.AsSingleton View.AsSingleton} printing
- * a warning to users to specify a deterministic key coder.
- */
- static class BatchViewAsMultimap<K, V>
- extends PTransform<PCollection<KV<K, V>>, PCollectionView<Map<K, Iterable<V>>>> {
- /**
- * A {@link PTransform} that groups elements by the hash of window's byte representation
- * if the input {@link PCollection} is not within the global window. Otherwise by the hash
- * of the window and key's byte representation. This {@link PTransform} also sorts
- * the values by the combination of the window and key's byte representations.
- */
- private static class GroupByKeyHashAndSortByKeyAndWindow<K, V, W extends BoundedWindow>
- extends PTransform<PCollection<KV<K, V>>,
- PCollection<KV<Integer, Iterable<KV<KV<K, W>, WindowedValue<V>>>>>> {
-
- @SystemDoFnInternal
- private static class GroupByKeyHashAndSortByKeyAndWindowDoFn<K, V, W>
- extends DoFn<KV<K, V>, KV<Integer, KV<KV<K, W>, WindowedValue<V>>>>
- implements DoFn.RequiresWindowAccess {
-
- private final IsmRecordCoder<?> coder;
- private GroupByKeyHashAndSortByKeyAndWindowDoFn(IsmRecordCoder<?> coder) {
- this.coder = coder;
- }
-
- @Override
- public void processElement(ProcessContext c) throws Exception {
- @SuppressWarnings("unchecked")
- W window = (W) c.window();
-
- c.output(
- KV.of(coder.hash(ImmutableList.of(c.element().getKey())),
- KV.of(KV.of(c.element().getKey(), window),
- WindowedValue.of(
- c.element().getValue(),
- c.timestamp(),
- (BoundedWindow) window,
- c.pane()))));
- }
- }
-
- private final IsmRecordCoder<?> coder;
- public GroupByKeyHashAndSortByKeyAndWindow(IsmRecordCoder<?> coder) {
- this.coder = coder;
- }
-
- @Override
- public PCollection<KV<Integer, Iterable<KV<KV<K, W>, WindowedValue<V>>>>>
- apply(PCollection<KV<K, V>> input) {
-
- @SuppressWarnings("unchecked")
- Coder<W> windowCoder = (Coder<W>)
- input.getWindowingStrategy().getWindowFn().windowCoder();
- @SuppressWarnings("unchecked")
- KvCoder<K, V> inputCoder = (KvCoder<K, V>) input.getCoder();
-
- PCollection<KV<Integer, KV<KV<K, W>, WindowedValue<V>>>> keyedByHash;
- keyedByHash = input.apply(
- ParDo.of(new GroupByKeyHashAndSortByKeyAndWindowDoFn<K, V, W>(coder)));
- keyedByHash.setCoder(
- KvCoder.of(
- VarIntCoder.of(),
- KvCoder.of(KvCoder.of(inputCoder.getKeyCoder(), windowCoder),
- FullWindowedValueCoder.of(inputCoder.getValueCoder(), windowCoder))));
-
- return keyedByHash.apply(
- new GroupByKeyAndSortValuesOnly<Integer, KV<K, W>, WindowedValue<V>>());
- }
- }
-
- /**
- * A {@link DoFn} which creates {@link IsmRecord}s comparing successive elements windows
- * and keys to locate window and key boundaries. The main output {@link IsmRecord}s have:
- * <ul>
- * <li>Key 1: Window</li>
- * <li>Key 2: User key K</li>
- * <li>Key 3: Index offset for a given key and window.</li>
- * <li>Value: Windowed value</li>
- * </ul>
- *
- * <p>Additionally, we output all the unique keys per window seen to {@code outputForEntrySet}
- * and the unique key count per window to {@code outputForSize}.
- *
- * <p>Finally, if this DoFn has been requested to perform unique key checking, it will
- * throw an {@link IllegalStateException} if more than one key per window is found.
- */
- static class ToIsmRecordForMapLikeDoFn<K, V, W extends BoundedWindow>
- extends DoFn<KV<Integer, Iterable<KV<KV<K, W>, WindowedValue<V>>>>,
- IsmRecord<WindowedValue<V>>> {
-
- private final TupleTag<KV<Integer, KV<W, Long>>> outputForSize;
- private final TupleTag<KV<Integer, KV<W, K>>> outputForEntrySet;
- private final Coder<W> windowCoder;
- private final Coder<K> keyCoder;
- private final IsmRecordCoder<WindowedValue<V>> ismCoder;
- private final boolean uniqueKeysExpected;
- ToIsmRecordForMapLikeDoFn(
- TupleTag<KV<Integer, KV<W, Long>>> outputForSize,
- TupleTag<KV<Integer, KV<W, K>>> outputForEntrySet,
- Coder<W> windowCoder,
- Coder<K> keyCoder,
- IsmRecordCoder<WindowedValue<V>> ismCoder,
- boolean uniqueKeysExpected) {
- this.outputForSize = outputForSize;
- this.outputForEntrySet = outputForEntrySet;
- this.windowCoder = windowCoder;
- this.keyCoder = keyCoder;
- this.ismCoder = ismCoder;
- this.uniqueKeysExpected = uniqueKeysExpected;
- }
-
- @Override
- public void processElement(ProcessContext c) throws Exception {
- long currentKeyIndex = 0;
- // We use one based indexing while counting
- long currentUniqueKeyCounter = 1;
- Iterator<KV<KV<K, W>, WindowedValue<V>>> iterator = c.element().getValue().iterator();
-
- KV<KV<K, W>, WindowedValue<V>> currentValue = iterator.next();
- Object currentKeyStructuralValue =
- keyCoder.structuralValue(currentValue.getKey().getKey());
- Object currentWindowStructuralValue =
- windowCoder.structuralValue(currentValue.getKey().getValue());
-
- while (iterator.hasNext()) {
- KV<KV<K, W>, WindowedValue<V>> nextValue = iterator.next();
- Object nextKeyStructuralValue =
- keyCoder.structuralValue(nextValue.getKey().getKey());
- Object nextWindowStructuralValue =
- windowCoder.structuralValue(nextValue.getKey().getValue());
-
- outputDataRecord(c, currentValue, currentKeyIndex);
-
- final long nextKeyIndex;
- final long nextUniqueKeyCounter;
-
- // Check to see if its a new window
- if (!currentWindowStructuralValue.equals(nextWindowStructuralValue)) {
- // The next value is a new window, so we output for size the number of unique keys
- // seen and the last key of the window. We also reset the next key index the unique
- // key counter.
- outputMetadataRecordForSize(c, currentValue, currentUniqueKeyCounter);
- outputMetadataRecordForEntrySet(c, currentValue);
-
- nextKeyIndex = 0;
- nextUniqueKeyCounter = 1;
- } else if (!currentKeyStructuralValue.equals(nextKeyStructuralValue)){
- // It is a new key within the same window so output the key for the entry set,
- // reset the key index and increase the count of unique keys seen within this window.
- outputMetadataRecordForEntrySet(c, currentValue);
-
- nextKeyIndex = 0;
- nextUniqueKeyCounter = currentUniqueKeyCounter + 1;
- } else if (!uniqueKeysExpected) {
- // It is not a new key so we don't have to output the number of elements in this
- // window or increase the unique key counter. All we do is increase the key index.
-
- nextKeyIndex = currentKeyIndex + 1;
- nextUniqueKeyCounter = currentUniqueKeyCounter;
- } else {
- throw new IllegalStateException(String.format(
- "Unique keys are expected but found key %s with values %s and %s in window %s.",
- currentValue.getKey().getKey(),
- currentValue.getValue().getValue(),
- nextValue.getValue().getValue(),
- currentValue.getKey().getValue()));
- }
-
- currentValue = nextValue;
- currentWindowStructuralValue = nextWindowStructuralValue;
- currentKeyStructuralValue = nextKeyStructuralValue;
- currentKeyIndex = nextKeyIndex;
- currentUniqueKeyCounter = nextUniqueKeyCounter;
- }
-
- outputDataRecord(c, currentValue, currentKeyIndex);
- outputMetadataRecordForSize(c, currentValue, currentUniqueKeyCounter);
- // The last value for this hash is guaranteed to be at a window boundary
- // so we output a record with the number of unique keys seen.
- outputMetadataRecordForEntrySet(c, currentValue);
- }
-
- /** This outputs the data record. */
- private void outputDataRecord(
- ProcessContext c, KV<KV<K, W>, WindowedValue<V>> value, long keyIndex) {
- IsmRecord<WindowedValue<V>> ismRecord = IsmRecord.of(
- ImmutableList.of(
- value.getKey().getKey(),
- value.getKey().getValue(),
- keyIndex),
- value.getValue());
- c.output(ismRecord);
- }
-
- /**
- * This outputs records which will be used to compute the number of keys for a given window.
- */
- private void outputMetadataRecordForSize(
- ProcessContext c, KV<KV<K, W>, WindowedValue<V>> value, long uniqueKeyCount) {
- c.sideOutput(outputForSize,
- KV.of(ismCoder.hash(ImmutableList.of(IsmFormat.getMetadataKey(),
- value.getKey().getValue())),
- KV.of(value.getKey().getValue(), uniqueKeyCount)));
- }
-
- /** This outputs records which will be used to construct the entry set. */
- private void outputMetadataRecordForEntrySet(
- ProcessContext c, KV<KV<K, W>, WindowedValue<V>> value) {
- c.sideOutput(outputForEntrySet,
- KV.of(ismCoder.hash(ImmutableList.of(IsmFormat.getMetadataKey(),
- value.getKey().getValue())),
- KV.of(value.getKey().getValue(), value.getKey().getKey())));
- }
- }
-
- /**
- * A {@link DoFn} which outputs a metadata {@link IsmRecord} per window of:
- * <ul>
- * <li>Key 1: META key</li>
- * <li>Key 2: window</li>
- * <li>Key 3: 0L (constant)</li>
- * <li>Value: sum of values for window</li>
- * </ul>
- *
- * <p>This {@link DoFn} is meant to be used to compute the number of unique keys
- * per window for map and multimap side inputs.
- */
- static class ToIsmMetadataRecordForSizeDoFn<K, V, W extends BoundedWindow>
- extends DoFn<KV<Integer, Iterable<KV<W, Long>>>, IsmRecord<WindowedValue<V>>> {
- private final Coder<W> windowCoder;
- ToIsmMetadataRecordForSizeDoFn(Coder<W> windowCoder) {
- this.windowCoder = windowCoder;
- }
-
- @Override
- public void processElement(ProcessContext c) throws Exception {
- Iterator<KV<W, Long>> iterator = c.element().getValue().iterator();
- KV<W, Long> currentValue = iterator.next();
- Object currentWindowStructuralValue = windowCoder.structuralValue(currentValue.getKey());
- long size = 0;
- while (iterator.hasNext()) {
- KV<W, Long> nextValue = iterator.next();
- Object nextWindowStructuralValue = windowCoder.structuralValue(nextValue.getKey());
-
- size += currentValue.getValue();
- if (!currentWindowStructuralValue.equals(nextWindowStructuralValue)) {
- c.output(IsmRecord.<WindowedValue<V>>meta(
- ImmutableList.of(IsmFormat.getMetadataKey(), currentValue.getKey(), 0L),
- CoderUtils.encodeToByteArray(VarLongCoder.of(), size)));
- size = 0;
- }
-
- currentValue = nextValue;
- currentWindowStructuralValue = nextWindowStructuralValue;
- }
-
- size += currentValue.getValue();
- // Output the final value since it is guaranteed to be on a window boundary.
- c.output(IsmRecord.<WindowedValue<V>>meta(
- ImmutableList.of(IsmFormat.getMetadataKey(), currentValue.getKey(), 0L),
- CoderUtils.encodeToByteArray(VarLongCoder.of(), size)));
- }
- }
-
- /**
- * A {@link DoFn} which outputs a metadata {@link IsmRecord} per window and key pair of:
- * <ul>
- * <li>Key 1: META key</li>
- * <li>Key 2: window</li>
- * <li>Key 3: index offset (1-based index)</li>
- * <li>Value: key</li>
- * </ul>
- *
- * <p>This {@link DoFn} is meant to be used to output index to key records
- * per window for map and multimap side inputs.
- */
- static class ToIsmMetadataRecordForKeyDoFn<K, V, W extends BoundedWindow>
- extends DoFn<KV<Integer, Iterable<KV<W, K>>>, IsmRecord<WindowedValue<V>>> {
-
- private final Coder<K> keyCoder;
- private final Coder<W> windowCoder;
- ToIsmMetadataRecordForKeyDoFn(Coder<K> keyCoder, Coder<W> windowCoder) {
- this.keyCoder = keyCoder;
- this.windowCoder = windowCoder;
- }
-
- @Override
- public void processElement(ProcessContext c) throws Exception {
- Iterator<KV<W, K>> iterator = c.element().getValue().iterator();
- KV<W, K> currentValue = iterator.next();
- Object currentWindowStructuralValue = windowCoder.structuralValue(currentValue.getKey());
- long elementsInWindow = 1;
- while (iterator.hasNext()) {
- KV<W, K> nextValue = iterator.next();
- Object nextWindowStructuralValue = windowCoder.structuralValue(nextValue.getKey());
-
- c.output(IsmRecord.<WindowedValue<V>>meta(
- ImmutableList.of(IsmFormat.getMetadataKey(), currentValue.getKey(), elementsInWindow),
- CoderUtils.encodeToByteArray(keyCoder, currentValue.getValue())));
- elementsInWindow += 1;
-
- if (!currentWindowStructuralValue.equals(nextWindowStructuralValue)) {
- elementsInWindow = 1;
- }
-
- currentValue = nextValue;
- currentWindowStructuralValue = nextWindowStructuralValue;
- }
-
- // Output the final value since it is guaranteed to be on a window boundary.
- c.output(IsmRecord.<WindowedValue<V>>meta(
- ImmutableList.of(IsmFormat.getMetadataKey(), currentValue.getKey(), elementsInWindow),
- CoderUtils.encodeToByteArray(keyCoder, currentValue.getValue())));
- }
- }
-
- /**
- * A {@link DoFn} which partitions sets of elements by window boundaries. Within each
- * partition, the set of elements is transformed into a {@link TransformedMap}.
- * The transformed {@code Map<K, Iterable<V>>} is backed by a
- * {@code Map<K, Iterable<WindowedValue<V>>>} and contains a function
- * {@code Iterable<WindowedValue<V>> -> Iterable<V>}.
- *
- * <p>Outputs {@link IsmRecord}s having:
- * <ul>
- * <li>Key 1: Window</li>
- * <li>Value: Transformed map containing a transform that removes the encapsulation
- * of the window around each value,
- * {@code Map<K, Iterable<WindowedValue<V>>> -> Map<K, Iterable<V>>}.</li>
- * </ul>
- */
- static class ToMultimapDoFn<K, V, W extends BoundedWindow>
- extends DoFn<KV<Integer, Iterable<KV<W, WindowedValue<KV<K, V>>>>>,
- IsmRecord<WindowedValue<TransformedMap<K,
- Iterable<WindowedValue<V>>,
- Iterable<V>>>>> {
-
- private final Coder<W> windowCoder;
- ToMultimapDoFn(Coder<W> windowCoder) {
- this.windowCoder = windowCoder;
- }
-
- @Override
- public void processElement(ProcessContext c)
- throws Exception {
- Optional<Object> previousWindowStructuralValue = Optional.absent();
- Optional<W> previousWindow = Optional.absent();
- Multimap<K, WindowedValue<V>> multimap = HashMultimap.create();
- for (KV<W, WindowedValue<KV<K, V>>> kv : c.element().getValue()) {
- Object currentWindowStructuralValue = windowCoder.structuralValue(kv.getKey());
- if (previousWindowStructuralValue.isPresent()
- && !previousWindowStructuralValue.get().equals(currentWindowStructuralValue)) {
- // Construct the transformed map containing all the elements since we
- // are at a window boundary.
- @SuppressWarnings({"unchecked", "rawtypes"})
- Map<K, Iterable<WindowedValue<V>>> resultMap = (Map) multimap.asMap();
- c.output(IsmRecord.<WindowedValue<TransformedMap<K,
- Iterable<WindowedValue<V>>,
- Iterable<V>>>>of(
- ImmutableList.of(previousWindow.get()),
- valueInEmptyWindows(
- new TransformedMap<>(
- IterableWithWindowedValuesToIterable.<V>of(), resultMap))));
- multimap = HashMultimap.create();
- }
-
- multimap.put(kv.getValue().getValue().getKey(),
- kv.getValue().withValue(kv.getValue().getValue().getValue()));
- previousWindowStructuralValue = Optional.of(currentWindowStructuralValue);
- previousWindow = Optional.of(kv.getKey());
- }
-
- // The last value for this hash is guaranteed to be at a window boundary
- // so we output a transformed map containing all the elements since the last
- // window boundary.
- @SuppressWarnings({"unchecked", "rawtypes"})
- Map<K, Iterable<WindowedValue<V>>> resultMap = (Map) multimap.asMap();
- c.output(IsmRecord.<WindowedValue<TransformedMap<K,
- Iterable<WindowedValue<V>>,
- Iterable<V>>>>of(
- ImmutableList.of(previousWindow.get()),
- valueInEmptyWindows(
- new TransformedMap<>(IterableWithWindowedValuesToIterable.<V>of(), resultMap))));
- }
- }
-
- private final DataflowPipelineRunner runner;
- /**
- * Builds an instance of this class from the overridden transform.
- */
- @SuppressWarnings("unused") // used via reflection in DataflowPipelineRunner#apply()
- public BatchViewAsMultimap(DataflowPipelineRunner runner, View.AsMultimap<K, V> transform) {
- this.runner = runner;
- }
-
- @Override
- public PCollectionView<Map<K, Iterable<V>>> apply(PCollection<KV<K, V>> input) {
- return this.<BoundedWindow>applyInternal(input);
- }
-
- private <W extends BoundedWindow> PCollectionView<Map<K, Iterable<V>>>
- applyInternal(PCollection<KV<K, V>> input) {
- @SuppressWarnings({"rawtypes", "unchecked"})
- KvCoder<K, V> inputCoder = (KvCoder) input.getCoder();
- try {
- PCollectionView<Map<K, Iterable<V>>> view = PCollectionViews.multimapView(
- input.getPipeline(), input.getWindowingStrategy(), inputCoder);
-
- return applyForMapLike(runner, input, view, false /* unique keys not expected */);
- } catch (NonDeterministicException e) {
- runner.recordViewUsesNonDeterministicKeyCoder(this);
-
- // Since the key coder is not deterministic, we convert the map into a singleton
- // and return a singleton view equivalent.
- return applyForSingletonFallback(input);
- }
- }
-
- /** Transforms the input {@link PCollection} into a singleton {@link Map} per window. */
- private <W extends BoundedWindow> PCollectionView<Map<K, Iterable<V>>>
- applyForSingletonFallback(PCollection<KV<K, V>> input) {
- @SuppressWarnings("unchecked")
- Coder<W> windowCoder = (Coder<W>)
- input.getWindowingStrategy().getWindowFn().windowCoder();
-
- @SuppressWarnings({"rawtypes", "unchecked"})
- KvCoder<K, V> inputCoder = (KvCoder) input.getCoder();
-
- @SuppressWarnings({"unchecked", "rawtypes"})
- Coder<Function<Iterable<WindowedValue<V>>, Iterable<V>>> transformCoder =
- (Coder) SerializableCoder.of(IterableWithWindowedValuesToIterable.class);
-
- Coder<TransformedMap<K, Iterable<WindowedValue<V>>, Iterable<V>>> finalValueCoder =
- TransformedMapCoder.of(
- transformCoder,
- MapCoder.of(
- inputCoder.getKeyCoder(),
- IterableCoder.of(
- FullWindowedValueCoder.of(inputCoder.getValueCoder(), windowCoder))));
-
- TransformedMap<K, Iterable<WindowedValue<V>>, Iterable<V>> defaultValue =
- new TransformedMap<>(
- IterableWithWindowedValuesToIterable.<V>of(),
- ImmutableMap.<K, Iterable<WindowedValue<V>>>of());
-
- return BatchViewAsSingleton.<KV<K, V>,
- TransformedMap<K, Iterable<WindowedValue<V>>, Iterable<V>>,
- Map<K, Iterable<V>>,
- W> applyForSingleton(
- runner,
- input,
- new ToMultimapDoFn<K, V, W>(windowCoder),
- true,
- defaultValue,
- finalValueCoder);
- }
-
- private static <K, V, W extends BoundedWindow, ViewT> PCollectionView<ViewT> applyForMapLike(
- DataflowPipelineRunner runner,
- PCollection<KV<K, V>> input,
- PCollectionView<ViewT> view,
- boolean uniqueKeysExpected) throws NonDeterministicException {
-
- @SuppressWarnings("unchecked")
- Coder<W> windowCoder = (Coder<W>)
- input.getWindowingStrategy().getWindowFn().windowCoder();
-
- @SuppressWarnings({"rawtypes", "unchecked"})
- KvCoder<K, V> inputCoder = (KvCoder) input.getCoder();
-
- // If our key coder is deterministic, we can use the key portion of each KV
- // part of a composite key containing the window , key and index.
- inputCoder.getKeyCoder().verifyDeterministic();
-
- IsmRecordCoder<WindowedValue<V>> ismCoder =
- coderForMapLike(windowCoder, inputCoder.getKeyCoder(), inputCoder.getValueCoder());
-
- // Create the various output tags representing the main output containing the data stream
- // and the side outputs containing the metadata about the size and entry set.
- TupleTag<IsmRecord<WindowedValue<V>>> mainOutputTag = new TupleTag<>();
- TupleTag<KV<Integer, KV<W, Long>>> outputForSizeTag = new TupleTag<>();
- TupleTag<KV<Integer, KV<W, K>>> outputForEntrySetTag = new TupleTag<>();
-
- // Process all the elements grouped by key hash, and sorted by key and then window
- // outputting to all the outputs defined above.
- PCollectionTuple outputTuple = input
- .apply("GBKaSVForData", new GroupByKeyHashAndSortByKeyAndWindow<K, V, W>(ismCoder))
- .apply(ParDo.of(new ToIsmRecordForMapLikeDoFn<K, V, W>(
- outputForSizeTag, outputForEntrySetTag,
- windowCoder, inputCoder.getKeyCoder(), ismCoder, uniqueKeysExpected))
- .withOutputTags(mainOutputTag,
- TupleTagList.of(
- ImmutableList.<TupleTag<?>>of(outputForSizeTag,
- outputForEntrySetTag))));
-
- // Set the coder on the main data output.
- PCollection<IsmRecord<WindowedValue<V>>> perHashWithReifiedWindows =
- outputTuple.get(mainOutputTag);
- perHashWithReifiedWindows.setCoder(ismCoder);
-
- // Set the coder on the metadata output for size and process the entries
- // producing a [META, Window, 0L] record per window storing the number of unique keys
- // for each window.
- PCollection<KV<Integer, KV<W, Long>>> outputForSize = outputTuple.get(outputForSizeTag);
- outputForSize.setCoder(
- KvCoder.of(VarIntCoder.of(),
- KvCoder.of(windowCoder, VarLongCoder.of())));
- PCollection<IsmRecord<WindowedValue<V>>> windowMapSizeMetadata = outputForSize
- .apply("GBKaSVForSize", new GroupByKeyAndSortValuesOnly<Integer, W, Long>())
- .apply(ParDo.of(new ToIsmMetadataRecordForSizeDoFn<K, V, W>(windowCoder)));
- windowMapSizeMetadata.setCoder(ismCoder);
-
- // Set the coder on the metadata output destined to build the entry set and process the
- // entries producing a [META, Window, Index] record per window key pair storing the key.
- PCollection<KV<Integer, KV<W, K>>> outputForEntrySet =
- outputTuple.get(outputForEntrySetTag);
- outputForEntrySet.setCoder(
- KvCoder.of(VarIntCoder.of(),
- KvCoder.of(windowCoder, inputCoder.getKeyCoder())));
- PCollection<IsmRecord<WindowedValue<V>>> windowMapKeysMetadata = outputForEntrySet
- .apply("GBKaSVForKeys", new GroupByKeyAndSortValuesOnly<Integer, W, K>())
- .apply(ParDo.of(
- new ToIsmMetadataRecordForKeyDoFn<K, V, W>(inputCoder.getKeyCoder(), windowCoder)));
- windowMapKeysMetadata.setCoder(ismCoder);
-
- // Set that all these outputs should be materialized using an indexed format.
- runner.addPCollectionRequiringIndexedFormat(perHashWithReifiedWindows);
- runner.addPCollectionRequiringIndexedFormat(windowMapSizeMetadata);
- runner.addPCollectionRequiringIndexedFormat(windowMapKeysMetadata);
-
- PCollectionList<IsmRecord<WindowedValue<V>>> outputs =
- PCollectionList.of(ImmutableList.of(
- perHashWithReifiedWindows, windowMapSizeMetadata, windowMapKeysMetadata));
-
- return Pipeline.applyTransform(outputs,
- Flatten.<IsmRecord<WindowedValue<V>>>pCollections())
- .apply(CreatePCollectionView.<IsmRecord<WindowedValue<V>>,
- ViewT>of(view));
- }
-
- @Override
- protected String getKindString() {
- return "BatchViewAsMultimap";
- }
-
- static <V> IsmRecordCoder<WindowedValue<V>> coderForMapLike(
- Coder<? extends BoundedWindow> windowCoder, Coder<?> keyCoder, Coder<V> valueCoder) {
- // TODO: swap to use a variable length long coder which has values which compare
- // the same as their byte representation compare lexicographically within the key coder
- return IsmRecordCoder.of(
- 1, // We use only the key for hashing when producing value records
- 2, // Since the key is not present, we add the window to the hash when
- // producing metadata records
- ImmutableList.of(
- MetadataKeyCoder.of(keyCoder),
- windowCoder,
- BigEndianLongCoder.of()),
- FullWindowedValueCoder.of(valueCoder, windowCoder));
- }
- }
-
- /**
- * A {@code Map<K, V2>} backed by a {@code Map<K, V1>} and a function that transforms
- * {@code V1 -> V2}.
- */
- static class TransformedMap<K, V1, V2>
- extends ForwardingMap<K, V2> {
- private final Function<V1, V2> transform;
- private final Map<K, V1> originalMap;
- private final Map<K, V2> transformedMap;
-
- private TransformedMap(Function<V1, V2> transform, Map<K, V1> originalMap) {
- this.transform = transform;
- this.originalMap = Collections.unmodifiableMap(originalMap);
- this.transformedMap = Maps.transformValues(originalMap, transform);
- }
-
- @Override
- protected Map<K, V2> delegate() {
- return transformedMap;
- }
- }
-
- /**
- * A {@link Coder} for {@link TransformedMap}s.
- */
- static class TransformedMapCoder<K, V1, V2>
- extends StandardCoder<TransformedMap<K, V1, V2>> {
- private final Coder<Function<V1, V2>> transformCoder;
- private final Coder<Map<K, V1>> originalMapCoder;
-
- private TransformedMapCoder(
- Coder<Function<V1, V2>> transformCoder, Coder<Map<K, V1>> originalMapCoder) {
- this.transformCoder = transformCoder;
- this.originalMapCoder = originalMapCoder;
- }
-
- public static <K, V1, V2> TransformedMapCoder<K, V1, V2> of(
- Coder<Function<V1, V2>> transformCoder, Coder<Map<K, V1>> originalMapCoder) {
- return new TransformedMapCoder<>(transformCoder, originalMapCoder);
- }
-
- @JsonCreator
- public static <K, V1, V2> TransformedMapCoder<K, V1, V2> of(
- @JsonProperty(PropertyNames.COMPONENT_ENCODINGS)
- List<Coder<?>> components) {
- checkArgument(components.size() == 2,
- "Expecting 2 components, got " + components.size());
- @SuppressWarnings("unchecked")
- Coder<Function<V1, V2>> transformCoder = (Coder<Function<V1, V2>>) components.get(0);
- @SuppressWarnings("unchecked")
- Coder<Map<K, V1>> originalMapCoder = (Coder<Map<K, V1>>) components.get(1);
- return of(transformCoder, originalMapCoder);
- }
-
- @Override
- public void encode(TransformedMap<K, V1, V2> value, OutputStream outStream,
- Coder.Context context) throws CoderException, IOException {
- transformCoder.encode(value.transform, outStream, context.nested());
- originalMapCoder.encode(value.originalMap, outStream, context.nested());
- }
-
- @Override
- public TransformedMap<K, V1, V2> decode(
- InputStream inStream, Coder.Context context) throws CoderException, IOException {
- return new TransformedMap<>(
- transformCoder.decode(inStream, context.nested()),
- originalMapCoder.decode(inStream, context.nested()));
- }
-
- @Override
- public List<? extends Co
<TRUNCATED>
[31/67] [partial] incubator-beam git commit: Directory reorganization
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/InMemoryWatermarkManager.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/InMemoryWatermarkManager.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/InMemoryWatermarkManager.java
deleted file mode 100644
index a9a62a6..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/InMemoryWatermarkManager.java
+++ /dev/null
@@ -1,1310 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.runners.inprocess;
-
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.runners.inprocess.InProcessPipelineRunner.CommittedBundle;
-import com.google.cloud.dataflow.sdk.transforms.AppliedPTransform;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
-import com.google.cloud.dataflow.sdk.util.TimeDomain;
-import com.google.cloud.dataflow.sdk.util.TimerInternals;
-import com.google.cloud.dataflow.sdk.util.TimerInternals.TimerData;
-import com.google.cloud.dataflow.sdk.util.WindowedValue;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.cloud.dataflow.sdk.values.PValue;
-import com.google.common.annotations.VisibleForTesting;
-import com.google.common.base.MoreObjects;
-import com.google.common.collect.ComparisonChain;
-import com.google.common.collect.ImmutableList;
-import com.google.common.collect.ImmutableSet;
-import com.google.common.collect.Iterables;
-import com.google.common.collect.Ordering;
-import com.google.common.collect.SortedMultiset;
-import com.google.common.collect.TreeMultiset;
-
-import org.joda.time.Instant;
-
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.EnumMap;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Map;
-import java.util.NavigableSet;
-import java.util.Objects;
-import java.util.PriorityQueue;
-import java.util.Set;
-import java.util.TreeSet;
-import java.util.concurrent.atomic.AtomicReference;
-
-import javax.annotation.Nullable;
-
-/**
- * Manages watermarks of {@link PCollection PCollections} and input and output watermarks of
- * {@link AppliedPTransform AppliedPTransforms} to provide event-time and completion tracking for
- * in-memory execution. {@link InMemoryWatermarkManager} is designed to update and return a
- * consistent view of watermarks in the presence of concurrent updates.
- *
- * <p>An {@link InMemoryWatermarkManager} is provided with the collection of root
- * {@link AppliedPTransform AppliedPTransforms} and a map of {@link PCollection PCollections} to
- * all the {@link AppliedPTransform AppliedPTransforms} that consume them at construction time.
- *
- * <p>Whenever a root {@link AppliedPTransform transform} produces elements, the
- * {@link InMemoryWatermarkManager} is provided with the produced elements and the output watermark
- * of the producing {@link AppliedPTransform transform}. The
- * {@link InMemoryWatermarkManager watermark manager} is responsible for computing the watermarks
- * of all {@link AppliedPTransform transforms} that consume one or more
- * {@link PCollection PCollections}.
- *
- * <p>Whenever a non-root {@link AppliedPTransform} finishes processing one or more in-flight
- * elements (referred to as the input {@link CommittedBundle bundle}), the following occurs
- * atomically:
- * <ul>
- * <li>All of the in-flight elements are removed from the collection of pending elements for the
- * {@link AppliedPTransform}.</li>
- * <li>All of the elements produced by the {@link AppliedPTransform} are added to the collection
- * of pending elements for each {@link AppliedPTransform} that consumes them.</li>
- * <li>The input watermark for the {@link AppliedPTransform} becomes the maximum value of
- * <ul>
- * <li>the previous input watermark</li>
- * <li>the minimum of
- * <ul>
- * <li>the timestamps of all currently pending elements</li>
- * <li>all input {@link PCollection} watermarks</li>
- * </ul>
- * </li>
- * </ul>
- * </li>
- * <li>The output watermark for the {@link AppliedPTransform} becomes the maximum of
- * <ul>
- * <li>the previous output watermark</li>
- * <li>the minimum of
- * <ul>
- * <li>the current input watermark</li>
- * <li>the current watermark holds</li>
- * </ul>
- * </li>
- * </ul>
- * </li>
- * <li>The watermark of the output {@link PCollection} can be advanced to the output watermark of
- * the {@link AppliedPTransform}</li>
- * <li>The watermark of all downstream {@link AppliedPTransform AppliedPTransforms} can be
- * advanced.</li>
- * </ul>
- *
- * <p>The watermark of a {@link PCollection} is equal to the output watermark of the
- * {@link AppliedPTransform} that produces it.
- *
- * <p>The watermarks for a {@link PTransform} are updated as follows when output is committed:<pre>
- * Watermark_In' = MAX(Watermark_In, MIN(U(TS_Pending), U(Watermark_InputPCollection)))
- * Watermark_Out' = MAX(Watermark_Out, MIN(Watermark_In', U(StateHold)))
- * Watermark_PCollection = Watermark_Out_ProducingPTransform
- * </pre>
- */
-public class InMemoryWatermarkManager {
- /**
- * The watermark of some {@link Pipeline} element, usually a {@link PTransform} or a
- * {@link PCollection}.
- *
- * <p>A watermark is a monotonically increasing value, which represents the point up to which the
- * system believes it has received all of the data. Data that arrives with a timestamp that is
- * before the watermark is considered late. {@link BoundedWindow#TIMESTAMP_MAX_VALUE} is a special
- * timestamp which indicates we have received all of the data and there will be no more on-time or
- * late data. This value is represented by {@link InMemoryWatermarkManager#THE_END_OF_TIME}.
- */
- private static interface Watermark {
- /**
- * Returns the current value of this watermark.
- */
- Instant get();
-
- /**
- * Refreshes the value of this watermark from its input watermarks and watermark holds.
- *
- * @return true if the value of the watermark has changed (and thus dependent watermark must
- * also be updated
- */
- WatermarkUpdate refresh();
- }
-
- /**
- * The result of computing a {@link Watermark}.
- */
- private static enum WatermarkUpdate {
- /** The watermark is later than the value at the previous time it was computed. */
- ADVANCED(true),
- /** The watermark is equal to the value at the previous time it was computed. */
- NO_CHANGE(false);
-
- private final boolean advanced;
-
- private WatermarkUpdate(boolean advanced) {
- this.advanced = advanced;
- }
-
- public boolean isAdvanced() {
- return advanced;
- }
-
- /**
- * Returns the {@link WatermarkUpdate} that is a result of combining the two watermark updates.
- *
- * If either of the input {@link WatermarkUpdate WatermarkUpdates} were advanced, the result
- * {@link WatermarkUpdate} has been advanced.
- */
- public WatermarkUpdate union(WatermarkUpdate that) {
- if (this.advanced) {
- return this;
- }
- return that;
- }
-
- /**
- * Returns the {@link WatermarkUpdate} based on the former and current
- * {@link Instant timestamps}.
- */
- public static WatermarkUpdate fromTimestamps(Instant oldTime, Instant currentTime) {
- if (currentTime.isAfter(oldTime)) {
- return ADVANCED;
- }
- return NO_CHANGE;
- }
- }
-
- /**
- * The input {@link Watermark} of an {@link AppliedPTransform}.
- *
- * <p>At any point, the value of an {@link AppliedPTransformInputWatermark} is equal to the
- * minimum watermark across all of its input {@link Watermark Watermarks}, and the minimum
- * timestamp of all of the pending elements, restricted to be monotonically increasing.
- *
- * <p>See {@link #refresh()} for more information.
- */
- private static class AppliedPTransformInputWatermark implements Watermark {
- private final Collection<? extends Watermark> inputWatermarks;
- private final SortedMultiset<WindowedValue<?>> pendingElements;
- private final Map<Object, NavigableSet<TimerData>> objectTimers;
-
- private AtomicReference<Instant> currentWatermark;
-
- public AppliedPTransformInputWatermark(Collection<? extends Watermark> inputWatermarks) {
- this.inputWatermarks = inputWatermarks;
- this.pendingElements = TreeMultiset.create(PENDING_ELEMENT_COMPARATOR);
- this.objectTimers = new HashMap<>();
- currentWatermark = new AtomicReference<>(BoundedWindow.TIMESTAMP_MIN_VALUE);
- }
-
- @Override
- public Instant get() {
- return currentWatermark.get();
- }
-
- /**
- * {@inheritDoc}.
- *
- * <p>When refresh is called, the value of the {@link AppliedPTransformInputWatermark} becomes
- * equal to the maximum value of
- * <ul>
- * <li>the previous input watermark</li>
- * <li>the minimum of
- * <ul>
- * <li>the timestamps of all currently pending elements</li>
- * <li>all input {@link PCollection} watermarks</li>
- * </ul>
- * </li>
- * </ul>
- */
- @Override
- public synchronized WatermarkUpdate refresh() {
- Instant oldWatermark = currentWatermark.get();
- Instant minInputWatermark = BoundedWindow.TIMESTAMP_MAX_VALUE;
- for (Watermark inputWatermark : inputWatermarks) {
- minInputWatermark = INSTANT_ORDERING.min(minInputWatermark, inputWatermark.get());
- }
- if (!pendingElements.isEmpty()) {
- minInputWatermark = INSTANT_ORDERING.min(
- minInputWatermark, pendingElements.firstEntry().getElement().getTimestamp());
- }
- Instant newWatermark = INSTANT_ORDERING.max(oldWatermark, minInputWatermark);
- currentWatermark.set(newWatermark);
- return WatermarkUpdate.fromTimestamps(oldWatermark, newWatermark);
- }
-
- private synchronized void addPendingElements(Iterable<? extends WindowedValue<?>> newPending) {
- for (WindowedValue<?> pendingElement : newPending) {
- pendingElements.add(pendingElement);
- }
- }
-
- private synchronized void removePendingElements(
- Iterable<? extends WindowedValue<?>> finishedElements) {
- for (WindowedValue<?> finishedElement : finishedElements) {
- pendingElements.remove(finishedElement);
- }
- }
-
- private synchronized void updateTimers(TimerUpdate update) {
- NavigableSet<TimerData> keyTimers = objectTimers.get(update.key);
- if (keyTimers == null) {
- keyTimers = new TreeSet<>();
- objectTimers.put(update.key, keyTimers);
- }
- for (TimerData timer : update.setTimers) {
- if (TimeDomain.EVENT_TIME.equals(timer.getDomain())) {
- keyTimers.add(timer);
- }
- }
- for (TimerData timer : update.deletedTimers) {
- if (TimeDomain.EVENT_TIME.equals(timer.getDomain())) {
- keyTimers.remove(timer);
- }
- }
- // We don't keep references to timers that have been fired and delivered via #getFiredTimers()
- }
-
- private synchronized Map<Object, List<TimerData>> extractFiredEventTimeTimers() {
- return extractFiredTimers(currentWatermark.get(), objectTimers);
- }
-
- @Override
- public synchronized String toString() {
- return MoreObjects.toStringHelper(AppliedPTransformInputWatermark.class)
- .add("pendingElements", pendingElements)
- .add("currentWatermark", currentWatermark)
- .toString();
- }
- }
-
- /**
- * The output {@link Watermark} of an {@link AppliedPTransform}.
- *
- * <p>The value of an {@link AppliedPTransformOutputWatermark} is equal to the minimum of the
- * current watermark hold and the {@link AppliedPTransformInputWatermark} for the same
- * {@link AppliedPTransform}, restricted to be monotonically increasing. See
- * {@link #refresh()} for more information.
- */
- private static class AppliedPTransformOutputWatermark implements Watermark {
- private final Watermark inputWatermark;
- private final PerKeyHolds holds;
- private AtomicReference<Instant> currentWatermark;
-
- public AppliedPTransformOutputWatermark(AppliedPTransformInputWatermark inputWatermark) {
- this.inputWatermark = inputWatermark;
- holds = new PerKeyHolds();
- currentWatermark = new AtomicReference<>(BoundedWindow.TIMESTAMP_MIN_VALUE);
- }
-
- public synchronized void updateHold(Object key, Instant newHold) {
- if (newHold == null) {
- holds.removeHold(key);
- } else {
- holds.updateHold(key, newHold);
- }
- }
-
- @Override
- public Instant get() {
- return currentWatermark.get();
- }
-
- /**
- * {@inheritDoc}.
- *
- * <p>When refresh is called, the value of the {@link AppliedPTransformOutputWatermark} becomes
- * equal to the maximum value of:
- * <ul>
- * <li>the previous output watermark</li>
- * <li>the minimum of
- * <ul>
- * <li>the current input watermark</li>
- * <li>the current watermark holds</li>
- * </ul>
- * </li>
- * </ul>
- */
- @Override
- public synchronized WatermarkUpdate refresh() {
- Instant oldWatermark = currentWatermark.get();
- Instant newWatermark = INSTANT_ORDERING.min(inputWatermark.get(), holds.getMinHold());
- newWatermark = INSTANT_ORDERING.max(oldWatermark, newWatermark);
- currentWatermark.set(newWatermark);
- return WatermarkUpdate.fromTimestamps(oldWatermark, newWatermark);
- }
-
- @Override
- public synchronized String toString() {
- return MoreObjects.toStringHelper(AppliedPTransformOutputWatermark.class)
- .add("holds", holds)
- .add("currentWatermark", currentWatermark)
- .toString();
- }
- }
-
- /**
- * The input {@link TimeDomain#SYNCHRONIZED_PROCESSING_TIME} hold for an
- * {@link AppliedPTransform}.
- *
- * <p>At any point, the hold value of an {@link SynchronizedProcessingTimeInputWatermark} is equal
- * to the minimum across all pending bundles at the {@link AppliedPTransform} and all upstream
- * {@link TimeDomain#SYNCHRONIZED_PROCESSING_TIME} watermarks. The value of the input
- * synchronized processing time at any step is equal to the maximum of:
- * <ul>
- * <li>The most recently returned synchronized processing input time
- * <li>The minimum of
- * <ul>
- * <li>The current processing time
- * <li>The current synchronized processing time input hold
- * </ul>
- * </ul>
- */
- private static class SynchronizedProcessingTimeInputWatermark implements Watermark {
- private final Collection<? extends Watermark> inputWms;
- private final Collection<CommittedBundle<?>> pendingBundles;
- private final Map<Object, NavigableSet<TimerData>> processingTimers;
- private final Map<Object, NavigableSet<TimerData>> synchronizedProcessingTimers;
-
- private final PriorityQueue<TimerData> pendingTimers;
-
- private AtomicReference<Instant> earliestHold;
-
- public SynchronizedProcessingTimeInputWatermark(Collection<? extends Watermark> inputWms) {
- this.inputWms = inputWms;
- this.pendingBundles = new HashSet<>();
- this.processingTimers = new HashMap<>();
- this.synchronizedProcessingTimers = new HashMap<>();
- this.pendingTimers = new PriorityQueue<>();
- Instant initialHold = BoundedWindow.TIMESTAMP_MAX_VALUE;
- for (Watermark wm : inputWms) {
- initialHold = INSTANT_ORDERING.min(initialHold, wm.get());
- }
- earliestHold = new AtomicReference<>(initialHold);
- }
-
- @Override
- public Instant get() {
- return earliestHold.get();
- }
-
- /**
- * {@inheritDoc}.
- *
- * <p>When refresh is called, the value of the {@link SynchronizedProcessingTimeInputWatermark}
- * becomes equal to the minimum value of
- * <ul>
- * <li>the timestamps of all currently pending bundles</li>
- * <li>all input {@link PCollection} synchronized processing time watermarks</li>
- * </ul>
- *
- * <p>Note that this value is not monotonic, but the returned value for the synchronized
- * processing time must be.
- */
- @Override
- public synchronized WatermarkUpdate refresh() {
- Instant oldHold = earliestHold.get();
- Instant minTime = THE_END_OF_TIME.get();
- for (Watermark input : inputWms) {
- minTime = INSTANT_ORDERING.min(minTime, input.get());
- }
- for (CommittedBundle<?> bundle : pendingBundles) {
- // TODO: Track elements in the bundle by the processing time they were output instead of
- // entire bundles. Requried to support arbitrarily splitting and merging bundles between
- // steps
- minTime = INSTANT_ORDERING.min(minTime, bundle.getSynchronizedProcessingOutputWatermark());
- }
- earliestHold.set(minTime);
- return WatermarkUpdate.fromTimestamps(oldHold, minTime);
- }
-
- public synchronized void addPending(CommittedBundle<?> bundle) {
- pendingBundles.add(bundle);
- }
-
- public synchronized void removePending(CommittedBundle<?> bundle) {
- pendingBundles.remove(bundle);
- }
-
- /**
- * Return the earliest timestamp of the earliest timer that has not been completed. This is
- * either the earliest timestamp across timers that have not been completed, or the earliest
- * timestamp across timers that have been delivered but have not been completed.
- */
- public synchronized Instant getEarliestTimerTimestamp() {
- Instant earliest = THE_END_OF_TIME.get();
- for (NavigableSet<TimerData> timers : processingTimers.values()) {
- if (!timers.isEmpty()) {
- earliest = INSTANT_ORDERING.min(timers.first().getTimestamp(), earliest);
- }
- }
- for (NavigableSet<TimerData> timers : synchronizedProcessingTimers.values()) {
- if (!timers.isEmpty()) {
- earliest = INSTANT_ORDERING.min(timers.first().getTimestamp(), earliest);
- }
- }
- if (!pendingTimers.isEmpty()) {
- earliest = INSTANT_ORDERING.min(pendingTimers.peek().getTimestamp(), earliest);
- }
- return earliest;
- }
-
- private synchronized void updateTimers(TimerUpdate update) {
- for (TimerData completedTimer : update.completedTimers) {
- pendingTimers.remove(completedTimer);
- }
- Map<TimeDomain, NavigableSet<TimerData>> timerMap = timerMap(update.key);
- for (TimerData addedTimer : update.setTimers) {
- NavigableSet<TimerData> timerQueue = timerMap.get(addedTimer.getDomain());
- if (timerQueue != null) {
- timerQueue.add(addedTimer);
- }
- }
- for (TimerData deletedTimer : update.deletedTimers) {
- NavigableSet<TimerData> timerQueue = timerMap.get(deletedTimer.getDomain());
- if (timerQueue != null) {
- timerQueue.remove(deletedTimer);
- }
- }
- }
-
- private synchronized Map<Object, List<TimerData>> extractFiredDomainTimers(
- TimeDomain domain, Instant firingTime) {
- Map<Object, List<TimerData>> firedTimers;
- switch (domain) {
- case PROCESSING_TIME:
- firedTimers = extractFiredTimers(firingTime, processingTimers);
- break;
- case SYNCHRONIZED_PROCESSING_TIME:
- firedTimers =
- extractFiredTimers(
- INSTANT_ORDERING.min(firingTime, earliestHold.get()),
- synchronizedProcessingTimers);
- break;
- default:
- throw new IllegalArgumentException(
- "Called getFiredTimers on a Synchronized Processing Time watermark"
- + " and gave a non-processing time domain "
- + domain);
- }
- for (Map.Entry<Object, ? extends Collection<TimerData>> firedTimer : firedTimers.entrySet()) {
- pendingTimers.addAll(firedTimer.getValue());
- }
- return firedTimers;
- }
-
- private Map<TimeDomain, NavigableSet<TimerData>> timerMap(Object key) {
- NavigableSet<TimerData> processingQueue = processingTimers.get(key);
- if (processingQueue == null) {
- processingQueue = new TreeSet<>();
- processingTimers.put(key, processingQueue);
- }
- NavigableSet<TimerData> synchronizedProcessingQueue =
- synchronizedProcessingTimers.get(key);
- if (synchronizedProcessingQueue == null) {
- synchronizedProcessingQueue = new TreeSet<>();
- synchronizedProcessingTimers.put(key, synchronizedProcessingQueue);
- }
- EnumMap<TimeDomain, NavigableSet<TimerData>> result = new EnumMap<>(TimeDomain.class);
- result.put(TimeDomain.PROCESSING_TIME, processingQueue);
- result.put(TimeDomain.SYNCHRONIZED_PROCESSING_TIME, synchronizedProcessingQueue);
- return result;
- }
-
- @Override
- public synchronized String toString() {
- return MoreObjects.toStringHelper(SynchronizedProcessingTimeInputWatermark.class)
- .add("earliestHold", earliestHold)
- .toString();
- }
- }
-
- /**
- * The output {@link TimeDomain#SYNCHRONIZED_PROCESSING_TIME} hold for an
- * {@link AppliedPTransform}.
- *
- * <p>At any point, the hold value of an {@link SynchronizedProcessingTimeOutputWatermark} is
- * equal to the minimum across all incomplete timers at the {@link AppliedPTransform} and all
- * upstream {@link TimeDomain#SYNCHRONIZED_PROCESSING_TIME} watermarks. The value of the output
- * synchronized processing time at any step is equal to the maximum of:
- * <ul>
- * <li>The most recently returned synchronized processing output time
- * <li>The minimum of
- * <ul>
- * <li>The current processing time
- * <li>The current synchronized processing time output hold
- * </ul>
- * </ul>
- */
- private static class SynchronizedProcessingTimeOutputWatermark implements Watermark {
- private final SynchronizedProcessingTimeInputWatermark inputWm;
- private AtomicReference<Instant> latestRefresh;
-
- public SynchronizedProcessingTimeOutputWatermark(
- SynchronizedProcessingTimeInputWatermark inputWm) {
- this.inputWm = inputWm;
- this.latestRefresh = new AtomicReference<>(BoundedWindow.TIMESTAMP_MIN_VALUE);
- }
-
- @Override
- public Instant get() {
- return latestRefresh.get();
- }
-
- /**
- * {@inheritDoc}.
- *
- * <p>When refresh is called, the value of the {@link SynchronizedProcessingTimeOutputWatermark}
- * becomes equal to the minimum value of:
- * <ul>
- * <li>the current input watermark.
- * <li>all {@link TimeDomain#SYNCHRONIZED_PROCESSING_TIME} timers that are based on the input
- * watermark.
- * <li>all {@link TimeDomain#PROCESSING_TIME} timers that are based on the input watermark.
- * </ul>
- *
- * <p>Note that this value is not monotonic, but the returned value for the synchronized
- * processing time must be.
- */
- @Override
- public synchronized WatermarkUpdate refresh() {
- // Hold the output synchronized processing time to the input watermark, which takes into
- // account buffered bundles, and the earliest pending timer, which determines what to hold
- // downstream timers to.
- Instant oldRefresh = latestRefresh.get();
- Instant newTimestamp =
- INSTANT_ORDERING.min(inputWm.get(), inputWm.getEarliestTimerTimestamp());
- latestRefresh.set(newTimestamp);
- return WatermarkUpdate.fromTimestamps(oldRefresh, newTimestamp);
- }
-
- @Override
- public synchronized String toString() {
- return MoreObjects.toStringHelper(SynchronizedProcessingTimeOutputWatermark.class)
- .add("latestRefresh", latestRefresh)
- .toString();
- }
- }
-
- /**
- * The {@code Watermark} that is after the latest time it is possible to represent in the global
- * window. This is a distinguished value representing a complete {@link PTransform}.
- */
- private static final Watermark THE_END_OF_TIME = new Watermark() {
- @Override
- public WatermarkUpdate refresh() {
- // THE_END_OF_TIME is a distinguished value that cannot be advanced.
- return WatermarkUpdate.NO_CHANGE;
- }
-
- @Override
- public Instant get() {
- return BoundedWindow.TIMESTAMP_MAX_VALUE;
- }
- };
-
- private static final Ordering<Instant> INSTANT_ORDERING = Ordering.natural();
-
- /**
- * An ordering that compares windowed values by timestamp, then arbitrarily. This ensures that
- * {@link WindowedValue WindowedValues} will be sorted by timestamp, while two different
- * {@link WindowedValue WindowedValues} with the same timestamp are not considered equal.
- */
- private static final Ordering<WindowedValue<? extends Object>> PENDING_ELEMENT_COMPARATOR =
- (new WindowedValueByTimestampComparator()).compound(Ordering.arbitrary());
-
- /**
- * For each (Object, PriorityQueue) pair in the provided map, remove each Timer that is before the
- * latestTime argument and put in in the result with the same key, then remove all of the keys
- * which have no more pending timers.
- *
- * The result collection retains ordering of timers (from earliest to latest).
- */
- private static Map<Object, List<TimerData>> extractFiredTimers(
- Instant latestTime, Map<Object, NavigableSet<TimerData>> objectTimers) {
- Map<Object, List<TimerData>> result = new HashMap<>();
- Set<Object> emptyKeys = new HashSet<>();
- for (Map.Entry<Object, NavigableSet<TimerData>> pendingTimers : objectTimers.entrySet()) {
- NavigableSet<TimerData> timers = pendingTimers.getValue();
- if (!timers.isEmpty() && timers.first().getTimestamp().isBefore(latestTime)) {
- ArrayList<TimerData> keyFiredTimers = new ArrayList<>();
- result.put(pendingTimers.getKey(), keyFiredTimers);
- while (!timers.isEmpty() && timers.first().getTimestamp().isBefore(latestTime)) {
- keyFiredTimers.add(timers.first());
- timers.remove(timers.first());
- }
- }
- if (timers.isEmpty()) {
- emptyKeys.add(pendingTimers.getKey());
- }
- }
- objectTimers.keySet().removeAll(emptyKeys);
- return result;
- }
-
- ////////////////////////////////////////////////////////////////////////////////////////////////
-
- /**
- * The {@link Clock} providing the current time in the {@link TimeDomain#PROCESSING_TIME} domain.
- */
- private final Clock clock;
-
- /**
- * A map from each {@link PCollection} to all {@link AppliedPTransform PTransform applications}
- * that consume that {@link PCollection}.
- */
- private final Map<PValue, Collection<AppliedPTransform<?, ?, ?>>> consumers;
-
- /**
- * The input and output watermark of each {@link AppliedPTransform}.
- */
- private final Map<AppliedPTransform<?, ?, ?>, TransformWatermarks> transformToWatermarks;
-
- /**
- * Creates a new {@link InMemoryWatermarkManager}. All watermarks within the newly created
- * {@link InMemoryWatermarkManager} start at {@link BoundedWindow#TIMESTAMP_MIN_VALUE}, the
- * minimum watermark, with no watermark holds or pending elements.
- *
- * @param rootTransforms the root-level transforms of the {@link Pipeline}
- * @param consumers a mapping between each {@link PCollection} in the {@link Pipeline} to the
- * transforms that consume it as a part of their input
- */
- public static InMemoryWatermarkManager create(
- Clock clock,
- Collection<AppliedPTransform<?, ?, ?>> rootTransforms,
- Map<PValue, Collection<AppliedPTransform<?, ?, ?>>> consumers) {
- return new InMemoryWatermarkManager(clock, rootTransforms, consumers);
- }
-
- private InMemoryWatermarkManager(
- Clock clock,
- Collection<AppliedPTransform<?, ?, ?>> rootTransforms,
- Map<PValue, Collection<AppliedPTransform<?, ?, ?>>> consumers) {
- this.clock = clock;
- this.consumers = consumers;
-
- transformToWatermarks = new HashMap<>();
-
- for (AppliedPTransform<?, ?, ?> rootTransform : rootTransforms) {
- getTransformWatermark(rootTransform);
- }
- for (Collection<AppliedPTransform<?, ?, ?>> intermediateTransforms : consumers.values()) {
- for (AppliedPTransform<?, ?, ?> transform : intermediateTransforms) {
- getTransformWatermark(transform);
- }
- }
- }
-
- private TransformWatermarks getTransformWatermark(AppliedPTransform<?, ?, ?> transform) {
- TransformWatermarks wms = transformToWatermarks.get(transform);
- if (wms == null) {
- List<Watermark> inputCollectionWatermarks = getInputWatermarks(transform);
- AppliedPTransformInputWatermark inputWatermark =
- new AppliedPTransformInputWatermark(inputCollectionWatermarks);
- AppliedPTransformOutputWatermark outputWatermark =
- new AppliedPTransformOutputWatermark(inputWatermark);
-
- SynchronizedProcessingTimeInputWatermark inputProcessingWatermark =
- new SynchronizedProcessingTimeInputWatermark(getInputProcessingWatermarks(transform));
- SynchronizedProcessingTimeOutputWatermark outputProcessingWatermark =
- new SynchronizedProcessingTimeOutputWatermark(inputProcessingWatermark);
-
- wms =
- new TransformWatermarks(
- inputWatermark, outputWatermark, inputProcessingWatermark, outputProcessingWatermark);
- transformToWatermarks.put(transform, wms);
- }
- return wms;
- }
-
- private Collection<Watermark> getInputProcessingWatermarks(
- AppliedPTransform<?, ?, ?> transform) {
- ImmutableList.Builder<Watermark> inputWmsBuilder = ImmutableList.builder();
- Collection<? extends PValue> inputs = transform.getInput().expand();
- if (inputs.isEmpty()) {
- inputWmsBuilder.add(THE_END_OF_TIME);
- }
- for (PValue pvalue : inputs) {
- Watermark producerOutputWatermark =
- getTransformWatermark(pvalue.getProducingTransformInternal())
- .synchronizedProcessingOutputWatermark;
- inputWmsBuilder.add(producerOutputWatermark);
- }
- return inputWmsBuilder.build();
- }
-
- private List<Watermark> getInputWatermarks(AppliedPTransform<?, ?, ?> transform) {
- ImmutableList.Builder<Watermark> inputWatermarksBuilder = ImmutableList.builder();
- Collection<? extends PValue> inputs = transform.getInput().expand();
- if (inputs.isEmpty()) {
- inputWatermarksBuilder.add(THE_END_OF_TIME);
- }
- for (PValue pvalue : inputs) {
- Watermark producerOutputWatermark =
- getTransformWatermark(pvalue.getProducingTransformInternal()).outputWatermark;
- inputWatermarksBuilder.add(producerOutputWatermark);
- }
- List<Watermark> inputCollectionWatermarks = inputWatermarksBuilder.build();
- return inputCollectionWatermarks;
- }
-
- ////////////////////////////////////////////////////////////////////////////////////////////////
-
- /**
- * Gets the input and output watermarks for an {@link AppliedPTransform}. If the
- * {@link AppliedPTransform PTransform} has not processed any elements, return a watermark of
- * {@link BoundedWindow#TIMESTAMP_MIN_VALUE}.
- *
- * @return a snapshot of the input watermark and output watermark for the provided transform
- */
- public TransformWatermarks getWatermarks(AppliedPTransform<?, ?, ?> transform) {
- return transformToWatermarks.get(transform);
- }
-
- /**
- * Updates the watermarks of a transform with one or more inputs.
- *
- * <p>Each transform has two monotonically increasing watermarks: the input watermark, which can,
- * at any time, be updated to equal:
- * <pre>
- * MAX(CurrentInputWatermark, MIN(PendingElements, InputPCollectionWatermarks))
- * </pre>
- * and the output watermark, which can, at any time, be updated to equal:
- * <pre>
- * MAX(CurrentOutputWatermark, MIN(InputWatermark, WatermarkHolds))
- * </pre>.
- *
- * @param completed the input that has completed
- * @param transform the transform that has completed processing the input
- * @param outputs the bundles the transform has output
- * @param earliestHold the earliest watermark hold in the transform's state. {@code null} if there
- * is no hold
- */
- public void updateWatermarks(
- @Nullable CommittedBundle<?> completed,
- AppliedPTransform<?, ?, ?> transform,
- TimerUpdate timerUpdate,
- Iterable<? extends CommittedBundle<?>> outputs,
- @Nullable Instant earliestHold) {
- updatePending(completed, transform, timerUpdate, outputs);
- TransformWatermarks transformWms = transformToWatermarks.get(transform);
- transformWms.setEventTimeHold(completed == null ? null : completed.getKey(), earliestHold);
- refreshWatermarks(transform);
- }
-
- private void refreshWatermarks(AppliedPTransform<?, ?, ?> transform) {
- TransformWatermarks myWatermarks = transformToWatermarks.get(transform);
- WatermarkUpdate updateResult = myWatermarks.refresh();
- if (updateResult.isAdvanced()) {
- for (PValue outputPValue : transform.getOutput().expand()) {
- Collection<AppliedPTransform<?, ?, ?>> downstreamTransforms = consumers.get(outputPValue);
- if (downstreamTransforms != null) {
- for (AppliedPTransform<?, ?, ?> downstreamTransform : downstreamTransforms) {
- refreshWatermarks(downstreamTransform);
- }
- }
- }
- }
- }
-
- /**
- * Removes all of the completed Timers from the collection of pending timers, adds all new timers,
- * and removes all deleted timers. Removes all elements consumed by the input bundle from the
- * {@link PTransform PTransforms} collection of pending elements, and adds all elements produced
- * by the {@link PTransform} to the pending queue of each consumer.
- */
- private void updatePending(
- CommittedBundle<?> input,
- AppliedPTransform<?, ?, ?> transform,
- TimerUpdate timerUpdate,
- Iterable<? extends CommittedBundle<?>> outputs) {
- TransformWatermarks completedTransform = transformToWatermarks.get(transform);
- completedTransform.updateTimers(timerUpdate);
- if (input != null) {
- completedTransform.removePending(input);
- }
-
- for (CommittedBundle<?> bundle : outputs) {
- for (AppliedPTransform<?, ?, ?> consumer : consumers.get(bundle.getPCollection())) {
- TransformWatermarks watermarks = transformToWatermarks.get(consumer);
- watermarks.addPending(bundle);
- }
- }
- }
-
- /**
- * Returns a map of each {@link PTransform} that has pending timers to those timers. All of the
- * pending timers will be removed from this {@link InMemoryWatermarkManager}.
- */
- public Map<AppliedPTransform<?, ?, ?>, Map<Object, FiredTimers>> extractFiredTimers() {
- Map<AppliedPTransform<?, ?, ?>, Map<Object, FiredTimers>> allTimers = new HashMap<>();
- for (Map.Entry<AppliedPTransform<?, ?, ?>, TransformWatermarks> watermarksEntry :
- transformToWatermarks.entrySet()) {
- Map<Object, FiredTimers> keyFiredTimers = watermarksEntry.getValue().extractFiredTimers();
- if (!keyFiredTimers.isEmpty()) {
- allTimers.put(watermarksEntry.getKey(), keyFiredTimers);
- }
- }
- return allTimers;
- }
-
- /**
- * A (key, Instant) pair that holds the watermark. Holds are per-key, but the watermark is global,
- * and as such the watermark manager must track holds and the release of holds on a per-key basis.
- *
- * <p>The {@link #compareTo(KeyedHold)} method of {@link KeyedHold} is not consistent with equals,
- * as the key is arbitrarily ordered via identity, rather than object equality.
- */
- private static final class KeyedHold implements Comparable<KeyedHold> {
- private static final Ordering<Object> KEY_ORDERING = Ordering.arbitrary().nullsLast();
-
- private final Object key;
- private final Instant timestamp;
-
- /**
- * Create a new KeyedHold with the specified key and timestamp.
- */
- public static KeyedHold of(Object key, Instant timestamp) {
- return new KeyedHold(key, MoreObjects.firstNonNull(timestamp, THE_END_OF_TIME.get()));
- }
-
- private KeyedHold(Object key, Instant timestamp) {
- this.key = key;
- this.timestamp = timestamp;
- }
-
- @Override
- public int compareTo(KeyedHold that) {
- return ComparisonChain.start()
- .compare(this.timestamp, that.timestamp)
- .compare(this.key, that.key, KEY_ORDERING)
- .result();
- }
-
- @Override
- public int hashCode() {
- return Objects.hash(timestamp, key);
- }
-
- @Override
- public boolean equals(Object other) {
- if (other == null || !(other instanceof KeyedHold)) {
- return false;
- }
- KeyedHold that = (KeyedHold) other;
- return Objects.equals(this.timestamp, that.timestamp) && Objects.equals(this.key, that.key);
- }
-
- /**
- * Get the value of this {@link KeyedHold}.
- */
- public Instant getTimestamp() {
- return timestamp;
- }
-
- @Override
- public String toString() {
- return MoreObjects.toStringHelper(KeyedHold.class)
- .add("key", key)
- .add("hold", timestamp)
- .toString();
- }
- }
-
- private static class PerKeyHolds {
- private final Map<Object, KeyedHold> keyedHolds;
- private final PriorityQueue<KeyedHold> allHolds;
-
- private PerKeyHolds() {
- this.keyedHolds = new HashMap<>();
- this.allHolds = new PriorityQueue<>();
- }
-
- /**
- * Gets the minimum hold across all keys in this {@link PerKeyHolds}, or THE_END_OF_TIME if
- * there are no holds within this {@link PerKeyHolds}.
- */
- public Instant getMinHold() {
- return allHolds.isEmpty() ? THE_END_OF_TIME.get() : allHolds.peek().getTimestamp();
- }
-
- /**
- * Updates the hold of the provided key to the provided value, removing any other holds for
- * the same key.
- */
- public void updateHold(@Nullable Object key, Instant newHold) {
- removeHold(key);
- KeyedHold newKeyedHold = KeyedHold.of(key, newHold);
- keyedHolds.put(key, newKeyedHold);
- allHolds.offer(newKeyedHold);
- }
-
- /**
- * Removes the hold of the provided key.
- */
- public void removeHold(Object key) {
- KeyedHold oldHold = keyedHolds.get(key);
- if (oldHold != null) {
- allHolds.remove(oldHold);
- }
- }
- }
-
- /**
- * A reference to the input and output watermarks of an {@link AppliedPTransform}.
- */
- public class TransformWatermarks {
- private final AppliedPTransformInputWatermark inputWatermark;
- private final AppliedPTransformOutputWatermark outputWatermark;
-
- private final SynchronizedProcessingTimeInputWatermark synchronizedProcessingInputWatermark;
- private final SynchronizedProcessingTimeOutputWatermark synchronizedProcessingOutputWatermark;
-
- private Instant latestSynchronizedInputWm;
- private Instant latestSynchronizedOutputWm;
-
- private TransformWatermarks(
- AppliedPTransformInputWatermark inputWatermark,
- AppliedPTransformOutputWatermark outputWatermark,
- SynchronizedProcessingTimeInputWatermark inputSynchProcessingWatermark,
- SynchronizedProcessingTimeOutputWatermark outputSynchProcessingWatermark) {
- this.inputWatermark = inputWatermark;
- this.outputWatermark = outputWatermark;
-
- this.synchronizedProcessingInputWatermark = inputSynchProcessingWatermark;
- this.synchronizedProcessingOutputWatermark = outputSynchProcessingWatermark;
- this.latestSynchronizedInputWm = BoundedWindow.TIMESTAMP_MIN_VALUE;
- this.latestSynchronizedOutputWm = BoundedWindow.TIMESTAMP_MIN_VALUE;
- }
-
- /**
- * Returns the input watermark of the {@link AppliedPTransform}.
- */
- public Instant getInputWatermark() {
- return inputWatermark.get();
- }
-
- /**
- * Returns the output watermark of the {@link AppliedPTransform}.
- */
- public Instant getOutputWatermark() {
- return outputWatermark.get();
- }
-
- /**
- * Returns the synchronized processing input time of the {@link AppliedPTransform}.
- *
- * <p>The returned value is guaranteed to be monotonically increasing, and outside of the
- * presence of holds, will increase as the system time progresses.
- */
- public synchronized Instant getSynchronizedProcessingInputTime() {
- latestSynchronizedInputWm = INSTANT_ORDERING.max(
- latestSynchronizedInputWm,
- INSTANT_ORDERING.min(clock.now(), synchronizedProcessingInputWatermark.get()));
- return latestSynchronizedInputWm;
- }
-
- /**
- * Returns the synchronized processing output time of the {@link AppliedPTransform}.
- *
- * <p>The returned value is guaranteed to be monotonically increasing, and outside of the
- * presence of holds, will increase as the system time progresses.
- */
- public synchronized Instant getSynchronizedProcessingOutputTime() {
- latestSynchronizedOutputWm = INSTANT_ORDERING.max(
- latestSynchronizedOutputWm,
- INSTANT_ORDERING.min(clock.now(), synchronizedProcessingOutputWatermark.get()));
- return latestSynchronizedOutputWm;
- }
-
- private WatermarkUpdate refresh() {
- inputWatermark.refresh();
- synchronizedProcessingInputWatermark.refresh();
- WatermarkUpdate eventOutputUpdate = outputWatermark.refresh();
- WatermarkUpdate syncOutputUpdate = synchronizedProcessingOutputWatermark.refresh();
- return eventOutputUpdate.union(syncOutputUpdate);
- }
-
- private void setEventTimeHold(Object key, Instant newHold) {
- outputWatermark.updateHold(key, newHold);
- }
-
- private void removePending(CommittedBundle<?> bundle) {
- inputWatermark.removePendingElements(bundle.getElements());
- synchronizedProcessingInputWatermark.removePending(bundle);
- }
-
- private void addPending(CommittedBundle<?> bundle) {
- inputWatermark.addPendingElements(bundle.getElements());
- synchronizedProcessingInputWatermark.addPending(bundle);
- }
-
- private Map<Object, FiredTimers> extractFiredTimers() {
- Map<Object, List<TimerData>> eventTimeTimers = inputWatermark.extractFiredEventTimeTimers();
- Map<Object, List<TimerData>> processingTimers;
- Map<Object, List<TimerData>> synchronizedTimers;
- if (inputWatermark.get().equals(BoundedWindow.TIMESTAMP_MAX_VALUE)) {
- processingTimers = synchronizedProcessingInputWatermark.extractFiredDomainTimers(
- TimeDomain.PROCESSING_TIME, BoundedWindow.TIMESTAMP_MAX_VALUE);
- synchronizedTimers = synchronizedProcessingInputWatermark.extractFiredDomainTimers(
- TimeDomain.PROCESSING_TIME, BoundedWindow.TIMESTAMP_MAX_VALUE);
- } else {
- processingTimers = synchronizedProcessingInputWatermark.extractFiredDomainTimers(
- TimeDomain.PROCESSING_TIME, clock.now());
- synchronizedTimers = synchronizedProcessingInputWatermark.extractFiredDomainTimers(
- TimeDomain.SYNCHRONIZED_PROCESSING_TIME, getSynchronizedProcessingInputTime());
- }
- Map<Object, Map<TimeDomain, List<TimerData>>> groupedTimers = new HashMap<>();
- groupFiredTimers(groupedTimers, eventTimeTimers, processingTimers, synchronizedTimers);
-
- Map<Object, FiredTimers> keyFiredTimers = new HashMap<>();
- for (Map.Entry<Object, Map<TimeDomain, List<TimerData>>> firedTimers :
- groupedTimers.entrySet()) {
- keyFiredTimers.put(firedTimers.getKey(), new FiredTimers(firedTimers.getValue()));
- }
- return keyFiredTimers;
- }
-
- @SafeVarargs
- private final void groupFiredTimers(
- Map<Object, Map<TimeDomain, List<TimerData>>> groupedToMutate,
- Map<Object, List<TimerData>>... timersToGroup) {
- for (Map<Object, List<TimerData>> subGroup : timersToGroup) {
- for (Map.Entry<Object, List<TimerData>> newTimers : subGroup.entrySet()) {
- Map<TimeDomain, List<TimerData>> grouped = groupedToMutate.get(newTimers.getKey());
- if (grouped == null) {
- grouped = new HashMap<>();
- groupedToMutate.put(newTimers.getKey(), grouped);
- }
- grouped.put(newTimers.getValue().get(0).getDomain(), newTimers.getValue());
- }
- }
- }
-
- private void updateTimers(TimerUpdate update) {
- inputWatermark.updateTimers(update);
- synchronizedProcessingInputWatermark.updateTimers(update);
- }
-
- @Override
- public String toString() {
- return MoreObjects.toStringHelper(TransformWatermarks.class)
- .add("inputWatermark", inputWatermark)
- .add("outputWatermark", outputWatermark)
- .add("inputProcessingTime", synchronizedProcessingInputWatermark)
- .add("outputProcessingTime", synchronizedProcessingOutputWatermark)
- .toString();
- }
- }
-
- /**
- * A collection of newly set, deleted, and completed timers.
- *
- * <p>setTimers and deletedTimers are collections of {@link TimerData} that have been added to the
- * {@link TimerInternals} of an executed step. completedTimers are timers that were delivered as
- * the input to the executed step.
- */
- public static class TimerUpdate {
- private final Object key;
- private final Iterable<? extends TimerData> completedTimers;
-
- private final Iterable<? extends TimerData> setTimers;
- private final Iterable<? extends TimerData> deletedTimers;
-
- /**
- * Returns a TimerUpdate for a null key with no timers.
- */
- public static TimerUpdate empty() {
- return new TimerUpdate(
- null,
- Collections.<TimerData>emptyList(),
- Collections.<TimerData>emptyList(),
- Collections.<TimerData>emptyList());
- }
-
- /**
- * Creates a new {@link TimerUpdate} builder with the provided completed timers that needs the
- * set and deleted timers to be added to it.
- */
- public static TimerUpdateBuilder builder(Object key) {
- return new TimerUpdateBuilder(key);
- }
-
- /**
- * A {@link TimerUpdate} builder that needs to be provided with set timers and deleted timers.
- */
- public static final class TimerUpdateBuilder {
- private final Object key;
- private final Collection<TimerData> completedTimers;
- private final Collection<TimerData> setTimers;
- private final Collection<TimerData> deletedTimers;
-
- private TimerUpdateBuilder(Object key) {
- this.key = key;
- this.completedTimers = new HashSet<>();
- this.setTimers = new HashSet<>();
- this.deletedTimers = new HashSet<>();
- }
-
- /**
- * Adds all of the provided timers to the collection of completed timers, and returns this
- * {@link TimerUpdateBuilder}.
- */
- public TimerUpdateBuilder withCompletedTimers(Iterable<TimerData> completedTimers) {
- Iterables.addAll(this.completedTimers, completedTimers);
- return this;
- }
-
- /**
- * Adds the provided timer to the collection of set timers, removing it from deleted timers if
- * it has previously been deleted. Returns this {@link TimerUpdateBuilder}.
- */
- public TimerUpdateBuilder setTimer(TimerData setTimer) {
- deletedTimers.remove(setTimer);
- setTimers.add(setTimer);
- return this;
- }
-
- /**
- * Adds the provided timer to the collection of deleted timers, removing it from set timers if
- * it has previously been set. Returns this {@link TimerUpdateBuilder}.
- */
- public TimerUpdateBuilder deletedTimer(TimerData deletedTimer) {
- deletedTimers.add(deletedTimer);
- setTimers.remove(deletedTimer);
- return this;
- }
-
- /**
- * Returns a new {@link TimerUpdate} with the most recently set completedTimers, setTimers,
- * and deletedTimers.
- */
- public TimerUpdate build() {
- return new TimerUpdate(
- key,
- ImmutableSet.copyOf(completedTimers),
- ImmutableSet.copyOf(setTimers),
- ImmutableSet.copyOf(deletedTimers));
- }
- }
-
- private TimerUpdate(
- Object key,
- Iterable<? extends TimerData> completedTimers,
- Iterable<? extends TimerData> setTimers,
- Iterable<? extends TimerData> deletedTimers) {
- this.key = key;
- this.completedTimers = completedTimers;
- this.setTimers = setTimers;
- this.deletedTimers = deletedTimers;
- }
-
- @VisibleForTesting
- Object getKey() {
- return key;
- }
-
- @VisibleForTesting
- Iterable<? extends TimerData> getCompletedTimers() {
- return completedTimers;
- }
-
- @VisibleForTesting
- Iterable<? extends TimerData> getSetTimers() {
- return setTimers;
- }
-
- @VisibleForTesting
- Iterable<? extends TimerData> getDeletedTimers() {
- return deletedTimers;
- }
-
- /**
- * Returns a {@link TimerUpdate} that is like this one, but with the specified completed timers.
- */
- public TimerUpdate withCompletedTimers(Iterable<TimerData> completedTimers) {
- return new TimerUpdate(this.key, completedTimers, setTimers, deletedTimers);
- }
-
- @Override
- public int hashCode() {
- return Objects.hash(key, completedTimers, setTimers, deletedTimers);
- }
-
- @Override
- public boolean equals(Object other) {
- if (other == null || !(other instanceof TimerUpdate)) {
- return false;
- }
- TimerUpdate that = (TimerUpdate) other;
- return Objects.equals(this.key, that.key)
- && Objects.equals(this.completedTimers, that.completedTimers)
- && Objects.equals(this.setTimers, that.setTimers)
- && Objects.equals(this.deletedTimers, that.deletedTimers);
- }
- }
-
- /**
- * A pair of {@link TimerData} and key which can be delivered to the appropriate
- * {@link AppliedPTransform}. A timer fires at the transform that set it with a specific key when
- * the time domain in which it lives progresses past a specified time, as determined by the
- * {@link InMemoryWatermarkManager}.
- */
- public static class FiredTimers {
- private final Map<TimeDomain, ? extends Collection<TimerData>> timers;
-
- private FiredTimers(Map<TimeDomain, ? extends Collection<TimerData>> timers) {
- this.timers = timers;
- }
-
- /**
- * Gets all of the timers that have fired within the provided {@link TimeDomain}. If no timers
- * fired within the provided domain, return an empty collection.
- *
- * <p>Timers within a {@link TimeDomain} are guaranteed to be in order of increasing timestamp.
- */
- public Collection<TimerData> getTimers(TimeDomain domain) {
- Collection<TimerData> domainTimers = timers.get(domain);
- if (domainTimers == null) {
- return Collections.emptyList();
- }
- return domainTimers;
- }
-
- @Override
- public String toString() {
- return MoreObjects.toStringHelper(FiredTimers.class).add("timers", timers).toString();
- }
- }
-
- private static class WindowedValueByTimestampComparator extends Ordering<WindowedValue<?>> {
- @Override
- public int compare(WindowedValue<?> o1, WindowedValue<?> o2) {
- return o1.getTimestamp().compareTo(o2.getTimestamp());
- }
- }
-
- public Set<AppliedPTransform<?, ?, ?>> getCompletedTransforms() {
- Set<AppliedPTransform<?, ?, ?>> result = new HashSet<>();
- for (Map.Entry<AppliedPTransform<?, ?, ?>, TransformWatermarks> wms :
- transformToWatermarks.entrySet()) {
- if (wms.getValue().getOutputWatermark().equals(THE_END_OF_TIME.get())) {
- result.add(wms.getKey());
- }
- }
- return result;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/InProcessBundle.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/InProcessBundle.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/InProcessBundle.java
deleted file mode 100644
index 112ba17..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/InProcessBundle.java
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright (C) 2016 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.runners.inprocess;
-
-import static com.google.common.base.Preconditions.checkState;
-
-import com.google.cloud.dataflow.sdk.runners.inprocess.InProcessPipelineRunner.CommittedBundle;
-import com.google.cloud.dataflow.sdk.runners.inprocess.InProcessPipelineRunner.UncommittedBundle;
-import com.google.cloud.dataflow.sdk.util.WindowedValue;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.common.base.MoreObjects;
-import com.google.common.collect.ImmutableList;
-
-import org.joda.time.Instant;
-
-import javax.annotation.Nullable;
-
-/**
- * A {@link UncommittedBundle} that buffers elements in memory.
- */
-public final class InProcessBundle<T> implements UncommittedBundle<T> {
- private final PCollection<T> pcollection;
- private final boolean keyed;
- private final Object key;
- private boolean committed = false;
- private ImmutableList.Builder<WindowedValue<T>> elements;
-
- /**
- * Create a new {@link InProcessBundle} for the specified {@link PCollection} without a key.
- */
- public static <T> InProcessBundle<T> unkeyed(PCollection<T> pcollection) {
- return new InProcessBundle<T>(pcollection, false, null);
- }
-
- /**
- * Create a new {@link InProcessBundle} for the specified {@link PCollection} with the specified
- * key.
- *
- * See {@link CommittedBundle#getKey()} and {@link CommittedBundle#isKeyed()} for more
- * information.
- */
- public static <T> InProcessBundle<T> keyed(PCollection<T> pcollection, Object key) {
- return new InProcessBundle<T>(pcollection, true, key);
- }
-
- private InProcessBundle(PCollection<T> pcollection, boolean keyed, Object key) {
- this.pcollection = pcollection;
- this.keyed = keyed;
- this.key = key;
- this.elements = ImmutableList.builder();
- }
-
- @Override
- public PCollection<T> getPCollection() {
- return pcollection;
- }
-
- @Override
- public InProcessBundle<T> add(WindowedValue<T> element) {
- checkState(!committed, "Can't add element %s to committed bundle %s", element, this);
- elements.add(element);
- return this;
- }
-
- @Override
- public CommittedBundle<T> commit(final Instant synchronizedCompletionTime) {
- checkState(!committed, "Can't commit already committed bundle %s", this);
- committed = true;
- final Iterable<WindowedValue<T>> committedElements = elements.build();
- return new CommittedBundle<T>() {
- @Override
- @Nullable
- public Object getKey() {
- return key;
- }
-
- @Override
- public boolean isKeyed() {
- return keyed;
- }
-
- @Override
- public Iterable<WindowedValue<T>> getElements() {
- return committedElements;
- }
-
- @Override
- public PCollection<T> getPCollection() {
- return pcollection;
- }
-
- @Override
- public Instant getSynchronizedProcessingOutputWatermark() {
- return synchronizedCompletionTime;
- }
-
- @Override
- public String toString() {
- return MoreObjects.toStringHelper(this)
- .omitNullValues()
- .add("pcollection", pcollection)
- .add("key", key)
- .add("elements", committedElements)
- .toString();
- }
- };
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/InProcessBundleOutputManager.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/InProcessBundleOutputManager.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/InProcessBundleOutputManager.java
deleted file mode 100644
index 406e2d4..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/InProcessBundleOutputManager.java
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.runners.inprocess;
-
-import com.google.cloud.dataflow.sdk.runners.inprocess.InProcessPipelineRunner.CommittedBundle;
-import com.google.cloud.dataflow.sdk.runners.inprocess.InProcessPipelineRunner.UncommittedBundle;
-import com.google.cloud.dataflow.sdk.util.DoFnRunners.OutputManager;
-import com.google.cloud.dataflow.sdk.util.WindowedValue;
-import com.google.cloud.dataflow.sdk.values.TupleTag;
-
-import java.util.Map;
-
-/**
- * An {@link OutputManager} that outputs to {@link CommittedBundle Bundles} used by the
- * {@link InProcessPipelineRunner}.
- */
-public class InProcessBundleOutputManager implements OutputManager {
- private final Map<TupleTag<?>, UncommittedBundle<?>> bundles;
-
- public static InProcessBundleOutputManager create(
- Map<TupleTag<?>, UncommittedBundle<?>> outputBundles) {
- return new InProcessBundleOutputManager(outputBundles);
- }
-
- public InProcessBundleOutputManager(Map<TupleTag<?>, UncommittedBundle<?>> bundles) {
- this.bundles = bundles;
- }
-
- @SuppressWarnings("unchecked")
- @Override
- public <T> void output(TupleTag<T> tag, WindowedValue<T> output) {
- @SuppressWarnings("rawtypes")
- UncommittedBundle bundle = bundles.get(tag);
- bundle.add(output);
- }
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/InProcessCreate.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/InProcessCreate.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/InProcessCreate.java
deleted file mode 100644
index 9023b7b..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/InProcessCreate.java
+++ /dev/null
@@ -1,209 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.runners.inprocess;
-
-import com.google.cloud.dataflow.sdk.coders.CannotProvideCoderException;
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.coders.CoderException;
-import com.google.cloud.dataflow.sdk.io.BoundedSource;
-import com.google.cloud.dataflow.sdk.io.Read;
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-import com.google.cloud.dataflow.sdk.transforms.Create;
-import com.google.cloud.dataflow.sdk.transforms.Create.Values;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-import com.google.cloud.dataflow.sdk.util.CoderUtils;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.cloud.dataflow.sdk.values.PInput;
-import com.google.common.annotations.VisibleForTesting;
-import com.google.common.base.Optional;
-import com.google.common.base.Throwables;
-import com.google.common.collect.ImmutableList;
-import com.google.common.collect.Iterators;
-import com.google.common.collect.PeekingIterator;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.List;
-import java.util.NoSuchElementException;
-
-import javax.annotation.Nullable;
-
-/**
- * An in-process implementation of the {@link Values Create.Values} {@link PTransform}, implemented
- * using a {@link BoundedSource}.
- *
- * The coder is inferred via the {@link Values#getDefaultOutputCoder(PInput)} method on the original
- * transform.
- */
-class InProcessCreate<T> extends ForwardingPTransform<PInput, PCollection<T>> {
- private final Create.Values<T> original;
-
- public static <T> InProcessCreate<T> from(Create.Values<T> original) {
- return new InProcessCreate<>(original);
- }
-
- private InProcessCreate(Values<T> original) {
- this.original = original;
- }
-
- @Override
- public PCollection<T> apply(PInput input) {
- Coder<T> elementCoder;
- try {
- elementCoder = original.getDefaultOutputCoder(input);
- } catch (CannotProvideCoderException e) {
- throw new IllegalArgumentException(
- "Unable to infer a coder and no Coder was specified. "
- + "Please set a coder by invoking Create.withCoder() explicitly.",
- e);
- }
- InMemorySource<T> source;
- try {
- source = new InMemorySource<>(original.getElements(), elementCoder);
- } catch (IOException e) {
- throw Throwables.propagate(e);
- }
- PCollection<T> result = input.getPipeline().apply(Read.from(source));
- result.setCoder(elementCoder);
- return result;
- }
-
- @Override
- public PTransform<PInput, PCollection<T>> delegate() {
- return original;
- }
-
- @VisibleForTesting
- static class InMemorySource<T> extends BoundedSource<T> {
- private final Collection<byte[]> allElementsBytes;
- private final long totalSize;
- private final Coder<T> coder;
-
- public InMemorySource(Iterable<T> elements, Coder<T> elemCoder)
- throws CoderException, IOException {
- allElementsBytes = new ArrayList<>();
- long totalSize = 0L;
- for (T element : elements) {
- byte[] bytes = CoderUtils.encodeToByteArray(elemCoder, element);
- allElementsBytes.add(bytes);
- totalSize += bytes.length;
- }
- this.totalSize = totalSize;
- this.coder = elemCoder;
- }
-
- /**
- * Create a new source with the specified bytes. The new source owns the input element bytes,
- * which must not be modified after this constructor is called.
- */
- private InMemorySource(Collection<byte[]> elementBytes, long totalSize, Coder<T> coder) {
- this.allElementsBytes = ImmutableList.copyOf(elementBytes);
- this.totalSize = totalSize;
- this.coder = coder;
- }
-
- @Override
- public List<? extends BoundedSource<T>> splitIntoBundles(
- long desiredBundleSizeBytes, PipelineOptions options) throws Exception {
- ImmutableList.Builder<InMemorySource<T>> resultBuilder = ImmutableList.builder();
- long currentSourceSize = 0L;
- List<byte[]> currentElems = new ArrayList<>();
- for (byte[] elemBytes : allElementsBytes) {
- currentElems.add(elemBytes);
- currentSourceSize += elemBytes.length;
- if (currentSourceSize >= desiredBundleSizeBytes) {
- resultBuilder.add(new InMemorySource<>(currentElems, currentSourceSize, coder));
- currentElems.clear();
- currentSourceSize = 0L;
- }
- }
- if (!currentElems.isEmpty()) {
- resultBuilder.add(new InMemorySource<>(currentElems, currentSourceSize, coder));
- }
- return resultBuilder.build();
- }
-
- @Override
- public long getEstimatedSizeBytes(PipelineOptions options) throws Exception {
- return totalSize;
- }
-
- @Override
- public boolean producesSortedKeys(PipelineOptions options) throws Exception {
- return false;
- }
-
- @Override
- public BoundedSource.BoundedReader<T> createReader(PipelineOptions options) throws IOException {
- return new BytesReader();
- }
-
- @Override
- public void validate() {}
-
- @Override
- public Coder<T> getDefaultOutputCoder() {
- return coder;
- }
-
- private class BytesReader extends BoundedReader<T> {
- private final PeekingIterator<byte[]> iter;
- /**
- * Use an optional to distinguish between null next element (as Optional.absent()) and no next
- * element (next is null).
- */
- @Nullable private Optional<T> next;
-
- public BytesReader() {
- this.iter = Iterators.peekingIterator(allElementsBytes.iterator());
- }
-
- @Override
- public BoundedSource<T> getCurrentSource() {
- return InMemorySource.this;
- }
-
- @Override
- public boolean start() throws IOException {
- return advance();
- }
-
- @Override
- public boolean advance() throws IOException {
- boolean hasNext = iter.hasNext();
- if (hasNext) {
- next = Optional.fromNullable(CoderUtils.decodeFromByteArray(coder, iter.next()));
- } else {
- next = null;
- }
- return hasNext;
- }
-
- @Override
- @Nullable
- public T getCurrent() throws NoSuchElementException {
- if (next == null) {
- throw new NoSuchElementException();
- }
- return next.orNull();
- }
-
- @Override
- public void close() throws IOException {}
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/InProcessEvaluationContext.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/InProcessEvaluationContext.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/InProcessEvaluationContext.java
deleted file mode 100644
index 4aeb0d3..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/inprocess/InProcessEvaluationContext.java
+++ /dev/null
@@ -1,405 +0,0 @@
-/*
- * Copyright (C) 2016 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.runners.inprocess;
-
-import static com.google.common.base.Preconditions.checkNotNull;
-
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.runners.inprocess.GroupByKeyEvaluatorFactory.InProcessGroupByKeyOnly;
-import com.google.cloud.dataflow.sdk.runners.inprocess.InMemoryWatermarkManager.FiredTimers;
-import com.google.cloud.dataflow.sdk.runners.inprocess.InMemoryWatermarkManager.TransformWatermarks;
-import com.google.cloud.dataflow.sdk.runners.inprocess.InProcessPipelineRunner.CommittedBundle;
-import com.google.cloud.dataflow.sdk.runners.inprocess.InProcessPipelineRunner.PCollectionViewWriter;
-import com.google.cloud.dataflow.sdk.runners.inprocess.InProcessPipelineRunner.UncommittedBundle;
-import com.google.cloud.dataflow.sdk.transforms.AppliedPTransform;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
-import com.google.cloud.dataflow.sdk.transforms.windowing.Trigger;
-import com.google.cloud.dataflow.sdk.util.ExecutionContext;
-import com.google.cloud.dataflow.sdk.util.SideInputReader;
-import com.google.cloud.dataflow.sdk.util.TimerInternals.TimerData;
-import com.google.cloud.dataflow.sdk.util.WindowedValue;
-import com.google.cloud.dataflow.sdk.util.WindowingStrategy;
-import com.google.cloud.dataflow.sdk.util.common.CounterSet;
-import com.google.cloud.dataflow.sdk.util.state.CopyOnAccessInMemoryStateInternals;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.cloud.dataflow.sdk.values.PCollection.IsBounded;
-import com.google.cloud.dataflow.sdk.values.PCollectionView;
-import com.google.cloud.dataflow.sdk.values.PValue;
-import com.google.common.collect.ImmutableList;
-import com.google.common.collect.Iterables;
-
-import java.util.Collection;
-import java.util.List;
-import java.util.Map;
-import java.util.concurrent.ConcurrentHashMap;
-import java.util.concurrent.ConcurrentMap;
-
-import javax.annotation.Nullable;
-
-/**
- * The evaluation context for a specific pipeline being executed by the
- * {@link InProcessPipelineRunner}. Contains state shared within the execution across all
- * transforms.
- *
- * <p>{@link InProcessEvaluationContext} contains shared state for an execution of the
- * {@link InProcessPipelineRunner} that can be used while evaluating a {@link PTransform}. This
- * consists of views into underlying state and watermark implementations, access to read and write
- * {@link PCollectionView PCollectionViews}, and constructing {@link CounterSet CounterSets} and
- * {@link ExecutionContext ExecutionContexts}. This includes executing callbacks asynchronously when
- * state changes to the appropriate point (e.g. when a {@link PCollectionView} is requested and
- * known to be empty).
- *
- * <p>{@link InProcessEvaluationContext} also handles results by committing finalizing bundles based
- * on the current global state and updating the global state appropriately. This includes updating
- * the per-{@link StepAndKey} state, updating global watermarks, and executing any callbacks that
- * can be executed.
- */
-class InProcessEvaluationContext {
- /** The step name for each {@link AppliedPTransform} in the {@link Pipeline}. */
- private final Map<AppliedPTransform<?, ?, ?>, String> stepNames;
-
- /** The options that were used to create this {@link Pipeline}. */
- private final InProcessPipelineOptions options;
-
- /** The current processing time and event time watermarks and timers. */
- private final InMemoryWatermarkManager watermarkManager;
-
- /** Executes callbacks based on the progression of the watermark. */
- private final WatermarkCallbackExecutor callbackExecutor;
-
- /** The stateInternals of the world, by applied PTransform and key. */
- private final ConcurrentMap<StepAndKey, CopyOnAccessInMemoryStateInternals<?>>
- applicationStateInternals;
-
- private final InProcessSideInputContainer sideInputContainer;
-
- private final CounterSet mergedCounters;
-
- public static InProcessEvaluationContext create(
- InProcessPipelineOptions options,
- Collection<AppliedPTransform<?, ?, ?>> rootTransforms,
- Map<PValue, Collection<AppliedPTransform<?, ?, ?>>> valueToConsumers,
- Map<AppliedPTransform<?, ?, ?>, String> stepNames,
- Collection<PCollectionView<?>> views) {
- return new InProcessEvaluationContext(
- options, rootTransforms, valueToConsumers, stepNames, views);
- }
-
- private InProcessEvaluationContext(
- InProcessPipelineOptions options,
- Collection<AppliedPTransform<?, ?, ?>> rootTransforms,
- Map<PValue, Collection<AppliedPTransform<?, ?, ?>>> valueToConsumers,
- Map<AppliedPTransform<?, ?, ?>, String> stepNames,
- Collection<PCollectionView<?>> views) {
- this.options = checkNotNull(options);
- checkNotNull(rootTransforms);
- checkNotNull(valueToConsumers);
- checkNotNull(stepNames);
- checkNotNull(views);
- this.stepNames = stepNames;
-
- this.watermarkManager =
- InMemoryWatermarkManager.create(
- NanosOffsetClock.create(), rootTransforms, valueToConsumers);
- this.sideInputContainer = InProcessSideInputContainer.create(this, views);
-
- this.applicationStateInternals = new ConcurrentHashMap<>();
- this.mergedCounters = new CounterSet();
-
- this.callbackExecutor = WatermarkCallbackExecutor.create();
- }
-
- /**
- * Handle the provided {@link InProcessTransformResult}, produced after evaluating the provided
- * {@link CommittedBundle} (potentially null, if the result of a root {@link PTransform}).
- *
- * <p>The result is the output of running the transform contained in the
- * {@link InProcessTransformResult} on the contents of the provided bundle.
- *
- * @param completedBundle the bundle that was processed to produce the result. Potentially
- * {@code null} if the transform that produced the result is a root
- * transform
- * @param completedTimers the timers that were delivered to produce the {@code completedBundle},
- * or an empty iterable if no timers were delivered
- * @param result the result of evaluating the input bundle
- * @return the committed bundles contained within the handled {@code result}
- */
- public synchronized Iterable<? extends CommittedBundle<?>> handleResult(
- @Nullable CommittedBundle<?> completedBundle,
- Iterable<TimerData> completedTimers,
- InProcessTransformResult result) {
- Iterable<? extends CommittedBundle<?>> committedBundles =
- commitBundles(result.getOutputBundles());
- // Update watermarks and timers
- watermarkManager.updateWatermarks(
- completedBundle,
- result.getTransform(),
- result.getTimerUpdate().withCompletedTimers(completedTimers),
- committedBundles,
- result.getWatermarkHold());
- fireAllAvailableCallbacks();
- // Update counters
- if (result.getCounters() != null) {
- mergedCounters.merge(result.getCounters());
- }
- // Update state internals
- CopyOnAccessInMemoryStateInternals<?> theirState = result.getState();
- if (theirState != null) {
- CopyOnAccessInMemoryStateInternals<?> committedState = theirState.commit();
- StepAndKey stepAndKey =
- StepAndKey.of(
- result.getTransform(), completedBundle == null ? null : completedBundle.getKey());
- if (!committedState.isEmpty()) {
- applicationStateInternals.put(stepAndKey, committedState);
- } else {
- applicationStateInternals.remove(stepAndKey);
- }
- }
- return committedBundles;
- }
-
- private Iterable<? extends CommittedBundle<?>> commitBundles(
- Iterable<? extends UncommittedBundle<?>> bundles) {
- ImmutableList.Builder<CommittedBundle<?>> completed = ImmutableList.builder();
- for (UncommittedBundle<?> inProgress : bundles) {
- AppliedPTransform<?, ?, ?> producing =
- inProgress.getPCollection().getProducingTransformInternal();
- TransformWatermarks watermarks = watermarkManager.getWatermarks(producing);
- CommittedBundle<?> committed =
- inProgress.commit(watermarks.getSynchronizedProcessingOutputTime());
- // Empty bundles don't impact watermarks and shouldn't trigger downstream execution, so
- // filter them out
- if (!Iterables.isEmpty(committed.getElements())) {
- completed.add(committed);
- }
- }
- return completed.build();
- }
-
- private void fireAllAvailableCallbacks() {
- for (AppliedPTransform<?, ?, ?> transform : stepNames.keySet()) {
- fireAvailableCallbacks(transform);
- }
- }
-
- private void fireAvailableCallbacks(AppliedPTransform<?, ?, ?> producingTransform) {
- TransformWatermarks watermarks = watermarkManager.getWatermarks(producingTransform);
- callbackExecutor.fireForWatermark(producingTransform, watermarks.getOutputWatermark());
- }
-
- /**
- * Create a {@link UncommittedBundle} for use by a source.
- */
- public <T> UncommittedBundle<T> createRootBundle(PCollection<T> output) {
- return InProcessBundle.unkeyed(output);
- }
-
- /**
- * Create a {@link UncommittedBundle} whose elements belong to the specified {@link
- * PCollection}.
- */
- public <T> UncommittedBundle<T> createBundle(CommittedBundle<?> input, PCollection<T> output) {
- return input.isKeyed()
- ? InProcessBundle.keyed(output, input.getKey())
- : InProcessBundle.unkeyed(output);
- }
-
- /**
- * Create a {@link UncommittedBundle} with the specified keys at the specified step. For use by
- * {@link InProcessGroupByKeyOnly} {@link PTransform PTransforms}.
- */
- public <T> UncommittedBundle<T> createKeyedBundle(
- CommittedBundle<?> input, Object key, PCollection<T> output) {
- return InProcessBundle.keyed(output, key);
- }
-
- /**
- * Create a {@link PCollectionViewWriter}, whose elements will be used in the provided
- * {@link PCollectionView}.
- */
- public <ElemT, ViewT> PCollectionViewWriter<ElemT, ViewT> createPCollectionViewWriter(
- PCollection<Iterable<ElemT>> input, final PCollectionView<ViewT> output) {
- return new PCollectionViewWriter<ElemT, ViewT>() {
- @Override
- public void add(Iterable<WindowedValue<ElemT>> values) {
- sideInputContainer.write(output, values);
- }
- };
- }
-
- /**
- * Schedule a callback to be executed after output would be produced for the given window
- * if there had been input.
- *
- * <p>Output would be produced when the watermark for a {@link PValue} passes the point at
- * which the trigger for the specified window (with the specified windowing strategy) must have
- * fired from the perspective of that {@link PValue}, as specified by the value of
- * {@link Trigger#getWatermarkThatGuaranteesFiring(BoundedWindow)} for the trigger of the
- * {@link WindowingStrategy}. When the callback has fired, either values will have been produced
- * for a key in that window, the window is empty, or all elements in the window are late. The
- * callback will be executed regardless of whether values have been produced.
- */
- public void scheduleAfterOutputWouldBeProduced(
- PValue value,
- BoundedWindow window,
- WindowingStrategy<?, ?> windowingStrategy,
- Runnable runnable) {
- AppliedPTransform<?, ?, ?> producing = getProducing(value);
- callbackExecutor.callOnGuaranteedFiring(producing, window, windowingStrategy, runnable);
-
- fireAvailableCallbacks(lookupProducing(value));
- }
-
- private AppliedPTransform<?, ?, ?> getProducing(PValue value) {
- if (value.getProducingTransformInternal() != null) {
- return value.getProducingTransformInternal();
- }
- return lookupProducing(value);
- }
-
- private AppliedPTransform<?, ?, ?> lookupProducing(PValue value) {
- for (AppliedPTransform<?, ?, ?> transform : stepNames.keySet()) {
- if (transform.getOutput().equals(value) || transform.getOutput().expand().contains(value)) {
- return transform;
- }
- }
- return null;
- }
-
- /**
- * Get the options used by this {@link Pipeline}.
- */
- public InProcessPipelineOptions getPipelineOptions() {
- return options;
- }
-
- /**
- * Get an {@link ExecutionContext} for the provided {@link AppliedPTransform} and key.
- */
- public InProcessExecutionContext getExecutionContext(
- AppliedPTransform<?, ?, ?> application, Object key) {
- StepAndKey stepAndKey = StepAndKey.of(application, key);
- return new InProcessExecutionContext(
- options.getClock(),
- key,
- (CopyOnAccessInMemoryStateInternals<Object>) applicationStateInternals.get(stepAndKey),
- watermarkManager.getWatermarks(application));
- }
-
- /**
- * Get all of the steps used in this {@link Pipeline}.
- */
- public Collection<AppliedPTransform<?, ?, ?>> getSteps() {
- return stepNames.keySet();
- }
-
- /**
- * Get the Step Name for the provided application.
- */
- public String getStepName(AppliedPTransform<?, ?, ?> application) {
- return stepNames.get(application);
- }
-
- /**
- * Returns a {@link SideInputReader} capable of reading the provided
- * {@link PCollectionView PCollectionViews}.
- * @param sideInputs the {@link PCollectionView PCollectionViews} the result should be able to
- * read
- * @return a {@link SideInputReader} that can read all of the provided
- * {@link PCollectionView PCollectionViews}
- */
- public SideInputReader createSideInputReader(final List<PCollectionView<?>> sideInputs) {
- return sideInputContainer.createReaderForViews(sideInputs);
- }
-
- /**
- * Create a {@link CounterSet} for this {@link Pipeline}. The {@link CounterSet} is independent
- * of all other {@link CounterSet CounterSets} created by this call.
- *
- * The {@link InProcessEvaluationContext} is responsible for unifying the counters present in
- * all created {@link CounterSet CounterSets} when the transforms that call this method
- * complete.
- */
- public CounterSet createCounterSet() {
- return new CounterSet();
- }
-
- /**
- * Returns all of the counters that have been merged into this context via calls to
- * {@link CounterSet#merge(CounterSet)}.
- */
- public CounterSet getCounters() {
- return mergedCounters;
- }
-
- /**
- * Extracts all timers that have been fired and have not already been extracted.
- *
- * <p>This is a destructive operation. Timers will only appear in the result of this method once
- * for each time they are set.
- */
- public Map<AppliedPTransform<?, ?, ?>, Map<Object, FiredTimers>> extractFiredTimers() {
- return watermarkManager.extractFiredTimers();
- }
-
- /**
- * Returns true if the step will not produce additional output.
- *
- * <p>If the provided transform produces only {@link IsBounded#BOUNDED}
- * {@link PCollection PCollections}, returns true if the watermark is at
- * {@link BoundedWindow#TIMESTAMP_MAX_VALUE positive infinity}.
- *
- * <p>If the provided transform produces any {@link IsBounded#UNBOUNDED}
- * {@link PCollection PCollections}, returns the value of
- * {@link InProcessPipelineOptions#isShutdownUnboundedProducersWithMaxWatermark()}.
- */
- public boolean isDone(AppliedPTransform<?, ?, ?> transform) {
- // if the PTransform's watermark isn't at the max value, it isn't done
- if (watermarkManager
- .getWatermarks(transform)
- .getOutputWatermark()
- .isBefore(BoundedWindow.TIMESTAMP_MAX_VALUE)) {
- return false;
- }
- // If the PTransform has any unbounded outputs, and unbounded producers should not be shut down,
- // the PTransform may produce additional output. It is not done.
- for (PValue output : transform.getOutput().expand()) {
- if (output instanceof PCollection) {
- IsBounded bounded = ((PCollection<?>) output).isBounded();
- if (bounded.equals(IsBounded.UNBOUNDED)
- && !options.isShutdownUnboundedProducersWithMaxWatermark()) {
- return false;
- }
- }
- }
- // The PTransform's watermark was at positive infinity and all of its outputs are known to be
- // done. It is done.
- return true;
- }
-
- /**
- * Returns true if all steps are done.
- */
- public boolean isDone() {
- for (AppliedPTransform<?, ?, ?> transform : stepNames.keySet()) {
- if (!isDone(transform)) {
- return false;
- }
- }
- return true;
- }
-}
[37/67] [partial] incubator-beam git commit: Directory reorganization
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/PipelineOptionsFactory.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/PipelineOptionsFactory.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/PipelineOptionsFactory.java
deleted file mode 100644
index 4781d1c..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/PipelineOptionsFactory.java
+++ /dev/null
@@ -1,1537 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.options;
-
-import static com.google.common.base.Preconditions.checkArgument;
-
-import com.google.cloud.dataflow.sdk.options.Validation.Required;
-import com.google.cloud.dataflow.sdk.runners.PipelineRunner;
-import com.google.cloud.dataflow.sdk.runners.PipelineRunnerRegistrar;
-import com.google.cloud.dataflow.sdk.util.StringUtils;
-import com.google.cloud.dataflow.sdk.util.common.ReflectHelpers;
-import com.google.common.base.Function;
-import com.google.common.base.Joiner;
-import com.google.common.base.Optional;
-import com.google.common.base.Preconditions;
-import com.google.common.base.Predicate;
-import com.google.common.base.Strings;
-import com.google.common.base.Throwables;
-import com.google.common.collect.ArrayListMultimap;
-import com.google.common.collect.Collections2;
-import com.google.common.collect.FluentIterable;
-import com.google.common.collect.ImmutableListMultimap;
-import com.google.common.collect.ImmutableMap;
-import com.google.common.collect.ImmutableSet;
-import com.google.common.collect.Iterables;
-import com.google.common.collect.Iterators;
-import com.google.common.collect.ListMultimap;
-import com.google.common.collect.Lists;
-import com.google.common.collect.Maps;
-import com.google.common.collect.Sets;
-import com.google.common.collect.SortedSetMultimap;
-import com.google.common.collect.TreeMultimap;
-
-import com.fasterxml.jackson.annotation.JsonIgnore;
-import com.fasterxml.jackson.databind.JavaType;
-import com.fasterxml.jackson.databind.ObjectMapper;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.beans.BeanInfo;
-import java.beans.IntrospectionException;
-import java.beans.Introspector;
-import java.beans.PropertyDescriptor;
-import java.io.IOException;
-import java.io.PrintStream;
-import java.lang.annotation.Annotation;
-import java.lang.reflect.Method;
-import java.lang.reflect.Modifier;
-import java.lang.reflect.Proxy;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-import java.util.NoSuchElementException;
-import java.util.ServiceLoader;
-import java.util.Set;
-import java.util.SortedMap;
-import java.util.SortedSet;
-import java.util.TreeSet;
-
-import javax.annotation.Nullable;
-
-/**
- * Constructs a {@link PipelineOptions} or any derived interface that is composable to any other
- * derived interface of {@link PipelineOptions} via the {@link PipelineOptions#as} method. Being
- * able to compose one derived interface of {@link PipelineOptions} to another has the following
- * restrictions:
- * <ul>
- * <li>Any property with the same name must have the same return type for all derived interfaces
- * of {@link PipelineOptions}.
- * <li>Every bean property of any interface derived from {@link PipelineOptions} must have a
- * getter and setter method.
- * <li>Every method must conform to being a getter or setter for a JavaBean.
- * <li>The derived interface of {@link PipelineOptions} must be composable with every interface
- * registered with this factory.
- * </ul>
- *
- * <p>See the <a
- * href="http://www.oracle.com/technetwork/java/javase/documentation/spec-136004.html">JavaBeans
- * specification</a> for more details as to what constitutes a property.
- */
-public class PipelineOptionsFactory {
- /**
- * Creates and returns an object that implements {@link PipelineOptions}.
- * This sets the {@link ApplicationNameOptions#getAppName() "appName"} to the calling
- * {@link Class#getSimpleName() classes simple name}.
- *
- * @return An object that implements {@link PipelineOptions}.
- */
- public static PipelineOptions create() {
- return new Builder().as(PipelineOptions.class);
- }
-
- /**
- * Creates and returns an object that implements {@code <T>}.
- * This sets the {@link ApplicationNameOptions#getAppName() "appName"} to the calling
- * {@link Class#getSimpleName() classes simple name}.
- *
- * <p>Note that {@code <T>} must be composable with every registered interface with this factory.
- * See {@link PipelineOptionsFactory#validateWellFormed(Class, Set)} for more details.
- *
- * @return An object that implements {@code <T>}.
- */
- public static <T extends PipelineOptions> T as(Class<T> klass) {
- return new Builder().as(klass);
- }
-
- /**
- * Sets the command line arguments to parse when constructing the {@link PipelineOptions}.
- *
- * <p>Example GNU style command line arguments:
- * <pre>
- * --project=MyProject (simple property, will set the "project" property to "MyProject")
- * --readOnly=true (for boolean properties, will set the "readOnly" property to "true")
- * --readOnly (shorthand for boolean properties, will set the "readOnly" property to "true")
- * --x=1 --x=2 --x=3 (list style simple property, will set the "x" property to [1, 2, 3])
- * --x=1,2,3 (shorthand list style simple property, will set the "x" property to [1, 2, 3])
- * --complexObject='{"key1":"value1",...} (JSON format for all other complex types)
- * </pre>
- *
- * <p>Simple properties are able to bound to {@link String}, {@link Class}, enums and Java
- * primitives {@code boolean}, {@code byte}, {@code short}, {@code int}, {@code long},
- * {@code float}, {@code double} and their primitive wrapper classes.
- *
- * <p>Simple list style properties are able to be bound to {@code boolean[]}, {@code char[]},
- * {@code short[]}, {@code int[]}, {@code long[]}, {@code float[]}, {@code double[]},
- * {@code Class[]}, enum arrays, {@code String[]}, and {@code List<String>}.
- *
- * <p>JSON format is required for all other types.
- *
- * <p>By default, strict parsing is enabled and arguments must conform to be either
- * {@code --booleanArgName} or {@code --argName=argValue}. Strict parsing can be disabled with
- * {@link Builder#withoutStrictParsing()}. Empty or null arguments will be ignored whether
- * or not strict parsing is enabled.
- *
- * <p>Help information can be output to {@link System#out} by specifying {@code --help} as an
- * argument. After help is printed, the application will exit. Specifying only {@code --help}
- * will print out the list of
- * {@link PipelineOptionsFactory#getRegisteredOptions() registered options}
- * by invoking {@link PipelineOptionsFactory#printHelp(PrintStream)}. Specifying
- * {@code --help=PipelineOptionsClassName} will print out detailed usage information about the
- * specifically requested PipelineOptions by invoking
- * {@link PipelineOptionsFactory#printHelp(PrintStream, Class)}.
- */
- public static Builder fromArgs(String[] args) {
- return new Builder().fromArgs(args);
- }
-
- /**
- * After creation we will validate that {@code <T>} conforms to all the
- * validation criteria. See
- * {@link PipelineOptionsValidator#validate(Class, PipelineOptions)} for more details about
- * validation.
- */
- public Builder withValidation() {
- return new Builder().withValidation();
- }
-
- /** A fluent {@link PipelineOptions} builder. */
- public static class Builder {
- private final String defaultAppName;
- private final String[] args;
- private final boolean validation;
- private final boolean strictParsing;
-
- // Do not allow direct instantiation
- private Builder() {
- this(null, false, true);
- }
-
- private Builder(String[] args, boolean validation,
- boolean strictParsing) {
- this.defaultAppName = findCallersClassName();
- this.args = args;
- this.validation = validation;
- this.strictParsing = strictParsing;
- }
-
- /**
- * Sets the command line arguments to parse when constructing the {@link PipelineOptions}.
- *
- * <p>Example GNU style command line arguments:
- * <pre>
- * --project=MyProject (simple property, will set the "project" property to "MyProject")
- * --readOnly=true (for boolean properties, will set the "readOnly" property to "true")
- * --readOnly (shorthand for boolean properties, will set the "readOnly" property to "true")
- * --x=1 --x=2 --x=3 (list style simple property, will set the "x" property to [1, 2, 3])
- * --x=1,2,3 (shorthand list style simple property, will set the "x" property to [1, 2, 3])
- * --complexObject='{"key1":"value1",...} (JSON format for all other complex types)
- * </pre>
- *
- * <p>Simple properties are able to bound to {@link String}, {@link Class}, enums and Java
- * primitives {@code boolean}, {@code byte}, {@code short}, {@code int}, {@code long},
- * {@code float}, {@code double} and their primitive wrapper classes.
- *
- * <p>Simple list style properties are able to be bound to {@code boolean[]}, {@code char[]},
- * {@code short[]}, {@code int[]}, {@code long[]}, {@code float[]}, {@code double[]},
- * {@code Class[]}, enum arrays, {@code String[]}, and {@code List<String>}.
- *
- * <p>JSON format is required for all other types.
- *
- * <p>By default, strict parsing is enabled and arguments must conform to be either
- * {@code --booleanArgName} or {@code --argName=argValue}. Strict parsing can be disabled with
- * {@link Builder#withoutStrictParsing()}. Empty or null arguments will be ignored whether
- * or not strict parsing is enabled.
- *
- * <p>Help information can be output to {@link System#out} by specifying {@code --help} as an
- * argument. After help is printed, the application will exit. Specifying only {@code --help}
- * will print out the list of
- * {@link PipelineOptionsFactory#getRegisteredOptions() registered options}
- * by invoking {@link PipelineOptionsFactory#printHelp(PrintStream)}. Specifying
- * {@code --help=PipelineOptionsClassName} will print out detailed usage information about the
- * specifically requested PipelineOptions by invoking
- * {@link PipelineOptionsFactory#printHelp(PrintStream, Class)}.
- */
- public Builder fromArgs(String[] args) {
- Preconditions.checkNotNull(args, "Arguments should not be null.");
- return new Builder(args, validation, strictParsing);
- }
-
- /**
- * After creation we will validate that {@link PipelineOptions} conforms to all the
- * validation criteria from {@code <T>}. See
- * {@link PipelineOptionsValidator#validate(Class, PipelineOptions)} for more details about
- * validation.
- */
- public Builder withValidation() {
- return new Builder(args, true, strictParsing);
- }
-
- /**
- * During parsing of the arguments, we will skip over improperly formatted and unknown
- * arguments.
- */
- public Builder withoutStrictParsing() {
- return new Builder(args, validation, false);
- }
-
- /**
- * Creates and returns an object that implements {@link PipelineOptions} using the values
- * configured on this builder during construction.
- *
- * @return An object that implements {@link PipelineOptions}.
- */
- public PipelineOptions create() {
- return as(PipelineOptions.class);
- }
-
- /**
- * Creates and returns an object that implements {@code <T>} using the values configured on
- * this builder during construction.
- *
- * <p>Note that {@code <T>} must be composable with every registered interface with this
- * factory. See {@link PipelineOptionsFactory#validateWellFormed(Class, Set)} for more
- * details.
- *
- * @return An object that implements {@code <T>}.
- */
- public <T extends PipelineOptions> T as(Class<T> klass) {
- Map<String, Object> initialOptions = Maps.newHashMap();
-
- // Attempt to parse the arguments into the set of initial options to use
- if (args != null) {
- ListMultimap<String, String> options = parseCommandLine(args, strictParsing);
- LOG.debug("Provided Arguments: {}", options);
- printHelpUsageAndExitIfNeeded(options, System.out, true /* exit */);
- initialOptions = parseObjects(klass, options, strictParsing);
- }
-
- // Create our proxy
- ProxyInvocationHandler handler = new ProxyInvocationHandler(initialOptions);
- T t = handler.as(klass);
-
- // Set the application name to the default if none was set.
- ApplicationNameOptions appNameOptions = t.as(ApplicationNameOptions.class);
- if (appNameOptions.getAppName() == null) {
- appNameOptions.setAppName(defaultAppName);
- }
-
- if (validation) {
- PipelineOptionsValidator.validate(klass, t);
- }
- return t;
- }
- }
-
- /**
- * Determines whether the generic {@code --help} was requested or help was
- * requested for a specific class and invokes the appropriate
- * {@link PipelineOptionsFactory#printHelp(PrintStream)} and
- * {@link PipelineOptionsFactory#printHelp(PrintStream, Class)} variant.
- * Prints to the specified {@link PrintStream}, and exits if requested.
- *
- * <p>Visible for testing.
- * {@code printStream} and {@code exit} used for testing.
- */
- @SuppressWarnings("unchecked")
- static boolean printHelpUsageAndExitIfNeeded(ListMultimap<String, String> options,
- PrintStream printStream, boolean exit) {
- if (options.containsKey("help")) {
- final String helpOption = Iterables.getOnlyElement(options.get("help"));
-
- // Print the generic help if only --help was specified.
- if (Boolean.TRUE.toString().equals(helpOption)) {
- printHelp(printStream);
- if (exit) {
- System.exit(0);
- } else {
- return true;
- }
- }
-
- // Otherwise attempt to print the specific help option.
- try {
- Class<?> klass = Class.forName(helpOption);
- if (!PipelineOptions.class.isAssignableFrom(klass)) {
- throw new ClassNotFoundException("PipelineOptions of type " + klass + " not found.");
- }
- printHelp(printStream, (Class<? extends PipelineOptions>) klass);
- } catch (ClassNotFoundException e) {
- // If we didn't find an exact match, look for any that match the class name.
- Iterable<Class<? extends PipelineOptions>> matches = Iterables.filter(
- getRegisteredOptions(),
- new Predicate<Class<? extends PipelineOptions>>() {
- @Override
- public boolean apply(Class<? extends PipelineOptions> input) {
- if (helpOption.contains(".")) {
- return input.getName().endsWith(helpOption);
- } else {
- return input.getSimpleName().equals(helpOption);
- }
- }
- });
- try {
- printHelp(printStream, Iterables.getOnlyElement(matches));
- } catch (NoSuchElementException exception) {
- printStream.format("Unable to find option %s.%n", helpOption);
- printHelp(printStream);
- } catch (IllegalArgumentException exception) {
- printStream.format("Multiple matches found for %s: %s.%n", helpOption,
- Iterables.transform(matches, ReflectHelpers.CLASS_NAME));
- printHelp(printStream);
- }
- }
- if (exit) {
- System.exit(0);
- } else {
- return true;
- }
- }
- return false;
- }
-
- /**
- * Returns the simple name of the calling class using the current threads stack.
- */
- private static String findCallersClassName() {
- Iterator<StackTraceElement> elements =
- Iterators.forArray(Thread.currentThread().getStackTrace());
- // First find the PipelineOptionsFactory/Builder class in the stack trace.
- while (elements.hasNext()) {
- StackTraceElement next = elements.next();
- if (PIPELINE_OPTIONS_FACTORY_CLASSES.contains(next.getClassName())) {
- break;
- }
- }
- // Then find the first instance after that is not the PipelineOptionsFactory/Builder class.
- while (elements.hasNext()) {
- StackTraceElement next = elements.next();
- if (!PIPELINE_OPTIONS_FACTORY_CLASSES.contains(next.getClassName())) {
- try {
- return Class.forName(next.getClassName()).getSimpleName();
- } catch (ClassNotFoundException e) {
- break;
- }
- }
- }
-
- return "unknown";
- }
-
- /**
- * Stores the generated proxyClass and its respective {@link BeanInfo} object.
- *
- * @param <T> The type of the proxyClass.
- */
- static class Registration<T extends PipelineOptions> {
- private final Class<T> proxyClass;
- private final List<PropertyDescriptor> propertyDescriptors;
-
- public Registration(Class<T> proxyClass, List<PropertyDescriptor> beanInfo) {
- this.proxyClass = proxyClass;
- this.propertyDescriptors = beanInfo;
- }
-
- List<PropertyDescriptor> getPropertyDescriptors() {
- return propertyDescriptors;
- }
-
- Class<T> getProxyClass() {
- return proxyClass;
- }
- }
-
- private static final Set<Class<?>> SIMPLE_TYPES = ImmutableSet.<Class<?>>builder()
- .add(boolean.class)
- .add(Boolean.class)
- .add(char.class)
- .add(Character.class)
- .add(short.class)
- .add(Short.class)
- .add(int.class)
- .add(Integer.class)
- .add(long.class)
- .add(Long.class)
- .add(float.class)
- .add(Float.class)
- .add(double.class)
- .add(Double.class)
- .add(String.class)
- .add(Class.class).build();
- private static final Logger LOG = LoggerFactory.getLogger(PipelineOptionsFactory.class);
- @SuppressWarnings("rawtypes")
- private static final Class<?>[] EMPTY_CLASS_ARRAY = new Class[0];
- private static final ObjectMapper MAPPER = new ObjectMapper();
- private static final Map<String, Class<? extends PipelineRunner<?>>> SUPPORTED_PIPELINE_RUNNERS;
-
- /** Classes that are used as the boundary in the stack trace to find the callers class name. */
- private static final Set<String> PIPELINE_OPTIONS_FACTORY_CLASSES =
- ImmutableSet.of(PipelineOptionsFactory.class.getName(), Builder.class.getName());
-
- /** Methods that are ignored when validating the proxy class. */
- private static final Set<Method> IGNORED_METHODS;
-
- /** A predicate that checks if a method is synthetic via {@link Method#isSynthetic()}. */
- private static final Predicate<Method> NOT_SYNTHETIC_PREDICATE =
- new Predicate<Method>() {
- @Override
- public boolean apply(Method input) {
- return !input.isSynthetic();
- }
- };
-
- /** The set of options that have been registered and visible to the user. */
- private static final Set<Class<? extends PipelineOptions>> REGISTERED_OPTIONS =
- Sets.newConcurrentHashSet();
-
- /** A cache storing a mapping from a given interface to its registration record. */
- private static final Map<Class<? extends PipelineOptions>, Registration<?>> INTERFACE_CACHE =
- Maps.newConcurrentMap();
-
- /** A cache storing a mapping from a set of interfaces to its registration record. */
- private static final Map<Set<Class<? extends PipelineOptions>>, Registration<?>> COMBINED_CACHE =
- Maps.newConcurrentMap();
-
- /** The width at which options should be output. */
- private static final int TERMINAL_WIDTH = 80;
-
- /**
- * Finds the appropriate {@code ClassLoader} to be used by the
- * {@link ServiceLoader#load} call, which by default would use the context
- * {@code ClassLoader}, which can be null. The fallback is as follows: context
- * ClassLoader, class ClassLoader and finaly the system ClassLoader.
- */
- static ClassLoader findClassLoader() {
- ClassLoader classLoader = Thread.currentThread().getContextClassLoader();
- if (classLoader == null) {
- classLoader = PipelineOptionsFactory.class.getClassLoader();
- }
- if (classLoader == null) {
- classLoader = ClassLoader.getSystemClassLoader();
- }
- return classLoader;
- }
-
- static {
- try {
- IGNORED_METHODS = ImmutableSet.<Method>builder()
- .add(Object.class.getMethod("getClass"))
- .add(Object.class.getMethod("wait"))
- .add(Object.class.getMethod("wait", long.class))
- .add(Object.class.getMethod("wait", long.class, int.class))
- .add(Object.class.getMethod("notify"))
- .add(Object.class.getMethod("notifyAll"))
- .add(Proxy.class.getMethod("getInvocationHandler", Object.class))
- .build();
- } catch (NoSuchMethodException | SecurityException e) {
- LOG.error("Unable to find expected method", e);
- throw new ExceptionInInitializerError(e);
- }
-
- ClassLoader classLoader = findClassLoader();
-
- // Store the list of all available pipeline runners.
- ImmutableMap.Builder<String, Class<? extends PipelineRunner<?>>> builder =
- ImmutableMap.builder();
- Set<PipelineRunnerRegistrar> pipelineRunnerRegistrars =
- Sets.newTreeSet(ObjectsClassComparator.INSTANCE);
- pipelineRunnerRegistrars.addAll(
- Lists.newArrayList(ServiceLoader.load(PipelineRunnerRegistrar.class, classLoader)));
- for (PipelineRunnerRegistrar registrar : pipelineRunnerRegistrars) {
- for (Class<? extends PipelineRunner<?>> klass : registrar.getPipelineRunners()) {
- builder.put(klass.getSimpleName(), klass);
- }
- }
- SUPPORTED_PIPELINE_RUNNERS = builder.build();
-
- // Load and register the list of all classes that extend PipelineOptions.
- register(PipelineOptions.class);
- Set<PipelineOptionsRegistrar> pipelineOptionsRegistrars =
- Sets.newTreeSet(ObjectsClassComparator.INSTANCE);
- pipelineOptionsRegistrars.addAll(
- Lists.newArrayList(ServiceLoader.load(PipelineOptionsRegistrar.class, classLoader)));
- for (PipelineOptionsRegistrar registrar : pipelineOptionsRegistrars) {
- for (Class<? extends PipelineOptions> klass : registrar.getPipelineOptions()) {
- register(klass);
- }
- }
- }
-
- /**
- * This registers the interface with this factory. This interface must conform to the following
- * restrictions:
- * <ul>
- * <li>Any property with the same name must have the same return type for all derived
- * interfaces of {@link PipelineOptions}.
- * <li>Every bean property of any interface derived from {@link PipelineOptions} must have a
- * getter and setter method.
- * <li>Every method must conform to being a getter or setter for a JavaBean.
- * <li>The derived interface of {@link PipelineOptions} must be composable with every interface
- * registered with this factory.
- * </ul>
- *
- * @param iface The interface object to manually register.
- */
- public static synchronized void register(Class<? extends PipelineOptions> iface) {
- Preconditions.checkNotNull(iface);
- Preconditions.checkArgument(iface.isInterface(), "Only interface types are supported.");
-
- if (REGISTERED_OPTIONS.contains(iface)) {
- return;
- }
- validateWellFormed(iface, REGISTERED_OPTIONS);
- REGISTERED_OPTIONS.add(iface);
- }
-
- /**
- * Validates that the interface conforms to the following:
- * <ul>
- * <li>Any property with the same name must have the same return type for all derived
- * interfaces of {@link PipelineOptions}.
- * <li>Every bean property of any interface derived from {@link PipelineOptions} must have a
- * getter and setter method.
- * <li>Every method must conform to being a getter or setter for a JavaBean.
- * <li>The derived interface of {@link PipelineOptions} must be composable with every interface
- * part of allPipelineOptionsClasses.
- * <li>Only getters may be annotated with {@link JsonIgnore @JsonIgnore}.
- * <li>If any getter is annotated with {@link JsonIgnore @JsonIgnore}, then all getters for
- * this property must be annotated with {@link JsonIgnore @JsonIgnore}.
- * </ul>
- *
- * @param iface The interface to validate.
- * @param validatedPipelineOptionsInterfaces The set of validated pipeline options interfaces to
- * validate against.
- * @return A registration record containing the proxy class and bean info for iface.
- */
- static synchronized <T extends PipelineOptions> Registration<T> validateWellFormed(
- Class<T> iface, Set<Class<? extends PipelineOptions>> validatedPipelineOptionsInterfaces) {
- Preconditions.checkArgument(iface.isInterface(), "Only interface types are supported.");
-
- @SuppressWarnings("unchecked")
- Set<Class<? extends PipelineOptions>> combinedPipelineOptionsInterfaces =
- FluentIterable.from(validatedPipelineOptionsInterfaces).append(iface).toSet();
- // Validate that the view of all currently passed in options classes is well formed.
- if (!COMBINED_CACHE.containsKey(combinedPipelineOptionsInterfaces)) {
- @SuppressWarnings("unchecked")
- Class<T> allProxyClass =
- (Class<T>) Proxy.getProxyClass(PipelineOptionsFactory.class.getClassLoader(),
- combinedPipelineOptionsInterfaces.toArray(EMPTY_CLASS_ARRAY));
- try {
- List<PropertyDescriptor> propertyDescriptors =
- validateClass(iface, validatedPipelineOptionsInterfaces, allProxyClass);
- COMBINED_CACHE.put(combinedPipelineOptionsInterfaces,
- new Registration<T>(allProxyClass, propertyDescriptors));
- } catch (IntrospectionException e) {
- throw Throwables.propagate(e);
- }
- }
-
- // Validate that the local view of the class is well formed.
- if (!INTERFACE_CACHE.containsKey(iface)) {
- @SuppressWarnings({"rawtypes", "unchecked"})
- Class<T> proxyClass = (Class<T>) Proxy.getProxyClass(
- PipelineOptionsFactory.class.getClassLoader(), new Class[] {iface});
- try {
- List<PropertyDescriptor> propertyDescriptors =
- validateClass(iface, validatedPipelineOptionsInterfaces, proxyClass);
- INTERFACE_CACHE.put(iface,
- new Registration<T>(proxyClass, propertyDescriptors));
- } catch (IntrospectionException e) {
- throw Throwables.propagate(e);
- }
- }
- @SuppressWarnings("unchecked")
- Registration<T> result = (Registration<T>) INTERFACE_CACHE.get(iface);
- return result;
- }
-
- public static Set<Class<? extends PipelineOptions>> getRegisteredOptions() {
- return Collections.unmodifiableSet(REGISTERED_OPTIONS);
- }
-
- /**
- * Outputs the set of registered options with the PipelineOptionsFactory
- * with a description for each one if available to the output stream. This output
- * is pretty printed and meant to be human readable. This method will attempt to
- * format its output to be compatible with a terminal window.
- */
- public static void printHelp(PrintStream out) {
- Preconditions.checkNotNull(out);
- out.println("The set of registered options are:");
- Set<Class<? extends PipelineOptions>> sortedOptions =
- new TreeSet<>(ClassNameComparator.INSTANCE);
- sortedOptions.addAll(REGISTERED_OPTIONS);
- for (Class<? extends PipelineOptions> kls : sortedOptions) {
- out.format(" %s%n", kls.getName());
- }
- out.format("%nUse --help=<OptionsName> for detailed help. For example:%n"
- + " --help=DataflowPipelineOptions <short names valid for registered options>%n"
- + " --help=com.google.cloud.dataflow.sdk.options.DataflowPipelineOptions%n");
- }
-
- /**
- * Outputs the set of options available to be set for the passed in {@link PipelineOptions}
- * interface. The output is in a human readable format. The format is:
- * <pre>
- * OptionGroup:
- * ... option group description ...
- *
- * --option1={@code <type>} or list of valid enum choices
- * Default: value (if available, see {@link Default})
- * ... option description ... (if available, see {@link Description})
- * Required groups (if available, see {@link Required})
- * --option2={@code <type>} or list of valid enum choices
- * Default: value (if available, see {@link Default})
- * ... option description ... (if available, see {@link Description})
- * Required groups (if available, see {@link Required})
- * </pre>
- * This method will attempt to format its output to be compatible with a terminal window.
- */
- public static void printHelp(PrintStream out, Class<? extends PipelineOptions> iface) {
- Preconditions.checkNotNull(out);
- Preconditions.checkNotNull(iface);
- validateWellFormed(iface, REGISTERED_OPTIONS);
-
- Iterable<Method> methods =
- Iterables.filter(
- ReflectHelpers.getClosureOfMethodsOnInterface(iface), NOT_SYNTHETIC_PREDICATE);
- ListMultimap<Class<?>, Method> ifaceToMethods = ArrayListMultimap.create();
- for (Method method : methods) {
- // Process only methods that are not marked as hidden.
- if (method.getAnnotation(Hidden.class) == null) {
- ifaceToMethods.put(method.getDeclaringClass(), method);
- }
- }
- SortedSet<Class<?>> ifaces = new TreeSet<>(ClassNameComparator.INSTANCE);
- // Keep interfaces that are not marked as hidden.
- ifaces.addAll(Collections2.filter(ifaceToMethods.keySet(), new Predicate<Class<?>>() {
- @Override
- public boolean apply(Class<?> input) {
- return input.getAnnotation(Hidden.class) == null;
- }
- }));
- for (Class<?> currentIface : ifaces) {
- Map<String, Method> propertyNamesToGetters =
- getPropertyNamesToGetters(ifaceToMethods.get(currentIface));
-
- // Don't output anything if there are no defined options
- if (propertyNamesToGetters.isEmpty()) {
- continue;
- }
- SortedSetMultimap<String, String> requiredGroupNameToProperties =
- getRequiredGroupNamesToProperties(propertyNamesToGetters);
-
- out.format("%s:%n", currentIface.getName());
- prettyPrintDescription(out, currentIface.getAnnotation(Description.class));
-
- out.println();
-
- List<String> lists = Lists.newArrayList(propertyNamesToGetters.keySet());
- Collections.sort(lists, String.CASE_INSENSITIVE_ORDER);
- for (String propertyName : lists) {
- Method method = propertyNamesToGetters.get(propertyName);
- String printableType = method.getReturnType().getSimpleName();
- if (method.getReturnType().isEnum()) {
- printableType = Joiner.on(" | ").join(method.getReturnType().getEnumConstants());
- }
- out.format(" --%s=<%s>%n", propertyName, printableType);
- Optional<String> defaultValue = getDefaultValueFromAnnotation(method);
- if (defaultValue.isPresent()) {
- out.format(" Default: %s%n", defaultValue.get());
- }
- prettyPrintDescription(out, method.getAnnotation(Description.class));
- prettyPrintRequiredGroups(out, method.getAnnotation(Validation.Required.class),
- requiredGroupNameToProperties);
- }
- out.println();
- }
- }
-
- /**
- * Output the requirement groups that the property is a member of, including all properties that
- * satisfy the group requirement, breaking up long lines on white space characters and attempting
- * to honor a line limit of {@code TERMINAL_WIDTH}.
- */
- private static void prettyPrintRequiredGroups(PrintStream out, Required annotation,
- SortedSetMultimap<String, String> requiredGroupNameToProperties) {
- if (annotation == null || annotation.groups() == null) {
- return;
- }
- for (String group : annotation.groups()) {
- SortedSet<String> groupMembers = requiredGroupNameToProperties.get(group);
- String requirement;
- if (groupMembers.size() == 1) {
- requirement = Iterables.getOnlyElement(groupMembers) + " is required.";
- } else {
- requirement = "At least one of " + groupMembers + " is required";
- }
- terminalPrettyPrint(out, requirement.split("\\s+"));
- }
- }
-
- /**
- * Outputs the value of the description, breaking up long lines on white space characters and
- * attempting to honor a line limit of {@code TERMINAL_WIDTH}.
- */
- private static void prettyPrintDescription(PrintStream out, Description description) {
- if (description == null || description.value() == null) {
- return;
- }
-
- String[] words = description.value().split("\\s+");
- terminalPrettyPrint(out, words);
- }
-
- private static void terminalPrettyPrint(PrintStream out, String[] words) {
- final String spacing = " ";
-
- if (words.length == 0) {
- return;
- }
-
- out.print(spacing);
- int lineLength = spacing.length();
- for (int i = 0; i < words.length; ++i) {
- out.print(" ");
- out.print(words[i]);
- lineLength += 1 + words[i].length();
-
- // If the next word takes us over the terminal width, then goto the next line.
- if (i + 1 != words.length && words[i + 1].length() + lineLength + 1 > TERMINAL_WIDTH) {
- out.println();
- out.print(spacing);
- lineLength = spacing.length();
- }
- }
- out.println();
- }
-
- /**
- * Returns a string representation of the {@link Default} value on the passed in method.
- */
- private static Optional<String> getDefaultValueFromAnnotation(Method method) {
- for (Annotation annotation : method.getAnnotations()) {
- if (annotation instanceof Default.Class) {
- return Optional.of(((Default.Class) annotation).value().getSimpleName());
- } else if (annotation instanceof Default.String) {
- return Optional.of(((Default.String) annotation).value());
- } else if (annotation instanceof Default.Boolean) {
- return Optional.of(Boolean.toString(((Default.Boolean) annotation).value()));
- } else if (annotation instanceof Default.Character) {
- return Optional.of(Character.toString(((Default.Character) annotation).value()));
- } else if (annotation instanceof Default.Byte) {
- return Optional.of(Byte.toString(((Default.Byte) annotation).value()));
- } else if (annotation instanceof Default.Short) {
- return Optional.of(Short.toString(((Default.Short) annotation).value()));
- } else if (annotation instanceof Default.Integer) {
- return Optional.of(Integer.toString(((Default.Integer) annotation).value()));
- } else if (annotation instanceof Default.Long) {
- return Optional.of(Long.toString(((Default.Long) annotation).value()));
- } else if (annotation instanceof Default.Float) {
- return Optional.of(Float.toString(((Default.Float) annotation).value()));
- } else if (annotation instanceof Default.Double) {
- return Optional.of(Double.toString(((Default.Double) annotation).value()));
- } else if (annotation instanceof Default.Enum) {
- return Optional.of(((Default.Enum) annotation).value());
- } else if (annotation instanceof Default.InstanceFactory) {
- return Optional.of(((Default.InstanceFactory) annotation).value().getSimpleName());
- }
- }
- return Optional.absent();
- }
-
- static Map<String, Class<? extends PipelineRunner<?>>> getRegisteredRunners() {
- return SUPPORTED_PIPELINE_RUNNERS;
- }
-
- static List<PropertyDescriptor> getPropertyDescriptors(
- Set<Class<? extends PipelineOptions>> interfaces) {
- return COMBINED_CACHE.get(interfaces).getPropertyDescriptors();
- }
-
- /**
- * Creates a set of Dataflow worker harness options based of a set of known system
- * properties. This is meant to only be used from the Dataflow worker harness as a method to
- * bootstrap the worker harness.
- *
- * <p>For internal use only.
- *
- * @return A {@link DataflowWorkerHarnessOptions} object configured for the
- * Dataflow worker harness.
- */
- public static DataflowWorkerHarnessOptions createFromSystemPropertiesInternal()
- throws IOException {
- return createFromSystemProperties();
- }
-
- /**
- * Creates a set of {@link DataflowWorkerHarnessOptions} based of a set of known system
- * properties. This is meant to only be used from the Dataflow worker harness as a method to
- * bootstrap the worker harness.
- *
- * @return A {@link DataflowWorkerHarnessOptions} object configured for the
- * Dataflow worker harness.
- * @deprecated for internal use only
- */
- @Deprecated
- public static DataflowWorkerHarnessOptions createFromSystemProperties() throws IOException {
- ObjectMapper objectMapper = new ObjectMapper();
- DataflowWorkerHarnessOptions options;
- if (System.getProperties().containsKey("sdk_pipeline_options")) {
- String serializedOptions = System.getProperty("sdk_pipeline_options");
- LOG.info("Worker harness starting with: " + serializedOptions);
- options = objectMapper.readValue(serializedOptions, PipelineOptions.class)
- .as(DataflowWorkerHarnessOptions.class);
- } else {
- options = PipelineOptionsFactory.as(DataflowWorkerHarnessOptions.class);
- }
-
- // These values will not be known at job submission time and must be provided.
- if (System.getProperties().containsKey("worker_id")) {
- options.setWorkerId(System.getProperty("worker_id"));
- }
- if (System.getProperties().containsKey("job_id")) {
- options.setJobId(System.getProperty("job_id"));
- }
-
- return options;
- }
-
- /**
- * This method is meant to emulate the behavior of {@link Introspector#getBeanInfo(Class, int)}
- * to construct the list of {@link PropertyDescriptor}.
- *
- * <p>TODO: Swap back to using Introspector once the proxy class issue with AppEngine is
- * resolved.
- */
- private static List<PropertyDescriptor> getPropertyDescriptors(Class<?> beanClass)
- throws IntrospectionException {
- // The sorting is important to make this method stable.
- SortedSet<Method> methods = Sets.newTreeSet(MethodComparator.INSTANCE);
- methods.addAll(
- Collections2.filter(Arrays.asList(beanClass.getMethods()), NOT_SYNTHETIC_PREDICATE));
- SortedMap<String, Method> propertyNamesToGetters = getPropertyNamesToGetters(methods);
- List<PropertyDescriptor> descriptors = Lists.newArrayList();
-
- List<TypeMismatch> mismatches = new ArrayList<>();
- /*
- * Add all the getter/setter pairs to the list of descriptors removing the getter once
- * it has been paired up.
- */
- for (Method method : methods) {
- String methodName = method.getName();
- if (!methodName.startsWith("set")
- || method.getParameterTypes().length != 1
- || method.getReturnType() != void.class) {
- continue;
- }
- String propertyName = Introspector.decapitalize(methodName.substring(3));
- Method getterMethod = propertyNamesToGetters.remove(propertyName);
-
- // Validate that the getter and setter property types are the same.
- if (getterMethod != null) {
- Class<?> getterPropertyType = getterMethod.getReturnType();
- Class<?> setterPropertyType = method.getParameterTypes()[0];
- if (getterPropertyType != setterPropertyType) {
- TypeMismatch mismatch = new TypeMismatch();
- mismatch.propertyName = propertyName;
- mismatch.getterPropertyType = getterPropertyType;
- mismatch.setterPropertyType = setterPropertyType;
- mismatches.add(mismatch);
- continue;
- }
- }
-
- descriptors.add(new PropertyDescriptor(
- propertyName, getterMethod, method));
- }
- throwForTypeMismatches(mismatches);
-
- // Add the remaining getters with missing setters.
- for (Map.Entry<String, Method> getterToMethod : propertyNamesToGetters.entrySet()) {
- descriptors.add(new PropertyDescriptor(
- getterToMethod.getKey(), getterToMethod.getValue(), null));
- }
- return descriptors;
- }
-
- private static class TypeMismatch {
- private String propertyName;
- private Class<?> getterPropertyType;
- private Class<?> setterPropertyType;
- }
-
- private static void throwForTypeMismatches(List<TypeMismatch> mismatches) {
- if (mismatches.size() == 1) {
- TypeMismatch mismatch = mismatches.get(0);
- throw new IllegalArgumentException(String.format(
- "Type mismatch between getter and setter methods for property [%s]. "
- + "Getter is of type [%s] whereas setter is of type [%s].",
- mismatch.propertyName,
- mismatch.getterPropertyType.getName(),
- mismatch.setterPropertyType.getName()));
- } else if (mismatches.size() > 1) {
- StringBuilder builder = new StringBuilder(
- String.format("Type mismatches between getters and setters detected:"));
- for (TypeMismatch mismatch : mismatches) {
- builder.append(String.format(
- "%n - Property [%s]: Getter is of type [%s] whereas setter is of type [%s].",
- mismatch.propertyName,
- mismatch.getterPropertyType.getName(),
- mismatch.setterPropertyType.getName()));
- }
- throw new IllegalArgumentException(builder.toString());
- }
- }
-
- /**
- * Returns a map of the property name to the getter method it represents.
- * If there are duplicate methods with the same bean name, then it is indeterminate
- * as to which method will be returned.
- */
- private static SortedMap<String, Method> getPropertyNamesToGetters(Iterable<Method> methods) {
- SortedMap<String, Method> propertyNamesToGetters = Maps.newTreeMap();
- for (Method method : methods) {
- String methodName = method.getName();
- if ((!methodName.startsWith("get")
- && !methodName.startsWith("is"))
- || method.getParameterTypes().length != 0
- || method.getReturnType() == void.class) {
- continue;
- }
- String propertyName = Introspector.decapitalize(
- methodName.startsWith("is") ? methodName.substring(2) : methodName.substring(3));
- propertyNamesToGetters.put(propertyName, method);
- }
- return propertyNamesToGetters;
- }
-
- /**
- * Returns a map of required groups of arguments to the properties that satisfy the requirement.
- */
- private static SortedSetMultimap<String, String> getRequiredGroupNamesToProperties(
- Map<String, Method> propertyNamesToGetters) {
- SortedSetMultimap<String, String> result = TreeMultimap.create();
- for (Map.Entry<String, Method> propertyEntry : propertyNamesToGetters.entrySet()) {
- Required requiredAnnotation =
- propertyEntry.getValue().getAnnotation(Validation.Required.class);
- if (requiredAnnotation != null) {
- for (String groupName : requiredAnnotation.groups()) {
- result.put(groupName, propertyEntry.getKey());
- }
- }
- }
- return result;
- }
-
- /**
- * Validates that a given class conforms to the following properties:
- * <ul>
- * <li>Any property with the same name must have the same return type for all derived
- * interfaces of {@link PipelineOptions}.
- * <li>Every bean property of any interface derived from {@link PipelineOptions} must have a
- * getter and setter method.
- * <li>Every method must conform to being a getter or setter for a JavaBean.
- * <li>Only getters may be annotated with {@link JsonIgnore @JsonIgnore}.
- * <li>If any getter is annotated with {@link JsonIgnore @JsonIgnore}, then all getters for
- * this property must be annotated with {@link JsonIgnore @JsonIgnore}.
- * </ul>
- *
- * @param iface The interface to validate.
- * @param validatedPipelineOptionsInterfaces The set of validated pipeline options interfaces to
- * validate against.
- * @param klass The proxy class representing the interface.
- * @return A list of {@link PropertyDescriptor}s representing all valid bean properties of
- * {@code iface}.
- * @throws IntrospectionException if invalid property descriptors.
- */
- private static List<PropertyDescriptor> validateClass(Class<? extends PipelineOptions> iface,
- Set<Class<? extends PipelineOptions>> validatedPipelineOptionsInterfaces,
- Class<?> klass) throws IntrospectionException {
- Set<Method> methods = Sets.newHashSet(IGNORED_METHODS);
- // Ignore static methods, "equals", "hashCode", "toString" and "as" on the generated class.
- // Ignore synthetic methods
- for (Method method : klass.getMethods()) {
- if (Modifier.isStatic(method.getModifiers()) || method.isSynthetic()) {
- methods.add(method);
- }
- }
- try {
- methods.add(klass.getMethod("equals", Object.class));
- methods.add(klass.getMethod("hashCode"));
- methods.add(klass.getMethod("toString"));
- methods.add(klass.getMethod("as", Class.class));
- methods.add(klass.getMethod("cloneAs", Class.class));
- } catch (NoSuchMethodException | SecurityException e) {
- throw Throwables.propagate(e);
- }
-
- // Verify that there are no methods with the same name with two different return types.
- Iterable<Method> interfaceMethods = FluentIterable
- .from(ReflectHelpers.getClosureOfMethodsOnInterface(iface))
- .filter(NOT_SYNTHETIC_PREDICATE)
- .toSortedSet(MethodComparator.INSTANCE);
- SortedSetMultimap<Method, Method> methodNameToMethodMap =
- TreeMultimap.create(MethodNameComparator.INSTANCE, MethodComparator.INSTANCE);
- for (Method method : interfaceMethods) {
- methodNameToMethodMap.put(method, method);
- }
- List<MultipleDefinitions> multipleDefinitions = Lists.newArrayList();
- for (Map.Entry<Method, Collection<Method>> entry
- : methodNameToMethodMap.asMap().entrySet()) {
- Set<Class<?>> returnTypes = FluentIterable.from(entry.getValue())
- .transform(ReturnTypeFetchingFunction.INSTANCE).toSet();
- SortedSet<Method> collidingMethods = FluentIterable.from(entry.getValue())
- .toSortedSet(MethodComparator.INSTANCE);
- if (returnTypes.size() > 1) {
- MultipleDefinitions defs = new MultipleDefinitions();
- defs.method = entry.getKey();
- defs.collidingMethods = collidingMethods;
- multipleDefinitions.add(defs);
- }
- }
- throwForMultipleDefinitions(iface, multipleDefinitions);
-
- // Verify that there is no getter with a mixed @JsonIgnore annotation and verify
- // that no setter has @JsonIgnore.
- Iterable<Method> allInterfaceMethods =
- FluentIterable.from(
- ReflectHelpers.getClosureOfMethodsOnInterfaces(
- validatedPipelineOptionsInterfaces))
- .append(ReflectHelpers.getClosureOfMethodsOnInterface(iface))
- .filter(NOT_SYNTHETIC_PREDICATE)
- .toSortedSet(MethodComparator.INSTANCE);
- SortedSetMultimap<Method, Method> methodNameToAllMethodMap =
- TreeMultimap.create(MethodNameComparator.INSTANCE, MethodComparator.INSTANCE);
- for (Method method : allInterfaceMethods) {
- methodNameToAllMethodMap.put(method, method);
- }
-
- List<PropertyDescriptor> descriptors = getPropertyDescriptors(klass);
-
- List<InconsistentlyIgnoredGetters> incompletelyIgnoredGetters = new ArrayList<>();
- List<IgnoredSetter> ignoredSetters = new ArrayList<>();
-
- for (PropertyDescriptor descriptor : descriptors) {
- if (descriptor.getReadMethod() == null
- || descriptor.getWriteMethod() == null
- || IGNORED_METHODS.contains(descriptor.getReadMethod())
- || IGNORED_METHODS.contains(descriptor.getWriteMethod())) {
- continue;
- }
- SortedSet<Method> getters = methodNameToAllMethodMap.get(descriptor.getReadMethod());
- SortedSet<Method> gettersWithJsonIgnore = Sets.filter(getters, JsonIgnorePredicate.INSTANCE);
-
- Iterable<String> getterClassNames = FluentIterable.from(getters)
- .transform(MethodToDeclaringClassFunction.INSTANCE)
- .transform(ReflectHelpers.CLASS_NAME);
- Iterable<String> gettersWithJsonIgnoreClassNames = FluentIterable.from(gettersWithJsonIgnore)
- .transform(MethodToDeclaringClassFunction.INSTANCE)
- .transform(ReflectHelpers.CLASS_NAME);
-
- if (!(gettersWithJsonIgnore.isEmpty() || getters.size() == gettersWithJsonIgnore.size())) {
- InconsistentlyIgnoredGetters err = new InconsistentlyIgnoredGetters();
- err.descriptor = descriptor;
- err.getterClassNames = getterClassNames;
- err.gettersWithJsonIgnoreClassNames = gettersWithJsonIgnoreClassNames;
- incompletelyIgnoredGetters.add(err);
- }
- if (!incompletelyIgnoredGetters.isEmpty()) {
- continue;
- }
-
- SortedSet<Method> settersWithJsonIgnore =
- Sets.filter(methodNameToAllMethodMap.get(descriptor.getWriteMethod()),
- JsonIgnorePredicate.INSTANCE);
-
- Iterable<String> settersWithJsonIgnoreClassNames = FluentIterable.from(settersWithJsonIgnore)
- .transform(MethodToDeclaringClassFunction.INSTANCE)
- .transform(ReflectHelpers.CLASS_NAME);
-
- if (!settersWithJsonIgnore.isEmpty()) {
- IgnoredSetter ignored = new IgnoredSetter();
- ignored.descriptor = descriptor;
- ignored.settersWithJsonIgnoreClassNames = settersWithJsonIgnoreClassNames;
- ignoredSetters.add(ignored);
- }
- }
- throwForGettersWithInconsistentJsonIgnore(incompletelyIgnoredGetters);
- throwForSettersWithJsonIgnore(ignoredSetters);
-
- List<MissingBeanMethod> missingBeanMethods = new ArrayList<>();
- // Verify that each property has a matching read and write method.
- for (PropertyDescriptor propertyDescriptor : descriptors) {
- if (!(IGNORED_METHODS.contains(propertyDescriptor.getWriteMethod())
- || propertyDescriptor.getReadMethod() != null)) {
- MissingBeanMethod method = new MissingBeanMethod();
- method.property = propertyDescriptor;
- method.methodType = "getter";
- missingBeanMethods.add(method);
- continue;
- }
- if (!(IGNORED_METHODS.contains(propertyDescriptor.getReadMethod())
- || propertyDescriptor.getWriteMethod() != null)) {
- MissingBeanMethod method = new MissingBeanMethod();
- method.property = propertyDescriptor;
- method.methodType = "setter";
- missingBeanMethods.add(method);
- continue;
- }
- methods.add(propertyDescriptor.getReadMethod());
- methods.add(propertyDescriptor.getWriteMethod());
- }
- throwForMissingBeanMethod(iface, missingBeanMethods);
-
- // Verify that no additional methods are on an interface that aren't a bean property.
- SortedSet<Method> unknownMethods = new TreeSet<>(MethodComparator.INSTANCE);
- unknownMethods.addAll(
- Sets.filter(
- Sets.difference(Sets.newHashSet(klass.getMethods()), methods),
- NOT_SYNTHETIC_PREDICATE));
- Preconditions.checkArgument(unknownMethods.isEmpty(),
- "Methods %s on [%s] do not conform to being bean properties.",
- FluentIterable.from(unknownMethods).transform(ReflectHelpers.METHOD_FORMATTER),
- iface.getName());
-
- return descriptors;
- }
-
- private static class MultipleDefinitions {
- private Method method;
- private SortedSet<Method> collidingMethods;
- }
-
- private static void throwForMultipleDefinitions(
- Class<? extends PipelineOptions> iface, List<MultipleDefinitions> definitions) {
- if (definitions.size() == 1) {
- MultipleDefinitions errDef = definitions.get(0);
- throw new IllegalArgumentException(String.format(
- "Method [%s] has multiple definitions %s with different return types for [%s].",
- errDef.method.getName(), errDef.collidingMethods, iface.getName()));
- } else if (definitions.size() > 1) {
- StringBuilder errorBuilder = new StringBuilder(String.format(
- "Interface [%s] has Methods with multiple definitions with different return types:",
- iface.getName()));
- for (MultipleDefinitions errDef : definitions) {
- errorBuilder.append(String.format(
- "%n - Method [%s] has multiple definitions %s",
- errDef.method.getName(),
- errDef.collidingMethods));
- }
- throw new IllegalArgumentException(errorBuilder.toString());
- }
- }
-
- private static class InconsistentlyIgnoredGetters {
- PropertyDescriptor descriptor;
- Iterable<String> getterClassNames;
- Iterable<String> gettersWithJsonIgnoreClassNames;
- }
-
- private static void throwForGettersWithInconsistentJsonIgnore(
- List<InconsistentlyIgnoredGetters> getters) {
- if (getters.size() == 1) {
- InconsistentlyIgnoredGetters getter = getters.get(0);
- throw new IllegalArgumentException(String.format(
- "Expected getter for property [%s] to be marked with @JsonIgnore on all %s, "
- + "found only on %s",
- getter.descriptor.getName(), getter.getterClassNames,
- getter.gettersWithJsonIgnoreClassNames));
- } else if (getters.size() > 1) {
- StringBuilder errorBuilder =
- new StringBuilder("Property getters are inconsistently marked with @JsonIgnore:");
- for (InconsistentlyIgnoredGetters getter : getters) {
- errorBuilder.append(
- String.format("%n - Expected for property [%s] to be marked on all %s, "
- + "found only on %s",
- getter.descriptor.getName(), getter.getterClassNames,
- getter.gettersWithJsonIgnoreClassNames));
- }
- throw new IllegalArgumentException(errorBuilder.toString());
- }
- }
-
- private static class IgnoredSetter {
- PropertyDescriptor descriptor;
- Iterable<String> settersWithJsonIgnoreClassNames;
- }
-
- private static void throwForSettersWithJsonIgnore(List<IgnoredSetter> setters) {
- if (setters.size() == 1) {
- IgnoredSetter setter = setters.get(0);
- throw new IllegalArgumentException(
- String.format("Expected setter for property [%s] to not be marked with @JsonIgnore on %s",
- setter.descriptor.getName(), setter.settersWithJsonIgnoreClassNames));
- } else if (setters.size() > 1) {
- StringBuilder builder = new StringBuilder("Found setters marked with @JsonIgnore:");
- for (IgnoredSetter setter : setters) {
- builder.append(
- String.format("%n - Setter for property [%s] should not be marked with @JsonIgnore "
- + "on %s",
- setter.descriptor.getName(), setter.settersWithJsonIgnoreClassNames));
- }
- throw new IllegalArgumentException(builder.toString());
- }
- }
-
- private static class MissingBeanMethod {
- String methodType;
- PropertyDescriptor property;
- }
-
- private static void throwForMissingBeanMethod(
- Class<? extends PipelineOptions> iface, List<MissingBeanMethod> missingBeanMethods) {
- if (missingBeanMethods.size() == 1) {
- MissingBeanMethod missingBeanMethod = missingBeanMethods.get(0);
- throw new IllegalArgumentException(
- String.format("Expected %s for property [%s] of type [%s] on [%s].",
- missingBeanMethod.methodType, missingBeanMethod.property.getName(),
- missingBeanMethod.property.getPropertyType().getName(), iface.getName()));
- } else if (missingBeanMethods.size() > 1) {
- StringBuilder builder = new StringBuilder(String.format(
- "Found missing property methods on [%s]:", iface.getName()));
- for (MissingBeanMethod method : missingBeanMethods) {
- builder.append(
- String.format("%n - Expected %s for property [%s] of type [%s]", method.methodType,
- method.property.getName(), method.property.getPropertyType().getName()));
- }
- throw new IllegalArgumentException(builder.toString());
- }
- }
-
- /** A {@link Comparator} that uses the classes name to compare them. */
- private static class ClassNameComparator implements Comparator<Class<?>> {
- static final ClassNameComparator INSTANCE = new ClassNameComparator();
- @Override
- public int compare(Class<?> o1, Class<?> o2) {
- return o1.getName().compareTo(o2.getName());
- }
- }
-
- /** A {@link Comparator} that uses the object's classes canonical name to compare them. */
- private static class ObjectsClassComparator implements Comparator<Object> {
- static final ObjectsClassComparator INSTANCE = new ObjectsClassComparator();
- @Override
- public int compare(Object o1, Object o2) {
- return o1.getClass().getCanonicalName().compareTo(o2.getClass().getCanonicalName());
- }
- }
-
- /** A {@link Comparator} that uses the generic method signature to sort them. */
- private static class MethodComparator implements Comparator<Method> {
- static final MethodComparator INSTANCE = new MethodComparator();
- @Override
- public int compare(Method o1, Method o2) {
- return o1.toGenericString().compareTo(o2.toGenericString());
- }
- }
-
- /** A {@link Comparator} that uses the methods name to compare them. */
- static class MethodNameComparator implements Comparator<Method> {
- static final MethodNameComparator INSTANCE = new MethodNameComparator();
- @Override
- public int compare(Method o1, Method o2) {
- return o1.getName().compareTo(o2.getName());
- }
- }
-
- /** A {@link Function} that gets the method's return type. */
- private static class ReturnTypeFetchingFunction implements Function<Method, Class<?>> {
- static final ReturnTypeFetchingFunction INSTANCE = new ReturnTypeFetchingFunction();
- @Override
- public Class<?> apply(Method input) {
- return input.getReturnType();
- }
- }
-
- /** A {@link Function} with returns the declaring class for the method. */
- private static class MethodToDeclaringClassFunction implements Function<Method, Class<?>> {
- static final MethodToDeclaringClassFunction INSTANCE = new MethodToDeclaringClassFunction();
- @Override
- public Class<?> apply(Method input) {
- return input.getDeclaringClass();
- }
- }
-
- /**
- * A {@link Predicate} that returns true if the method is annotated with
- * {@link JsonIgnore @JsonIgnore}.
- */
- static class JsonIgnorePredicate implements Predicate<Method> {
- static final JsonIgnorePredicate INSTANCE = new JsonIgnorePredicate();
- @Override
- public boolean apply(Method input) {
- return input.isAnnotationPresent(JsonIgnore.class);
- }
- }
-
- /**
- * Splits string arguments based upon expected pattern of --argName=value.
- *
- * <p>Example GNU style command line arguments:
- *
- * <pre>
- * --project=MyProject (simple property, will set the "project" property to "MyProject")
- * --readOnly=true (for boolean properties, will set the "readOnly" property to "true")
- * --readOnly (shorthand for boolean properties, will set the "readOnly" property to "true")
- * --x=1 --x=2 --x=3 (list style simple property, will set the "x" property to [1, 2, 3])
- * --x=1,2,3 (shorthand list style simple property, will set the "x" property to [1, 2, 3])
- * --complexObject='{"key1":"value1",...} (JSON format for all other complex types)
- * </pre>
- *
- * <p>Simple properties are able to bound to {@link String}, {@link Class}, enums and Java
- * primitives {@code boolean}, {@code byte}, {@code short}, {@code int}, {@code long},
- * {@code float}, {@code double} and their primitive wrapper classes.
- *
- * <p>Simple list style properties are able to be bound to {@code boolean[]}, {@code char[]},
- * {@code short[]}, {@code int[]}, {@code long[]}, {@code float[]}, {@code double[]},
- * {@code Class[]}, enum arrays, {@code String[]}, and {@code List<String>}.
- *
- * <p>JSON format is required for all other types.
- *
- * <p>If strict parsing is enabled, options must start with '--', and not have an empty argument
- * name or value based upon the positioning of the '='. Empty or null arguments will be ignored
- * whether or not strict parsing is enabled.
- */
- private static ListMultimap<String, String> parseCommandLine(
- String[] args, boolean strictParsing) {
- ImmutableListMultimap.Builder<String, String> builder = ImmutableListMultimap.builder();
- for (String arg : args) {
- if (Strings.isNullOrEmpty(arg)) {
- continue;
- }
- try {
- Preconditions.checkArgument(arg.startsWith("--"),
- "Argument '%s' does not begin with '--'", arg);
- int index = arg.indexOf("=");
- // Make sure that '=' isn't the first character after '--' or the last character
- Preconditions.checkArgument(index != 2,
- "Argument '%s' starts with '--=', empty argument name not allowed", arg);
- if (index > 0) {
- builder.put(arg.substring(2, index), arg.substring(index + 1, arg.length()));
- } else {
- builder.put(arg.substring(2), "true");
- }
- } catch (IllegalArgumentException e) {
- if (strictParsing) {
- throw e;
- } else {
- LOG.warn("Strict parsing is disabled, ignoring option '{}' because {}",
- arg, e.getMessage());
- }
- }
- }
- return builder.build();
- }
-
- /**
- * Using the parsed string arguments, we convert the strings to the expected
- * return type of the methods that are found on the passed-in class.
- *
- * <p>For any return type that is expected to be an array or a collection, we further
- * split up each string on ','.
- *
- * <p>We special case the "runner" option. It is mapped to the class of the {@link PipelineRunner}
- * based off of the {@link PipelineRunner PipelineRunners} simple class name. If the provided
- * runner name is not registered via a {@link PipelineRunnerRegistrar}, we attempt to obtain the
- * class that the name represents using {@link Class#forName(String)} and use the result class if
- * it subclasses {@link PipelineRunner}.
- *
- * <p>If strict parsing is enabled, unknown options or options that cannot be converted to
- * the expected java type using an {@link ObjectMapper} will be ignored.
- */
- private static <T extends PipelineOptions> Map<String, Object> parseObjects(
- Class<T> klass, ListMultimap<String, String> options, boolean strictParsing) {
- Map<String, Method> propertyNamesToGetters = Maps.newHashMap();
- PipelineOptionsFactory.validateWellFormed(klass, REGISTERED_OPTIONS);
- @SuppressWarnings("unchecked")
- Iterable<PropertyDescriptor> propertyDescriptors =
- PipelineOptionsFactory.getPropertyDescriptors(
- FluentIterable.from(getRegisteredOptions()).append(klass).toSet());
- for (PropertyDescriptor descriptor : propertyDescriptors) {
- propertyNamesToGetters.put(descriptor.getName(), descriptor.getReadMethod());
- }
- Map<String, Object> convertedOptions = Maps.newHashMap();
- for (final Map.Entry<String, Collection<String>> entry : options.asMap().entrySet()) {
- try {
- // Search for close matches for missing properties.
- // Either off by one or off by two character errors.
- if (!propertyNamesToGetters.containsKey(entry.getKey())) {
- SortedSet<String> closestMatches = new TreeSet<String>(
- Sets.filter(propertyNamesToGetters.keySet(), new Predicate<String>() {
- @Override
- public boolean apply(@Nullable String input) {
- return StringUtils.getLevenshteinDistance(entry.getKey(), input) <= 2;
- }
- }));
- switch (closestMatches.size()) {
- case 0:
- throw new IllegalArgumentException(
- String.format("Class %s missing a property named '%s'.",
- klass, entry.getKey()));
- case 1:
- throw new IllegalArgumentException(
- String.format("Class %s missing a property named '%s'. Did you mean '%s'?",
- klass, entry.getKey(), Iterables.getOnlyElement(closestMatches)));
- default:
- throw new IllegalArgumentException(
- String.format("Class %s missing a property named '%s'. Did you mean one of %s?",
- klass, entry.getKey(), closestMatches));
- }
- }
-
- Method method = propertyNamesToGetters.get(entry.getKey());
- // Only allow empty argument values for String, String Array, and Collection.
- Class<?> returnType = method.getReturnType();
- JavaType type = MAPPER.getTypeFactory().constructType(method.getGenericReturnType());
- if ("runner".equals(entry.getKey())) {
- String runner = Iterables.getOnlyElement(entry.getValue());
- if (SUPPORTED_PIPELINE_RUNNERS.containsKey(runner)) {
- convertedOptions.put("runner", SUPPORTED_PIPELINE_RUNNERS.get(runner));
- } else {
- try {
- Class<?> runnerClass = Class.forName(runner);
- checkArgument(
- PipelineRunner.class.isAssignableFrom(runnerClass),
- "Class '%s' does not implement PipelineRunner. Supported pipeline runners %s",
- runner,
- Sets.newTreeSet(SUPPORTED_PIPELINE_RUNNERS.keySet()));
- convertedOptions.put("runner", runnerClass);
- } catch (ClassNotFoundException e) {
- String msg =
- String.format(
- "Unknown 'runner' specified '%s', supported pipeline runners %s",
- runner,
- Sets.newTreeSet(SUPPORTED_PIPELINE_RUNNERS.keySet()));
- throw new IllegalArgumentException(msg, e);
- }
- }
- } else if ((returnType.isArray() && (SIMPLE_TYPES.contains(returnType.getComponentType())
- || returnType.getComponentType().isEnum()))
- || Collection.class.isAssignableFrom(returnType)) {
- // Split any strings with ","
- List<String> values = FluentIterable.from(entry.getValue())
- .transformAndConcat(new Function<String, Iterable<String>>() {
- @Override
- public Iterable<String> apply(String input) {
- return Arrays.asList(input.split(","));
- }
- }).toList();
-
- if (returnType.isArray() && !returnType.getComponentType().equals(String.class)) {
- for (String value : values) {
- Preconditions.checkArgument(!value.isEmpty(),
- "Empty argument value is only allowed for String, String Array, and Collection,"
- + " but received: " + returnType);
- }
- }
- convertedOptions.put(entry.getKey(), MAPPER.convertValue(values, type));
- } else if (SIMPLE_TYPES.contains(returnType) || returnType.isEnum()) {
- String value = Iterables.getOnlyElement(entry.getValue());
- Preconditions.checkArgument(returnType.equals(String.class) || !value.isEmpty(),
- "Empty argument value is only allowed for String, String Array, and Collection,"
- + " but received: " + returnType);
- convertedOptions.put(entry.getKey(), MAPPER.convertValue(value, type));
- } else {
- String value = Iterables.getOnlyElement(entry.getValue());
- Preconditions.checkArgument(returnType.equals(String.class) || !value.isEmpty(),
- "Empty argument value is only allowed for String, String Array, and Collection,"
- + " but received: " + returnType);
- try {
- convertedOptions.put(entry.getKey(), MAPPER.readValue(value, type));
- } catch (IOException e) {
- throw new IllegalArgumentException("Unable to parse JSON value " + value, e);
- }
- }
- } catch (IllegalArgumentException e) {
- if (strictParsing) {
- throw e;
- } else {
- LOG.warn("Strict parsing is disabled, ignoring option '{}' with value '{}' because {}",
- entry.getKey(), entry.getValue(), e.getMessage());
- }
- }
- }
- return convertedOptions;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/PipelineOptionsRegistrar.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/PipelineOptionsRegistrar.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/PipelineOptionsRegistrar.java
deleted file mode 100644
index 1678541..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/PipelineOptionsRegistrar.java
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.options;
-
-import com.google.auto.service.AutoService;
-
-import java.util.ServiceLoader;
-
-/**
- * {@link PipelineOptions} creators have the ability to automatically have their
- * {@link PipelineOptions} registered with this SDK by creating a {@link ServiceLoader} entry
- * and a concrete implementation of this interface.
- *
- * <p>Note that automatic registration of any {@link PipelineOptions} requires users
- * conform to the limitations discussed on {@link PipelineOptionsFactory#register(Class)}.
- *
- * <p>It is optional but recommended to use one of the many build time tools such as
- * {@link AutoService} to generate the necessary META-INF files automatically.
- */
-public interface PipelineOptionsRegistrar {
- Iterable<Class<? extends PipelineOptions>> getPipelineOptions();
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/PipelineOptionsValidator.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/PipelineOptionsValidator.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/PipelineOptionsValidator.java
deleted file mode 100644
index b5612c4..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/PipelineOptionsValidator.java
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.options;
-
-import com.google.cloud.dataflow.sdk.options.Validation.Required;
-import com.google.cloud.dataflow.sdk.util.common.ReflectHelpers;
-import com.google.common.base.Preconditions;
-import com.google.common.collect.Collections2;
-import com.google.common.collect.Ordering;
-import com.google.common.collect.SortedSetMultimap;
-import com.google.common.collect.TreeMultimap;
-
-import java.lang.reflect.Method;
-import java.lang.reflect.Proxy;
-import java.util.Collection;
-
-/**
- * Validates that the {@link PipelineOptions} conforms to all the {@link Validation} criteria.
- */
-public class PipelineOptionsValidator {
- /**
- * Validates that the passed {@link PipelineOptions} conforms to all the validation criteria from
- * the passed in interface.
- *
- * <p>Note that the interface requested must conform to the validation criteria specified on
- * {@link PipelineOptions#as(Class)}.
- *
- * @param klass The interface to fetch validation criteria from.
- * @param options The {@link PipelineOptions} to validate.
- * @return The type
- */
- public static <T extends PipelineOptions> T validate(Class<T> klass, PipelineOptions options) {
- Preconditions.checkNotNull(klass);
- Preconditions.checkNotNull(options);
- Preconditions.checkArgument(Proxy.isProxyClass(options.getClass()));
- Preconditions.checkArgument(Proxy.getInvocationHandler(options)
- instanceof ProxyInvocationHandler);
-
- // Ensure the methods for T are registered on the ProxyInvocationHandler
- T asClassOptions = options.as(klass);
-
- ProxyInvocationHandler handler =
- (ProxyInvocationHandler) Proxy.getInvocationHandler(asClassOptions);
-
- SortedSetMultimap<String, Method> requiredGroups = TreeMultimap.create(
- Ordering.natural(), PipelineOptionsFactory.MethodNameComparator.INSTANCE);
- for (Method method : ReflectHelpers.getClosureOfMethodsOnInterface(klass)) {
- Required requiredAnnotation = method.getAnnotation(Validation.Required.class);
- if (requiredAnnotation != null) {
- if (requiredAnnotation.groups().length > 0) {
- for (String requiredGroup : requiredAnnotation.groups()) {
- requiredGroups.put(requiredGroup, method);
- }
- } else {
- Preconditions.checkArgument(handler.invoke(asClassOptions, method, null) != null,
- "Missing required value for [" + method + ", \"" + getDescription(method) + "\"]. ");
- }
- }
- }
-
- for (String requiredGroup : requiredGroups.keySet()) {
- if (!verifyGroup(handler, asClassOptions, requiredGroups.get(requiredGroup))) {
- throw new IllegalArgumentException("Missing required value for group [" + requiredGroup
- + "]. At least one of the following properties "
- + Collections2.transform(
- requiredGroups.get(requiredGroup), ReflectHelpers.METHOD_FORMATTER)
- + " required. Run with --help=" + klass.getSimpleName() + " for more information.");
- }
- }
-
- return asClassOptions;
- }
-
- private static boolean verifyGroup(ProxyInvocationHandler handler, PipelineOptions options,
- Collection<Method> requiredGroup) {
- for (Method m : requiredGroup) {
- if (handler.invoke(options, m, null) != null) {
- return true;
- }
- }
- return false;
- }
-
- private static String getDescription(Method method) {
- Description description = method.getAnnotation(Description.class);
- return description == null ? "" : description.value();
- }
-}
[42/67] [partial] incubator-beam git commit: Directory reorganization
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/FileBasedSource.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/FileBasedSource.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/FileBasedSource.java
deleted file mode 100644
index 5d32a9d..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/FileBasedSource.java
+++ /dev/null
@@ -1,648 +0,0 @@
-/*
- * Copyright (C) 2014 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
- * in compliance with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software distributed under the License
- * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
- * or implied. See the License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.io;
-
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-import com.google.cloud.dataflow.sdk.util.IOChannelFactory;
-import com.google.cloud.dataflow.sdk.util.IOChannelUtils;
-import com.google.common.base.Preconditions;
-import com.google.common.collect.ImmutableList;
-import com.google.common.collect.Iterables;
-import com.google.common.util.concurrent.Futures;
-import com.google.common.util.concurrent.ListenableFuture;
-import com.google.common.util.concurrent.ListeningExecutorService;
-import com.google.common.util.concurrent.MoreExecutors;
-
-import org.joda.time.Instant;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.IOException;
-import java.nio.channels.ReadableByteChannel;
-import java.nio.channels.SeekableByteChannel;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.List;
-import java.util.ListIterator;
-import java.util.NoSuchElementException;
-import java.util.concurrent.Callable;
-import java.util.concurrent.Executors;
-
-/**
- * A common base class for all file-based {@link Source}s. Extend this class to implement your own
- * file-based custom source.
- *
- * <p>A file-based {@code Source} is a {@code Source} backed by a file pattern defined as a Java
- * glob, a single file, or a offset range for a single file. See {@link OffsetBasedSource} and
- * {@link com.google.cloud.dataflow.sdk.io.range.RangeTracker} for semantics of offset ranges.
- *
- * <p>This source stores a {@code String} that is an {@link IOChannelFactory} specification for a
- * file or file pattern. There should be an {@code IOChannelFactory} defined for the file
- * specification provided. Please refer to {@link IOChannelUtils} and {@link IOChannelFactory} for
- * more information on this.
- *
- * <p>In addition to the methods left abstract from {@code BoundedSource}, subclasses must implement
- * methods to create a sub-source and a reader for a range of a single file -
- * {@link #createForSubrangeOfFile} and {@link #createSingleFileReader}. Please refer to
- * {@link XmlSource} for an example implementation of {@code FileBasedSource}.
- *
- * @param <T> Type of records represented by the source.
- */
-public abstract class FileBasedSource<T> extends OffsetBasedSource<T> {
- private static final Logger LOG = LoggerFactory.getLogger(FileBasedSource.class);
- private static final float FRACTION_OF_FILES_TO_STAT = 0.01f;
-
- // Package-private for testing
- static final int MAX_NUMBER_OF_FILES_FOR_AN_EXACT_STAT = 100;
-
- // Size of the thread pool to be used for performing file operations in parallel.
- // Package-private for testing.
- static final int THREAD_POOL_SIZE = 128;
-
- private final String fileOrPatternSpec;
- private final Mode mode;
-
- /**
- * A given {@code FileBasedSource} represents a file resource of one of these types.
- */
- public enum Mode {
- FILEPATTERN,
- SINGLE_FILE_OR_SUBRANGE
- }
-
- /**
- * Create a {@code FileBaseSource} based on a file or a file pattern specification. This
- * constructor must be used when creating a new {@code FileBasedSource} for a file pattern.
- *
- * <p>See {@link OffsetBasedSource} for a detailed description of {@code minBundleSize}.
- *
- * @param fileOrPatternSpec {@link IOChannelFactory} specification of file or file pattern
- * represented by the {@link FileBasedSource}.
- * @param minBundleSize minimum bundle size in bytes.
- */
- public FileBasedSource(String fileOrPatternSpec, long minBundleSize) {
- super(0, Long.MAX_VALUE, minBundleSize);
- mode = Mode.FILEPATTERN;
- this.fileOrPatternSpec = fileOrPatternSpec;
- }
-
- /**
- * Create a {@code FileBasedSource} based on a single file. This constructor must be used when
- * creating a new {@code FileBasedSource} for a subrange of a single file.
- * Additionally, this constructor must be used to create new {@code FileBasedSource}s when
- * subclasses implement the method {@link #createForSubrangeOfFile}.
- *
- * <p>See {@link OffsetBasedSource} for detailed descriptions of {@code minBundleSize},
- * {@code startOffset}, and {@code endOffset}.
- *
- * @param fileName {@link IOChannelFactory} specification of the file represented by the
- * {@link FileBasedSource}.
- * @param minBundleSize minimum bundle size in bytes.
- * @param startOffset starting byte offset.
- * @param endOffset ending byte offset. If the specified value {@code >= #getMaxEndOffset()} it
- * implies {@code #getMaxEndOffSet()}.
- */
- public FileBasedSource(String fileName, long minBundleSize,
- long startOffset, long endOffset) {
- super(startOffset, endOffset, minBundleSize);
- mode = Mode.SINGLE_FILE_OR_SUBRANGE;
- this.fileOrPatternSpec = fileName;
- }
-
- public final String getFileOrPatternSpec() {
- return fileOrPatternSpec;
- }
-
- public final Mode getMode() {
- return mode;
- }
-
- @Override
- public final FileBasedSource<T> createSourceForSubrange(long start, long end) {
- Preconditions.checkArgument(mode != Mode.FILEPATTERN,
- "Cannot split a file pattern based source based on positions");
- Preconditions.checkArgument(start >= getStartOffset(), "Start offset value " + start
- + " of the subrange cannot be smaller than the start offset value " + getStartOffset()
- + " of the parent source");
- Preconditions.checkArgument(end <= getEndOffset(), "End offset value " + end
- + " of the subrange cannot be larger than the end offset value " + getEndOffset()
- + " of the parent source");
-
- FileBasedSource<T> source = createForSubrangeOfFile(fileOrPatternSpec, start, end);
- if (start > 0 || end != Long.MAX_VALUE) {
- Preconditions.checkArgument(source.getMode() == Mode.SINGLE_FILE_OR_SUBRANGE,
- "Source created for the range [" + start + "," + end + ")"
- + " must be a subrange source");
- }
- return source;
- }
-
- /**
- * Creates and returns a new {@code FileBasedSource} of the same type as the current
- * {@code FileBasedSource} backed by a given file and an offset range. When current source is
- * being split, this method is used to generate new sub-sources. When creating the source
- * subclasses must call the constructor {@link #FileBasedSource(String, long, long, long)} of
- * {@code FileBasedSource} with corresponding parameter values passed here.
- *
- * @param fileName file backing the new {@code FileBasedSource}.
- * @param start starting byte offset of the new {@code FileBasedSource}.
- * @param end ending byte offset of the new {@code FileBasedSource}. May be Long.MAX_VALUE,
- * in which case it will be inferred using {@link #getMaxEndOffset}.
- */
- protected abstract FileBasedSource<T> createForSubrangeOfFile(
- String fileName, long start, long end);
-
- /**
- * Creates and returns an instance of a {@code FileBasedReader} implementation for the current
- * source assuming the source represents a single file. File patterns will be handled by
- * {@code FileBasedSource} implementation automatically.
- */
- protected abstract FileBasedReader<T> createSingleFileReader(
- PipelineOptions options);
-
- @Override
- public final long getEstimatedSizeBytes(PipelineOptions options) throws Exception {
- // This implementation of method getEstimatedSizeBytes is provided to simplify subclasses. Here
- // we perform the size estimation of files and file patterns using the interface provided by
- // IOChannelFactory.
-
- IOChannelFactory factory = IOChannelUtils.getFactory(fileOrPatternSpec);
- if (mode == Mode.FILEPATTERN) {
- // TODO Implement a more efficient parallel/batch size estimation mechanism for file patterns.
- long startTime = System.currentTimeMillis();
- long totalSize = 0;
- Collection<String> inputs = factory.match(fileOrPatternSpec);
- if (inputs.size() <= MAX_NUMBER_OF_FILES_FOR_AN_EXACT_STAT) {
- totalSize = getExactTotalSizeOfFiles(inputs, factory);
- LOG.debug("Size estimation of all files of pattern " + fileOrPatternSpec + " took "
- + (System.currentTimeMillis() - startTime) + " ms");
- } else {
- totalSize = getEstimatedSizeOfFilesBySampling(inputs, factory);
- LOG.debug("Size estimation of pattern " + fileOrPatternSpec + " by sampling took "
- + (System.currentTimeMillis() - startTime) + " ms");
- }
- return totalSize;
- } else {
- long start = getStartOffset();
- long end = Math.min(getEndOffset(), getMaxEndOffset(options));
- return end - start;
- }
- }
-
- // Get the exact total size of the given set of files.
- // Invokes multiple requests for size estimation in parallel using a thread pool.
- // TODO: replace this with bulk request API when it is available. Will require updates
- // to IOChannelFactory interface.
- private static long getExactTotalSizeOfFiles(
- Collection<String> files, IOChannelFactory ioChannelFactory) throws Exception {
- List<ListenableFuture<Long>> futures = new ArrayList<>();
- ListeningExecutorService service =
- MoreExecutors.listeningDecorator(Executors.newFixedThreadPool(THREAD_POOL_SIZE));
- long totalSize = 0;
- try {
- for (String file : files) {
- futures.add(createFutureForSizeEstimation(file, ioChannelFactory, service));
- }
-
- for (Long val : Futures.allAsList(futures).get()) {
- totalSize += val;
- }
-
- return totalSize;
- } finally {
- service.shutdown();
- }
- }
-
- private static ListenableFuture<Long> createFutureForSizeEstimation(
- final String file,
- final IOChannelFactory ioChannelFactory,
- ListeningExecutorService service) {
- return service.submit(
- new Callable<Long>() {
- @Override
- public Long call() throws Exception {
- return ioChannelFactory.getSizeBytes(file);
- }
- });
- }
-
- // Estimate the total size of the given set of files through sampling and extrapolation.
- // Currently we use uniform sampling which requires a linear sampling size for a reasonable
- // estimate.
- // TODO: Implement a more efficient sampling mechanism.
- private static long getEstimatedSizeOfFilesBySampling(
- Collection<String> files, IOChannelFactory ioChannelFactory) throws Exception {
- int sampleSize = (int) (FRACTION_OF_FILES_TO_STAT * files.size());
- sampleSize = Math.max(MAX_NUMBER_OF_FILES_FOR_AN_EXACT_STAT, sampleSize);
-
- List<String> selectedFiles = new ArrayList<String>(files);
- Collections.shuffle(selectedFiles);
- selectedFiles = selectedFiles.subList(0, sampleSize);
-
- return files.size() * getExactTotalSizeOfFiles(selectedFiles, ioChannelFactory)
- / selectedFiles.size();
- }
-
- private ListenableFuture<List<? extends FileBasedSource<T>>> createFutureForFileSplit(
- final String file,
- final long desiredBundleSizeBytes,
- final PipelineOptions options,
- ListeningExecutorService service) {
- return service.submit(new Callable<List<? extends FileBasedSource<T>>>() {
- @Override
- public List<? extends FileBasedSource<T>> call() throws Exception {
- return createForSubrangeOfFile(file, 0, Long.MAX_VALUE)
- .splitIntoBundles(desiredBundleSizeBytes, options);
- }
- });
- }
-
- @Override
- public final List<? extends FileBasedSource<T>> splitIntoBundles(
- long desiredBundleSizeBytes, PipelineOptions options) throws Exception {
- // This implementation of method splitIntoBundles is provided to simplify subclasses. Here we
- // split a FileBasedSource based on a file pattern to FileBasedSources based on full single
- // files. For files that can be efficiently seeked, we further split FileBasedSources based on
- // those files to FileBasedSources based on sub ranges of single files.
-
- if (mode == Mode.FILEPATTERN) {
- long startTime = System.currentTimeMillis();
- List<ListenableFuture<List<? extends FileBasedSource<T>>>> futures = new ArrayList<>();
-
- ListeningExecutorService service =
- MoreExecutors.listeningDecorator(Executors.newFixedThreadPool(THREAD_POOL_SIZE));
- try {
- for (final String file : FileBasedSource.expandFilePattern(fileOrPatternSpec)) {
- futures.add(createFutureForFileSplit(file, desiredBundleSizeBytes, options, service));
- }
- List<? extends FileBasedSource<T>> splitResults =
- ImmutableList.copyOf(Iterables.concat(Futures.allAsList(futures).get()));
- LOG.debug(
- "Splitting the source based on file pattern "
- + fileOrPatternSpec
- + " took "
- + (System.currentTimeMillis() - startTime)
- + " ms");
- return splitResults;
- } finally {
- service.shutdown();
- }
- } else {
- if (isSplittable()) {
- List<FileBasedSource<T>> splitResults = new ArrayList<>();
- for (OffsetBasedSource<T> split :
- super.splitIntoBundles(desiredBundleSizeBytes, options)) {
- splitResults.add((FileBasedSource<T>) split);
- }
- return splitResults;
- } else {
- LOG.debug("The source for file " + fileOrPatternSpec
- + " is not split into sub-range based sources since the file is not seekable");
- return ImmutableList.of(this);
- }
- }
- }
-
- /**
- * Determines whether a file represented by this source is can be split into bundles.
- *
- * <p>By default, a file is splittable if it is on a file system that supports efficient read
- * seeking. Subclasses may override to provide different behavior.
- */
- protected boolean isSplittable() throws Exception {
- // We split a file-based source into subranges only if the file is efficiently seekable.
- // If a file is not efficiently seekable it would be highly inefficient to create and read a
- // source based on a subrange of that file.
- IOChannelFactory factory = IOChannelUtils.getFactory(fileOrPatternSpec);
- return factory.isReadSeekEfficient(fileOrPatternSpec);
- }
-
- @Override
- public final BoundedReader<T> createReader(PipelineOptions options) throws IOException {
- // Validate the current source prior to creating a reader for it.
- this.validate();
-
- if (mode == Mode.FILEPATTERN) {
- long startTime = System.currentTimeMillis();
- Collection<String> files = FileBasedSource.expandFilePattern(fileOrPatternSpec);
- List<FileBasedReader<T>> fileReaders = new ArrayList<>();
- for (String fileName : files) {
- long endOffset;
- try {
- endOffset = IOChannelUtils.getFactory(fileName).getSizeBytes(fileName);
- } catch (IOException e) {
- LOG.warn("Failed to get size of " + fileName, e);
- endOffset = Long.MAX_VALUE;
- }
- fileReaders.add(
- createForSubrangeOfFile(fileName, 0, endOffset).createSingleFileReader(options));
- }
- LOG.debug("Creating a reader for file pattern " + fileOrPatternSpec + " took "
- + (System.currentTimeMillis() - startTime) + " ms");
- if (fileReaders.size() == 1) {
- return fileReaders.get(0);
- }
- return new FilePatternReader(this, fileReaders);
- } else {
- return createSingleFileReader(options);
- }
- }
-
- @Override
- public String toString() {
- switch (mode) {
- case FILEPATTERN:
- return fileOrPatternSpec;
- case SINGLE_FILE_OR_SUBRANGE:
- return fileOrPatternSpec + " range " + super.toString();
- default:
- throw new IllegalStateException("Unexpected mode: " + mode);
- }
- }
-
- @Override
- public void validate() {
- super.validate();
- switch (mode) {
- case FILEPATTERN:
- Preconditions.checkArgument(getStartOffset() == 0,
- "FileBasedSource is based on a file pattern or a full single file "
- + "but the starting offset proposed " + getStartOffset() + " is not zero");
- Preconditions.checkArgument(getEndOffset() == Long.MAX_VALUE,
- "FileBasedSource is based on a file pattern or a full single file "
- + "but the ending offset proposed " + getEndOffset() + " is not Long.MAX_VALUE");
- break;
- case SINGLE_FILE_OR_SUBRANGE:
- // Nothing more to validate.
- break;
- default:
- throw new IllegalStateException("Unknown mode: " + mode);
- }
- }
-
- @Override
- public final long getMaxEndOffset(PipelineOptions options) throws Exception {
- if (mode == Mode.FILEPATTERN) {
- throw new IllegalArgumentException("Cannot determine the exact end offset of a file pattern");
- }
- if (getEndOffset() == Long.MAX_VALUE) {
- IOChannelFactory factory = IOChannelUtils.getFactory(fileOrPatternSpec);
- return factory.getSizeBytes(fileOrPatternSpec);
- } else {
- return getEndOffset();
- }
- }
-
- protected static final Collection<String> expandFilePattern(String fileOrPatternSpec)
- throws IOException {
- IOChannelFactory factory = IOChannelUtils.getFactory(fileOrPatternSpec);
- Collection<String> matches = factory.match(fileOrPatternSpec);
- LOG.info("Matched {} files for pattern {}", matches.size(), fileOrPatternSpec);
- return matches;
- }
-
- /**
- * A {@link Source.Reader reader} that implements code common to readers of
- * {@code FileBasedSource}s.
- *
- * <h2>Seekability</h2>
- *
- * <p>This reader uses a {@link ReadableByteChannel} created for the file represented by the
- * corresponding source to efficiently move to the correct starting position defined in the
- * source. Subclasses of this reader should implement {@link #startReading} to get access to this
- * channel. If the source corresponding to the reader is for a subrange of a file the
- * {@code ReadableByteChannel} provided is guaranteed to be an instance of the type
- * {@link SeekableByteChannel}, which may be used by subclass to traverse back in the channel to
- * determine the correct starting position.
- *
- * <h2>Reading Records</h2>
- *
- * <p>Sequential reading is implemented using {@link #readNextRecord}.
- *
- * <p>Then {@code FileBasedReader} implements "reading a range [A, B)" in the following way.
- * <ol>
- * <li>{@link #start} opens the file
- * <li>{@link #start} seeks the {@code SeekableByteChannel} to A (reading offset ranges for
- * non-seekable files is not supported) and calls {@code startReading()}
- * <li>{@link #start} calls {@link #advance} once, which, via {@link #readNextRecord},
- * locates the first record which is at a split point AND its offset is at or after A.
- * If this record is at or after B, {@link #advance} returns false and reading is finished.
- * <li>if the previous advance call returned {@code true} sequential reading starts and
- * {@code advance()} will be called repeatedly
- * </ol>
- * {@code advance()} calls {@code readNextRecord()} on the subclass, and stops (returns false) if
- * the new record is at a split point AND the offset of the new record is at or after B.
- *
- * <h2>Thread Safety</h2>
- *
- * <p>Since this class implements {@link Source.Reader} it guarantees thread safety. Abstract
- * methods defined here will not be accessed by more than one thread concurrently.
- */
- public abstract static class FileBasedReader<T> extends OffsetBasedReader<T> {
- private ReadableByteChannel channel = null;
-
- /**
- * Subclasses should not perform IO operations at the constructor. All IO operations should be
- * delayed until the {@link #startReading} method is invoked.
- */
- public FileBasedReader(FileBasedSource<T> source) {
- super(source);
- Preconditions.checkArgument(source.getMode() != Mode.FILEPATTERN,
- "FileBasedReader does not support reading file patterns");
- }
-
- @Override
- public FileBasedSource<T> getCurrentSource() {
- return (FileBasedSource<T>) super.getCurrentSource();
- }
-
- @Override
- protected final boolean startImpl() throws IOException {
- FileBasedSource<T> source = getCurrentSource();
- IOChannelFactory factory = IOChannelUtils.getFactory(source.getFileOrPatternSpec());
- this.channel = factory.open(source.getFileOrPatternSpec());
-
- if (channel instanceof SeekableByteChannel) {
- SeekableByteChannel seekChannel = (SeekableByteChannel) channel;
- seekChannel.position(source.getStartOffset());
- } else {
- // Channel is not seekable. Must not be a subrange.
- Preconditions.checkArgument(source.mode != Mode.SINGLE_FILE_OR_SUBRANGE,
- "Subrange-based sources must only be defined for file types that support seekable "
- + " read channels");
- Preconditions.checkArgument(source.getStartOffset() == 0, "Start offset "
- + source.getStartOffset()
- + " is not zero but channel for reading the file is not seekable.");
- }
-
- startReading(channel);
-
- // Advance once to load the first record.
- return advanceImpl();
- }
-
- @Override
- protected final boolean advanceImpl() throws IOException {
- return readNextRecord();
- }
-
- /**
- * Closes any {@link ReadableByteChannel} created for the current reader. This implementation is
- * idempotent. Any {@code close()} method introduced by a subclass must be idempotent and must
- * call the {@code close()} method in the {@code FileBasedReader}.
- */
- @Override
- public void close() throws IOException {
- if (channel != null) {
- channel.close();
- }
- }
-
- /**
- * Performs any initialization of the subclass of {@code FileBasedReader} that involves IO
- * operations. Will only be invoked once and before that invocation the base class will seek the
- * channel to the source's starting offset.
- *
- * <p>Provided {@link ReadableByteChannel} is for the file represented by the source of this
- * reader. Subclass may use the {@code channel} to build a higher level IO abstraction, e.g., a
- * BufferedReader or an XML parser.
- *
- * <p>If the corresponding source is for a subrange of a file, {@code channel} is guaranteed to
- * be an instance of the type {@link SeekableByteChannel}.
- *
- * <p>After this method is invoked the base class will not be reading data from the channel or
- * adjusting the position of the channel. But the base class is responsible for properly closing
- * the channel.
- *
- * @param channel a byte channel representing the file backing the reader.
- */
- protected abstract void startReading(ReadableByteChannel channel) throws IOException;
-
- /**
- * Reads the next record from the channel provided by {@link #startReading}. Methods
- * {@link #getCurrent}, {@link #getCurrentOffset}, and {@link #isAtSplitPoint()} should return
- * the corresponding information about the record read by the last invocation of this method.
- *
- * <p>Note that this method will be called the same way for reading the first record in the
- * source (file or offset range in the file) and for reading subsequent records. It is up to the
- * subclass to do anything special for locating and reading the first record, if necessary.
- *
- * @return {@code true} if a record was successfully read, {@code false} if the end of the
- * channel was reached before successfully reading a new record.
- */
- protected abstract boolean readNextRecord() throws IOException;
- }
-
- // An internal Reader implementation that concatenates a sequence of FileBasedReaders.
- private class FilePatternReader extends BoundedReader<T> {
- private final FileBasedSource<T> source;
- private final List<FileBasedReader<T>> fileReaders;
- final ListIterator<FileBasedReader<T>> fileReadersIterator;
- FileBasedReader<T> currentReader = null;
-
- public FilePatternReader(FileBasedSource<T> source, List<FileBasedReader<T>> fileReaders) {
- this.source = source;
- this.fileReaders = fileReaders;
- this.fileReadersIterator = fileReaders.listIterator();
- }
-
- @Override
- public boolean start() throws IOException {
- return startNextNonemptyReader();
- }
-
- @Override
- public boolean advance() throws IOException {
- Preconditions.checkState(currentReader != null, "Call start() before advance()");
- if (currentReader.advance()) {
- return true;
- }
- return startNextNonemptyReader();
- }
-
- private boolean startNextNonemptyReader() throws IOException {
- while (fileReadersIterator.hasNext()) {
- currentReader = fileReadersIterator.next();
- if (currentReader.start()) {
- return true;
- }
- currentReader.close();
- }
- return false;
- }
-
- @Override
- public T getCurrent() throws NoSuchElementException {
- // A NoSuchElement will be thrown by the last FileBasedReader if getCurrent() is called after
- // advance() returns false.
- return currentReader.getCurrent();
- }
-
- @Override
- public Instant getCurrentTimestamp() throws NoSuchElementException {
- // A NoSuchElement will be thrown by the last FileBasedReader if getCurrentTimestamp()
- // is called after advance() returns false.
- return currentReader.getCurrentTimestamp();
- }
-
- @Override
- public void close() throws IOException {
- // Close all readers that may have not yet been closed.
- // If this reader has not been started, currentReader is null.
- if (currentReader != null) {
- currentReader.close();
- }
- while (fileReadersIterator.hasNext()) {
- fileReadersIterator.next().close();
- }
- }
-
- @Override
- public FileBasedSource<T> getCurrentSource() {
- return source;
- }
-
- @Override
- public FileBasedSource<T> splitAtFraction(double fraction) {
- // Unsupported. TODO: implement.
- LOG.debug("Dynamic splitting of FilePatternReader is unsupported.");
- return null;
- }
-
- @Override
- public Double getFractionConsumed() {
- if (currentReader == null) {
- return 0.0;
- }
- if (fileReaders.isEmpty()) {
- return 1.0;
- }
- int index = fileReadersIterator.previousIndex();
- int numReaders = fileReaders.size();
- if (index == numReaders) {
- return 1.0;
- }
- double before = 1.0 * index / numReaders;
- double after = 1.0 * (index + 1) / numReaders;
- Double fractionOfCurrentReader = currentReader.getFractionConsumed();
- if (fractionOfCurrentReader == null) {
- return before;
- }
- return before + fractionOfCurrentReader * (after - before);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/OffsetBasedSource.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/OffsetBasedSource.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/OffsetBasedSource.java
deleted file mode 100644
index d581b80..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/OffsetBasedSource.java
+++ /dev/null
@@ -1,326 +0,0 @@
-/*
- * Copyright (C) 2014 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
- * in compliance with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software distributed under the License
- * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
- * or implied. See the License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.io;
-
-import com.google.cloud.dataflow.sdk.io.range.OffsetRangeTracker;
-import com.google.cloud.dataflow.sdk.io.range.RangeTracker;
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-import com.google.common.base.Preconditions;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.NoSuchElementException;
-
-/**
- * A {@link BoundedSource} that uses offsets to define starting and ending positions.
- *
- * <p>{@link OffsetBasedSource} is a common base class for all bounded sources where the input can
- * be represented as a single range, and an input can be efficiently processed in parallel by
- * splitting the range into a set of disjoint ranges whose union is the original range. This class
- * should be used for sources that can be cheaply read starting at any given offset.
- * {@link OffsetBasedSource} stores the range and implements splitting into bundles.
- *
- * <p>Extend {@link OffsetBasedSource} to implement your own offset-based custom source.
- * {@link FileBasedSource}, which is a subclass of this, adds additional functionality useful for
- * custom sources that are based on files. If possible implementors should start from
- * {@link FileBasedSource} instead of {@link OffsetBasedSource}.
- *
- * <p>Consult {@link RangeTracker} for important semantics common to all sources defined by a range
- * of positions of a certain type, including the semantics of split points
- * ({@link OffsetBasedReader#isAtSplitPoint}).
- *
- * @param <T> Type of records represented by the source.
- * @see BoundedSource
- * @see FileBasedSource
- * @see RangeTracker
- */
-public abstract class OffsetBasedSource<T> extends BoundedSource<T> {
- private final long startOffset;
- private final long endOffset;
- private final long minBundleSize;
-
- /**
- * @param startOffset starting offset (inclusive) of the source. Must be non-negative.
- *
- * @param endOffset ending offset (exclusive) of the source. Use {@link Long#MAX_VALUE} to
- * indicate that the entire source after {@code startOffset} should be read. Must be
- * {@code > startOffset}.
- *
- * @param minBundleSize minimum bundle size in offset units that should be used when splitting the
- * source into sub-sources. This value may not be respected if the total
- * range of the source is smaller than the specified {@code minBundleSize}.
- * Must be non-negative.
- */
- public OffsetBasedSource(long startOffset, long endOffset, long minBundleSize) {
- this.startOffset = startOffset;
- this.endOffset = endOffset;
- this.minBundleSize = minBundleSize;
- }
-
- /**
- * Returns the starting offset of the source.
- */
- public long getStartOffset() {
- return startOffset;
- }
-
- /**
- * Returns the specified ending offset of the source. Any returned value greater than or equal to
- * {@link #getMaxEndOffset(PipelineOptions)} should be treated as
- * {@link #getMaxEndOffset(PipelineOptions)}.
- */
- public long getEndOffset() {
- return endOffset;
- }
-
- /**
- * Returns the minimum bundle size that should be used when splitting the source into sub-sources.
- * This value may not be respected if the total range of the source is smaller than the specified
- * {@code minBundleSize}.
- */
- public long getMinBundleSize() {
- return minBundleSize;
- }
-
- @Override
- public long getEstimatedSizeBytes(PipelineOptions options) throws Exception {
- long trueEndOffset = (endOffset == Long.MAX_VALUE) ? getMaxEndOffset(options) : endOffset;
- return getBytesPerOffset() * (trueEndOffset - getStartOffset());
- }
-
- @Override
- public List<? extends OffsetBasedSource<T>> splitIntoBundles(
- long desiredBundleSizeBytes, PipelineOptions options) throws Exception {
- // Split the range into bundles based on the desiredBundleSizeBytes. Final bundle is adjusted to
- // make sure that we do not end up with a too small bundle at the end. If the desired bundle
- // size is smaller than the minBundleSize of the source then minBundleSize will be used instead.
-
- long desiredBundleSizeOffsetUnits = Math.max(
- Math.max(1, desiredBundleSizeBytes / getBytesPerOffset()),
- minBundleSize);
-
- List<OffsetBasedSource<T>> subSources = new ArrayList<>();
- long start = startOffset;
- long maxEnd = Math.min(endOffset, getMaxEndOffset(options));
-
- while (start < maxEnd) {
- long end = start + desiredBundleSizeOffsetUnits;
- end = Math.min(end, maxEnd);
- // Avoid having a too small bundle at the end and ensure that we respect minBundleSize.
- long remaining = maxEnd - end;
- if ((remaining < desiredBundleSizeOffsetUnits / 4) || (remaining < minBundleSize)) {
- end = maxEnd;
- }
- subSources.add(createSourceForSubrange(start, end));
-
- start = end;
- }
- return subSources;
- }
-
- @Override
- public void validate() {
- Preconditions.checkArgument(
- this.startOffset >= 0,
- "Start offset has value %s, must be non-negative", this.startOffset);
- Preconditions.checkArgument(
- this.endOffset >= 0,
- "End offset has value %s, must be non-negative", this.endOffset);
- Preconditions.checkArgument(
- this.startOffset < this.endOffset,
- "Start offset %s must be before end offset %s",
- this.startOffset, this.endOffset);
- Preconditions.checkArgument(
- this.minBundleSize >= 0,
- "minBundleSize has value %s, must be non-negative",
- this.minBundleSize);
- }
-
- @Override
- public String toString() {
- return "[" + startOffset + ", " + endOffset + ")";
- }
-
- /**
- * Returns approximately how many bytes of data correspond to a single offset in this source.
- * Used for translation between this source's range and methods defined in terms of bytes, such
- * as {@link #getEstimatedSizeBytes} and {@link #splitIntoBundles}.
- *
- * <p>Defaults to {@code 1} byte, which is the common case for, e.g., file sources.
- */
- public long getBytesPerOffset() {
- return 1L;
- }
-
- /**
- * Returns the actual ending offset of the current source. The value returned by this function
- * will be used to clip the end of the range {@code [startOffset, endOffset)} such that the
- * range used is {@code [startOffset, min(endOffset, maxEndOffset))}.
- *
- * <p>As an example in which {@link OffsetBasedSource} is used to implement a file source, suppose
- * that this source was constructed with an {@code endOffset} of {@link Long#MAX_VALUE} to
- * indicate that a file should be read to the end. Then {@link #getMaxEndOffset} should determine
- * the actual, exact size of the file in bytes and return it.
- */
- public abstract long getMaxEndOffset(PipelineOptions options) throws Exception;
-
- /**
- * Returns an {@link OffsetBasedSource} for a subrange of the current source. The
- * subrange {@code [start, end)} must be within the range {@code [startOffset, endOffset)} of
- * the current source, i.e. {@code startOffset <= start < end <= endOffset}.
- */
- public abstract OffsetBasedSource<T> createSourceForSubrange(long start, long end);
-
- /**
- * Whether this source should allow dynamic splitting of the offset ranges.
- *
- * <p>True by default. Override this to return false if the source cannot
- * support dynamic splitting correctly. If this returns false,
- * {@link OffsetBasedSource.OffsetBasedReader#splitAtFraction} will refuse all split requests.
- */
- public boolean allowsDynamicSplitting() {
- return true;
- }
-
- /**
- * A {@link Source.Reader} that implements code common to readers of all
- * {@link OffsetBasedSource}s.
- *
- * <p>Subclasses have to implement:
- * <ul>
- * <li>The methods {@link #startImpl} and {@link #advanceImpl} for reading the
- * first or subsequent records.
- * <li>The methods {@link #getCurrent}, {@link #getCurrentOffset}, and optionally
- * {@link #isAtSplitPoint} and {@link #getCurrentTimestamp} to access properties of
- * the last record successfully read by {@link #startImpl} or {@link #advanceImpl}.
- * </ul>
- */
- public abstract static class OffsetBasedReader<T> extends BoundedReader<T> {
- private static final Logger LOG = LoggerFactory.getLogger(OffsetBasedReader.class);
-
- private OffsetBasedSource<T> source;
-
- /** The {@link OffsetRangeTracker} managing the range and current position of the source. */
- private final OffsetRangeTracker rangeTracker;
-
- /**
- * @param source the {@link OffsetBasedSource} to be read by the current reader.
- */
- public OffsetBasedReader(OffsetBasedSource<T> source) {
- this.source = source;
- this.rangeTracker = new OffsetRangeTracker(source.getStartOffset(), source.getEndOffset());
- }
-
- /**
- * Returns the <i>starting</i> offset of the {@link Source.Reader#getCurrent current record},
- * which has been read by the last successful {@link Source.Reader#start} or
- * {@link Source.Reader#advance} call.
- * <p>If no such call has been made yet, the return value is unspecified.
- * <p>See {@link RangeTracker} for description of offset semantics.
- */
- protected abstract long getCurrentOffset() throws NoSuchElementException;
-
- /**
- * Returns whether the current record is at a split point (i.e., whether the current record
- * would be the first record to be read by a source with a specified start offset of
- * {@link #getCurrentOffset}).
- *
- * <p>See detailed documentation about split points in {@link RangeTracker}.
- */
- protected boolean isAtSplitPoint() throws NoSuchElementException {
- return true;
- }
-
- @Override
- public final boolean start() throws IOException {
- return startImpl() && rangeTracker.tryReturnRecordAt(isAtSplitPoint(), getCurrentOffset());
- }
-
- @Override
- public final boolean advance() throws IOException {
- return advanceImpl() && rangeTracker.tryReturnRecordAt(isAtSplitPoint(), getCurrentOffset());
- }
-
- /**
- * Initializes the {@link OffsetBasedSource.OffsetBasedReader} and advances to the first record,
- * returning {@code true} if there is a record available to be read. This method will be
- * invoked exactly once and may perform expensive setup operations that are needed to
- * initialize the reader.
- *
- * <p>This function is the {@code OffsetBasedReader} implementation of
- * {@link BoundedReader#start}. The key difference is that the implementor can ignore the
- * possibility that it should no longer produce the first record, either because it has exceeded
- * the original {@code endOffset} assigned to the reader, or because a concurrent call to
- * {@link #splitAtFraction} has changed the source to shrink the offset range being read.
- *
- * @see BoundedReader#start
- */
- protected abstract boolean startImpl() throws IOException;
-
- /**
- * Advances to the next record and returns {@code true}, or returns false if there is no next
- * record.
- *
- * <p>This function is the {@code OffsetBasedReader} implementation of
- * {@link BoundedReader#advance}. The key difference is that the implementor can ignore the
- * possibility that it should no longer produce the next record, either because it has exceeded
- * the original {@code endOffset} assigned to the reader, or because a concurrent call to
- * {@link #splitAtFraction} has changed the source to shrink the offset range being read.
- *
- * @see BoundedReader#advance
- */
- protected abstract boolean advanceImpl() throws IOException;
-
- @Override
- public synchronized OffsetBasedSource<T> getCurrentSource() {
- return source;
- }
-
- @Override
- public Double getFractionConsumed() {
- return rangeTracker.getFractionConsumed();
- }
-
- @Override
- public final synchronized OffsetBasedSource<T> splitAtFraction(double fraction) {
- if (!getCurrentSource().allowsDynamicSplitting()) {
- return null;
- }
- if (rangeTracker.getStopPosition() == Long.MAX_VALUE) {
- LOG.debug(
- "Refusing to split unbounded OffsetBasedReader {} at fraction {}",
- rangeTracker, fraction);
- return null;
- }
- long splitOffset = rangeTracker.getPositionForFractionConsumed(fraction);
- LOG.debug(
- "Proposing to split OffsetBasedReader {} at fraction {} (offset {})",
- rangeTracker, fraction, splitOffset);
- if (!rangeTracker.trySplitAtPosition(splitOffset)) {
- return null;
- }
- long start = source.getStartOffset();
- long end = source.getEndOffset();
- OffsetBasedSource<T> primary = source.createSourceForSubrange(start, splitOffset);
- OffsetBasedSource<T> residual = source.createSourceForSubrange(splitOffset, end);
- this.source = primary;
- return residual;
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/PubsubIO.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/PubsubIO.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/PubsubIO.java
deleted file mode 100644
index 653b31f..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/PubsubIO.java
+++ /dev/null
@@ -1,1044 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.io;
-
-import static com.google.common.base.MoreObjects.firstNonNull;
-import static com.google.common.base.Preconditions.checkArgument;
-
-import com.google.api.client.util.Clock;
-import com.google.api.client.util.DateTime;
-import com.google.api.services.pubsub.Pubsub;
-import com.google.api.services.pubsub.model.AcknowledgeRequest;
-import com.google.api.services.pubsub.model.PublishRequest;
-import com.google.api.services.pubsub.model.PubsubMessage;
-import com.google.api.services.pubsub.model.PullRequest;
-import com.google.api.services.pubsub.model.PullResponse;
-import com.google.api.services.pubsub.model.ReceivedMessage;
-import com.google.api.services.pubsub.model.Subscription;
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.coders.StringUtf8Coder;
-import com.google.cloud.dataflow.sdk.coders.VoidCoder;
-import com.google.cloud.dataflow.sdk.options.DataflowPipelineOptions;
-import com.google.cloud.dataflow.sdk.runners.DataflowPipelineRunner;
-import com.google.cloud.dataflow.sdk.runners.DirectPipelineRunner;
-import com.google.cloud.dataflow.sdk.runners.PipelineRunner;
-import com.google.cloud.dataflow.sdk.transforms.Create;
-import com.google.cloud.dataflow.sdk.transforms.DoFn;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-import com.google.cloud.dataflow.sdk.transforms.ParDo;
-import com.google.cloud.dataflow.sdk.transforms.windowing.AfterWatermark;
-import com.google.cloud.dataflow.sdk.util.CoderUtils;
-import com.google.cloud.dataflow.sdk.util.Transport;
-import com.google.cloud.dataflow.sdk.util.WindowingStrategy;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.cloud.dataflow.sdk.values.PCollection.IsBounded;
-import com.google.cloud.dataflow.sdk.values.PDone;
-import com.google.cloud.dataflow.sdk.values.PInput;
-import com.google.common.annotations.VisibleForTesting;
-import com.google.common.base.Throwables;
-import com.google.common.collect.ImmutableMap;
-
-import org.joda.time.Duration;
-import org.joda.time.Instant;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.IOException;
-import java.io.Serializable;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.Random;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import javax.annotation.Nullable;
-
-/**
- * Read and Write {@link PTransform}s for Cloud Pub/Sub streams. These transforms create
- * and consume unbounded {@link PCollection PCollections}.
- *
- * <h3>Permissions</h3>
- * <p>Permission requirements depend on the {@link PipelineRunner} that is used to execute the
- * Dataflow job. Please refer to the documentation of corresponding
- * {@link PipelineRunner PipelineRunners} for more details.
- */
-public class PubsubIO {
- private static final Logger LOG = LoggerFactory.getLogger(PubsubIO.class);
-
- /** The default {@link Coder} used to translate to/from Cloud Pub/Sub messages. */
- public static final Coder<String> DEFAULT_PUBSUB_CODER = StringUtf8Coder.of();
-
- /**
- * Project IDs must contain 6-63 lowercase letters, digits, or dashes.
- * IDs must start with a letter and may not end with a dash.
- * This regex isn't exact - this allows for patterns that would be rejected by
- * the service, but this is sufficient for basic parsing of table references.
- */
- private static final Pattern PROJECT_ID_REGEXP =
- Pattern.compile("[a-z][-a-z0-9:.]{4,61}[a-z0-9]");
-
- private static final Pattern SUBSCRIPTION_REGEXP =
- Pattern.compile("projects/([^/]+)/subscriptions/(.+)");
-
- private static final Pattern TOPIC_REGEXP = Pattern.compile("projects/([^/]+)/topics/(.+)");
-
- private static final Pattern V1BETA1_SUBSCRIPTION_REGEXP =
- Pattern.compile("/subscriptions/([^/]+)/(.+)");
-
- private static final Pattern V1BETA1_TOPIC_REGEXP = Pattern.compile("/topics/([^/]+)/(.+)");
-
- private static final Pattern PUBSUB_NAME_REGEXP = Pattern.compile("[a-zA-Z][-._~%+a-zA-Z0-9]+");
-
- private static final int PUBSUB_NAME_MAX_LENGTH = 255;
-
- private static final String SUBSCRIPTION_RANDOM_TEST_PREFIX = "_random/";
- private static final String SUBSCRIPTION_STARTING_SIGNAL = "_starting_signal/";
- private static final String TOPIC_DEV_NULL_TEST_NAME = "/topics/dev/null";
-
- private static void validateProjectName(String project) {
- Matcher match = PROJECT_ID_REGEXP.matcher(project);
- if (!match.matches()) {
- throw new IllegalArgumentException(
- "Illegal project name specified in Pubsub subscription: " + project);
- }
- }
-
- private static void validatePubsubName(String name) {
- if (name.length() > PUBSUB_NAME_MAX_LENGTH) {
- throw new IllegalArgumentException(
- "Pubsub object name is longer than 255 characters: " + name);
- }
-
- if (name.startsWith("goog")) {
- throw new IllegalArgumentException("Pubsub object name cannot start with goog: " + name);
- }
-
- Matcher match = PUBSUB_NAME_REGEXP.matcher(name);
- if (!match.matches()) {
- throw new IllegalArgumentException("Illegal Pubsub object name specified: " + name
- + " Please see Javadoc for naming rules.");
- }
- }
-
- /**
- * Returns the {@link Instant} that corresponds to the timestamp in the supplied
- * {@link PubsubMessage} under the specified {@code ink label}. See
- * {@link PubsubIO.Read#timestampLabel(String)} for details about how these messages are
- * parsed.
- *
- * <p>The {@link Clock} parameter is used to virtualize time for testing.
- *
- * @throws IllegalArgumentException if the timestamp label is provided, but there is no
- * corresponding attribute in the message or the value provided is not a valid timestamp
- * string.
- * @see PubsubIO.Read#timestampLabel(String)
- */
- @VisibleForTesting
- protected static Instant assignMessageTimestamp(
- PubsubMessage message, @Nullable String label, Clock clock) {
- if (label == null) {
- return new Instant(clock.currentTimeMillis());
- }
-
- // Extract message attributes, defaulting to empty map if null.
- Map<String, String> attributes = firstNonNull(
- message.getAttributes(), ImmutableMap.<String, String>of());
-
- String timestampStr = attributes.get(label);
- checkArgument(timestampStr != null && !timestampStr.isEmpty(),
- "PubSub message is missing a timestamp in label: %s", label);
-
- long millisSinceEpoch;
- try {
- // Try parsing as milliseconds since epoch. Note there is no way to parse a string in
- // RFC 3339 format here.
- // Expected IllegalArgumentException if parsing fails; we use that to fall back to RFC 3339.
- millisSinceEpoch = Long.parseLong(timestampStr);
- } catch (IllegalArgumentException e) {
- // Try parsing as RFC3339 string. DateTime.parseRfc3339 will throw an IllegalArgumentException
- // if parsing fails, and the caller should handle.
- millisSinceEpoch = DateTime.parseRfc3339(timestampStr).getValue();
- }
- return new Instant(millisSinceEpoch);
- }
-
- /**
- * Class representing a Cloud Pub/Sub Subscription.
- */
- public static class PubsubSubscription implements Serializable {
- private enum Type { NORMAL, FAKE }
-
- private final Type type;
- private final String project;
- private final String subscription;
-
- private PubsubSubscription(Type type, String project, String subscription) {
- this.type = type;
- this.project = project;
- this.subscription = subscription;
- }
-
- /**
- * Creates a class representing a Pub/Sub subscription from the specified subscription path.
- *
- * <p>Cloud Pub/Sub subscription names should be of the form
- * {@code projects/<project>/subscriptions/<subscription>}, where {@code <project>} is the name
- * of the project the subscription belongs to. The {@code <subscription>} component must comply
- * with the following requirements:
- *
- * <ul>
- * <li>Can only contain lowercase letters, numbers, dashes ('-'), underscores ('_') and periods
- * ('.').</li>
- * <li>Must be between 3 and 255 characters.</li>
- * <li>Must begin with a letter.</li>
- * <li>Must end with a letter or a number.</li>
- * <li>Cannot begin with {@code 'goog'} prefix.</li>
- * </ul>
- */
- public static PubsubSubscription fromPath(String path) {
- if (path.startsWith(SUBSCRIPTION_RANDOM_TEST_PREFIX)
- || path.startsWith(SUBSCRIPTION_STARTING_SIGNAL)) {
- return new PubsubSubscription(Type.FAKE, "", path);
- }
-
- String projectName, subscriptionName;
-
- Matcher v1beta1Match = V1BETA1_SUBSCRIPTION_REGEXP.matcher(path);
- if (v1beta1Match.matches()) {
- LOG.warn("Saw subscription in v1beta1 format. Subscriptions should be in the format "
- + "projects/<project_id>/subscriptions/<subscription_name>");
- projectName = v1beta1Match.group(1);
- subscriptionName = v1beta1Match.group(2);
- } else {
- Matcher match = SUBSCRIPTION_REGEXP.matcher(path);
- if (!match.matches()) {
- throw new IllegalArgumentException("Pubsub subscription is not in "
- + "projects/<project_id>/subscriptions/<subscription_name> format: " + path);
- }
- projectName = match.group(1);
- subscriptionName = match.group(2);
- }
-
- validateProjectName(projectName);
- validatePubsubName(subscriptionName);
- return new PubsubSubscription(Type.NORMAL, projectName, subscriptionName);
- }
-
- /**
- * Returns the string representation of this subscription as a path used in the Cloud Pub/Sub
- * v1beta1 API.
- *
- * @deprecated the v1beta1 API for Cloud Pub/Sub is deprecated.
- */
- @Deprecated
- public String asV1Beta1Path() {
- if (type == Type.NORMAL) {
- return "/subscriptions/" + project + "/" + subscription;
- } else {
- return subscription;
- }
- }
-
- /**
- * Returns the string representation of this subscription as a path used in the Cloud Pub/Sub
- * v1beta2 API.
- *
- * @deprecated the v1beta2 API for Cloud Pub/Sub is deprecated.
- */
- @Deprecated
- public String asV1Beta2Path() {
- if (type == Type.NORMAL) {
- return "projects/" + project + "/subscriptions/" + subscription;
- } else {
- return subscription;
- }
- }
-
- /**
- * Returns the string representation of this subscription as a path used in the Cloud Pub/Sub
- * API.
- */
- public String asPath() {
- if (type == Type.NORMAL) {
- return "projects/" + project + "/subscriptions/" + subscription;
- } else {
- return subscription;
- }
- }
- }
-
- /**
- * Class representing a Cloud Pub/Sub Topic.
- */
- public static class PubsubTopic implements Serializable {
- private enum Type { NORMAL, FAKE }
-
- private final Type type;
- private final String project;
- private final String topic;
-
- private PubsubTopic(Type type, String project, String topic) {
- this.type = type;
- this.project = project;
- this.topic = topic;
- }
-
- /**
- * Creates a class representing a Cloud Pub/Sub topic from the specified topic path.
- *
- * <p>Cloud Pub/Sub topic names should be of the form
- * {@code /topics/<project>/<topic>}, where {@code <project>} is the name of
- * the publishing project. The {@code <topic>} component must comply with
- * the following requirements:
- *
- * <ul>
- * <li>Can only contain lowercase letters, numbers, dashes ('-'), underscores ('_') and periods
- * ('.').</li>
- * <li>Must be between 3 and 255 characters.</li>
- * <li>Must begin with a letter.</li>
- * <li>Must end with a letter or a number.</li>
- * <li>Cannot begin with 'goog' prefix.</li>
- * </ul>
- */
- public static PubsubTopic fromPath(String path) {
- if (path.equals(TOPIC_DEV_NULL_TEST_NAME)) {
- return new PubsubTopic(Type.FAKE, "", path);
- }
-
- String projectName, topicName;
-
- Matcher v1beta1Match = V1BETA1_TOPIC_REGEXP.matcher(path);
- if (v1beta1Match.matches()) {
- LOG.warn("Saw topic in v1beta1 format. Topics should be in the format "
- + "projects/<project_id>/topics/<topic_name>");
- projectName = v1beta1Match.group(1);
- topicName = v1beta1Match.group(2);
- } else {
- Matcher match = TOPIC_REGEXP.matcher(path);
- if (!match.matches()) {
- throw new IllegalArgumentException(
- "Pubsub topic is not in projects/<project_id>/topics/<topic_name> format: " + path);
- }
- projectName = match.group(1);
- topicName = match.group(2);
- }
-
- validateProjectName(projectName);
- validatePubsubName(topicName);
- return new PubsubTopic(Type.NORMAL, projectName, topicName);
- }
-
- /**
- * Returns the string representation of this topic as a path used in the Cloud Pub/Sub
- * v1beta1 API.
- *
- * @deprecated the v1beta1 API for Cloud Pub/Sub is deprecated.
- */
- @Deprecated
- public String asV1Beta1Path() {
- if (type == Type.NORMAL) {
- return "/topics/" + project + "/" + topic;
- } else {
- return topic;
- }
- }
-
- /**
- * Returns the string representation of this topic as a path used in the Cloud Pub/Sub
- * v1beta2 API.
- *
- * @deprecated the v1beta2 API for Cloud Pub/Sub is deprecated.
- */
- @Deprecated
- public String asV1Beta2Path() {
- if (type == Type.NORMAL) {
- return "projects/" + project + "/topics/" + topic;
- } else {
- return topic;
- }
- }
-
- /**
- * Returns the string representation of this topic as a path used in the Cloud Pub/Sub
- * API.
- */
- public String asPath() {
- if (type == Type.NORMAL) {
- return "projects/" + project + "/topics/" + topic;
- } else {
- return topic;
- }
- }
- }
-
- /**
- * A {@link PTransform} that continuously reads from a Cloud Pub/Sub stream and
- * returns a {@link PCollection} of {@link String Strings} containing the items from
- * the stream.
- *
- * <p>When running with a {@link PipelineRunner} that only supports bounded
- * {@link PCollection PCollections} (such as {@link DirectPipelineRunner} or
- * {@link DataflowPipelineRunner} without {@code --streaming}), only a bounded portion of the
- * input Pub/Sub stream can be processed. As such, either {@link Bound#maxNumRecords(int)} or
- * {@link Bound#maxReadTime(Duration)} must be set.
- */
- public static class Read {
- /**
- * Creates and returns a transform for reading from Cloud Pub/Sub with the specified transform
- * name.
- */
- public static Bound<String> named(String name) {
- return new Bound<>(DEFAULT_PUBSUB_CODER).named(name);
- }
-
- /**
- * Creates and returns a transform for reading from a Cloud Pub/Sub topic. Mutually exclusive
- * with {@link #subscription(String)}.
- *
- * <p>See {@link PubsubIO.PubsubTopic#fromPath(String)} for more details on the format
- * of the {@code topic} string.
- *
- * <p>Dataflow will start reading data published on this topic from the time the pipeline is
- * started. Any data published on the topic before the pipeline is started will not be read by
- * Dataflow.
- */
- public static Bound<String> topic(String topic) {
- return new Bound<>(DEFAULT_PUBSUB_CODER).topic(topic);
- }
-
- /**
- * Creates and returns a transform for reading from a specific Cloud Pub/Sub subscription.
- * Mutually exclusive with {@link #topic(String)}.
- *
- * <p>See {@link PubsubIO.PubsubSubscription#fromPath(String)} for more details on the format
- * of the {@code subscription} string.
- */
- public static Bound<String> subscription(String subscription) {
- return new Bound<>(DEFAULT_PUBSUB_CODER).subscription(subscription);
- }
-
- /**
- * Creates and returns a transform reading from Cloud Pub/Sub where record timestamps are
- * expected to be provided as Pub/Sub message attributes. The {@code timestampLabel}
- * parameter specifies the name of the attribute that contains the timestamp.
- *
- * <p>The timestamp value is expected to be represented in the attribute as either:
- *
- * <ul>
- * <li>a numerical value representing the number of milliseconds since the Unix epoch. For
- * example, if using the Joda time classes, {@link Instant#getMillis()} returns the correct
- * value for this attribute.
- * <li>a String in RFC 3339 format. For example, {@code 2015-10-29T23:41:41.123Z}. The
- * sub-second component of the timestamp is optional, and digits beyond the first three
- * (i.e., time units smaller than milliseconds) will be ignored.
- * </ul>
- *
- * <p>If {@code timestampLabel} is not provided, the system will generate record timestamps
- * the first time it sees each record. All windowing will be done relative to these timestamps.
- *
- * <p>By default, windows are emitted based on an estimate of when this source is likely
- * done producing data for a given timestamp (referred to as the Watermark; see
- * {@link AfterWatermark} for more details). Any late data will be handled by the trigger
- * specified with the windowing strategy – by default it will be output immediately.
- *
- * <p>Note that the system can guarantee that no late data will ever be seen when it assigns
- * timestamps by arrival time (i.e. {@code timestampLabel} is not provided).
- *
- * @see <a href="https://www.ietf.org/rfc/rfc3339.txt">RFC 3339</a>
- */
- public static Bound<String> timestampLabel(String timestampLabel) {
- return new Bound<>(DEFAULT_PUBSUB_CODER).timestampLabel(timestampLabel);
- }
-
- /**
- * Creates and returns a transform for reading from Cloud Pub/Sub where unique record
- * identifiers are expected to be provided as Pub/Sub message attributes. The {@code idLabel}
- * parameter specifies the attribute name. The value of the attribute can be any string
- * that uniquely identifies this record.
- *
- * <p>If {@code idLabel} is not provided, Dataflow cannot guarantee that no duplicate data will
- * be delivered on the Pub/Sub stream. In this case, deduplication of the stream will be
- * strictly best effort.
- */
- public static Bound<String> idLabel(String idLabel) {
- return new Bound<>(DEFAULT_PUBSUB_CODER).idLabel(idLabel);
- }
-
- /**
- * Creates and returns a transform for reading from Cloud Pub/Sub that uses the given
- * {@link Coder} to decode Pub/Sub messages into a value of type {@code T}.
- *
- * <p>By default, uses {@link StringUtf8Coder}, which just
- * returns the text lines as Java strings.
- *
- * @param <T> the type of the decoded elements, and the elements
- * of the resulting PCollection.
- */
- public static <T> Bound<T> withCoder(Coder<T> coder) {
- return new Bound<>(coder);
- }
-
- /**
- * Creates and returns a transform for reading from Cloud Pub/Sub with a maximum number of
- * records that will be read. The transform produces a <i>bounded</i> {@link PCollection}.
- *
- * <p>Either this option or {@link #maxReadTime(Duration)} must be set in order to create a
- * bounded source.
- */
- public static Bound<String> maxNumRecords(int maxNumRecords) {
- return new Bound<>(DEFAULT_PUBSUB_CODER).maxNumRecords(maxNumRecords);
- }
-
- /**
- * Creates and returns a transform for reading from Cloud Pub/Sub with a maximum number of
- * duration during which records will be read. The transform produces a <i>bounded</i>
- * {@link PCollection}.
- *
- * <p>Either this option or {@link #maxNumRecords(int)} must be set in order to create a bounded
- * source.
- */
- public static Bound<String> maxReadTime(Duration maxReadTime) {
- return new Bound<>(DEFAULT_PUBSUB_CODER).maxReadTime(maxReadTime);
- }
-
- /**
- * A {@link PTransform} that reads from a Cloud Pub/Sub source and returns
- * a unbounded {@link PCollection} containing the items from the stream.
- */
- public static class Bound<T> extends PTransform<PInput, PCollection<T>> {
- /** The Cloud Pub/Sub topic to read from. */
- @Nullable private final PubsubTopic topic;
-
- /** The Cloud Pub/Sub subscription to read from. */
- @Nullable private final PubsubSubscription subscription;
-
- /** The name of the message attribute to read timestamps from. */
- @Nullable private final String timestampLabel;
-
- /** The name of the message attribute to read unique message IDs from. */
- @Nullable private final String idLabel;
-
- /** The coder used to decode each record. */
- @Nullable private final Coder<T> coder;
-
- /** Stop after reading this many records. */
- private final int maxNumRecords;
-
- /** Stop after reading for this much time. */
- @Nullable private final Duration maxReadTime;
-
- private Bound(Coder<T> coder) {
- this(null, null, null, null, coder, null, 0, null);
- }
-
- private Bound(String name, PubsubSubscription subscription, PubsubTopic topic,
- String timestampLabel, Coder<T> coder, String idLabel, int maxNumRecords,
- Duration maxReadTime) {
- super(name);
- this.subscription = subscription;
- this.topic = topic;
- this.timestampLabel = timestampLabel;
- this.coder = coder;
- this.idLabel = idLabel;
- this.maxNumRecords = maxNumRecords;
- this.maxReadTime = maxReadTime;
- }
-
- /**
- * Returns a transform that's like this one but with the given step name.
- *
- * <p>Does not modify this object.
- */
- public Bound<T> named(String name) {
- return new Bound<>(
- name, subscription, topic, timestampLabel, coder, idLabel, maxNumRecords, maxReadTime);
- }
-
- /**
- * Returns a transform that's like this one but reading from the
- * given subscription.
- *
- * <p>See {@link PubsubIO.PubsubSubscription#fromPath(String)} for more details on the format
- * of the {@code subscription} string.
- *
- * <p>Multiple readers reading from the same subscription will each receive
- * some arbitrary portion of the data. Most likely, separate readers should
- * use their own subscriptions.
- *
- * <p>Does not modify this object.
- */
- public Bound<T> subscription(String subscription) {
- return new Bound<>(name, PubsubSubscription.fromPath(subscription), topic, timestampLabel,
- coder, idLabel, maxNumRecords, maxReadTime);
- }
-
- /**
- * Returns a transform that's like this one but that reads from the specified topic.
- *
- * <p>See {@link PubsubIO.PubsubTopic#fromPath(String)} for more details on the
- * format of the {@code topic} string.
- *
- * <p>Does not modify this object.
- */
- public Bound<T> topic(String topic) {
- return new Bound<>(name, subscription, PubsubTopic.fromPath(topic), timestampLabel, coder,
- idLabel, maxNumRecords, maxReadTime);
- }
-
- /**
- * Returns a transform that's like this one but that reads message timestamps
- * from the given message attribute. See {@link PubsubIO.Read#timestampLabel(String)} for
- * more details on the format of the timestamp attribute.
- *
- * <p>Does not modify this object.
- */
- public Bound<T> timestampLabel(String timestampLabel) {
- return new Bound<>(
- name, subscription, topic, timestampLabel, coder, idLabel, maxNumRecords, maxReadTime);
- }
-
- /**
- * Returns a transform that's like this one but that reads unique message IDs
- * from the given message attribute. See {@link PubsubIO.Read#idLabel(String)} for more
- * details on the format of the ID attribute.
- *
- * <p>Does not modify this object.
- */
- public Bound<T> idLabel(String idLabel) {
- return new Bound<>(
- name, subscription, topic, timestampLabel, coder, idLabel, maxNumRecords, maxReadTime);
- }
-
- /**
- * Returns a transform that's like this one but that uses the given
- * {@link Coder} to decode each record into a value of type {@code X}.
- *
- * <p>Does not modify this object.
- *
- * @param <X> the type of the decoded elements, and the
- * elements of the resulting PCollection.
- */
- public <X> Bound<X> withCoder(Coder<X> coder) {
- return new Bound<>(
- name, subscription, topic, timestampLabel, coder, idLabel, maxNumRecords, maxReadTime);
- }
-
- /**
- * Returns a transform that's like this one but will only read up to the specified
- * maximum number of records from Cloud Pub/Sub. The transform produces a <i>bounded</i>
- * {@link PCollection}. See {@link PubsubIO.Read#maxNumRecords(int)} for more details.
- */
- public Bound<T> maxNumRecords(int maxNumRecords) {
- return new Bound<>(
- name, subscription, topic, timestampLabel, coder, idLabel, maxNumRecords, maxReadTime);
- }
-
- /**
- * Returns a transform that's like this one but will only read during the specified
- * duration from Cloud Pub/Sub. The transform produces a <i>bounded</i> {@link PCollection}.
- * See {@link PubsubIO.Read#maxReadTime(Duration)} for more details.
- */
- public Bound<T> maxReadTime(Duration maxReadTime) {
- return new Bound<>(
- name, subscription, topic, timestampLabel, coder, idLabel, maxNumRecords, maxReadTime);
- }
-
- @Override
- public PCollection<T> apply(PInput input) {
- if (topic == null && subscription == null) {
- throw new IllegalStateException("need to set either the topic or the subscription for "
- + "a PubsubIO.Read transform");
- }
- if (topic != null && subscription != null) {
- throw new IllegalStateException("Can't set both the topic and the subscription for a "
- + "PubsubIO.Read transform");
- }
-
- boolean boundedOutput = getMaxNumRecords() > 0 || getMaxReadTime() != null;
-
- if (boundedOutput) {
- return input.getPipeline().begin()
- .apply(Create.of((Void) null)).setCoder(VoidCoder.of())
- .apply(ParDo.of(new PubsubReader())).setCoder(coder);
- } else {
- return PCollection.<T>createPrimitiveOutputInternal(
- input.getPipeline(), WindowingStrategy.globalDefault(), IsBounded.UNBOUNDED)
- .setCoder(coder);
- }
- }
-
- @Override
- protected Coder<T> getDefaultOutputCoder() {
- return coder;
- }
-
- public PubsubTopic getTopic() {
- return topic;
- }
-
- public PubsubSubscription getSubscription() {
- return subscription;
- }
-
- public String getTimestampLabel() {
- return timestampLabel;
- }
-
- public Coder<T> getCoder() {
- return coder;
- }
-
- public String getIdLabel() {
- return idLabel;
- }
-
- public int getMaxNumRecords() {
- return maxNumRecords;
- }
-
- public Duration getMaxReadTime() {
- return maxReadTime;
- }
-
- private class PubsubReader extends DoFn<Void, T> {
- private static final int DEFAULT_PULL_SIZE = 100;
-
- @Override
- public void processElement(ProcessContext c) throws IOException {
- Pubsub pubsubClient =
- Transport.newPubsubClient(c.getPipelineOptions().as(DataflowPipelineOptions.class))
- .build();
-
- String subscription;
- if (getSubscription() == null) {
- String topic = getTopic().asPath();
- String[] split = topic.split("/");
- subscription =
- "projects/" + split[1] + "/subscriptions/" + split[3] + "_dataflow_"
- + new Random().nextLong();
- Subscription subInfo = new Subscription().setAckDeadlineSeconds(60).setTopic(topic);
- try {
- pubsubClient.projects().subscriptions().create(subscription, subInfo).execute();
- } catch (Exception e) {
- throw new RuntimeException("Failed to create subscription: ", e);
- }
- } else {
- subscription = getSubscription().asPath();
- }
-
- Instant endTime = (getMaxReadTime() == null)
- ? new Instant(Long.MAX_VALUE) : Instant.now().plus(getMaxReadTime());
-
- List<PubsubMessage> messages = new ArrayList<>();
-
- Throwable finallyBlockException = null;
- try {
- while ((getMaxNumRecords() == 0 || messages.size() < getMaxNumRecords())
- && Instant.now().isBefore(endTime)) {
- PullRequest pullRequest = new PullRequest().setReturnImmediately(false);
- if (getMaxNumRecords() > 0) {
- pullRequest.setMaxMessages(getMaxNumRecords() - messages.size());
- } else {
- pullRequest.setMaxMessages(DEFAULT_PULL_SIZE);
- }
-
- PullResponse pullResponse =
- pubsubClient.projects().subscriptions().pull(subscription, pullRequest).execute();
- List<String> ackIds = new ArrayList<>();
- if (pullResponse.getReceivedMessages() != null) {
- for (ReceivedMessage received : pullResponse.getReceivedMessages()) {
- messages.add(received.getMessage());
- ackIds.add(received.getAckId());
- }
- }
-
- if (ackIds.size() != 0) {
- AcknowledgeRequest ackRequest = new AcknowledgeRequest().setAckIds(ackIds);
- pubsubClient.projects()
- .subscriptions()
- .acknowledge(subscription, ackRequest)
- .execute();
- }
- }
- } catch (IOException e) {
- throw new RuntimeException("Unexpected exception while reading from Pubsub: ", e);
- } finally {
- if (getTopic() != null) {
- try {
- pubsubClient.projects().subscriptions().delete(subscription).execute();
- } catch (IOException e) {
- finallyBlockException = new RuntimeException("Failed to delete subscription: ", e);
- LOG.error("Failed to delete subscription: ", e);
- }
- }
- }
- if (finallyBlockException != null) {
- Throwables.propagate(finallyBlockException);
- }
-
- for (PubsubMessage message : messages) {
- c.outputWithTimestamp(
- CoderUtils.decodeFromByteArray(getCoder(), message.decodeData()),
- assignMessageTimestamp(message, getTimestampLabel(), Clock.SYSTEM));
- }
- }
- }
- }
-
- /** Disallow construction of utility class. */
- private Read() {}
- }
-
-
- /////////////////////////////////////////////////////////////////////////////
-
- /** Disallow construction of utility class. */
- private PubsubIO() {}
-
- /**
- * A {@link PTransform} that continuously writes a
- * {@link PCollection} of {@link String Strings} to a Cloud Pub/Sub stream.
- */
- // TODO: Support non-String encodings.
- public static class Write {
- /**
- * Creates a transform that writes to Pub/Sub with the given step name.
- */
- public static Bound<String> named(String name) {
- return new Bound<>(DEFAULT_PUBSUB_CODER).named(name);
- }
-
- /**
- * Creates a transform that publishes to the specified topic.
- *
- * <p>See {@link PubsubIO.PubsubTopic#fromPath(String)} for more details on the format of the
- * {@code topic} string.
- */
- public static Bound<String> topic(String topic) {
- return new Bound<>(DEFAULT_PUBSUB_CODER).topic(topic);
- }
-
- /**
- * Creates a transform that writes to Pub/Sub, adds each record's timestamp to the published
- * messages in an attribute with the specified name. The value of the attribute will be a number
- * representing the number of milliseconds since the Unix epoch. For example, if using the Joda
- * time classes, {@link Instant#Instant(long)} can be used to parse this value.
- *
- * <p>If the output from this sink is being read by another Dataflow source, then
- * {@link PubsubIO.Read#timestampLabel(String)} can be used to ensure the other source reads
- * these timestamps from the appropriate attribute.
- */
- public static Bound<String> timestampLabel(String timestampLabel) {
- return new Bound<>(DEFAULT_PUBSUB_CODER).timestampLabel(timestampLabel);
- }
-
- /**
- * Creates a transform that writes to Pub/Sub, adding each record's unique identifier to the
- * published messages in an attribute with the specified name. The value of the attribute is an
- * opaque string.
- *
- * <p>If the the output from this sink is being read by another Dataflow source, then
- * {@link PubsubIO.Read#idLabel(String)} can be used to ensure that* the other source reads
- * these unique identifiers from the appropriate attribute.
- */
- public static Bound<String> idLabel(String idLabel) {
- return new Bound<>(DEFAULT_PUBSUB_CODER).idLabel(idLabel);
- }
-
- /**
- * Creates a transform that uses the given {@link Coder} to encode each of the
- * elements of the input collection into an output message.
- *
- * <p>By default, uses {@link StringUtf8Coder}, which writes input Java strings directly as
- * records.
- *
- * @param <T> the type of the elements of the input PCollection
- */
- public static <T> Bound<T> withCoder(Coder<T> coder) {
- return new Bound<>(coder);
- }
-
- /**
- * A {@link PTransform} that writes an unbounded {@link PCollection} of {@link String Strings}
- * to a Cloud Pub/Sub stream.
- */
- public static class Bound<T> extends PTransform<PCollection<T>, PDone> {
- /** The Cloud Pub/Sub topic to publish to. */
- @Nullable private final PubsubTopic topic;
- /** The name of the message attribute to publish message timestamps in. */
- @Nullable private final String timestampLabel;
- /** The name of the message attribute to publish unique message IDs in. */
- @Nullable private final String idLabel;
- private final Coder<T> coder;
-
- private Bound(Coder<T> coder) {
- this(null, null, null, null, coder);
- }
-
- private Bound(
- String name, PubsubTopic topic, String timestampLabel, String idLabel, Coder<T> coder) {
- super(name);
- this.topic = topic;
- this.timestampLabel = timestampLabel;
- this.idLabel = idLabel;
- this.coder = coder;
- }
-
- /**
- * Returns a new transform that's like this one but with the specified step
- * name.
- *
- * <p>Does not modify this object.
- */
- public Bound<T> named(String name) {
- return new Bound<>(name, topic, timestampLabel, idLabel, coder);
- }
-
- /**
- * Returns a new transform that's like this one but that writes to the specified
- * topic.
- *
- * <p>See {@link PubsubIO.PubsubTopic#fromPath(String)} for more details on the format of the
- * {@code topic} string.
- *
- * <p>Does not modify this object.
- */
- public Bound<T> topic(String topic) {
- return new Bound<>(name, PubsubTopic.fromPath(topic), timestampLabel, idLabel, coder);
- }
-
- /**
- * Returns a new transform that's like this one but that publishes record timestamps
- * to a message attribute with the specified name. See
- * {@link PubsubIO.Write#timestampLabel(String)} for more details.
- *
- * <p>Does not modify this object.
- */
- public Bound<T> timestampLabel(String timestampLabel) {
- return new Bound<>(name, topic, timestampLabel, idLabel, coder);
- }
-
- /**
- * Returns a new transform that's like this one but that publishes unique record IDs
- * to a message attribute with the specified name. See {@link PubsubIO.Write#idLabel(String)}
- * for more details.
- *
- * <p>Does not modify this object.
- */
- public Bound<T> idLabel(String idLabel) {
- return new Bound<>(name, topic, timestampLabel, idLabel, coder);
- }
-
- /**
- * Returns a new transform that's like this one
- * but that uses the given {@link Coder} to encode each of
- * the elements of the input {@link PCollection} into an
- * output record.
- *
- * <p>Does not modify this object.
- *
- * @param <X> the type of the elements of the input {@link PCollection}
- */
- public <X> Bound<X> withCoder(Coder<X> coder) {
- return new Bound<>(name, topic, timestampLabel, idLabel, coder);
- }
-
- @Override
- public PDone apply(PCollection<T> input) {
- if (topic == null) {
- throw new IllegalStateException("need to set the topic of a PubsubIO.Write transform");
- }
- input.apply(ParDo.of(new PubsubWriter()));
- return PDone.in(input.getPipeline());
- }
-
- @Override
- protected Coder<Void> getDefaultOutputCoder() {
- return VoidCoder.of();
- }
-
- public PubsubTopic getTopic() {
- return topic;
- }
-
- public String getTimestampLabel() {
- return timestampLabel;
- }
-
- public String getIdLabel() {
- return idLabel;
- }
-
- public Coder<T> getCoder() {
- return coder;
- }
-
- private class PubsubWriter extends DoFn<T, Void> {
- private static final int MAX_PUBLISH_BATCH_SIZE = 100;
- private transient List<PubsubMessage> output;
- private transient Pubsub pubsubClient;
-
- @Override
- public void startBundle(Context c) {
- this.output = new ArrayList<>();
- this.pubsubClient =
- Transport.newPubsubClient(c.getPipelineOptions().as(DataflowPipelineOptions.class))
- .build();
- }
-
- @Override
- public void processElement(ProcessContext c) throws IOException {
- PubsubMessage message =
- new PubsubMessage().encodeData(CoderUtils.encodeToByteArray(getCoder(), c.element()));
- if (getTimestampLabel() != null) {
- Map<String, String> attributes = message.getAttributes();
- if (attributes == null) {
- attributes = new HashMap<>();
- message.setAttributes(attributes);
- }
- attributes.put(getTimestampLabel(), String.valueOf(c.timestamp().getMillis()));
- }
- output.add(message);
-
- if (output.size() >= MAX_PUBLISH_BATCH_SIZE) {
- publish();
- }
- }
-
- @Override
- public void finishBundle(Context c) throws IOException {
- if (!output.isEmpty()) {
- publish();
- }
- }
-
- private void publish() throws IOException {
- PublishRequest publishRequest = new PublishRequest().setMessages(output);
- pubsubClient.projects().topics()
- .publish(getTopic().asPath(), publishRequest)
- .execute();
- output.clear();
- }
- }
- }
-
- /** Disallow construction of utility class. */
- private Write() {}
- }
-}
[18/67] [partial] incubator-beam git commit: Directory reorganization
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/join/CoGbkResult.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/join/CoGbkResult.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/join/CoGbkResult.java
deleted file mode 100644
index aac57bc..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/join/CoGbkResult.java
+++ /dev/null
@@ -1,463 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms.join;
-
-import static com.google.cloud.dataflow.sdk.util.Structs.addObject;
-
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.coders.CoderException;
-import com.google.cloud.dataflow.sdk.coders.IterableCoder;
-import com.google.cloud.dataflow.sdk.coders.StandardCoder;
-import com.google.cloud.dataflow.sdk.util.CloudObject;
-import com.google.cloud.dataflow.sdk.util.PropertyNames;
-import com.google.cloud.dataflow.sdk.util.common.Reiterator;
-import com.google.cloud.dataflow.sdk.values.TupleTag;
-import com.google.cloud.dataflow.sdk.values.TupleTagList;
-import com.google.common.base.Preconditions;
-import com.google.common.collect.Iterators;
-import com.google.common.collect.PeekingIterator;
-
-import com.fasterxml.jackson.annotation.JsonCreator;
-import com.fasterxml.jackson.annotation.JsonProperty;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Objects;
-
-/**
- * A row result of a {@link CoGroupByKey}. This is a tuple of {@link Iterable}s produced for
- * a given key, and these can be accessed in different ways.
- */
-public class CoGbkResult {
- /**
- * A map of integer union tags to a list of union objects.
- * Note: the key and the embedded union tag are the same, so it is redundant
- * to store it multiple times, but for now it makes encoding easier.
- */
- private final List<Iterable<?>> valueMap;
-
- private final CoGbkResultSchema schema;
-
- private static final int DEFAULT_IN_MEMORY_ELEMENT_COUNT = 10_000;
-
- private static final Logger LOG = LoggerFactory.getLogger(CoGbkResult.class);
-
- /**
- * A row in the {@link PCollection} resulting from a {@link CoGroupByKey} transform.
- * Currently, this row must fit into memory.
- *
- * @param schema the set of tuple tags used to refer to input tables and
- * result values
- * @param taggedValues the raw results from a group-by-key
- */
- public CoGbkResult(
- CoGbkResultSchema schema,
- Iterable<RawUnionValue> taggedValues) {
- this(schema, taggedValues, DEFAULT_IN_MEMORY_ELEMENT_COUNT);
- }
-
- @SuppressWarnings("unchecked")
- public CoGbkResult(
- CoGbkResultSchema schema,
- Iterable<RawUnionValue> taggedValues,
- int inMemoryElementCount) {
- this.schema = schema;
- valueMap = new ArrayList<>();
- for (int unionTag = 0; unionTag < schema.size(); unionTag++) {
- valueMap.add(new ArrayList<>());
- }
-
- // Demultiplex the first imMemoryElementCount tagged union values
- // according to their tag.
- final Iterator<RawUnionValue> taggedIter = taggedValues.iterator();
- int elementCount = 0;
- while (taggedIter.hasNext()) {
- if (elementCount++ >= inMemoryElementCount && taggedIter instanceof Reiterator) {
- // Let the tails be lazy.
- break;
- }
- RawUnionValue value = taggedIter.next();
- // Make sure the given union tag has a corresponding tuple tag in the
- // schema.
- int unionTag = value.getUnionTag();
- if (schema.size() <= unionTag) {
- throw new IllegalStateException("union tag " + unionTag +
- " has no corresponding tuple tag in the result schema");
- }
- List<Object> valueList = (List<Object>) valueMap.get(unionTag);
- valueList.add(value.getValue());
- }
-
- if (taggedIter.hasNext()) {
- // If we get here, there were more elements than we can afford to
- // keep in memory, so we copy the re-iterable of remaining items
- // and append filtered views to each of the sorted lists computed earlier.
- LOG.info("CoGbkResult has more than " + inMemoryElementCount + " elements,"
- + " reiteration (which may be slow) is required.");
- final Reiterator<RawUnionValue> tail = (Reiterator<RawUnionValue>) taggedIter;
- // This is a trinary-state array recording whether a given tag is present in the tail. The
- // initial value is null (unknown) for all tags, and the first iteration through the entire
- // list will set these values to true or false to avoid needlessly iterating if filtering
- // against a given tag would not match anything.
- final Boolean[] containsTag = new Boolean[schema.size()];
- for (int unionTag = 0; unionTag < schema.size(); unionTag++) {
- final int unionTag0 = unionTag;
- updateUnionTag(tail, containsTag, unionTag, unionTag0);
- }
- }
- }
-
- private <T> void updateUnionTag(
- final Reiterator<RawUnionValue> tail, final Boolean[] containsTag,
- int unionTag, final int unionTag0) {
- @SuppressWarnings("unchecked")
- final Iterable<T> head = (Iterable<T>) valueMap.get(unionTag);
- valueMap.set(
- unionTag,
- new Iterable<T>() {
- @Override
- public Iterator<T> iterator() {
- return Iterators.concat(
- head.iterator(),
- new UnionValueIterator<T>(unionTag0, tail.copy(), containsTag));
- }
- });
- }
-
- public boolean isEmpty() {
- for (Iterable<?> tagValues : valueMap) {
- if (tagValues.iterator().hasNext()) {
- return false;
- }
- }
- return true;
- }
-
- /**
- * Returns the schema used by this {@link CoGbkResult}.
- */
- public CoGbkResultSchema getSchema() {
- return schema;
- }
-
- @Override
- public String toString() {
- return valueMap.toString();
- }
-
- /**
- * Returns the values from the table represented by the given
- * {@code TupleTag<V>} as an {@code Iterable<V>} (which may be empty if there
- * are no results).
- *
- * <p>If tag was not part of the original {@link CoGroupByKey},
- * throws an IllegalArgumentException.
- */
- public <V> Iterable<V> getAll(TupleTag<V> tag) {
- int index = schema.getIndex(tag);
- if (index < 0) {
- throw new IllegalArgumentException("TupleTag " + tag +
- " is not in the schema");
- }
- @SuppressWarnings("unchecked")
- Iterable<V> unions = (Iterable<V>) valueMap.get(index);
- return unions;
- }
-
- /**
- * If there is a singleton value for the given tag, returns it.
- * Otherwise, throws an IllegalArgumentException.
- *
- * <p>If tag was not part of the original {@link CoGroupByKey},
- * throws an IllegalArgumentException.
- */
- public <V> V getOnly(TupleTag<V> tag) {
- return innerGetOnly(tag, null, false);
- }
-
- /**
- * If there is a singleton value for the given tag, returns it. If there is
- * no value for the given tag, returns the defaultValue.
- *
- * <p>If tag was not part of the original {@link CoGroupByKey},
- * throws an IllegalArgumentException.
- */
- public <V> V getOnly(TupleTag<V> tag, V defaultValue) {
- return innerGetOnly(tag, defaultValue, true);
- }
-
- /**
- * A {@link Coder} for {@link CoGbkResult}s.
- */
- public static class CoGbkResultCoder extends StandardCoder<CoGbkResult> {
-
- private final CoGbkResultSchema schema;
- private final UnionCoder unionCoder;
-
- /**
- * Returns a {@link CoGbkResultCoder} for the given schema and {@link UnionCoder}.
- */
- public static CoGbkResultCoder of(
- CoGbkResultSchema schema,
- UnionCoder unionCoder) {
- return new CoGbkResultCoder(schema, unionCoder);
- }
-
- @JsonCreator
- public static CoGbkResultCoder of(
- @JsonProperty(PropertyNames.COMPONENT_ENCODINGS)
- List<Coder<?>> components,
- @JsonProperty(PropertyNames.CO_GBK_RESULT_SCHEMA) CoGbkResultSchema schema) {
- Preconditions.checkArgument(components.size() == 1,
- "Expecting 1 component, got " + components.size());
- return new CoGbkResultCoder(schema, (UnionCoder) components.get(0));
- }
-
- private CoGbkResultCoder(
- CoGbkResultSchema tupleTags,
- UnionCoder unionCoder) {
- this.schema = tupleTags;
- this.unionCoder = unionCoder;
- }
-
-
- @Override
- public List<? extends Coder<?>> getCoderArguments() {
- return null;
- }
-
- @Override
- public List<? extends Coder<?>> getComponents() {
- return Arrays.<Coder<?>>asList(unionCoder);
- }
-
- @Override
- public CloudObject asCloudObject() {
- CloudObject result = super.asCloudObject();
- addObject(result, PropertyNames.CO_GBK_RESULT_SCHEMA, schema.asCloudObject());
- return result;
- }
-
- @Override
- @SuppressWarnings("unchecked")
- public void encode(
- CoGbkResult value,
- OutputStream outStream,
- Context context) throws CoderException,
- IOException {
- if (!schema.equals(value.getSchema())) {
- throw new CoderException("input schema does not match coder schema");
- }
- for (int unionTag = 0; unionTag < schema.size(); unionTag++) {
- tagListCoder(unionTag).encode(value.valueMap.get(unionTag), outStream, Context.NESTED);
- }
- }
-
- @Override
- public CoGbkResult decode(
- InputStream inStream,
- Context context)
- throws CoderException, IOException {
- List<Iterable<?>> valueMap = new ArrayList<>();
- for (int unionTag = 0; unionTag < schema.size(); unionTag++) {
- valueMap.add(tagListCoder(unionTag).decode(inStream, Context.NESTED));
- }
- return new CoGbkResult(schema, valueMap);
- }
-
- @SuppressWarnings("rawtypes")
- private IterableCoder tagListCoder(int unionTag) {
- return IterableCoder.of(unionCoder.getComponents().get(unionTag));
- }
-
- @Override
- public boolean equals(Object object) {
- if (this == object) {
- return true;
- }
- if (!(object instanceof CoGbkResultCoder)) {
- return false;
- }
- CoGbkResultCoder other = (CoGbkResultCoder) object;
- return schema.equals(other.schema) && unionCoder.equals(other.unionCoder);
- }
-
- @Override
- public int hashCode() {
- return Objects.hashCode(schema);
- }
-
- @Override
- public void verifyDeterministic() throws NonDeterministicException {
- verifyDeterministic(
- "CoGbkResult requires the union coder to be deterministic", unionCoder);
- }
- }
-
-
- //////////////////////////////////////////////////////////////////////////////
- // Methods for directly constructing a CoGbkResult
- //
- // (for example, creating test data for a transform that consumes a
- // CoGbkResult)
-
- /**
- * Returns a new CoGbkResult that contains just the given tag and given data.
- */
- public static <V> CoGbkResult of(TupleTag<V> tag, List<V> data) {
- return CoGbkResult.empty().and(tag, data);
- }
-
- /**
- * Returns a new {@link CoGbkResult} based on this, with the given tag and given data
- * added to it.
- */
- public <V> CoGbkResult and(TupleTag<V> tag, List<V> data) {
- if (nextTestUnionId != schema.size()) {
- throw new IllegalArgumentException(
- "Attempting to call and() on a CoGbkResult apparently not created by"
- + " of().");
- }
- List<Iterable<?>> valueMap = new ArrayList<>(this.valueMap);
- valueMap.add(data);
- return new CoGbkResult(
- new CoGbkResultSchema(schema.getTupleTagList().and(tag)), valueMap,
- nextTestUnionId + 1);
- }
-
- /**
- * Returns an empty {@link CoGbkResult}.
- */
- public static <V> CoGbkResult empty() {
- return new CoGbkResult(new CoGbkResultSchema(TupleTagList.empty()),
- new ArrayList<Iterable<?>>());
- }
-
- //////////////////////////////////////////////////////////////////////////////
-
- private int nextTestUnionId = 0;
-
- private CoGbkResult(
- CoGbkResultSchema schema,
- List<Iterable<?>> valueMap,
- int nextTestUnionId) {
- this(schema, valueMap);
- this.nextTestUnionId = nextTestUnionId;
- }
-
- private CoGbkResult(
- CoGbkResultSchema schema,
- List<Iterable<?>> valueMap) {
- this.schema = schema;
- this.valueMap = valueMap;
- }
-
- private <V> V innerGetOnly(
- TupleTag<V> tag,
- V defaultValue,
- boolean useDefault) {
- int index = schema.getIndex(tag);
- if (index < 0) {
- throw new IllegalArgumentException("TupleTag " + tag
- + " is not in the schema");
- }
- @SuppressWarnings("unchecked")
- Iterator<V> unions = (Iterator<V>) valueMap.get(index).iterator();
- if (!unions.hasNext()) {
- if (useDefault) {
- return defaultValue;
- } else {
- throw new IllegalArgumentException("TupleTag " + tag
- + " corresponds to an empty result, and no default was provided");
- }
- }
- V value = unions.next();
- if (unions.hasNext()) {
- throw new IllegalArgumentException("TupleTag " + tag
- + " corresponds to a non-singleton result");
- }
- return value;
- }
-
- /**
- * Lazily filters and recasts an {@code Iterator<RawUnionValue>} into an
- * {@code Iterator<V>}, where V is the type of the raw union value's contents.
- */
- private static class UnionValueIterator<V> implements Iterator<V> {
-
- private final int tag;
- private final PeekingIterator<RawUnionValue> unions;
- private final Boolean[] containsTag;
-
- private UnionValueIterator(int tag, Iterator<RawUnionValue> unions, Boolean[] containsTag) {
- this.tag = tag;
- this.unions = Iterators.peekingIterator(unions);
- this.containsTag = containsTag;
- }
-
- @Override
- public boolean hasNext() {
- if (containsTag[tag] == Boolean.FALSE) {
- return false;
- }
- advance();
- if (unions.hasNext()) {
- return true;
- } else {
- // Now that we've iterated over all the values, we can resolve all the "unknown" null
- // values to false.
- for (int i = 0; i < containsTag.length; i++) {
- if (containsTag[i] == null) {
- containsTag[i] = false;
- }
- }
- return false;
- }
- }
-
- @Override
- @SuppressWarnings("unchecked")
- public V next() {
- advance();
- return (V) unions.next().getValue();
- }
-
- private void advance() {
- while (unions.hasNext()) {
- int curTag = unions.peek().getUnionTag();
- containsTag[curTag] = true;
- if (curTag == tag) {
- break;
- }
- unions.next();
- }
- }
-
- @Override
- public void remove() {
- throw new UnsupportedOperationException();
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/join/CoGbkResultSchema.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/join/CoGbkResultSchema.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/join/CoGbkResultSchema.java
deleted file mode 100644
index 2860ba7..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/join/CoGbkResultSchema.java
+++ /dev/null
@@ -1,134 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms.join;
-
-import static com.google.cloud.dataflow.sdk.util.Structs.addList;
-
-import com.google.cloud.dataflow.sdk.util.CloudObject;
-import com.google.cloud.dataflow.sdk.util.PropertyNames;
-import com.google.cloud.dataflow.sdk.values.TupleTag;
-import com.google.cloud.dataflow.sdk.values.TupleTagList;
-
-import com.fasterxml.jackson.annotation.JsonCreator;
-import com.fasterxml.jackson.annotation.JsonProperty;
-
-import java.io.Serializable;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-
-/**
- * A schema for the results of a {@link CoGroupByKey}. This maintains the full
- * set of {@link TupleTag}s for the results of a {@link CoGroupByKey} and
- * facilitates mapping between {@link TupleTag}s and
- * {@link RawUnionValue} tags (which are used as secondary keys in the
- * {@link CoGroupByKey}).
- */
-public class CoGbkResultSchema implements Serializable {
-
- private final TupleTagList tupleTagList;
-
- @JsonCreator
- public static CoGbkResultSchema of(
- @JsonProperty(PropertyNames.TUPLE_TAGS) List<TupleTag<?>> tags) {
- TupleTagList tupleTags = TupleTagList.empty();
- for (TupleTag<?> tag : tags) {
- tupleTags = tupleTags.and(tag);
- }
- return new CoGbkResultSchema(tupleTags);
- }
-
- /**
- * Maps TupleTags to union tags. This avoids needing to encode the tags
- * themselves.
- */
- private final HashMap<TupleTag<?>, Integer> tagMap = new HashMap<>();
-
- /**
- * Builds a schema from a tuple of {@code TupleTag<?>}s.
- */
- public CoGbkResultSchema(TupleTagList tupleTagList) {
- this.tupleTagList = tupleTagList;
- int index = -1;
- for (TupleTag<?> tag : tupleTagList.getAll()) {
- index++;
- tagMap.put(tag, index);
- }
- }
-
- /**
- * Returns the index for the given tuple tag, if the tag is present in this
- * schema, -1 if it isn't.
- */
- public int getIndex(TupleTag<?> tag) {
- Integer index = tagMap.get(tag);
- return index == null ? -1 : index;
- }
-
- /**
- * Returns the tuple tag at the given index.
- */
- public TupleTag<?> getTag(int index) {
- return tupleTagList.get(index);
- }
-
- /**
- * Returns the number of columns for this schema.
- */
- public int size() {
- return tupleTagList.getAll().size();
- }
-
- /**
- * Returns the TupleTagList tuple associated with this schema.
- */
- public TupleTagList getTupleTagList() {
- return tupleTagList;
- }
-
- public CloudObject asCloudObject() {
- CloudObject result = CloudObject.forClass(getClass());
- List<CloudObject> serializedTags = new ArrayList<>(tupleTagList.size());
- for (TupleTag<?> tag : tupleTagList.getAll()) {
- serializedTags.add(tag.asCloudObject());
- }
- addList(result, PropertyNames.TUPLE_TAGS, serializedTags);
- return result;
- }
-
- @Override
- public boolean equals(Object obj) {
- if (obj == this) {
- return true;
- }
- if (!(obj instanceof CoGbkResultSchema)) {
- return false;
- }
- CoGbkResultSchema other = (CoGbkResultSchema) obj;
- return tupleTagList.getAll().equals(other.tupleTagList.getAll());
- }
-
- @Override
- public int hashCode() {
- return tupleTagList.getAll().hashCode();
- }
-
- @Override
- public String toString() {
- return "CoGbkResultSchema: " + tupleTagList.getAll();
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/join/CoGroupByKey.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/join/CoGroupByKey.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/join/CoGroupByKey.java
deleted file mode 100644
index b840682..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/join/CoGroupByKey.java
+++ /dev/null
@@ -1,211 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms.join;
-
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.coders.KvCoder;
-import com.google.cloud.dataflow.sdk.transforms.DoFn;
-import com.google.cloud.dataflow.sdk.transforms.Flatten;
-import com.google.cloud.dataflow.sdk.transforms.GroupByKey;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-import com.google.cloud.dataflow.sdk.transforms.ParDo;
-import com.google.cloud.dataflow.sdk.transforms.join.CoGbkResult.CoGbkResultCoder;
-import com.google.cloud.dataflow.sdk.transforms.join.KeyedPCollectionTuple.TaggedKeyedPCollection;
-import com.google.cloud.dataflow.sdk.values.KV;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.cloud.dataflow.sdk.values.PCollectionList;
-
-import java.util.ArrayList;
-import java.util.List;
-
-/**
- * A {@link PTransform} that performs a {@link CoGroupByKey} on a tuple
- * of tables. A {@link CoGroupByKey} groups results from all
- * tables by like keys into {@link CoGbkResult}s,
- * from which the results for any specific table can be accessed by the
- * {@link com.google.cloud.dataflow.sdk.values.TupleTag}
- * supplied with the initial table.
- *
- * <p>Example of performing a {@link CoGroupByKey} followed by a
- * {@link ParDo} that consumes
- * the results:
- * <pre> {@code
- * PCollection<KV<K, V1>> pt1 = ...;
- * PCollection<KV<K, V2>> pt2 = ...;
- *
- * final TupleTag<V1> t1 = new TupleTag<>();
- * final TupleTag<V2> t2 = new TupleTag<>();
- * PCollection<KV<K, CoGbkResult>> coGbkResultCollection =
- * KeyedPCollectionTuple.of(t1, pt1)
- * .and(t2, pt2)
- * .apply(CoGroupByKey.<K>create());
- *
- * PCollection<T> finalResultCollection =
- * coGbkResultCollection.apply(ParDo.of(
- * new DoFn<KV<K, CoGbkResult>, T>() {
- * @Override
- * public void processElement(ProcessContext c) {
- * KV<K, CoGbkResult> e = c.element();
- * Iterable<V1> pt1Vals = e.getValue().getAll(t1);
- * V2 pt2Val = e.getValue().getOnly(t2);
- * ... Do Something ....
- * c.output(...some T...);
- * }
- * }));
- * } </pre>
- *
- * @param <K> the type of the keys in the input and output
- * {@code PCollection}s
- */
-public class CoGroupByKey<K> extends
- PTransform<KeyedPCollectionTuple<K>,
- PCollection<KV<K, CoGbkResult>>> {
- /**
- * Returns a {@code CoGroupByKey<K>} {@code PTransform}.
- *
- * @param <K> the type of the keys in the input and output
- * {@code PCollection}s
- */
- public static <K> CoGroupByKey<K> create() {
- return new CoGroupByKey<>();
- }
-
- private CoGroupByKey() { }
-
- @Override
- public PCollection<KV<K, CoGbkResult>> apply(
- KeyedPCollectionTuple<K> input) {
- if (input.isEmpty()) {
- throw new IllegalArgumentException(
- "must have at least one input to a KeyedPCollections");
- }
-
- // First build the union coder.
- // TODO: Look at better integration of union types with the
- // schema specified in the input.
- List<Coder<?>> codersList = new ArrayList<>();
- for (TaggedKeyedPCollection<K, ?> entry : input.getKeyedCollections()) {
- codersList.add(getValueCoder(entry.pCollection));
- }
- UnionCoder unionCoder = UnionCoder.of(codersList);
- Coder<K> keyCoder = input.getKeyCoder();
- KvCoder<K, RawUnionValue> kVCoder =
- KvCoder.of(keyCoder, unionCoder);
-
- PCollectionList<KV<K, RawUnionValue>> unionTables =
- PCollectionList.empty(input.getPipeline());
-
- // TODO: Use the schema to order the indices rather than depending
- // on the fact that the schema ordering is identical to the ordering from
- // input.getJoinCollections().
- int index = -1;
- for (TaggedKeyedPCollection<K, ?> entry : input.getKeyedCollections()) {
- index++;
- PCollection<KV<K, RawUnionValue>> unionTable =
- makeUnionTable(index, entry.pCollection, kVCoder);
- unionTables = unionTables.and(unionTable);
- }
-
- PCollection<KV<K, RawUnionValue>> flattenedTable =
- unionTables.apply(Flatten.<KV<K, RawUnionValue>>pCollections());
-
- PCollection<KV<K, Iterable<RawUnionValue>>> groupedTable =
- flattenedTable.apply(GroupByKey.<K, RawUnionValue>create());
-
- CoGbkResultSchema tupleTags = input.getCoGbkResultSchema();
- PCollection<KV<K, CoGbkResult>> result = groupedTable.apply(
- ParDo.of(new ConstructCoGbkResultFn<K>(tupleTags))
- .named("ConstructCoGbkResultFn"));
- result.setCoder(KvCoder.of(keyCoder,
- CoGbkResultCoder.of(tupleTags, unionCoder)));
-
- return result;
- }
-
- //////////////////////////////////////////////////////////////////////////////
-
- /**
- * Returns the value coder for the given PCollection. Assumes that the value
- * coder is an instance of {@code KvCoder<K, V>}.
- */
- private <V> Coder<V> getValueCoder(PCollection<KV<K, V>> pCollection) {
- // Assumes that the PCollection uses a KvCoder.
- Coder<?> entryCoder = pCollection.getCoder();
- if (!(entryCoder instanceof KvCoder<?, ?>)) {
- throw new IllegalArgumentException("PCollection does not use a KvCoder");
- }
- @SuppressWarnings("unchecked")
- KvCoder<K, V> coder = (KvCoder<K, V>) entryCoder;
- return coder.getValueCoder();
- }
-
- /**
- * Returns a UnionTable for the given input PCollection, using the given
- * union index and the given unionTableEncoder.
- */
- private <V> PCollection<KV<K, RawUnionValue>> makeUnionTable(
- final int index,
- PCollection<KV<K, V>> pCollection,
- KvCoder<K, RawUnionValue> unionTableEncoder) {
-
- return pCollection.apply(ParDo.of(
- new ConstructUnionTableFn<K, V>(index)).named("MakeUnionTable" + index))
- .setCoder(unionTableEncoder);
- }
-
- /**
- * A DoFn to construct a UnionTable (i.e., a
- * {@code PCollection<KV<K, RawUnionValue>>} from a
- * {@code PCollection<KV<K, V>>}.
- */
- private static class ConstructUnionTableFn<K, V> extends
- DoFn<KV<K, V>, KV<K, RawUnionValue>> {
-
- private final int index;
-
- public ConstructUnionTableFn(int index) {
- this.index = index;
- }
-
- @Override
- public void processElement(ProcessContext c) {
- KV<K, ?> e = c.element();
- c.output(KV.of(e.getKey(), new RawUnionValue(index, e.getValue())));
- }
- }
-
- /**
- * A DoFn to construct a CoGbkResult from an input grouped union
- * table.
- */
- private static class ConstructCoGbkResultFn<K>
- extends DoFn<KV<K, Iterable<RawUnionValue>>,
- KV<K, CoGbkResult>> {
-
- private final CoGbkResultSchema schema;
-
- public ConstructCoGbkResultFn(CoGbkResultSchema schema) {
- this.schema = schema;
- }
-
- @Override
- public void processElement(ProcessContext c) {
- KV<K, Iterable<RawUnionValue>> e = c.element();
- c.output(KV.of(e.getKey(), new CoGbkResult(schema, e.getValue())));
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/join/KeyedPCollectionTuple.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/join/KeyedPCollectionTuple.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/join/KeyedPCollectionTuple.java
deleted file mode 100644
index abfbe08..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/join/KeyedPCollectionTuple.java
+++ /dev/null
@@ -1,247 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms.join;
-
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.coders.KvCoder;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-import com.google.cloud.dataflow.sdk.values.KV;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.cloud.dataflow.sdk.values.PInput;
-import com.google.cloud.dataflow.sdk.values.POutput;
-import com.google.cloud.dataflow.sdk.values.PValue;
-import com.google.cloud.dataflow.sdk.values.TupleTag;
-import com.google.cloud.dataflow.sdk.values.TupleTagList;
-
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.List;
-
-/**
- * An immutable tuple of keyed {@link PCollection PCollections}
- * with key type K.
- * ({@link PCollection PCollections} containing values of type
- * {@code KV<K, ?>})
- *
- * @param <K> the type of key shared by all constituent PCollections
- */
-public class KeyedPCollectionTuple<K> implements PInput {
- /**
- * Returns an empty {@code KeyedPCollectionTuple<K>} on the given pipeline.
- */
- public static <K> KeyedPCollectionTuple<K> empty(Pipeline pipeline) {
- return new KeyedPCollectionTuple<>(pipeline);
- }
-
- /**
- * Returns a new {@code KeyedPCollectionTuple<K>} with the given tag and initial
- * PCollection.
- */
- public static <K, InputT> KeyedPCollectionTuple<K> of(
- TupleTag<InputT> tag,
- PCollection<KV<K, InputT>> pc) {
- return new KeyedPCollectionTuple<K>(pc.getPipeline()).and(tag, pc);
- }
-
- /**
- * Returns a new {@code KeyedPCollectionTuple<K>} that is the same as this,
- * appended with the given PCollection.
- */
- public <V> KeyedPCollectionTuple<K> and(
- TupleTag< V> tag,
- PCollection<KV<K, V>> pc) {
- if (pc.getPipeline() != getPipeline()) {
- throw new IllegalArgumentException(
- "PCollections come from different Pipelines");
- }
- TaggedKeyedPCollection<K, ?> wrapper =
- new TaggedKeyedPCollection<>(tag, pc);
- Coder<K> myKeyCoder = keyCoder == null ? getKeyCoder(pc) : keyCoder;
- List<TaggedKeyedPCollection<K, ?>>
- newKeyedCollections =
- copyAddLast(
- keyedCollections,
- wrapper);
- return new KeyedPCollectionTuple<>(
- getPipeline(),
- newKeyedCollections,
- schema.getTupleTagList().and(tag),
- myKeyCoder);
- }
-
- public boolean isEmpty() {
- return keyedCollections.isEmpty();
- }
-
- /**
- * Returns a list of {@link TaggedKeyedPCollection TaggedKeyedPCollections} for the
- * {@link PCollection PCollections} contained in this {@link KeyedPCollectionTuple}.
- */
- public List<TaggedKeyedPCollection<K, ?>> getKeyedCollections() {
- return keyedCollections;
- }
-
- /**
- * Like {@link #apply(String, PTransform)} but defaulting to the name
- * provided by the {@link PTransform}.
- */
- public <OutputT extends POutput> OutputT apply(
- PTransform<KeyedPCollectionTuple<K>, OutputT> transform) {
- return Pipeline.applyTransform(this, transform);
- }
-
- /**
- * Applies the given {@link PTransform} to this input {@code KeyedPCollectionTuple} and returns
- * its {@code OutputT}. This uses {@code name} to identify the specific application of
- * the transform. This name is used in various places, including the monitoring UI,
- * logging, and to stably identify this application node in the job graph.
- */
- public <OutputT extends POutput> OutputT apply(
- String name, PTransform<KeyedPCollectionTuple<K>, OutputT> transform) {
- return Pipeline.applyTransform(name, this, transform);
- }
-
- /**
- * Expands the component {@link PCollection PCollections}, stripping off
- * any tag-specific information.
- */
- @Override
- public Collection<? extends PValue> expand() {
- List<PCollection<?>> retval = new ArrayList<>();
- for (TaggedKeyedPCollection<K, ?> taggedPCollection : keyedCollections) {
- retval.add(taggedPCollection.pCollection);
- }
- return retval;
- }
-
- /**
- * Returns the key {@link Coder} for all {@link PCollection PCollections}
- * in this {@link KeyedPCollectionTuple}.
- */
- public Coder<K> getKeyCoder() {
- if (keyCoder == null) {
- throw new IllegalStateException("cannot return null keyCoder");
- }
- return keyCoder;
- }
-
- /**
- * Returns the {@link CoGbkResultSchema} associated with this
- * {@link KeyedPCollectionTuple}.
- */
- public CoGbkResultSchema getCoGbkResultSchema() {
- return schema;
- }
-
- @Override
- public Pipeline getPipeline() {
- return pipeline;
- }
-
- @Override
- public void finishSpecifying() {
- for (TaggedKeyedPCollection<K, ?> taggedPCollection : keyedCollections) {
- taggedPCollection.pCollection.finishSpecifying();
- }
- }
-
- /////////////////////////////////////////////////////////////////////////////
-
- /**
- * A utility class to help ensure coherence of tag and input PCollection
- * types.
- */
- public static class TaggedKeyedPCollection<K, V> {
-
- final TupleTag<V> tupleTag;
- final PCollection<KV<K, V>> pCollection;
-
- public TaggedKeyedPCollection(
- TupleTag<V> tupleTag,
- PCollection<KV<K, V>> pCollection) {
- this.tupleTag = tupleTag;
- this.pCollection = pCollection;
- }
-
- /**
- * Returns the underlying PCollection of this TaggedKeyedPCollection.
- */
- public PCollection<KV<K, V>> getCollection() {
- return pCollection;
- }
-
- /**
- * Returns the TupleTag of this TaggedKeyedPCollection.
- */
- public TupleTag<V> getTupleTag() {
- return tupleTag;
- }
- }
-
- /**
- * We use a List to properly track the order in which collections are added.
- */
- private final List<TaggedKeyedPCollection<K, ?>> keyedCollections;
-
- private final Coder<K> keyCoder;
-
- private final CoGbkResultSchema schema;
-
- private final Pipeline pipeline;
-
- KeyedPCollectionTuple(Pipeline pipeline) {
- this(pipeline,
- new ArrayList<TaggedKeyedPCollection<K, ?>>(),
- TupleTagList.empty(),
- null);
- }
-
- KeyedPCollectionTuple(
- Pipeline pipeline,
- List<TaggedKeyedPCollection<K, ?>> keyedCollections,
- TupleTagList tupleTagList,
- Coder<K> keyCoder) {
- this.pipeline = pipeline;
- this.keyedCollections = keyedCollections;
- this.schema = new CoGbkResultSchema(tupleTagList);
- this.keyCoder = keyCoder;
- }
-
- private static <K, V> Coder<K> getKeyCoder(PCollection<KV<K, V>> pc) {
- // Need to run coder inference on this PCollection before inspecting it.
- pc.finishSpecifying();
-
- // Assumes that the PCollection uses a KvCoder.
- Coder<?> entryCoder = pc.getCoder();
- if (!(entryCoder instanceof KvCoder<?, ?>)) {
- throw new IllegalArgumentException("PCollection does not use a KvCoder");
- }
- @SuppressWarnings("unchecked")
- KvCoder<K, V> coder = (KvCoder<K, V>) entryCoder;
- return coder.getKeyCoder();
- }
-
- private static <K> List<TaggedKeyedPCollection<K, ?>> copyAddLast(
- List<TaggedKeyedPCollection<K, ?>> keyedCollections,
- TaggedKeyedPCollection<K, ?> taggedCollection) {
- List<TaggedKeyedPCollection<K, ?>> retval =
- new ArrayList<>(keyedCollections);
- retval.add(taggedCollection);
- return retval;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/join/RawUnionValue.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/join/RawUnionValue.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/join/RawUnionValue.java
deleted file mode 100644
index 514853e..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/join/RawUnionValue.java
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms.join;
-
-// TODO: Think about making this a complete dynamic union by adding
-// a schema. Type would then be defined by the corresponding schema entry.
-
-/**
- * This corresponds to an integer union tag and value. The mapping of
- * union tag to type must come from elsewhere.
- */
-public class RawUnionValue {
- private final int unionTag;
- private final Object value;
-
- /**
- * Constructs a partial union from the given union tag and value.
- */
- public RawUnionValue(int unionTag, Object value) {
- this.unionTag = unionTag;
- this.value = value;
- }
-
- public int getUnionTag() {
- return unionTag;
- }
-
- public Object getValue() {
- return value;
- }
-
- @Override
- public String toString() {
- return unionTag + ":" + value;
- }
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/join/UnionCoder.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/join/UnionCoder.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/join/UnionCoder.java
deleted file mode 100644
index 2f1c2be..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/join/UnionCoder.java
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms.join;
-
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.coders.CoderException;
-import com.google.cloud.dataflow.sdk.coders.StandardCoder;
-import com.google.cloud.dataflow.sdk.util.PropertyNames;
-import com.google.cloud.dataflow.sdk.util.VarInt;
-import com.google.cloud.dataflow.sdk.util.common.ElementByteSizeObserver;
-
-import com.fasterxml.jackson.annotation.JsonCreator;
-import com.fasterxml.jackson.annotation.JsonProperty;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.util.List;
-
-/**
- * A UnionCoder encodes RawUnionValues.
- */
-class UnionCoder extends StandardCoder<RawUnionValue> {
- // TODO: Think about how to integrate this with a schema object (i.e.
- // a tuple of tuple tags).
- /**
- * Builds a union coder with the given list of element coders. This list
- * corresponds to a mapping of union tag to Coder. Union tags start at 0.
- */
- public static UnionCoder of(List<Coder<?>> elementCoders) {
- return new UnionCoder(elementCoders);
- }
-
- @JsonCreator
- public static UnionCoder jsonOf(
- @JsonProperty(PropertyNames.COMPONENT_ENCODINGS)
- List<Coder<?>> elements) {
- return UnionCoder.of(elements);
- }
-
- private int getIndexForEncoding(RawUnionValue union) {
- if (union == null) {
- throw new IllegalArgumentException("cannot encode a null tagged union");
- }
- int index = union.getUnionTag();
- if (index < 0 || index >= elementCoders.size()) {
- throw new IllegalArgumentException(
- "union value index " + index + " not in range [0.." +
- (elementCoders.size() - 1) + "]");
- }
- return index;
- }
-
- @SuppressWarnings("unchecked")
- @Override
- public void encode(
- RawUnionValue union,
- OutputStream outStream,
- Context context)
- throws IOException, CoderException {
- int index = getIndexForEncoding(union);
- // Write out the union tag.
- VarInt.encode(index, outStream);
-
- // Write out the actual value.
- Coder<Object> coder = (Coder<Object>) elementCoders.get(index);
- coder.encode(
- union.getValue(),
- outStream,
- context);
- }
-
- @Override
- public RawUnionValue decode(InputStream inStream, Context context)
- throws IOException, CoderException {
- int index = VarInt.decodeInt(inStream);
- Object value = elementCoders.get(index).decode(inStream, context);
- return new RawUnionValue(index, value);
- }
-
- @Override
- public List<? extends Coder<?>> getCoderArguments() {
- return null;
- }
-
- @Override
- public List<? extends Coder<?>> getComponents() {
- return elementCoders;
- }
-
- /**
- * Since this coder uses elementCoders.get(index) and coders that are known to run in constant
- * time, we defer the return value to that coder.
- */
- @Override
- public boolean isRegisterByteSizeObserverCheap(RawUnionValue union, Context context) {
- int index = getIndexForEncoding(union);
- @SuppressWarnings("unchecked")
- Coder<Object> coder = (Coder<Object>) elementCoders.get(index);
- return coder.isRegisterByteSizeObserverCheap(union.getValue(), context);
- }
-
- /**
- * Notifies ElementByteSizeObserver about the byte size of the encoded value using this coder.
- */
- @Override
- public void registerByteSizeObserver(
- RawUnionValue union, ElementByteSizeObserver observer, Context context)
- throws Exception {
- int index = getIndexForEncoding(union);
- // Write out the union tag.
- observer.update(VarInt.getLength(index));
- // Write out the actual value.
- @SuppressWarnings("unchecked")
- Coder<Object> coder = (Coder<Object>) elementCoders.get(index);
- coder.registerByteSizeObserver(union.getValue(), observer, context);
- }
-
- /////////////////////////////////////////////////////////////////////////////
-
- private final List<Coder<?>> elementCoders;
-
- private UnionCoder(List<Coder<?>> elementCoders) {
- this.elementCoders = elementCoders;
- }
-
- @Override
- public void verifyDeterministic() throws NonDeterministicException {
- verifyDeterministic(
- "UnionCoder is only deterministic if all element coders are",
- elementCoders);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/join/package-info.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/join/package-info.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/join/package-info.java
deleted file mode 100644
index be8bffa..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/join/package-info.java
+++ /dev/null
@@ -1,21 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-/**
- * Defines the {@link com.google.cloud.dataflow.sdk.transforms.join.CoGroupByKey} transform
- * for joining multiple PCollections.
- */
-package com.google.cloud.dataflow.sdk.transforms.join;
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/package-info.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/package-info.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/package-info.java
deleted file mode 100644
index 3c041f6..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/package-info.java
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-/**
- * Defines {@link com.google.cloud.dataflow.sdk.transforms.PTransform}s for transforming
- * data in a pipeline.
- *
- * <p>A {@link com.google.cloud.dataflow.sdk.transforms.PTransform} is an operation that takes an
- * {@code InputT} (some subtype of {@link com.google.cloud.dataflow.sdk.values.PInput})
- * and produces an
- * {@code OutputT} (some subtype of {@link com.google.cloud.dataflow.sdk.values.POutput}).
- *
- * <p>Common PTransforms include root PTransforms like
- * {@link com.google.cloud.dataflow.sdk.io.TextIO.Read} and
- * {@link com.google.cloud.dataflow.sdk.transforms.Create}, processing and
- * conversion operations like {@link com.google.cloud.dataflow.sdk.transforms.ParDo},
- * {@link com.google.cloud.dataflow.sdk.transforms.GroupByKey},
- * {@link com.google.cloud.dataflow.sdk.transforms.join.CoGroupByKey},
- * {@link com.google.cloud.dataflow.sdk.transforms.Combine}, and
- * {@link com.google.cloud.dataflow.sdk.transforms.Count}, and outputting
- * PTransforms like
- * {@link com.google.cloud.dataflow.sdk.io.TextIO.Write}.
- *
- * <p>New PTransforms can be created by composing existing PTransforms.
- * Most PTransforms in this package are composites, and users can also create composite PTransforms
- * for their own application-specific logic.
- *
- */
-package com.google.cloud.dataflow.sdk.transforms;
-
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/AfterAll.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/AfterAll.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/AfterAll.java
deleted file mode 100644
index bb43010..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/AfterAll.java
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms.windowing;
-
-import com.google.cloud.dataflow.sdk.annotations.Experimental;
-import com.google.cloud.dataflow.sdk.transforms.windowing.Trigger.OnceTrigger;
-import com.google.cloud.dataflow.sdk.util.ExecutableTrigger;
-import com.google.common.base.Preconditions;
-
-import org.joda.time.Instant;
-
-import java.util.Arrays;
-import java.util.List;
-
-/**
- * Create a {@link Trigger} that fires and finishes once after all of its sub-triggers have fired.
- *
- * @param <W> {@link BoundedWindow} subclass used to represent the windows used by this
- * {@code Trigger}
- */
-@Experimental(Experimental.Kind.TRIGGER)
-public class AfterAll<W extends BoundedWindow> extends OnceTrigger<W> {
-
- private AfterAll(List<Trigger<W>> subTriggers) {
- super(subTriggers);
- Preconditions.checkArgument(subTriggers.size() > 1);
- }
-
- /**
- * Returns an {@code AfterAll} {@code Trigger} with the given subtriggers.
- */
- @SafeVarargs
- public static <W extends BoundedWindow> OnceTrigger<W> of(
- OnceTrigger<W>... triggers) {
- return new AfterAll<W>(Arrays.<Trigger<W>>asList(triggers));
- }
-
- @Override
- public void onElement(OnElementContext c) throws Exception {
- for (ExecutableTrigger<W> subTrigger : c.trigger().unfinishedSubTriggers()) {
- // Since subTriggers are all OnceTriggers, they must either CONTINUE or FIRE_AND_FINISH.
- // invokeElement will automatically mark the finish bit if they return FIRE_AND_FINISH.
- subTrigger.invokeOnElement(c);
- }
- }
-
- @Override
- public void onMerge(OnMergeContext c) throws Exception {
- for (ExecutableTrigger<W> subTrigger : c.trigger().subTriggers()) {
- subTrigger.invokeOnMerge(c);
- }
- boolean allFinished = true;
- for (ExecutableTrigger<W> subTrigger1 : c.trigger().subTriggers()) {
- allFinished &= c.forTrigger(subTrigger1).trigger().isFinished();
- }
- c.trigger().setFinished(allFinished);
- }
-
- @Override
- public Instant getWatermarkThatGuaranteesFiring(W window) {
- // This trigger will fire after the latest of its sub-triggers.
- Instant deadline = BoundedWindow.TIMESTAMP_MIN_VALUE;
- for (Trigger<W> subTrigger : subTriggers) {
- Instant subDeadline = subTrigger.getWatermarkThatGuaranteesFiring(window);
- if (deadline.isBefore(subDeadline)) {
- deadline = subDeadline;
- }
- }
- return deadline;
- }
-
- @Override
- public OnceTrigger<W> getContinuationTrigger(List<Trigger<W>> continuationTriggers) {
- return new AfterAll<W>(continuationTriggers);
- }
-
- /**
- * {@inheritDoc}
- *
- * @return {@code true} if all subtriggers return {@code true}.
- */
- @Override
- public boolean shouldFire(TriggerContext context) throws Exception {
- for (ExecutableTrigger<W> subtrigger : context.trigger().subTriggers()) {
- if (!context.forTrigger(subtrigger).trigger().isFinished()
- && !subtrigger.invokeShouldFire(context)) {
- return false;
- }
- }
- return true;
- }
-
- /**
- * Invokes {@link #onFire} for all subtriggers, eliding redundant calls to {@link #shouldFire}
- * because they all must be ready to fire.
- */
- @Override
- public void onOnlyFiring(TriggerContext context) throws Exception {
- for (ExecutableTrigger<W> subtrigger : context.trigger().subTriggers()) {
- subtrigger.invokeOnFire(context);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/AfterDelayFromFirstElement.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/AfterDelayFromFirstElement.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/AfterDelayFromFirstElement.java
deleted file mode 100644
index 71968e9..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/AfterDelayFromFirstElement.java
+++ /dev/null
@@ -1,322 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms.windowing;
-
-import com.google.cloud.dataflow.sdk.annotations.Experimental;
-import com.google.cloud.dataflow.sdk.coders.InstantCoder;
-import com.google.cloud.dataflow.sdk.transforms.Combine;
-import com.google.cloud.dataflow.sdk.transforms.Min;
-import com.google.cloud.dataflow.sdk.transforms.SerializableFunction;
-import com.google.cloud.dataflow.sdk.transforms.windowing.Trigger.OnceTrigger;
-import com.google.cloud.dataflow.sdk.util.TimeDomain;
-import com.google.cloud.dataflow.sdk.util.state.AccumulatorCombiningState;
-import com.google.cloud.dataflow.sdk.util.state.CombiningState;
-import com.google.cloud.dataflow.sdk.util.state.MergingStateAccessor;
-import com.google.cloud.dataflow.sdk.util.state.StateAccessor;
-import com.google.cloud.dataflow.sdk.util.state.StateMerging;
-import com.google.cloud.dataflow.sdk.util.state.StateTag;
-import com.google.cloud.dataflow.sdk.util.state.StateTags;
-import com.google.common.collect.ImmutableList;
-
-import org.joda.time.Duration;
-import org.joda.time.Instant;
-
-import java.util.List;
-import java.util.Objects;
-
-import javax.annotation.Nullable;
-
-/**
- * A base class for triggers that happen after a processing time delay from the arrival
- * of the first element in a pane.
- *
- * <p>This class is for internal use only and may change at any time.
- */
-@Experimental(Experimental.Kind.TRIGGER)
-public abstract class AfterDelayFromFirstElement<W extends BoundedWindow> extends OnceTrigger<W> {
-
- protected static final List<SerializableFunction<Instant, Instant>> IDENTITY =
- ImmutableList.<SerializableFunction<Instant, Instant>>of();
-
- protected static final StateTag<Object, AccumulatorCombiningState<Instant,
- Combine.Holder<Instant>, Instant>> DELAYED_UNTIL_TAG =
- StateTags.makeSystemTagInternal(StateTags.combiningValueFromInputInternal(
- "delayed", InstantCoder.of(), Min.MinFn.<Instant>naturalOrder()));
-
- /**
- * To complete an implementation, return the desired time from the TriggerContext.
- */
- @Nullable
- public abstract Instant getCurrentTime(Trigger<W>.TriggerContext context);
-
- /**
- * To complete an implementation, return a new instance like this one, but incorporating
- * the provided timestamp mapping functions. Generally should be used by calling the
- * constructor of this class from the constructor of the subclass.
- */
- protected abstract AfterDelayFromFirstElement<W> newWith(
- List<SerializableFunction<Instant, Instant>> transform);
-
- /**
- * A list of timestampMappers m1, m2, m3, ... m_n considered to be composed in sequence. The
- * overall mapping for an instance `instance` is `m_n(... m3(m2(m1(instant))`,
- * implemented via #computeTargetTimestamp
- */
- protected final List<SerializableFunction<Instant, Instant>> timestampMappers;
-
- private final TimeDomain timeDomain;
-
- public AfterDelayFromFirstElement(
- TimeDomain timeDomain,
- List<SerializableFunction<Instant, Instant>> timestampMappers) {
- super(null);
- this.timestampMappers = timestampMappers;
- this.timeDomain = timeDomain;
- }
-
- private Instant getTargetTimestamp(OnElementContext c) {
- return computeTargetTimestamp(c.currentProcessingTime());
- }
-
- /**
- * Aligns timestamps to the smallest multiple of {@code size} since the {@code offset} greater
- * than the timestamp.
- *
- * <p>TODO: Consider sharing this with FixedWindows, and bring over the equivalent of
- * CalendarWindows.
- */
- public AfterDelayFromFirstElement<W> alignedTo(final Duration size, final Instant offset) {
- return newWith(new AlignFn(size, offset));
- }
-
- /**
- * Aligns the time to be the smallest multiple of {@code size} greater than the timestamp
- * since the epoch.
- */
- public AfterDelayFromFirstElement<W> alignedTo(final Duration size) {
- return alignedTo(size, new Instant(0));
- }
-
- /**
- * Adds some delay to the original target time.
- *
- * @param delay the delay to add
- * @return An updated time trigger that will wait the additional time before firing.
- */
- public AfterDelayFromFirstElement<W> plusDelayOf(final Duration delay) {
- return newWith(new DelayFn(delay));
- }
-
- /**
- * @deprecated This will be removed in the next major version. Please use only
- * {@link #plusDelayOf} and {@link #alignedTo}.
- */
- @Deprecated
- public OnceTrigger<W> mappedTo(SerializableFunction<Instant, Instant> timestampMapper) {
- return newWith(timestampMapper);
- }
-
- @Override
- public boolean isCompatible(Trigger<?> other) {
- if (!getClass().equals(other.getClass())) {
- return false;
- }
-
- AfterDelayFromFirstElement<?> that = (AfterDelayFromFirstElement<?>) other;
- return this.timestampMappers.equals(that.timestampMappers);
- }
-
-
- private AfterDelayFromFirstElement<W> newWith(
- SerializableFunction<Instant, Instant> timestampMapper) {
- return newWith(
- ImmutableList.<SerializableFunction<Instant, Instant>>builder()
- .addAll(timestampMappers)
- .add(timestampMapper)
- .build());
- }
-
- @Override
- public void prefetchOnElement(StateAccessor<?> state) {
- state.access(DELAYED_UNTIL_TAG).readLater();
- }
-
- @Override
- public void onElement(OnElementContext c) throws Exception {
- CombiningState<Instant, Instant> delayUntilState = c.state().access(DELAYED_UNTIL_TAG);
- Instant oldDelayUntil = delayUntilState.read();
-
- // Since processing time can only advance, resulting in target wake-up times we would
- // ignore anyhow, we don't bother with it if it is already set.
- if (oldDelayUntil != null) {
- return;
- }
-
- Instant targetTimestamp = getTargetTimestamp(c);
- delayUntilState.add(targetTimestamp);
- c.setTimer(targetTimestamp, timeDomain);
- }
-
- @Override
- public void prefetchOnMerge(MergingStateAccessor<?, W> state) {
- super.prefetchOnMerge(state);
- StateMerging.prefetchCombiningValues(state, DELAYED_UNTIL_TAG);
- }
-
- @Override
- public void onMerge(OnMergeContext c) throws Exception {
- // NOTE: We could try to delete all timers which are still active, but we would
- // need access to a timer context for each merging window.
- // for (CombiningValueStateInternal<Instant, Combine.Holder<Instant>, Instant> state :
- // c.state().accessInEachMergingWindow(DELAYED_UNTIL_TAG).values()) {
- // Instant timestamp = state.get().read();
- // if (timestamp != null) {
- // <context for merging window>.deleteTimer(timestamp, timeDomain);
- // }
- // }
- // Instead let them fire and be ignored.
-
- // If the trigger is already finished, there is no way it will become re-activated
- if (c.trigger().isFinished()) {
- StateMerging.clear(c.state(), DELAYED_UNTIL_TAG);
- // NOTE: We do not attempt to delete the timers.
- return;
- }
-
- // Determine the earliest point across all the windows, and delay to that.
- StateMerging.mergeCombiningValues(c.state(), DELAYED_UNTIL_TAG);
-
- Instant earliestTargetTime = c.state().access(DELAYED_UNTIL_TAG).read();
- if (earliestTargetTime != null) {
- c.setTimer(earliestTargetTime, timeDomain);
- }
- }
-
- @Override
- public void prefetchShouldFire(StateAccessor<?> state) {
- state.access(DELAYED_UNTIL_TAG).readLater();
- }
-
- @Override
- public void clear(TriggerContext c) throws Exception {
- c.state().access(DELAYED_UNTIL_TAG).clear();
- }
-
- @Override
- public Instant getWatermarkThatGuaranteesFiring(W window) {
- return BoundedWindow.TIMESTAMP_MAX_VALUE;
- }
-
- @Override
- public boolean shouldFire(Trigger<W>.TriggerContext context) throws Exception {
- Instant delayedUntil = context.state().access(DELAYED_UNTIL_TAG).read();
- return delayedUntil != null
- && getCurrentTime(context) != null
- && getCurrentTime(context).isAfter(delayedUntil);
- }
-
- @Override
- protected void onOnlyFiring(Trigger<W>.TriggerContext context) throws Exception {
- clear(context);
- }
-
- protected Instant computeTargetTimestamp(Instant time) {
- Instant result = time;
- for (SerializableFunction<Instant, Instant> timestampMapper : timestampMappers) {
- result = timestampMapper.apply(result);
- }
- return result;
- }
-
- /**
- * A {@link SerializableFunction} to delay the timestamp at which this triggers fires.
- */
- private static final class DelayFn implements SerializableFunction<Instant, Instant> {
- private final Duration delay;
-
- public DelayFn(Duration delay) {
- this.delay = delay;
- }
-
- @Override
- public Instant apply(Instant input) {
- return input.plus(delay);
- }
-
- @Override
- public boolean equals(Object object) {
- if (object == this) {
- return true;
- }
-
- if (!(object instanceof DelayFn)) {
- return false;
- }
-
- return this.delay.equals(((DelayFn) object).delay);
- }
-
- @Override
- public int hashCode() {
- return Objects.hash(delay);
- }
- }
-
- /**
- * A {@link SerializableFunction} to align an instant to the nearest interval boundary.
- */
- static final class AlignFn implements SerializableFunction<Instant, Instant> {
- private final Duration size;
- private final Instant offset;
-
-
- /**
- * Aligns timestamps to the smallest multiple of {@code size} since the {@code offset} greater
- * than the timestamp.
- */
- public AlignFn(Duration size, Instant offset) {
- this.size = size;
- this.offset = offset;
- }
-
- @Override
- public Instant apply(Instant point) {
- long millisSinceStart = new Duration(offset, point).getMillis() % size.getMillis();
- return millisSinceStart == 0 ? point : point.plus(size).minus(millisSinceStart);
- }
-
- @Override
- public boolean equals(Object object) {
- if (object == this) {
- return true;
- }
-
- if (!(object instanceof AlignFn)) {
- return false;
- }
-
- AlignFn other = (AlignFn) object;
- return other.size.equals(this.size)
- && other.offset.equals(this.offset);
- }
-
- @Override
- public int hashCode() {
- return Objects.hash(size, offset);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/AfterEach.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/AfterEach.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/AfterEach.java
deleted file mode 100644
index 4b052fa..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/AfterEach.java
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms.windowing;
-
-import static com.google.common.base.Preconditions.checkArgument;
-
-import com.google.cloud.dataflow.sdk.annotations.Experimental;
-import com.google.cloud.dataflow.sdk.util.ExecutableTrigger;
-
-import org.joda.time.Instant;
-
-import java.util.Arrays;
-import java.util.List;
-
-/**
- * A composite {@link Trigger} that executes its sub-triggers in order.
- * Only one sub-trigger is executing at a time,
- * and any time it fires the {@code AfterEach} fires. When the currently executing
- * sub-trigger finishes, the {@code AfterEach} starts executing the next sub-trigger.
- *
- * <p>{@code AfterEach.inOrder(t1, t2, ...)} finishes when all of the sub-triggers have finished.
- *
- * <p>The following properties hold:
- * <ul>
- * <li> {@code AfterEach.inOrder(AfterEach.inOrder(a, b), c)} behaves the same as
- * {@code AfterEach.inOrder(a, b, c)} and {@code AfterEach.inOrder(a, AfterEach.inOrder(b, c)}.
- * <li> {@code AfterEach.inOrder(Repeatedly.forever(a), b)} behaves the same as
- * {@code Repeatedly.forever(a)}, since the repeated trigger never finishes.
- * </ul>
- *
- * @param <W> {@link BoundedWindow} subclass used to represent the windows used by this
- * {@code Trigger}
- */
-@Experimental(Experimental.Kind.TRIGGER)
-public class AfterEach<W extends BoundedWindow> extends Trigger<W> {
-
- private AfterEach(List<Trigger<W>> subTriggers) {
- super(subTriggers);
- checkArgument(subTriggers.size() > 1);
- }
-
- /**
- * Returns an {@code AfterEach} {@code Trigger} with the given subtriggers.
- */
- @SafeVarargs
- public static <W extends BoundedWindow> Trigger<W> inOrder(Trigger<W>... triggers) {
- return new AfterEach<W>(Arrays.<Trigger<W>>asList(triggers));
- }
-
- @Override
- public void onElement(OnElementContext c) throws Exception {
- if (!c.trigger().isMerging()) {
- // If merges are not possible, we need only run the first unfinished subtrigger
- c.trigger().firstUnfinishedSubTrigger().invokeOnElement(c);
- } else {
- // If merges are possible, we need to run all subtriggers in parallel
- for (ExecutableTrigger<W> subTrigger : c.trigger().subTriggers()) {
- // Even if the subTrigger is done, it may be revived via merging and must have
- // adequate state.
- subTrigger.invokeOnElement(c);
- }
- }
- }
-
- @Override
- public void onMerge(OnMergeContext context) throws Exception {
- // If merging makes a subtrigger no-longer-finished, it will automatically
- // begin participating in shouldFire and onFire appropriately.
-
- // All the following triggers are retroactively "not started" but that is
- // also automatic because they are cleared whenever this trigger
- // fires.
- boolean priorTriggersAllFinished = true;
- for (ExecutableTrigger<W> subTrigger : context.trigger().subTriggers()) {
- if (priorTriggersAllFinished) {
- subTrigger.invokeOnMerge(context);
- priorTriggersAllFinished &= context.forTrigger(subTrigger).trigger().isFinished();
- } else {
- subTrigger.invokeClear(context);
- }
- }
- updateFinishedState(context);
- }
-
- @Override
- public Instant getWatermarkThatGuaranteesFiring(W window) {
- // This trigger will fire at least once when the first trigger in the sequence
- // fires at least once.
- return subTriggers.get(0).getWatermarkThatGuaranteesFiring(window);
- }
-
- @Override
- public Trigger<W> getContinuationTrigger(List<Trigger<W>> continuationTriggers) {
- return Repeatedly.forever(new AfterFirst<W>(continuationTriggers));
- }
-
- @Override
- public boolean shouldFire(Trigger<W>.TriggerContext context) throws Exception {
- ExecutableTrigger<W> firstUnfinished = context.trigger().firstUnfinishedSubTrigger();
- return firstUnfinished.invokeShouldFire(context);
- }
-
- @Override
- public void onFire(Trigger<W>.TriggerContext context) throws Exception {
- context.trigger().firstUnfinishedSubTrigger().invokeOnFire(context);
-
- // Reset all subtriggers if in a merging context; any may be revived by merging so they are
- // all run in parallel for each pending pane.
- if (context.trigger().isMerging()) {
- for (ExecutableTrigger<W> subTrigger : context.trigger().subTriggers()) {
- subTrigger.invokeClear(context);
- }
- }
-
- updateFinishedState(context);
- }
-
- private void updateFinishedState(TriggerContext context) {
- context.trigger().setFinished(context.trigger().firstUnfinishedSubTrigger() == null);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/AfterFirst.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/AfterFirst.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/AfterFirst.java
deleted file mode 100644
index 29b19bf..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/AfterFirst.java
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms.windowing;
-
-import com.google.cloud.dataflow.sdk.annotations.Experimental;
-import com.google.cloud.dataflow.sdk.transforms.windowing.Trigger.OnceTrigger;
-import com.google.cloud.dataflow.sdk.util.ExecutableTrigger;
-import com.google.common.base.Preconditions;
-
-import org.joda.time.Instant;
-
-import java.util.Arrays;
-import java.util.List;
-
-/**
- * Create a composite {@link Trigger} that fires once after at least one of its sub-triggers have
- * fired.
- *
- * @param <W> {@link BoundedWindow} subclass used to represent the windows used by this
- * {@code Trigger}
- */
-@Experimental(Experimental.Kind.TRIGGER)
-public class AfterFirst<W extends BoundedWindow> extends OnceTrigger<W> {
-
- AfterFirst(List<Trigger<W>> subTriggers) {
- super(subTriggers);
- Preconditions.checkArgument(subTriggers.size() > 1);
- }
-
- /**
- * Returns an {@code AfterFirst} {@code Trigger} with the given subtriggers.
- */
- @SafeVarargs
- public static <W extends BoundedWindow> OnceTrigger<W> of(
- OnceTrigger<W>... triggers) {
- return new AfterFirst<W>(Arrays.<Trigger<W>>asList(triggers));
- }
-
- @Override
- public void onElement(OnElementContext c) throws Exception {
- for (ExecutableTrigger<W> subTrigger : c.trigger().subTriggers()) {
- subTrigger.invokeOnElement(c);
- }
- }
-
- @Override
- public void onMerge(OnMergeContext c) throws Exception {
- for (ExecutableTrigger<W> subTrigger : c.trigger().subTriggers()) {
- subTrigger.invokeOnMerge(c);
- }
- updateFinishedStatus(c);
- }
-
- @Override
- public Instant getWatermarkThatGuaranteesFiring(W window) {
- // This trigger will fire after the earliest of its sub-triggers.
- Instant deadline = BoundedWindow.TIMESTAMP_MAX_VALUE;
- for (Trigger<W> subTrigger : subTriggers) {
- Instant subDeadline = subTrigger.getWatermarkThatGuaranteesFiring(window);
- if (deadline.isAfter(subDeadline)) {
- deadline = subDeadline;
- }
- }
- return deadline;
- }
-
- @Override
- public OnceTrigger<W> getContinuationTrigger(List<Trigger<W>> continuationTriggers) {
- return new AfterFirst<W>(continuationTriggers);
- }
-
- @Override
- public boolean shouldFire(Trigger<W>.TriggerContext context) throws Exception {
- for (ExecutableTrigger<W> subtrigger : context.trigger().subTriggers()) {
- if (context.forTrigger(subtrigger).trigger().isFinished()
- || subtrigger.invokeShouldFire(context)) {
- return true;
- }
- }
- return false;
- }
-
- @Override
- protected void onOnlyFiring(TriggerContext context) throws Exception {
- for (ExecutableTrigger<W> subtrigger : context.trigger().subTriggers()) {
- TriggerContext subContext = context.forTrigger(subtrigger);
- if (subtrigger.invokeShouldFire(subContext)) {
- // If the trigger is ready to fire, then do whatever it needs to do.
- subtrigger.invokeOnFire(subContext);
- } else {
- // If the trigger is not ready to fire, it is nonetheless true that whatever
- // pending pane it was tracking is now gone.
- subtrigger.invokeClear(subContext);
- }
- }
- }
-
- private void updateFinishedStatus(TriggerContext c) {
- boolean anyFinished = false;
- for (ExecutableTrigger<W> subTrigger : c.trigger().subTriggers()) {
- anyFinished |= c.forTrigger(subTrigger).trigger().isFinished();
- }
- c.trigger().setFinished(anyFinished);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/AfterPane.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/AfterPane.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/AfterPane.java
deleted file mode 100644
index 28c8560..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/AfterPane.java
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms.windowing;
-
-import com.google.cloud.dataflow.sdk.annotations.Experimental;
-import com.google.cloud.dataflow.sdk.coders.VarLongCoder;
-import com.google.cloud.dataflow.sdk.transforms.Sum;
-import com.google.cloud.dataflow.sdk.transforms.windowing.Trigger.OnceTrigger;
-import com.google.cloud.dataflow.sdk.util.state.AccumulatorCombiningState;
-import com.google.cloud.dataflow.sdk.util.state.MergingStateAccessor;
-import com.google.cloud.dataflow.sdk.util.state.StateAccessor;
-import com.google.cloud.dataflow.sdk.util.state.StateMerging;
-import com.google.cloud.dataflow.sdk.util.state.StateTag;
-import com.google.cloud.dataflow.sdk.util.state.StateTags;
-
-import org.joda.time.Instant;
-
-import java.util.List;
-import java.util.Objects;
-
-/**
- * {@link Trigger}s that fire based on properties of the elements in the current pane.
- *
- * @param <W> {@link BoundedWindow} subclass used to represent the windows used by this
- * {@link Trigger}
- */
-@Experimental(Experimental.Kind.TRIGGER)
-public class AfterPane<W extends BoundedWindow> extends OnceTrigger<W>{
-
-private static final StateTag<Object, AccumulatorCombiningState<Long, long[], Long>>
- ELEMENTS_IN_PANE_TAG =
- StateTags.makeSystemTagInternal(StateTags.combiningValueFromInputInternal(
- "count", VarLongCoder.of(), new Sum.SumLongFn()));
-
- private final int countElems;
-
- private AfterPane(int countElems) {
- super(null);
- this.countElems = countElems;
- }
-
- /**
- * Creates a trigger that fires when the pane contains at least {@code countElems} elements.
- */
- public static <W extends BoundedWindow> AfterPane<W> elementCountAtLeast(int countElems) {
- return new AfterPane<>(countElems);
- }
-
- @Override
- public void onElement(OnElementContext c) throws Exception {
- c.state().access(ELEMENTS_IN_PANE_TAG).add(1L);
- }
-
- @Override
- public void prefetchOnMerge(MergingStateAccessor<?, W> state) {
- super.prefetchOnMerge(state);
- StateMerging.prefetchCombiningValues(state, ELEMENTS_IN_PANE_TAG);
- }
-
- @Override
- public void onMerge(OnMergeContext context) throws Exception {
- // If we've already received enough elements and finished in some window,
- // then this trigger is just finished.
- if (context.trigger().finishedInAnyMergingWindow()) {
- context.trigger().setFinished(true);
- StateMerging.clear(context.state(), ELEMENTS_IN_PANE_TAG);
- return;
- }
-
- // Otherwise, compute the sum of elements in all the active panes.
- StateMerging.mergeCombiningValues(context.state(), ELEMENTS_IN_PANE_TAG);
- }
-
- @Override
- public void prefetchShouldFire(StateAccessor<?> state) {
- state.access(ELEMENTS_IN_PANE_TAG).readLater();
- }
-
- @Override
- public boolean shouldFire(Trigger<W>.TriggerContext context) throws Exception {
- long count = context.state().access(ELEMENTS_IN_PANE_TAG).read();
- return count >= countElems;
- }
-
- @Override
- public void clear(TriggerContext c) throws Exception {
- c.state().access(ELEMENTS_IN_PANE_TAG).clear();
- }
-
- @Override
- public boolean isCompatible(Trigger<?> other) {
- return this.equals(other);
- }
-
- @Override
- public Instant getWatermarkThatGuaranteesFiring(W window) {
- return BoundedWindow.TIMESTAMP_MAX_VALUE;
- }
-
- @Override
- public OnceTrigger<W> getContinuationTrigger(List<Trigger<W>> continuationTriggers) {
- return AfterPane.elementCountAtLeast(1);
- }
-
- @Override
- public String toString() {
- return "AfterPane.elementCountAtLeast(" + countElems + ")";
- }
-
- @Override
- public boolean equals(Object obj) {
- if (this == obj) {
- return true;
- }
- if (!(obj instanceof AfterPane)) {
- return false;
- }
- AfterPane<?> that = (AfterPane<?>) obj;
- return this.countElems == that.countElems;
- }
-
- @Override
- public int hashCode() {
- return Objects.hash(countElems);
- }
-
- @Override
- protected void onOnlyFiring(Trigger<W>.TriggerContext context) throws Exception {
- clear(context);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/AfterProcessingTime.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/AfterProcessingTime.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/AfterProcessingTime.java
deleted file mode 100644
index 7e89902..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/AfterProcessingTime.java
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms.windowing;
-
-import com.google.cloud.dataflow.sdk.annotations.Experimental;
-import com.google.cloud.dataflow.sdk.transforms.SerializableFunction;
-import com.google.cloud.dataflow.sdk.util.TimeDomain;
-
-import org.joda.time.Instant;
-
-import java.util.List;
-import java.util.Objects;
-
-import javax.annotation.Nullable;
-
-/**
- * {@code AfterProcessingTime} triggers fire based on the current processing time. They operate in
- * the real-time domain.
- *
- * <p>The time at which to fire the timer can be adjusted via the methods in {@link TimeTrigger},
- * such as {@link TimeTrigger#plusDelayOf} or {@link TimeTrigger#alignedTo}.
- *
- * @param <W> {@link BoundedWindow} subclass used to represent the windows used
- */
-@Experimental(Experimental.Kind.TRIGGER)
-public class AfterProcessingTime<W extends BoundedWindow> extends AfterDelayFromFirstElement<W> {
-
- @Override
- @Nullable
- public Instant getCurrentTime(Trigger<W>.TriggerContext context) {
- return context.currentProcessingTime();
- }
-
- private AfterProcessingTime(List<SerializableFunction<Instant, Instant>> transforms) {
- super(TimeDomain.PROCESSING_TIME, transforms);
- }
-
- /**
- * Creates a trigger that fires when the current processing time passes the processing time
- * at which this trigger saw the first element in a pane.
- */
- public static <W extends BoundedWindow> AfterProcessingTime<W> pastFirstElementInPane() {
- return new AfterProcessingTime<W>(IDENTITY);
- }
-
- @Override
- protected AfterProcessingTime<W> newWith(
- List<SerializableFunction<Instant, Instant>> transforms) {
- return new AfterProcessingTime<W>(transforms);
- }
-
- @Override
- public Instant getWatermarkThatGuaranteesFiring(W window) {
- return BoundedWindow.TIMESTAMP_MAX_VALUE;
- }
-
- @Override
- protected Trigger<W> getContinuationTrigger(List<Trigger<W>> continuationTriggers) {
- return new AfterSynchronizedProcessingTime<W>();
- }
-
- @Override
- public String toString() {
- return "AfterProcessingTime.pastFirstElementInPane(" + timestampMappers + ")";
- }
-
- @Override
- public boolean equals(Object obj) {
- if (this == obj) {
- return true;
- }
- if (!(obj instanceof AfterProcessingTime)) {
- return false;
- }
- AfterProcessingTime<?> that = (AfterProcessingTime<?>) obj;
- return Objects.equals(this.timestampMappers, that.timestampMappers);
- }
-
- @Override
- public int hashCode() {
- return Objects.hash(getClass(), this.timestampMappers);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/AfterSynchronizedProcessingTime.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/AfterSynchronizedProcessingTime.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/AfterSynchronizedProcessingTime.java
deleted file mode 100644
index 0a274c9..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/AfterSynchronizedProcessingTime.java
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.transforms.windowing;
-
-import com.google.cloud.dataflow.sdk.transforms.SerializableFunction;
-import com.google.cloud.dataflow.sdk.util.TimeDomain;
-import com.google.common.base.Objects;
-
-import org.joda.time.Instant;
-
-import java.util.Collections;
-import java.util.List;
-
-import javax.annotation.Nullable;
-
-class AfterSynchronizedProcessingTime<W extends BoundedWindow>
- extends AfterDelayFromFirstElement<W> {
-
- @Override
- @Nullable
- public Instant getCurrentTime(Trigger<W>.TriggerContext context) {
- return context.currentSynchronizedProcessingTime();
- }
-
- public AfterSynchronizedProcessingTime() {
- super(TimeDomain.SYNCHRONIZED_PROCESSING_TIME,
- Collections.<SerializableFunction<Instant, Instant>>emptyList());
- }
-
- @Override
- public Instant getWatermarkThatGuaranteesFiring(W window) {
- return BoundedWindow.TIMESTAMP_MAX_VALUE;
- }
-
- @Override
- protected Trigger<W> getContinuationTrigger(List<Trigger<W>> continuationTriggers) {
- return this;
- }
-
- @Override
- public String toString() {
- return "AfterSynchronizedProcessingTime.pastFirstElementInPane()";
- }
-
- @Override
- public boolean equals(Object obj) {
- return this == obj || obj instanceof AfterSynchronizedProcessingTime;
- }
-
- @Override
- public int hashCode() {
- return Objects.hashCode(AfterSynchronizedProcessingTime.class);
- }
-
- @Override
- protected AfterSynchronizedProcessingTime<W>
- newWith(List<SerializableFunction<Instant, Instant>> transforms) {
- // ignore transforms
- return this;
- }
-
-}
[05/67] [partial] incubator-beam git commit: Directory reorganization
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/common/worker/StateSampler.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/common/worker/StateSampler.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/common/worker/StateSampler.java
deleted file mode 100644
index 00d3b3b..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/common/worker/StateSampler.java
+++ /dev/null
@@ -1,365 +0,0 @@
-/*******************************************************************************
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- ******************************************************************************/
-
-package com.google.cloud.dataflow.sdk.util.common.worker;
-
-import com.google.cloud.dataflow.sdk.util.common.Counter;
-import com.google.cloud.dataflow.sdk.util.common.CounterSet;
-import com.google.common.util.concurrent.ThreadFactoryBuilder;
-
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.Random;
-import java.util.concurrent.Executors;
-import java.util.concurrent.ScheduledExecutorService;
-import java.util.concurrent.ScheduledFuture;
-import java.util.concurrent.TimeUnit;
-
-import javax.annotation.concurrent.ThreadSafe;
-
-/**
- * A StateSampler object may be used to obtain an approximate
- * breakdown of the time spent by an execution context in various
- * states, as a fraction of the total time. The sampling is taken at
- * regular intervals, with adjustment for scheduling delay.
- */
-@ThreadSafe
-public class StateSampler implements AutoCloseable {
-
- /** Different kinds of states. */
- public enum StateKind {
- /** IO, user code, etc. */
- USER,
- /** Reading/writing from/to shuffle service, etc. */
- FRAMEWORK
- }
-
- public static final long DEFAULT_SAMPLING_PERIOD_MS = 200;
-
- private final String prefix;
- private final CounterSet.AddCounterMutator counterSetMutator;
-
- /** Array of counters indexed by their state. */
- private ArrayList<Counter<Long>> countersByState = new ArrayList<>();
-
- /** Map of state name to state. */
- private Map<String, Integer> statesByName = new HashMap<>();
-
- /** Map of state id to kind. */
- private Map<Integer, StateKind> kindsByState = new HashMap<>();
-
- /** The current state. */
- private volatile int currentState;
-
- /** Special value of {@code currentState} that means we do not sample. */
- public static final int DO_NOT_SAMPLE = -1;
-
- /**
- * A counter that increments with each state transition. May be used
- * to detect a context being stuck in a state for some amount of
- * time.
- */
- private volatile long stateTransitionCount;
-
- /**
- * The timestamp (in nanoseconds) corresponding to the last time the
- * state was sampled (and recorded).
- */
- private long stateTimestampNs = 0;
-
- /** Using a fixed number of timers for all StateSampler objects. */
- private static final int NUM_EXECUTOR_THREADS = 16;
-
- private static final ScheduledExecutorService executorService =
- Executors.newScheduledThreadPool(NUM_EXECUTOR_THREADS,
- new ThreadFactoryBuilder().setDaemon(true).build());
-
- private Random rand = new Random();
-
- private List<SamplingCallback> callbacks = new ArrayList<>();
-
- private ScheduledFuture<?> invocationTriggerFuture = null;
-
- private ScheduledFuture<?> invocationFuture = null;
-
- /**
- * Constructs a new {@link StateSampler} that can be used to obtain
- * an approximate breakdown of the time spent by an execution
- * context in various states, as a fraction of the total time.
- *
- * @param prefix the prefix of the counter names for the states
- * @param counterSetMutator the {@link CounterSet.AddCounterMutator}
- * used to create a counter for each distinct state
- * @param samplingPeriodMs the sampling period in milliseconds
- */
- public StateSampler(String prefix,
- CounterSet.AddCounterMutator counterSetMutator,
- final long samplingPeriodMs) {
- this.prefix = prefix;
- this.counterSetMutator = counterSetMutator;
- currentState = DO_NOT_SAMPLE;
- scheduleSampling(samplingPeriodMs);
- }
-
- /**
- * Constructs a new {@link StateSampler} that can be used to obtain
- * an approximate breakdown of the time spent by an execution
- * context in various states, as a fraction of the total time.
- *
- * @param prefix the prefix of the counter names for the states
- * @param counterSetMutator the {@link CounterSet.AddCounterMutator}
- * used to create a counter for each distinct state
- */
- public StateSampler(String prefix,
- CounterSet.AddCounterMutator counterSetMutator) {
- this(prefix, counterSetMutator, DEFAULT_SAMPLING_PERIOD_MS);
- }
-
- /**
- * Called by the constructor to schedule sampling at the given period.
- *
- * <p>Should not be overridden by sub-classes unless they want to change
- * or disable the automatic sampling of state.
- */
- protected void scheduleSampling(final long samplingPeriodMs) {
- // Here "stratified sampling" is used, which makes sure that there's 1 uniformly chosen sampled
- // point in every bucket of samplingPeriodMs, to prevent pathological behavior in case some
- // states happen to occur at a similar period.
- // The current implementation uses a fixed-rate timer with a period samplingPeriodMs as a
- // trampoline to a one-shot random timer which fires with a random delay within
- // samplingPeriodMs.
- stateTimestampNs = System.nanoTime();
- invocationTriggerFuture =
- executorService.scheduleAtFixedRate(
- new Runnable() {
- @Override
- public void run() {
- long delay = rand.nextInt((int) samplingPeriodMs);
- synchronized (StateSampler.this) {
- if (invocationFuture != null) {
- invocationFuture.cancel(false);
- }
- invocationFuture =
- executorService.schedule(
- new Runnable() {
- @Override
- public void run() {
- StateSampler.this.run();
- }
- },
- delay,
- TimeUnit.MILLISECONDS);
- }
- }
- },
- 0,
- samplingPeriodMs,
- TimeUnit.MILLISECONDS);
- }
-
- public synchronized void run() {
- long startTimestampNs = System.nanoTime();
- int state = currentState;
- if (state != DO_NOT_SAMPLE) {
- StateKind kind = null;
- long elapsedMs = TimeUnit.NANOSECONDS.toMillis(startTimestampNs - stateTimestampNs);
- kind = kindsByState.get(state);
- countersByState.get(state).addValue(elapsedMs);
- // Invoke all callbacks.
- for (SamplingCallback c : callbacks) {
- c.run(state, kind, elapsedMs);
- }
- }
- stateTimestampNs = startTimestampNs;
- }
-
- @Override
- public synchronized void close() {
- currentState = DO_NOT_SAMPLE;
- if (invocationTriggerFuture != null) {
- invocationTriggerFuture.cancel(false);
- }
- if (invocationFuture != null) {
- invocationFuture.cancel(false);
- }
- }
-
- /**
- * Returns the state associated with a name; creating a new state if
- * necessary. Using states instead of state names during state
- * transitions is done for efficiency.
- *
- * @name the name for the state
- * @kind kind of the state, see {#code StateKind}
- * @return the state associated with the state name
- */
- public int stateForName(String name, StateKind kind) {
- if (name.isEmpty()) {
- return DO_NOT_SAMPLE;
- }
-
- synchronized (this) {
- Integer state = statesByName.get(name);
- if (state == null) {
- String counterName = prefix + name + "-msecs";
- Counter<Long> counter = counterSetMutator.addCounter(
- Counter.longs(counterName, Counter.AggregationKind.SUM));
- state = countersByState.size();
- statesByName.put(name, state);
- countersByState.add(counter);
- kindsByState.put(state, kind);
- }
- StateKind originalKind = kindsByState.get(state);
- if (originalKind != kind) {
- throw new IllegalArgumentException(
- "for state named " + name
- + ", requested kind " + kind + " different from the original kind " + originalKind);
- }
- return state;
- }
- }
-
- /**
- * An internal class for representing StateSampler information
- * typically used for debugging.
- */
- public static class StateSamplerInfo {
- public final String state;
- public final Long transitionCount;
- public final Long stateDurationMillis;
-
- public StateSamplerInfo(String state, Long transitionCount,
- Long stateDurationMillis) {
- this.state = state;
- this.transitionCount = transitionCount;
- this.stateDurationMillis = stateDurationMillis;
- }
- }
-
- /**
- * Returns information about the current state of this state sampler
- * into a {@link StateSamplerInfo} object, or null if sampling is
- * not turned on.
- *
- * @return information about this state sampler or null if sampling is off
- */
- public synchronized StateSamplerInfo getInfo() {
- return currentState == DO_NOT_SAMPLE ? null
- : new StateSamplerInfo(countersByState.get(currentState).getName(),
- stateTransitionCount, null);
- }
-
- /**
- * Returns the current state of this state sampler.
- */
- public int getCurrentState() {
- return currentState;
- }
-
- /**
- * Sets the current thread state.
- *
- * @param state the new state to transition to
- * @return the previous state
- */
- public int setState(int state) {
- // Updates to stateTransitionCount are always done by the same
- // thread, making the non-atomic volatile update below safe. The
- // count is updated first to avoid incorrectly attributing
- // stuckness occuring in an old state to the new state.
- long previousStateTransitionCount = this.stateTransitionCount;
- this.stateTransitionCount = previousStateTransitionCount + 1;
- int previousState = currentState;
- currentState = state;
- return previousState;
- }
-
- /**
- * Sets the current thread state.
- *
- * @param name the name of the new state to transition to
- * @param kind kind of the new state
- * @return the previous state
- */
- public int setState(String name, StateKind kind) {
- return setState(stateForName(name, kind));
- }
-
- /**
- * Returns an AutoCloseable {@link ScopedState} that will perform a
- * state transition to the given state, and will automatically reset
- * the state to the prior state upon closing.
- *
- * @param state the new state to transition to
- * @return a {@link ScopedState} that automatically resets the state
- * to the prior state
- */
- public ScopedState scopedState(int state) {
- return new ScopedState(this, setState(state));
- }
-
- /**
- * Add a callback to the sampler.
- * The callbacks will be executed sequentially upon {@link StateSampler#run}.
- */
- public synchronized void addSamplingCallback(SamplingCallback callback) {
- callbacks.add(callback);
- }
-
- /** Get the counter prefix associated with this sampler. */
- public String getPrefix() {
- return prefix;
- }
-
- /**
- * A nested class that is used to account for states and state
- * transitions based on lexical scopes.
- *
- * <p>Thread-safe.
- */
- public class ScopedState implements AutoCloseable {
- private StateSampler sampler;
- private int previousState;
-
- private ScopedState(StateSampler sampler, int previousState) {
- this.sampler = sampler;
- this.previousState = previousState;
- }
-
- @Override
- public void close() {
- sampler.setState(previousState);
- }
- }
-
- /**
- * Callbacks which supposed to be called sequentially upon {@link StateSampler#run}.
- * They should be registered via {@link #addSamplingCallback}.
- */
- public static interface SamplingCallback {
- /**
- * The entrance method of the callback, it is called in {@link StateSampler#run},
- * once per sample. This method should be thread safe.
- *
- * @param state The state of the StateSampler at the time of sample.
- * @param kind The kind associated with the state, see {@link StateKind}.
- * @param elapsedMs Milliseconds since last sample.
- */
- public void run(int state, StateKind kind, long elapsedMs);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/common/worker/package-info.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/common/worker/package-info.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/common/worker/package-info.java
deleted file mode 100644
index c3da9ed..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/common/worker/package-info.java
+++ /dev/null
@@ -1,18 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-/** Defines utilities used to implement the harness that runs user code. **/
-package com.google.cloud.dataflow.sdk.util.common.worker;
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/gcsfs/GcsPath.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/gcsfs/GcsPath.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/gcsfs/GcsPath.java
deleted file mode 100644
index f72ba4c..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/gcsfs/GcsPath.java
+++ /dev/null
@@ -1,619 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util.gcsfs;
-
-import com.google.api.services.storage.model.StorageObject;
-import com.google.common.base.Preconditions;
-import com.google.common.base.Strings;
-
-import java.io.File;
-import java.io.IOException;
-import java.net.URI;
-import java.net.URISyntaxException;
-import java.nio.file.FileSystem;
-import java.nio.file.LinkOption;
-import java.nio.file.Path;
-import java.nio.file.WatchEvent;
-import java.nio.file.WatchKey;
-import java.nio.file.WatchService;
-import java.util.Iterator;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import javax.annotation.Nonnull;
-import javax.annotation.Nullable;
-
-/**
- * Implements the Java NIO {@link Path} API for Google Cloud Storage paths.
- *
- * <p>GcsPath uses a slash ('/') as a directory separator. Below is
- * a summary of how slashes are treated:
- * <ul>
- * <li> A GCS bucket may not contain a slash. An object may contain zero or
- * more slashes.
- * <li> A trailing slash always indicates a directory, which is compliant
- * with POSIX.1-2008.
- * <li> Slashes separate components of a path. Empty components are allowed,
- * these are represented as repeated slashes. An empty component always
- * refers to a directory, and always ends in a slash.
- * <li> {@link #getParent()}} always returns a path ending in a slash, as the
- * parent of a GcsPath is always a directory.
- * <li> Use {@link #resolve(String)} to append elements to a GcsPath -- this
- * applies the rules consistently and is highly recommended over any
- * custom string concatenation.
- * </ul>
- *
- * <p>GcsPath treats all GCS objects and buckets as belonging to the same
- * filesystem, so the root of a GcsPath is the GcsPath bucket="", object="".
- *
- * <p>Relative paths are not associated with any bucket. This matches common
- * treatment of Path in which relative paths can be constructed from one
- * filesystem and appended to another filesystem.
- *
- * @see <a href=
- * "http://docs.oracle.com/javase/tutorial/essential/io/pathOps.html"
- * >Java Tutorials: Path Operations</a>
- */
-public class GcsPath implements Path {
-
- public static final String SCHEME = "gs";
-
- /**
- * Creates a GcsPath from a URI.
- *
- * <p>The URI must be in the form {@code gs://[bucket]/[path]}, and may not
- * contain a port, user info, a query, or a fragment.
- */
- public static GcsPath fromUri(URI uri) {
- Preconditions.checkArgument(uri.getScheme().equalsIgnoreCase(SCHEME),
- "URI: %s is not a GCS URI", uri);
- Preconditions.checkArgument(uri.getPort() == -1,
- "GCS URI may not specify port: %s (%i)", uri, uri.getPort());
- Preconditions.checkArgument(
- Strings.isNullOrEmpty(uri.getUserInfo()),
- "GCS URI may not specify userInfo: %s (%s)", uri, uri.getUserInfo());
- Preconditions.checkArgument(
- Strings.isNullOrEmpty(uri.getQuery()),
- "GCS URI may not specify query: %s (%s)", uri, uri.getQuery());
- Preconditions.checkArgument(
- Strings.isNullOrEmpty(uri.getFragment()),
- "GCS URI may not specify fragment: %s (%s)", uri, uri.getFragment());
-
- return fromUri(uri.toString());
- }
-
- /**
- * Pattern that is used to parse a GCS URL.
- *
- * <p>This is used to separate the components. Verification is handled
- * separately.
- */
- public static final Pattern GCS_URI =
- Pattern.compile("(?<SCHEME>[^:]+)://(?<BUCKET>[^/]+)(/(?<OBJECT>.*))?");
-
- /**
- * Creates a GcsPath from a URI in string form.
- *
- * <p>This does not use URI parsing, which means it may accept patterns that
- * the URI parser would not accept.
- */
- public static GcsPath fromUri(String uri) {
- Matcher m = GCS_URI.matcher(uri);
- Preconditions.checkArgument(m.matches(), "Invalid GCS URI: %s", uri);
-
- Preconditions.checkArgument(m.group("SCHEME").equalsIgnoreCase(SCHEME),
- "URI: %s is not a GCS URI", uri);
- return new GcsPath(null, m.group("BUCKET"), m.group("OBJECT"));
- }
-
- /**
- * Pattern that is used to parse a GCS resource name.
- */
- private static final Pattern GCS_RESOURCE_NAME =
- Pattern.compile("storage.googleapis.com/(?<BUCKET>[^/]+)(/(?<OBJECT>.*))?");
-
- /**
- * Creates a GcsPath from a OnePlatform resource name in string form.
- */
- public static GcsPath fromResourceName(String name) {
- Matcher m = GCS_RESOURCE_NAME.matcher(name);
- Preconditions.checkArgument(m.matches(), "Invalid GCS resource name: %s", name);
-
- return new GcsPath(null, m.group("BUCKET"), m.group("OBJECT"));
- }
-
- /**
- * Creates a GcsPath from a {@linkplain StorageObject}.
- */
- public static GcsPath fromObject(StorageObject object) {
- return new GcsPath(null, object.getBucket(), object.getName());
- }
-
- /**
- * Creates a GcsPath from bucket and object components.
- *
- * <p>A GcsPath without a bucket name is treated as a relative path, which
- * is a path component with no linkage to the root element. This is similar
- * to a Unix path that does not begin with the root marker (a slash).
- * GCS has different naming constraints and APIs for working with buckets and
- * objects, so these two concepts are kept separate to avoid accidental
- * attempts to treat objects as buckets, or vice versa, as much as possible.
- *
- * <p>A GcsPath without an object name is a bucket reference.
- * A bucket is always a directory, which could be used to lookup or add
- * files to a bucket, but could not be opened as a file.
- *
- * <p>A GcsPath containing neither bucket or object names is treated as
- * the root of the GCS filesystem. A listing on the root element would return
- * the buckets available to the user.
- *
- * <p>If {@code null} is passed as either parameter, it is converted to an
- * empty string internally for consistency. There is no distinction between
- * an empty string and a {@code null}, as neither are allowed by GCS.
- *
- * @param bucket a GCS bucket name, or none ({@code null} or an empty string)
- * if the object is not associated with a bucket
- * (e.g. relative paths or the root node).
- * @param object a GCS object path, or none ({@code null} or an empty string)
- * for no object.
- */
- public static GcsPath fromComponents(@Nullable String bucket,
- @Nullable String object) {
- return new GcsPath(null, bucket, object);
- }
-
- @Nullable
- private FileSystem fs;
- @Nonnull
- private final String bucket;
- @Nonnull
- private final String object;
-
- /**
- * Constructs a GcsPath.
- *
- * @param fs the associated FileSystem, if any
- * @param bucket the associated bucket, or none ({@code null} or an empty
- * string) for a relative path component
- * @param object the object, which is a fully-qualified object name if bucket
- * was also provided, or none ({@code null} or an empty string)
- * for no object
- * @throws java.lang.IllegalArgumentException if the bucket of object names
- * are invalid.
- */
- public GcsPath(@Nullable FileSystem fs,
- @Nullable String bucket,
- @Nullable String object) {
- if (bucket == null) {
- bucket = "";
- }
- Preconditions.checkArgument(!bucket.contains("/"),
- "GCS bucket may not contain a slash");
- Preconditions
- .checkArgument(bucket.isEmpty()
- || bucket.matches("[a-z0-9][-_a-z0-9.]+[a-z0-9]"),
- "GCS bucket names must contain only lowercase letters, numbers, "
- + "dashes (-), underscores (_), and dots (.). Bucket names "
- + "must start and end with a number or letter. "
- + "See https://developers.google.com/storage/docs/bucketnaming "
- + "for more details. Bucket name: " + bucket);
-
- if (object == null) {
- object = "";
- }
- Preconditions.checkArgument(
- object.indexOf('\n') < 0 && object.indexOf('\r') < 0,
- "GCS object names must not contain Carriage Return or "
- + "Line Feed characters.");
-
- this.fs = fs;
- this.bucket = bucket;
- this.object = object;
- }
-
- /**
- * Returns the bucket name associated with this GCS path, or an empty string
- * if this is a relative path component.
- */
- public String getBucket() {
- return bucket;
- }
-
- /**
- * Returns the object name associated with this GCS path, or an empty string
- * if no object is specified.
- */
- public String getObject() {
- return object;
- }
-
- public void setFileSystem(FileSystem fs) {
- this.fs = fs;
- }
-
- @Override
- public FileSystem getFileSystem() {
- return fs;
- }
-
- // Absolute paths are those that have a bucket and the root path.
- @Override
- public boolean isAbsolute() {
- return !bucket.isEmpty() || object.isEmpty();
- }
-
- @Override
- public GcsPath getRoot() {
- return new GcsPath(fs, "", "");
- }
-
- @Override
- public GcsPath getFileName() {
- throw new UnsupportedOperationException();
- }
-
- /**
- * Returns the <em>parent path</em>, or {@code null} if this path does not
- * have a parent.
- *
- * <p>Returns a path that ends in '/', as the parent path always refers to
- * a directory.
- */
- @Override
- public GcsPath getParent() {
- if (bucket.isEmpty() && object.isEmpty()) {
- // The root path has no parent, by definition.
- return null;
- }
-
- if (object.isEmpty()) {
- // A GCS bucket. All buckets come from a common root.
- return getRoot();
- }
-
- // Skip last character, in case it is a trailing slash.
- int i = object.lastIndexOf('/', object.length() - 2);
- if (i <= 0) {
- if (bucket.isEmpty()) {
- // Relative paths are not attached to the root node.
- return null;
- }
- return new GcsPath(fs, bucket, "");
- }
-
- // Retain trailing slash.
- return new GcsPath(fs, bucket, object.substring(0, i + 1));
- }
-
- @Override
- public int getNameCount() {
- int count = bucket.isEmpty() ? 0 : 1;
- if (object.isEmpty()) {
- return count;
- }
-
- // Add another for each separator found.
- int index = -1;
- while ((index = object.indexOf('/', index + 1)) != -1) {
- count++;
- }
-
- return object.endsWith("/") ? count : count + 1;
- }
-
- @Override
- public GcsPath getName(int count) {
- Preconditions.checkArgument(count >= 0);
-
- Iterator<Path> iterator = iterator();
- for (int i = 0; i < count; ++i) {
- Preconditions.checkArgument(iterator.hasNext());
- iterator.next();
- }
-
- Preconditions.checkArgument(iterator.hasNext());
- return (GcsPath) iterator.next();
- }
-
- @Override
- public GcsPath subpath(int beginIndex, int endIndex) {
- Preconditions.checkArgument(beginIndex >= 0);
- Preconditions.checkArgument(endIndex > beginIndex);
-
- Iterator<Path> iterator = iterator();
- for (int i = 0; i < beginIndex; ++i) {
- Preconditions.checkArgument(iterator.hasNext());
- iterator.next();
- }
-
- GcsPath path = null;
- while (beginIndex < endIndex) {
- Preconditions.checkArgument(iterator.hasNext());
- if (path == null) {
- path = (GcsPath) iterator.next();
- } else {
- path = path.resolve(iterator.next());
- }
- ++beginIndex;
- }
-
- return path;
- }
-
- @Override
- public boolean startsWith(Path other) {
- if (other instanceof GcsPath) {
- GcsPath gcsPath = (GcsPath) other;
- return startsWith(gcsPath.bucketAndObject());
- } else {
- return startsWith(other.toString());
- }
- }
-
- @Override
- public boolean startsWith(String prefix) {
- return bucketAndObject().startsWith(prefix);
- }
-
- @Override
- public boolean endsWith(Path other) {
- if (other instanceof GcsPath) {
- GcsPath gcsPath = (GcsPath) other;
- return endsWith(gcsPath.bucketAndObject());
- } else {
- return endsWith(other.toString());
- }
- }
-
- @Override
- public boolean endsWith(String suffix) {
- return bucketAndObject().endsWith(suffix);
- }
-
- // TODO: support "." and ".." path components?
- @Override
- public GcsPath normalize() {
- return this;
- }
-
- @Override
- public GcsPath resolve(Path other) {
- if (other instanceof GcsPath) {
- GcsPath path = (GcsPath) other;
- if (path.isAbsolute()) {
- return path;
- } else {
- return resolve(path.getObject());
- }
- } else {
- return resolve(other.toString());
- }
- }
-
- @Override
- public GcsPath resolve(String other) {
- if (bucket.isEmpty() && object.isEmpty()) {
- // Resolve on a root path is equivalent to looking up a bucket and object.
- other = SCHEME + "://" + other;
- }
-
- if (other.startsWith(SCHEME + "://")) {
- GcsPath path = GcsPath.fromUri(other);
- path.setFileSystem(getFileSystem());
- return path;
- }
-
- if (other.isEmpty()) {
- // An empty component MUST refer to a directory.
- other = "/";
- }
-
- if (object.isEmpty()) {
- return new GcsPath(fs, bucket, other);
- } else if (object.endsWith("/")) {
- return new GcsPath(fs, bucket, object + other);
- } else {
- return new GcsPath(fs, bucket, object + "/" + other);
- }
- }
-
- @Override
- public Path resolveSibling(Path other) {
- throw new UnsupportedOperationException();
- }
-
- @Override
- public Path resolveSibling(String other) {
- throw new UnsupportedOperationException();
- }
-
- @Override
- public Path relativize(Path other) {
- throw new UnsupportedOperationException();
- }
-
- @Override
- public GcsPath toAbsolutePath() {
- return this;
- }
-
- @Override
- public GcsPath toRealPath(LinkOption... options) throws IOException {
- return this;
- }
-
- @Override
- public File toFile() {
- throw new UnsupportedOperationException();
- }
-
- @Override
- public WatchKey register(WatchService watcher, WatchEvent.Kind<?>[] events,
- WatchEvent.Modifier... modifiers) throws IOException {
- throw new UnsupportedOperationException();
- }
-
- @Override
- public WatchKey register(WatchService watcher, WatchEvent.Kind<?>... events)
- throws IOException {
- throw new UnsupportedOperationException();
- }
-
- @Override
- public Iterator<Path> iterator() {
- return new NameIterator(fs, !bucket.isEmpty(), bucketAndObject());
- }
-
- private static class NameIterator implements Iterator<Path> {
- private final FileSystem fs;
- private boolean fullPath;
- private String name;
-
- NameIterator(FileSystem fs, boolean fullPath, String name) {
- this.fs = fs;
- this.fullPath = fullPath;
- this.name = name;
- }
-
- @Override
- public boolean hasNext() {
- return !Strings.isNullOrEmpty(name);
- }
-
- @Override
- public GcsPath next() {
- int i = name.indexOf('/');
- String component;
- if (i >= 0) {
- component = name.substring(0, i);
- name = name.substring(i + 1);
- } else {
- component = name;
- name = null;
- }
- if (fullPath) {
- fullPath = false;
- return new GcsPath(fs, component, "");
- } else {
- // Relative paths have no bucket.
- return new GcsPath(fs, "", component);
- }
- }
-
- @Override
- public void remove() {
- throw new UnsupportedOperationException();
- }
- }
-
- @Override
- public int compareTo(Path other) {
- if (!(other instanceof GcsPath)) {
- throw new ClassCastException();
- }
-
- GcsPath path = (GcsPath) other;
- int b = bucket.compareTo(path.bucket);
- if (b != 0) {
- return b;
- }
-
- // Compare a component at a time, so that the separator char doesn't
- // get compared against component contents. Eg, "a/b" < "a-1/b".
- Iterator<Path> left = iterator();
- Iterator<Path> right = path.iterator();
-
- while (left.hasNext() && right.hasNext()) {
- String leftStr = left.next().toString();
- String rightStr = right.next().toString();
- int c = leftStr.compareTo(rightStr);
- if (c != 0) {
- return c;
- }
- }
-
- if (!left.hasNext() && !right.hasNext()) {
- return 0;
- } else {
- return left.hasNext() ? 1 : -1;
- }
- }
-
- @Override
- public boolean equals(Object o) {
- if (this == o) {
- return true;
- }
- if (o == null || getClass() != o.getClass()) {
- return false;
- }
-
- GcsPath paths = (GcsPath) o;
- return bucket.equals(paths.bucket) && object.equals(paths.object);
- }
-
- @Override
- public int hashCode() {
- int result = bucket.hashCode();
- result = 31 * result + object.hashCode();
- return result;
- }
-
- @Override
- public String toString() {
- if (!isAbsolute()) {
- return object;
- }
- StringBuilder sb = new StringBuilder();
- sb.append(SCHEME)
- .append("://");
- if (!bucket.isEmpty()) {
- sb.append(bucket)
- .append('/');
- }
- sb.append(object);
- return sb.toString();
- }
-
- // TODO: Consider using resource names for all GCS paths used by the SDK.
- public String toResourceName() {
- StringBuilder sb = new StringBuilder();
- sb.append("storage.googleapis.com/");
- if (!bucket.isEmpty()) {
- sb.append(bucket).append('/');
- }
- sb.append(object);
- return sb.toString();
- }
-
- @Override
- public URI toUri() {
- try {
- return new URI(SCHEME, "//" + bucketAndObject(), null);
- } catch (URISyntaxException e) {
- throw new RuntimeException("Unable to create URI for GCS path " + this);
- }
- }
-
- private String bucketAndObject() {
- if (bucket.isEmpty()) {
- return object;
- } else {
- return bucket + "/" + object;
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/gcsfs/package-info.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/gcsfs/package-info.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/gcsfs/package-info.java
deleted file mode 100644
index 2f57938..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/gcsfs/package-info.java
+++ /dev/null
@@ -1,18 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-/** Defines utilities used to interact with Google Cloud Storage. **/
-package com.google.cloud.dataflow.sdk.util.gcsfs;
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/package-info.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/package-info.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/package-info.java
deleted file mode 100644
index c92adab..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/package-info.java
+++ /dev/null
@@ -1,18 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-/** Defines utilities used by the Dataflow SDK. **/
-package com.google.cloud.dataflow.sdk.util;
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/AccumulatorCombiningState.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/AccumulatorCombiningState.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/AccumulatorCombiningState.java
deleted file mode 100644
index 0d78b13..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/AccumulatorCombiningState.java
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.util.state;
-
-import com.google.cloud.dataflow.sdk.transforms.Combine.CombineFn;
-
-/**
- * State for a single value that is managed by a {@link CombineFn}. This is an internal extension
- * to {@link CombiningState} that includes the {@code AccumT} type.
- *
- * @param <InputT> the type of values added to the state
- * @param <AccumT> the type of accumulator
- * @param <OutputT> the type of value extracted from the state
- */
-public interface AccumulatorCombiningState<InputT, AccumT, OutputT>
- extends CombiningState<InputT, OutputT> {
-
- /**
- * Read the merged accumulator for this combining value. It is implied that reading the
- * state involes reading the accumulator, so {@link #readLater} is sufficient to prefetch for
- * this.
- */
- AccumT getAccum();
-
- /**
- * Add an accumulator to this combining value. Depending on implementation this may immediately
- * merge it with the previous accumulator, or may buffer this accumulator for a future merge.
- */
- void addAccum(AccumT accum);
-
- /**
- * Merge the given accumulators according to the underlying combiner.
- */
- AccumT mergeAccumulators(Iterable<AccumT> accumulators);
-
- @Override
- AccumulatorCombiningState<InputT, AccumT, OutputT> readLater();
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/BagState.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/BagState.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/BagState.java
deleted file mode 100644
index 363e480..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/BagState.java
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.util.state;
-
-/**
- * State containing a bag values. Items can be added to the bag and the contents read out.
- *
- * @param <T> The type of elements in the bag.
- */
-public interface BagState<T> extends CombiningState<T, Iterable<T>> {
- @Override
- BagState<T> readLater();
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/CombiningState.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/CombiningState.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/CombiningState.java
deleted file mode 100644
index 673bebb..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/CombiningState.java
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.util.state;
-
-import com.google.cloud.dataflow.sdk.transforms.Combine.CombineFn;
-
-/**
- * State that combines multiple {@code InputT} values using a {@link CombineFn} to produce a single
- * {@code OutputT} value.
- *
- * @param <InputT> the type of values added to the state
- * @param <OutputT> the type of value extracted from the state
- */
-public interface CombiningState<InputT, OutputT> extends ReadableState<OutputT>, State {
- /**
- * Add a value to the buffer.
- */
- void add(InputT value);
-
- /**
- * Return true if this state is empty.
- */
- ReadableState<Boolean> isEmpty();
-
- @Override
- CombiningState<InputT, OutputT> readLater();
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/CopyOnAccessInMemoryStateInternals.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/CopyOnAccessInMemoryStateInternals.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/CopyOnAccessInMemoryStateInternals.java
deleted file mode 100644
index 3683b74..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/CopyOnAccessInMemoryStateInternals.java
+++ /dev/null
@@ -1,454 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.util.state;
-
-import static com.google.common.base.Preconditions.checkState;
-
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.transforms.Combine.CombineFn;
-import com.google.cloud.dataflow.sdk.transforms.Combine.KeyedCombineFn;
-import com.google.cloud.dataflow.sdk.transforms.CombineWithContext.KeyedCombineFnWithContext;
-import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
-import com.google.cloud.dataflow.sdk.transforms.windowing.OutputTimeFn;
-import com.google.cloud.dataflow.sdk.util.CombineFnUtil;
-import com.google.cloud.dataflow.sdk.util.state.InMemoryStateInternals.InMemoryState;
-import com.google.cloud.dataflow.sdk.util.state.StateTag.StateBinder;
-import com.google.common.base.Optional;
-import com.google.common.collect.Iterables;
-
-import org.joda.time.Instant;
-
-import java.util.Collection;
-import java.util.HashSet;
-import java.util.Map;
-
-import javax.annotation.Nullable;
-
-/**
- * {@link StateInternals} built on top of an underlying {@link StateTable} that contains instances
- * of {@link InMemoryState}. Whenever state that exists in the underlying {@link StateTable} is
- * accessed, an independent copy will be created within this table.
- */
-public class CopyOnAccessInMemoryStateInternals<K> implements StateInternals<K> {
- private final K key;
- private final CopyOnAccessInMemoryStateTable<K> table;
-
- /**
- * Creates a new {@link CopyOnAccessInMemoryStateInternals} with the underlying (possibly null)
- * StateInternals.
- */
- public static <K> CopyOnAccessInMemoryStateInternals<K> withUnderlying(
- K key, @Nullable CopyOnAccessInMemoryStateInternals<K> underlying) {
- return new CopyOnAccessInMemoryStateInternals<K>(key, underlying);
- }
-
- private CopyOnAccessInMemoryStateInternals(
- K key, CopyOnAccessInMemoryStateInternals<K> underlying) {
- this.key = key;
- table =
- new CopyOnAccessInMemoryStateTable<K>(key, underlying == null ? null : underlying.table);
- }
-
- /**
- * Ensures this {@link CopyOnAccessInMemoryStateInternals} is complete. Other copies of state for
- * the same Step and Key may be discarded after invoking this method.
- *
- * <p>For each {@link StateNamespace}, for each {@link StateTag address} in that namespace that
- * has not been bound in this {@link CopyOnAccessInMemoryStateInternals}, put a reference to that
- * state within this {@link StateInternals}.
- *
- * <p>Additionally, stores the {@link WatermarkHoldState} with the earliest time bound in the
- * state table after the commit is completed, enabling calls to
- * {@link #getEarliestWatermarkHold()}.
- *
- * @return this table
- */
- public CopyOnAccessInMemoryStateInternals<K> commit() {
- table.commit();
- return this;
- }
-
- /**
- * Gets the earliest Watermark Hold present in this table.
- *
- * <p>Must be called after this state has been committed. Will throw an
- * {@link IllegalStateException} if the state has not been committed.
- */
- public Instant getEarliestWatermarkHold() {
- // After commit, the watermark hold is always present, but may be
- // BoundedWindow#TIMESTAMP_MAX_VALUE if there is no hold set.
- checkState(
- table.earliestWatermarkHold.isPresent(),
- "Can't get the earliest watermark hold in a %s before it is committed",
- getClass().getSimpleName());
- return table.earliestWatermarkHold.get();
- }
-
- @Override
- public <T extends State> T state(StateNamespace namespace, StateTag<? super K, T> address) {
- return state(namespace, address, StateContexts.nullContext());
- }
-
- @Override
- public <T extends State> T state(
- StateNamespace namespace, StateTag<? super K, T> address, StateContext<?> c) {
- return table.get(namespace, address, c);
- }
-
- @Override
- public K getKey() {
- return key;
- }
-
- public boolean isEmpty() {
- return Iterables.isEmpty(table.values());
- }
-
- /**
- * A {@link StateTable} that, when a value is retrieved with
- * {@link StateTable#get(StateNamespace, StateTag)}, first attempts to obtain a copy of existing
- * {@link State} from an underlying {@link StateTable}.
- */
- private static class CopyOnAccessInMemoryStateTable<K> extends StateTable<K> {
- private final K key;
- private Optional<StateTable<K>> underlying;
-
- /**
- * The StateBinderFactory currently in use by this {@link CopyOnAccessInMemoryStateTable}.
- *
- * <p>There are three {@link StateBinderFactory} implementations used by the {@link
- * CopyOnAccessInMemoryStateTable}.
- * <ul>
- * <li>The default {@link StateBinderFactory} is a {@link CopyOnBindBinderFactory}, allowing
- * the table to copy any existing {@link State} values to this {@link StateTable} from the
- * underlying table when accessed, at which point mutations will not be visible to the
- * underlying table - effectively a "Copy by Value" binder.</li>
- * <li>During the execution of the {@link #commit()} method, this is a
- * {@link ReadThroughBinderFactory}, which copies the references to the existing
- * {@link State} objects to this {@link StateTable}.</li>
- * <li>After the execution of the {@link #commit()} method, this is an
- * instance of {@link InMemoryStateBinderFactory}, which constructs new instances of state
- * when a {@link StateTag} is bound.</li>
- * </ul>
- */
- private StateBinderFactory<K> binderFactory;
-
- /**
- * The earliest watermark hold in this table.
- */
- private Optional<Instant> earliestWatermarkHold;
-
- public CopyOnAccessInMemoryStateTable(K key, StateTable<K> underlying) {
- this.key = key;
- this.underlying = Optional.fromNullable(underlying);
- binderFactory = new CopyOnBindBinderFactory<>(key, this.underlying);
- earliestWatermarkHold = Optional.absent();
- }
-
- /**
- * Copies all values in the underlying table to this table, then discards the underlying table.
- *
- * <p>If there is an underlying table, this replaces the existing
- * {@link CopyOnBindBinderFactory} with a {@link ReadThroughBinderFactory}, then reads all of
- * the values in the existing table, binding the state values to this table. The old StateTable
- * should be discarded after the call to {@link #commit()}.
- *
- * <p>After copying all of the existing values, replace the binder factory with an instance of
- * {@link InMemoryStateBinderFactory} to construct new values, since all existing values
- * are bound in this {@link StateTable table} and this table represents the canonical state.
- */
- private void commit() {
- Instant earliestHold = getEarliestWatermarkHold();
- if (underlying.isPresent()) {
- ReadThroughBinderFactory<K> readThroughBinder =
- new ReadThroughBinderFactory<>(underlying.get());
- binderFactory = readThroughBinder;
- Instant earliestUnderlyingHold = readThroughBinder.readThroughAndGetEarliestHold(this);
- if (earliestUnderlyingHold.isBefore(earliestHold)) {
- earliestHold = earliestUnderlyingHold;
- }
- }
- earliestWatermarkHold = Optional.of(earliestHold);
- clearEmpty();
- binderFactory = new InMemoryStateBinderFactory<>(key);
- underlying = Optional.absent();
- }
-
- /**
- * Get the earliest watermark hold in this table. Ignores the contents of any underlying table.
- */
- private Instant getEarliestWatermarkHold() {
- Instant earliest = BoundedWindow.TIMESTAMP_MAX_VALUE;
- for (State existingState : this.values()) {
- if (existingState instanceof WatermarkHoldState) {
- Instant hold = ((WatermarkHoldState<?>) existingState).read();
- if (hold != null && hold.isBefore(earliest)) {
- earliest = hold;
- }
- }
- }
- return earliest;
- }
-
- /**
- * Clear all empty {@link StateNamespace StateNamespaces} from this table. If all states are
- * empty, clear the entire table.
- *
- * <p>Because {@link InMemoryState} is not removed from the {@link StateTable} after it is
- * cleared, in case contents are modified after being cleared, the table must be explicitly
- * checked to ensure that it contains state and removed if not (otherwise we may never use
- * the table again).
- */
- private void clearEmpty() {
- Collection<StateNamespace> emptyNamespaces = new HashSet<>(this.getNamespacesInUse());
- for (StateNamespace namespace : this.getNamespacesInUse()) {
- for (State existingState : this.getTagsInUse(namespace).values()) {
- if (!((InMemoryState<?>) existingState).isCleared()) {
- emptyNamespaces.remove(namespace);
- break;
- }
- }
- }
- for (StateNamespace empty : emptyNamespaces) {
- this.clearNamespace(empty);
- }
- }
-
- @Override
- protected StateBinder<K> binderForNamespace(final StateNamespace namespace, StateContext<?> c) {
- return binderFactory.forNamespace(namespace, c);
- }
-
- private static interface StateBinderFactory<K> {
- StateBinder<K> forNamespace(StateNamespace namespace, StateContext<?> c);
- }
-
- /**
- * {@link StateBinderFactory} that creates a copy of any existing state when the state is bound.
- */
- private static class CopyOnBindBinderFactory<K> implements StateBinderFactory<K> {
- private final K key;
- private final Optional<StateTable<K>> underlying;
-
- public CopyOnBindBinderFactory(K key, Optional<StateTable<K>> underlying) {
- this.key = key;
- this.underlying = underlying;
- }
-
- private boolean containedInUnderlying(StateNamespace namespace, StateTag<? super K, ?> tag) {
- return underlying.isPresent() && underlying.get().isNamespaceInUse(namespace)
- && underlying.get().getTagsInUse(namespace).containsKey(tag);
- }
-
- @Override
- public StateBinder<K> forNamespace(final StateNamespace namespace, final StateContext<?> c) {
- return new StateBinder<K>() {
- @Override
- public <W extends BoundedWindow> WatermarkHoldState<W> bindWatermark(
- StateTag<? super K, WatermarkHoldState<W>> address,
- OutputTimeFn<? super W> outputTimeFn) {
- if (containedInUnderlying(namespace, address)) {
- @SuppressWarnings("unchecked")
- InMemoryState<? extends WatermarkHoldState<W>> existingState =
- (InMemoryStateInternals.InMemoryState<? extends WatermarkHoldState<W>>)
- underlying.get().get(namespace, address, c);
- return existingState.copy();
- } else {
- return new InMemoryStateInternals.InMemoryWatermarkHold<>(
- outputTimeFn);
- }
- }
-
- @Override
- public <T> ValueState<T> bindValue(
- StateTag<? super K, ValueState<T>> address, Coder<T> coder) {
- if (containedInUnderlying(namespace, address)) {
- @SuppressWarnings("unchecked")
- InMemoryState<? extends ValueState<T>> existingState =
- (InMemoryStateInternals.InMemoryState<? extends ValueState<T>>)
- underlying.get().get(namespace, address, c);
- return existingState.copy();
- } else {
- return new InMemoryStateInternals.InMemoryValue<>();
- }
- }
-
- @Override
- public <InputT, AccumT, OutputT> AccumulatorCombiningState<InputT, AccumT, OutputT>
- bindCombiningValue(
- StateTag<? super K, AccumulatorCombiningState<InputT, AccumT, OutputT>> address,
- Coder<AccumT> accumCoder, CombineFn<InputT, AccumT, OutputT> combineFn) {
- if (containedInUnderlying(namespace, address)) {
- @SuppressWarnings("unchecked")
- InMemoryState<? extends AccumulatorCombiningState<InputT, AccumT, OutputT>>
- existingState = (
- InMemoryStateInternals
- .InMemoryState<? extends AccumulatorCombiningState<InputT, AccumT,
- OutputT>>) underlying.get().get(namespace, address, c);
- return existingState.copy();
- } else {
- return new InMemoryStateInternals.InMemoryCombiningValue<>(
- key, combineFn.asKeyedFn());
- }
- }
-
- @Override
- public <T> BagState<T> bindBag(
- StateTag<? super K, BagState<T>> address, Coder<T> elemCoder) {
- if (containedInUnderlying(namespace, address)) {
- @SuppressWarnings("unchecked")
- InMemoryState<? extends BagState<T>> existingState =
- (InMemoryStateInternals.InMemoryState<? extends BagState<T>>)
- underlying.get().get(namespace, address, c);
- return existingState.copy();
- } else {
- return new InMemoryStateInternals.InMemoryBag<>();
- }
- }
-
- @Override
- public <InputT, AccumT, OutputT> AccumulatorCombiningState<InputT, AccumT, OutputT>
- bindKeyedCombiningValue(
- StateTag<? super K, AccumulatorCombiningState<InputT, AccumT, OutputT>> address,
- Coder<AccumT> accumCoder,
- KeyedCombineFn<? super K, InputT, AccumT, OutputT> combineFn) {
- if (containedInUnderlying(namespace, address)) {
- @SuppressWarnings("unchecked")
- InMemoryState<? extends AccumulatorCombiningState<InputT, AccumT, OutputT>>
- existingState = (
- InMemoryStateInternals
- .InMemoryState<? extends AccumulatorCombiningState<InputT, AccumT,
- OutputT>>) underlying.get().get(namespace, address, c);
- return existingState.copy();
- } else {
- return new InMemoryStateInternals.InMemoryCombiningValue<>(key, combineFn);
- }
- }
-
- @Override
- public <InputT, AccumT, OutputT> AccumulatorCombiningState<InputT, AccumT, OutputT>
- bindKeyedCombiningValueWithContext(
- StateTag<? super K, AccumulatorCombiningState<InputT, AccumT, OutputT>> address,
- Coder<AccumT> accumCoder,
- KeyedCombineFnWithContext<? super K, InputT, AccumT, OutputT> combineFn) {
- return bindKeyedCombiningValue(
- address, accumCoder, CombineFnUtil.bindContext(combineFn, c));
- }
- };
- }
- }
-
- /**
- * {@link StateBinderFactory} that reads directly from the underlying table. Used during calls
- * to {@link CopyOnAccessInMemoryStateTable#commit()} to read all values from
- * the underlying table.
- */
- private static class ReadThroughBinderFactory<K> implements StateBinderFactory<K> {
- private final StateTable<K> underlying;
-
- public ReadThroughBinderFactory(StateTable<K> underlying) {
- this.underlying = underlying;
- }
-
- public Instant readThroughAndGetEarliestHold(StateTable<K> readTo) {
- Instant earliestHold = BoundedWindow.TIMESTAMP_MAX_VALUE;
- for (StateNamespace namespace : underlying.getNamespacesInUse()) {
- for (Map.Entry<StateTag<? super K, ?>, ? extends State> existingState :
- underlying.getTagsInUse(namespace).entrySet()) {
- if (!((InMemoryState<?>) existingState.getValue()).isCleared()) {
- // Only read through non-cleared values to ensure that completed windows are
- // eventually discarded, and remember the earliest watermark hold from among those
- // values.
- State state =
- readTo.get(namespace, existingState.getKey(), StateContexts.nullContext());
- if (state instanceof WatermarkHoldState) {
- Instant hold = ((WatermarkHoldState<?>) state).read();
- if (hold != null && hold.isBefore(earliestHold)) {
- earliestHold = hold;
- }
- }
- }
- }
- }
- return earliestHold;
- }
-
- @Override
- public StateBinder<K> forNamespace(final StateNamespace namespace, final StateContext<?> c) {
- return new StateBinder<K>() {
- @Override
- public <W extends BoundedWindow> WatermarkHoldState<W> bindWatermark(
- StateTag<? super K, WatermarkHoldState<W>> address,
- OutputTimeFn<? super W> outputTimeFn) {
- return underlying.get(namespace, address, c);
- }
-
- @Override
- public <T> ValueState<T> bindValue(
- StateTag<? super K, ValueState<T>> address, Coder<T> coder) {
- return underlying.get(namespace, address, c);
- }
-
- @Override
- public <InputT, AccumT, OutputT> AccumulatorCombiningState<InputT, AccumT, OutputT>
- bindCombiningValue(
- StateTag<? super K, AccumulatorCombiningState<InputT, AccumT, OutputT>> address,
- Coder<AccumT> accumCoder, CombineFn<InputT, AccumT, OutputT> combineFn) {
- return underlying.get(namespace, address, c);
- }
-
- @Override
- public <T> BagState<T> bindBag(
- StateTag<? super K, BagState<T>> address, Coder<T> elemCoder) {
- return underlying.get(namespace, address, c);
- }
-
- @Override
- public <InputT, AccumT, OutputT> AccumulatorCombiningState<InputT, AccumT, OutputT>
- bindKeyedCombiningValue(
- StateTag<? super K, AccumulatorCombiningState<InputT, AccumT, OutputT>> address,
- Coder<AccumT> accumCoder,
- KeyedCombineFn<? super K, InputT, AccumT, OutputT> combineFn) {
- return underlying.get(namespace, address, c);
- }
-
- @Override
- public <InputT, AccumT, OutputT> AccumulatorCombiningState<InputT, AccumT, OutputT>
- bindKeyedCombiningValueWithContext(
- StateTag<? super K, AccumulatorCombiningState<InputT, AccumT, OutputT>> address,
- Coder<AccumT> accumCoder,
- KeyedCombineFnWithContext<? super K, InputT, AccumT, OutputT> combineFn) {
- return bindKeyedCombiningValue(
- address, accumCoder, CombineFnUtil.bindContext(combineFn, c));
- }
- };
- }
- }
-
- private static class InMemoryStateBinderFactory<K> implements StateBinderFactory<K> {
- private final K key;
-
- public InMemoryStateBinderFactory(K key) {
- this.key = key;
- }
-
- @Override
- public StateBinder<K> forNamespace(StateNamespace namespace, StateContext<?> c) {
- return new InMemoryStateInternals.InMemoryStateBinder<>(key, c);
- }
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/InMemoryStateInternals.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/InMemoryStateInternals.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/InMemoryStateInternals.java
deleted file mode 100644
index 8404801..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/InMemoryStateInternals.java
+++ /dev/null
@@ -1,414 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.util.state;
-
-import com.google.cloud.dataflow.sdk.annotations.Experimental;
-import com.google.cloud.dataflow.sdk.annotations.Experimental.Kind;
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.transforms.Combine.CombineFn;
-import com.google.cloud.dataflow.sdk.transforms.Combine.KeyedCombineFn;
-import com.google.cloud.dataflow.sdk.transforms.CombineWithContext.KeyedCombineFnWithContext;
-import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
-import com.google.cloud.dataflow.sdk.transforms.windowing.OutputTimeFn;
-import com.google.cloud.dataflow.sdk.util.CombineFnUtil;
-import com.google.cloud.dataflow.sdk.util.state.StateTag.StateBinder;
-
-import org.joda.time.Instant;
-
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.List;
-import java.util.Objects;
-
-import javax.annotation.Nullable;
-
-/**
- * In-memory implementation of {@link StateInternals}. Used in {@code BatchModeExecutionContext}
- * and for running tests that need state.
- */
-@Experimental(Kind.STATE)
-public class InMemoryStateInternals<K> implements StateInternals<K> {
-
- public static <K> InMemoryStateInternals<K> forKey(K key) {
- return new InMemoryStateInternals<>(key);
- }
-
- private final K key;
-
- protected InMemoryStateInternals(K key) {
- this.key = key;
- }
-
- @Override
- public K getKey() {
- return key;
- }
-
- interface InMemoryState<T extends InMemoryState<T>> {
- boolean isCleared();
- T copy();
- }
-
- protected final StateTable<K> inMemoryState = new StateTable<K>() {
- @Override
- protected StateBinder<K> binderForNamespace(StateNamespace namespace, StateContext<?> c) {
- return new InMemoryStateBinder<K>(key, c);
- }
- };
-
- public void clear() {
- inMemoryState.clear();
- }
-
- /**
- * Return true if the given state is empty. This is used by the test framework to make sure
- * that the state has been properly cleaned up.
- */
- protected boolean isEmptyForTesting(State state) {
- return ((InMemoryState<?>) state).isCleared();
- }
-
- @Override
- public <T extends State> T state(StateNamespace namespace, StateTag<? super K, T> address) {
- return inMemoryState.get(namespace, address, StateContexts.nullContext());
- }
-
- @Override
- public <T extends State> T state(
- StateNamespace namespace, StateTag<? super K, T> address, final StateContext<?> c) {
- return inMemoryState.get(namespace, address, c);
- }
-
- /**
- * A {@link StateBinder} that returns In Memory {@link State} objects.
- */
- static class InMemoryStateBinder<K> implements StateBinder<K> {
- private final K key;
- private final StateContext<?> c;
-
- InMemoryStateBinder(K key, StateContext<?> c) {
- this.key = key;
- this.c = c;
- }
-
- @Override
- public <T> ValueState<T> bindValue(
- StateTag<? super K, ValueState<T>> address, Coder<T> coder) {
- return new InMemoryValue<T>();
- }
-
- @Override
- public <T> BagState<T> bindBag(
- final StateTag<? super K, BagState<T>> address, Coder<T> elemCoder) {
- return new InMemoryBag<T>();
- }
-
- @Override
- public <InputT, AccumT, OutputT> AccumulatorCombiningState<InputT, AccumT, OutputT>
- bindCombiningValue(
- StateTag<? super K, AccumulatorCombiningState<InputT, AccumT, OutputT>> address,
- Coder<AccumT> accumCoder,
- final CombineFn<InputT, AccumT, OutputT> combineFn) {
- return new InMemoryCombiningValue<K, InputT, AccumT, OutputT>(key, combineFn.<K>asKeyedFn());
- }
-
- @Override
- public <W extends BoundedWindow> WatermarkHoldState<W> bindWatermark(
- StateTag<? super K, WatermarkHoldState<W>> address,
- OutputTimeFn<? super W> outputTimeFn) {
- return new InMemoryWatermarkHold<W>(outputTimeFn);
- }
-
- @Override
- public <InputT, AccumT, OutputT> AccumulatorCombiningState<InputT, AccumT, OutputT>
- bindKeyedCombiningValue(
- StateTag<? super K, AccumulatorCombiningState<InputT, AccumT, OutputT>> address,
- Coder<AccumT> accumCoder,
- KeyedCombineFn<? super K, InputT, AccumT, OutputT> combineFn) {
- return new InMemoryCombiningValue<K, InputT, AccumT, OutputT>(key, combineFn);
- }
-
- @Override
- public <InputT, AccumT, OutputT> AccumulatorCombiningState<InputT, AccumT, OutputT>
- bindKeyedCombiningValueWithContext(
- StateTag<? super K, AccumulatorCombiningState<InputT, AccumT, OutputT>> address,
- Coder<AccumT> accumCoder,
- KeyedCombineFnWithContext<? super K, InputT, AccumT, OutputT> combineFn) {
- return bindKeyedCombiningValue(address, accumCoder, CombineFnUtil.bindContext(combineFn, c));
- }
- }
-
- static final class InMemoryValue<T> implements ValueState<T>, InMemoryState<InMemoryValue<T>> {
- private boolean isCleared = true;
- private T value = null;
-
- @Override
- public void clear() {
- // Even though we're clearing we can't remove this from the in-memory state map, since
- // other users may already have a handle on this Value.
- value = null;
- isCleared = true;
- }
-
- @Override
- public InMemoryValue<T> readLater() {
- return this;
- }
-
- @Override
- public T read() {
- return value;
- }
-
- @Override
- public void write(T input) {
- isCleared = false;
- this.value = input;
- }
-
- @Override
- public InMemoryValue<T> copy() {
- InMemoryValue<T> that = new InMemoryValue<>();
- if (!this.isCleared) {
- that.isCleared = this.isCleared;
- that.value = this.value;
- }
- return that;
- }
-
- @Override
- public boolean isCleared() {
- return isCleared;
- }
- }
-
- static final class InMemoryWatermarkHold<W extends BoundedWindow>
- implements WatermarkHoldState<W>, InMemoryState<InMemoryWatermarkHold<W>> {
-
- private final OutputTimeFn<? super W> outputTimeFn;
-
- @Nullable
- private Instant combinedHold = null;
-
- public InMemoryWatermarkHold(OutputTimeFn<? super W> outputTimeFn) {
- this.outputTimeFn = outputTimeFn;
- }
-
- @Override
- public InMemoryWatermarkHold<W> readLater() {
- return this;
- }
-
- @Override
- public void clear() {
- // Even though we're clearing we can't remove this from the in-memory state map, since
- // other users may already have a handle on this WatermarkBagInternal.
- combinedHold = null;
- }
-
- @Override
- public Instant read() {
- return combinedHold;
- }
-
- @Override
- public void add(Instant outputTime) {
- combinedHold = combinedHold == null ? outputTime
- : outputTimeFn.combine(combinedHold, outputTime);
- }
-
- @Override
- public boolean isCleared() {
- return combinedHold == null;
- }
-
- @Override
- public ReadableState<Boolean> isEmpty() {
- return new ReadableState<Boolean>() {
- @Override
- public ReadableState<Boolean> readLater() {
- return this;
- }
- @Override
- public Boolean read() {
- return combinedHold == null;
- }
- };
- }
-
- @Override
- public OutputTimeFn<? super W> getOutputTimeFn() {
- return outputTimeFn;
- }
-
- @Override
- public String toString() {
- return Objects.toString(combinedHold);
- }
-
- @Override
- public InMemoryWatermarkHold<W> copy() {
- InMemoryWatermarkHold<W> that =
- new InMemoryWatermarkHold<>(outputTimeFn);
- that.combinedHold = this.combinedHold;
- return that;
- }
- }
-
- static final class InMemoryCombiningValue<K, InputT, AccumT, OutputT>
- implements AccumulatorCombiningState<InputT, AccumT, OutputT>,
- InMemoryState<InMemoryCombiningValue<K, InputT, AccumT, OutputT>> {
- private final K key;
- private boolean isCleared = true;
- private final KeyedCombineFn<? super K, InputT, AccumT, OutputT> combineFn;
- private AccumT accum;
-
- InMemoryCombiningValue(
- K key, KeyedCombineFn<? super K, InputT, AccumT, OutputT> combineFn) {
- this.key = key;
- this.combineFn = combineFn;
- accum = combineFn.createAccumulator(key);
- }
-
- @Override
- public InMemoryCombiningValue<K, InputT, AccumT, OutputT> readLater() {
- return this;
- }
-
- @Override
- public void clear() {
- // Even though we're clearing we can't remove this from the in-memory state map, since
- // other users may already have a handle on this CombiningValue.
- accum = combineFn.createAccumulator(key);
- isCleared = true;
- }
-
- @Override
- public OutputT read() {
- return combineFn.extractOutput(key, accum);
- }
-
- @Override
- public void add(InputT input) {
- isCleared = false;
- accum = combineFn.addInput(key, accum, input);
- }
-
- @Override
- public AccumT getAccum() {
- return accum;
- }
-
- @Override
- public ReadableState<Boolean> isEmpty() {
- return new ReadableState<Boolean>() {
- @Override
- public ReadableState<Boolean> readLater() {
- return this;
- }
- @Override
- public Boolean read() {
- return isCleared;
- }
- };
- }
-
- @Override
- public void addAccum(AccumT accum) {
- isCleared = false;
- this.accum = combineFn.mergeAccumulators(key, Arrays.asList(this.accum, accum));
- }
-
- @Override
- public AccumT mergeAccumulators(Iterable<AccumT> accumulators) {
- return combineFn.mergeAccumulators(key, accumulators);
- }
-
- @Override
- public boolean isCleared() {
- return isCleared;
- }
-
- @Override
- public InMemoryCombiningValue<K, InputT, AccumT, OutputT> copy() {
- InMemoryCombiningValue<K, InputT, AccumT, OutputT> that =
- new InMemoryCombiningValue<>(key, combineFn);
- if (!this.isCleared) {
- that.isCleared = this.isCleared;
- that.addAccum(accum);
- }
- return that;
- }
- }
-
- static final class InMemoryBag<T> implements BagState<T>, InMemoryState<InMemoryBag<T>> {
- private List<T> contents = new ArrayList<>();
-
- @Override
- public void clear() {
- // Even though we're clearing we can't remove this from the in-memory state map, since
- // other users may already have a handle on this Bag.
- // The result of get/read below must be stable for the lifetime of the bundle within which it
- // was generated. In batch and direct runners the bundle lifetime can be
- // greater than the window lifetime, in which case this method can be called while
- // the result is still in use. We protect against this by hot-swapping instead of
- // clearing the contents.
- contents = new ArrayList<>();
- }
-
- @Override
- public InMemoryBag<T> readLater() {
- return this;
- }
-
- @Override
- public Iterable<T> read() {
- return contents;
- }
-
- @Override
- public void add(T input) {
- contents.add(input);
- }
-
- @Override
- public boolean isCleared() {
- return contents.isEmpty();
- }
-
- @Override
- public ReadableState<Boolean> isEmpty() {
- return new ReadableState<Boolean>() {
- @Override
- public ReadableState<Boolean> readLater() {
- return this;
- }
-
- @Override
- public Boolean read() {
- return contents.isEmpty();
- }
- };
- }
-
- @Override
- public InMemoryBag<T> copy() {
- InMemoryBag<T> that = new InMemoryBag<>();
- that.contents.addAll(this.contents);
- return that;
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/MergingStateAccessor.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/MergingStateAccessor.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/MergingStateAccessor.java
deleted file mode 100644
index 40211d7..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/MergingStateAccessor.java
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util.state;
-
-import com.google.cloud.dataflow.sdk.annotations.Experimental;
-import com.google.cloud.dataflow.sdk.annotations.Experimental.Kind;
-import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
-
-import java.util.Map;
-
-/**
- * Interface for accessing persistent state while windows are merging.
- *
- * <p>For internal use only.
- */
-@Experimental(Kind.STATE)
-public interface MergingStateAccessor<K, W extends BoundedWindow>
- extends StateAccessor<K> {
- /**
- * Analogous to {@link #access}, but returned as a map from each window which is
- * about to be merged to the corresponding state. Only includes windows which
- * are known to have state.
- */
- <StateT extends State> Map<W, StateT> accessInEachMergingWindow(
- StateTag<? super K, StateT> address);
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/ReadableState.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/ReadableState.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/ReadableState.java
deleted file mode 100644
index 8f690a3..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/ReadableState.java
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.util.state;
-
-import com.google.cloud.dataflow.sdk.annotations.Experimental;
-import com.google.cloud.dataflow.sdk.annotations.Experimental.Kind;
-
-/**
- * A {@code StateContents} is produced by the read methods on all {@link State} objects.
- * Calling {@link #read} returns the associated value.
- *
- * <p>This class is similar to {@link java.util.concurrent.Future}, but each invocation of
- * {@link #read} need not return the same value.
- *
- * <p>Getting the {@code StateContents} from a read method indicates the desire to eventually
- * read a value. Depending on the runner this may or may not immediately start the read.
- *
- * @param <T> The type of value returned by {@link #read}.
- */
-@Experimental(Kind.STATE)
-public interface ReadableState<T> {
- /**
- * Read the current value, blocking until it is available.
- *
- * <p>If there will be many calls to {@link #read} for different state in short succession,
- * you should first call {@link #readLater} for all of them so the reads can potentially be
- * batched (depending on the underlying {@link StateInternals} implementation}.
- */
- T read();
-
- /**
- * Indicate that the value will be read later.
- *
- * <p>This allows a {@link StateInternals} implementation to start an asynchronous prefetch or
- * to include this state in the next batch of reads.
- *
- * @return this for convenient chaining
- */
- ReadableState<T> readLater();
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/State.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/State.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/State.java
deleted file mode 100644
index 0cef786..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/State.java
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.util.state;
-
-/**
- * Base interface for all state locations.
- *
- * <p>Specific types of state add appropriate accessors for reading and writing values, see
- * {@link ValueState}, {@link BagState}, and {@link CombiningState}.
- */
-public interface State {
-
- /**
- * Clear out the state location.
- */
- void clear();
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/StateAccessor.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/StateAccessor.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/StateAccessor.java
deleted file mode 100644
index 6cfbecf..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/StateAccessor.java
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util.state;
-
-import com.google.cloud.dataflow.sdk.annotations.Experimental;
-import com.google.cloud.dataflow.sdk.annotations.Experimental.Kind;
-
-/**
- * Interface for accessing a {@link StateTag} in the current context.
- *
- * <p>For internal use only.
- */
-@Experimental(Kind.STATE)
-public interface StateAccessor<K> {
- /**
- * Access the storage for the given {@code address} in the current window.
- *
- * <p>Never accounts for merged windows. When windows are merged, any state accessed via
- * this method must be eagerly combined and written into the result window.
- */
- <StateT extends State> StateT access(StateTag<? super K, StateT> address);
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/StateContext.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/StateContext.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/StateContext.java
deleted file mode 100644
index 96387d8..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/state/StateContext.java
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (C) 2016 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.util.state;
-
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
-import com.google.cloud.dataflow.sdk.values.PCollectionView;
-
-/**
- * Information accessible the state API.
- */
-public interface StateContext<W extends BoundedWindow> {
- /**
- * Returns the {@code PipelineOptions} specified with the
- * {@link com.google.cloud.dataflow.sdk.runners.PipelineRunner}.
- */
- public abstract PipelineOptions getPipelineOptions();
-
- /**
- * Returns the value of the side input for the corresponding state window.
- */
- public abstract <T> T sideInput(PCollectionView<T> view);
-
- /**
- * Returns the window corresponding to the state.
- */
- public abstract W window();
-}
[40/67] [partial] incubator-beam git commit: Directory reorganization
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/Write.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/Write.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/Write.java
deleted file mode 100644
index 0b78b83..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/Write.java
+++ /dev/null
@@ -1,213 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
- * in compliance with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software distributed under the License
- * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
- * or implied. See the License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.io;
-
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.annotations.Experimental;
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.coders.SerializableCoder;
-import com.google.cloud.dataflow.sdk.io.Sink.WriteOperation;
-import com.google.cloud.dataflow.sdk.io.Sink.Writer;
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-import com.google.cloud.dataflow.sdk.transforms.Create;
-import com.google.cloud.dataflow.sdk.transforms.DoFn;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-import com.google.cloud.dataflow.sdk.transforms.ParDo;
-import com.google.cloud.dataflow.sdk.transforms.View;
-import com.google.cloud.dataflow.sdk.transforms.windowing.GlobalWindows;
-import com.google.cloud.dataflow.sdk.transforms.windowing.Window;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.cloud.dataflow.sdk.values.PCollectionView;
-import com.google.cloud.dataflow.sdk.values.PDone;
-
-import org.joda.time.Instant;
-
-import java.util.UUID;
-
-/**
- * A {@link PTransform} that writes to a {@link Sink}. A write begins with a sequential global
- * initialization of a sink, followed by a parallel write, and ends with a sequential finalization
- * of the write. The output of a write is {@link PDone}. In the case of an empty PCollection, only
- * the global initialization and finalization will be performed.
- *
- * <p>Currently, only batch workflows can contain Write transforms.
- *
- * <p>Example usage:
- *
- * <p>{@code p.apply(Write.to(new MySink(...)));}
- */
-@Experimental(Experimental.Kind.SOURCE_SINK)
-public class Write {
- /**
- * Creates a Write transform that writes to the given Sink.
- */
- public static <T> Bound<T> to(Sink<T> sink) {
- return new Bound<>(sink);
- }
-
- /**
- * A {@link PTransform} that writes to a {@link Sink}. See {@link Write} and {@link Sink} for
- * documentation about writing to Sinks.
- */
- public static class Bound<T> extends PTransform<PCollection<T>, PDone> {
- private final Sink<T> sink;
-
- private Bound(Sink<T> sink) {
- this.sink = sink;
- }
-
- @Override
- public PDone apply(PCollection<T> input) {
- PipelineOptions options = input.getPipeline().getOptions();
- sink.validate(options);
- return createWrite(input, sink.createWriteOperation(options));
- }
-
- /**
- * Returns the {@link Sink} associated with this PTransform.
- */
- public Sink<T> getSink() {
- return sink;
- }
-
- /**
- * A write is performed as sequence of three {@link ParDo}'s.
- *
- * <p>In the first, a do-once ParDo is applied to a singleton PCollection containing the Sink's
- * {@link WriteOperation}. In this initialization ParDo, {@link WriteOperation#initialize} is
- * called. The output of this ParDo is a singleton PCollection
- * containing the WriteOperation.
- *
- * <p>This singleton collection containing the WriteOperation is then used as a side input to a
- * ParDo over the PCollection of elements to write. In this bundle-writing phase,
- * {@link WriteOperation#createWriter} is called to obtain a {@link Writer}.
- * {@link Writer#open} and {@link Writer#close} are called in {@link DoFn#startBundle} and
- * {@link DoFn#finishBundle}, respectively, and {@link Writer#write} method is called for every
- * element in the bundle. The output of this ParDo is a PCollection of <i>writer result</i>
- * objects (see {@link Sink} for a description of writer results)-one for each bundle.
- *
- * <p>The final do-once ParDo uses the singleton collection of the WriteOperation as input and
- * the collection of writer results as a side-input. In this ParDo,
- * {@link WriteOperation#finalize} is called to finalize the write.
- *
- * <p>If the write of any element in the PCollection fails, {@link Writer#close} will be called
- * before the exception that caused the write to fail is propagated and the write result will be
- * discarded.
- *
- * <p>Since the {@link WriteOperation} is serialized after the initialization ParDo and
- * deserialized in the bundle-writing and finalization phases, any state change to the
- * WriteOperation object that occurs during initialization is visible in the latter phases.
- * However, the WriteOperation is not serialized after the bundle-writing phase. This is why
- * implementations should guarantee that {@link WriteOperation#createWriter} does not mutate
- * WriteOperation).
- */
- private <WriteT> PDone createWrite(
- PCollection<T> input, WriteOperation<T, WriteT> writeOperation) {
- Pipeline p = input.getPipeline();
-
- // A coder to use for the WriteOperation.
- @SuppressWarnings("unchecked")
- Coder<WriteOperation<T, WriteT>> operationCoder =
- (Coder<WriteOperation<T, WriteT>>) SerializableCoder.of(writeOperation.getClass());
-
- // A singleton collection of the WriteOperation, to be used as input to a ParDo to initialize
- // the sink.
- PCollection<WriteOperation<T, WriteT>> operationCollection =
- p.apply(Create.<WriteOperation<T, WriteT>>of(writeOperation).withCoder(operationCoder));
-
- // Initialize the resource in a do-once ParDo on the WriteOperation.
- operationCollection = operationCollection
- .apply("Initialize", ParDo.of(
- new DoFn<WriteOperation<T, WriteT>, WriteOperation<T, WriteT>>() {
- @Override
- public void processElement(ProcessContext c) throws Exception {
- WriteOperation<T, WriteT> writeOperation = c.element();
- writeOperation.initialize(c.getPipelineOptions());
- // The WriteOperation is also the output of this ParDo, so it can have mutable
- // state.
- c.output(writeOperation);
- }
- }))
- .setCoder(operationCoder);
-
- // Create a view of the WriteOperation to be used as a sideInput to the parallel write phase.
- final PCollectionView<WriteOperation<T, WriteT>> writeOperationView =
- operationCollection.apply(View.<WriteOperation<T, WriteT>>asSingleton());
-
- // Perform the per-bundle writes as a ParDo on the input PCollection (with the WriteOperation
- // as a side input) and collect the results of the writes in a PCollection.
- // There is a dependency between this ParDo and the first (the WriteOperation PCollection
- // as a side input), so this will happen after the initial ParDo.
- PCollection<WriteT> results = input
- .apply("WriteBundles", ParDo.of(new DoFn<T, WriteT>() {
- // Writer that will write the records in this bundle. Lazily
- // initialized in processElement.
- private Writer<T, WriteT> writer = null;
-
- @Override
- public void processElement(ProcessContext c) throws Exception {
- // Lazily initialize the Writer
- if (writer == null) {
- WriteOperation<T, WriteT> writeOperation = c.sideInput(writeOperationView);
- writer = writeOperation.createWriter(c.getPipelineOptions());
- writer.open(UUID.randomUUID().toString());
- }
- try {
- writer.write(c.element());
- } catch (Exception e) {
- // Discard write result and close the write.
- try {
- writer.close();
- } catch (Exception closeException) {
- // Do not mask the exception that caused the write to fail.
- }
- throw e;
- }
- }
-
- @Override
- public void finishBundle(Context c) throws Exception {
- if (writer != null) {
- WriteT result = writer.close();
- // Output the result of the write.
- c.outputWithTimestamp(result, Instant.now());
- }
- }
- }).withSideInputs(writeOperationView))
- .setCoder(writeOperation.getWriterResultCoder())
- .apply(Window.<WriteT>into(new GlobalWindows()));
-
- final PCollectionView<Iterable<WriteT>> resultsView =
- results.apply(View.<WriteT>asIterable());
-
- // Finalize the write in another do-once ParDo on the singleton collection containing the
- // Writer. The results from the per-bundle writes are given as an Iterable side input.
- // The WriteOperation's state is the same as after its initialization in the first do-once
- // ParDo. There is a dependency between this ParDo and the parallel write (the writer results
- // collection as a side input), so it will happen after the parallel write.
- @SuppressWarnings("unused")
- final PCollection<Integer> done = operationCollection
- .apply("Finalize", ParDo.of(new DoFn<WriteOperation<T, WriteT>, Integer>() {
- @Override
- public void processElement(ProcessContext c) throws Exception {
- Iterable<WriteT> results = c.sideInput(resultsView);
- WriteOperation<T, WriteT> writeOperation = c.element();
- writeOperation.finalize(results, c.getPipelineOptions());
- }
- }).withSideInputs(resultsView));
- return PDone.in(input.getPipeline());
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/XmlSink.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/XmlSink.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/XmlSink.java
deleted file mode 100644
index b728c0a..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/XmlSink.java
+++ /dev/null
@@ -1,310 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.io;
-
-import com.google.cloud.dataflow.sdk.coders.StringUtf8Coder;
-import com.google.cloud.dataflow.sdk.io.FileBasedSink.FileBasedWriteOperation;
-import com.google.cloud.dataflow.sdk.io.FileBasedSink.FileBasedWriter;
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-import com.google.cloud.dataflow.sdk.util.CoderUtils;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.common.base.Preconditions;
-
-import java.io.OutputStream;
-import java.nio.channels.Channels;
-import java.nio.channels.WritableByteChannel;
-
-import javax.xml.bind.JAXBContext;
-import javax.xml.bind.JAXBException;
-import javax.xml.bind.Marshaller;
-
-// CHECKSTYLE.OFF: JavadocStyle
-/**
- * A {@link Sink} that outputs records as XML-formatted elements. Writes a {@link PCollection} of
- * records from JAXB-annotated classes to a single file location.
- *
- * <p>Given a PCollection containing records of type T that can be marshalled to XML elements, this
- * Sink will produce a single file consisting of a single root element that contains all of the
- * elements in the PCollection.
- *
- * <p>XML Sinks are created with a base filename to write to, a root element name that will be used
- * for the root element of the output files, and a class to bind to an XML element. This class
- * will be used in the marshalling of records in an input PCollection to their XML representation
- * and must be able to be bound using JAXB annotations (checked at pipeline construction time).
- *
- * <p>XML Sinks can be written to using the {@link Write} transform:
- *
- * <pre>
- * p.apply(Write.to(
- * XmlSink.ofRecordClass(Type.class)
- * .withRootElementName(root_element)
- * .toFilenamePrefix(output_filename)));
- * </pre>
- *
- * <p>For example, consider the following class with JAXB annotations:
- *
- * <pre>
- * {@literal @}XmlRootElement(name = "word_count_result")
- * {@literal @}XmlType(propOrder = {"word", "frequency"})
- * public class WordFrequency {
- * private String word;
- * private long frequency;
- *
- * public WordFrequency() { }
- *
- * public WordFrequency(String word, long frequency) {
- * this.word = word;
- * this.frequency = frequency;
- * }
- *
- * public void setWord(String word) {
- * this.word = word;
- * }
- *
- * public void setFrequency(long frequency) {
- * this.frequency = frequency;
- * }
- *
- * public long getFrequency() {
- * return frequency;
- * }
- *
- * public String getWord() {
- * return word;
- * }
- * }
- * </pre>
- *
- * <p>The following will produce XML output with a root element named "words" from a PCollection of
- * WordFrequency objects:
- * <pre>
- * p.apply(Write.to(
- * XmlSink.ofRecordClass(WordFrequency.class)
- * .withRootElement("words")
- * .toFilenamePrefix(output_file)));
- * </pre>
- *
- * <p>The output of which will look like:
- * <pre>
- * {@code
- * <words>
- *
- * <word_count_result>
- * <word>decreased</word>
- * <frequency>1</frequency>
- * </word_count_result>
- *
- * <word_count_result>
- * <word>War</word>
- * <frequency>4</frequency>
- * </word_count_result>
- *
- * <word_count_result>
- * <word>empress'</word>
- * <frequency>14</frequency>
- * </word_count_result>
- *
- * <word_count_result>
- * <word>stoops</word>
- * <frequency>6</frequency>
- * </word_count_result>
- *
- * ...
- * </words>
- * }</pre>
- */
-// CHECKSTYLE.ON: JavadocStyle
-@SuppressWarnings("checkstyle:javadocstyle")
-public class XmlSink {
- protected static final String XML_EXTENSION = "xml";
-
- /**
- * Returns a builder for an XmlSink. You'll need to configure the class to bind, the root
- * element name, and the output file prefix with {@link Bound#ofRecordClass}, {@link
- * Bound#withRootElement}, and {@link Bound#toFilenamePrefix}, respectively.
- */
- public static Bound<?> write() {
- return new Bound<>(null, null, null);
- }
-
- /**
- * Returns an XmlSink that writes objects as XML entities.
- *
- * <p>Output files will have the name {@literal {baseOutputFilename}-0000i-of-0000n.xml} where n
- * is the number of output bundles that the Dataflow service divides the output into.
- *
- * @param klass the class of the elements to write.
- * @param rootElementName the enclosing root element.
- * @param baseOutputFilename the output filename prefix.
- */
- public static <T> Bound<T> writeOf(
- Class<T> klass, String rootElementName, String baseOutputFilename) {
- return new Bound<>(klass, rootElementName, baseOutputFilename);
- }
-
- /**
- * A {@link FileBasedSink} that writes objects as XML elements.
- */
- public static class Bound<T> extends FileBasedSink<T> {
- final Class<T> classToBind;
- final String rootElementName;
-
- private Bound(Class<T> classToBind, String rootElementName, String baseOutputFilename) {
- super(baseOutputFilename, XML_EXTENSION);
- this.classToBind = classToBind;
- this.rootElementName = rootElementName;
- }
-
- /**
- * Returns an XmlSink that writes objects of the class specified as XML elements.
- *
- * <p>The specified class must be able to be used to create a JAXB context.
- */
- public <T> Bound<T> ofRecordClass(Class<T> classToBind) {
- return new Bound<>(classToBind, rootElementName, baseOutputFilename);
- }
-
- /**
- * Returns an XmlSink that writes to files with the given prefix.
- *
- * <p>Output files will have the name {@literal {filenamePrefix}-0000i-of-0000n.xml} where n is
- * the number of output bundles that the Dataflow service divides the output into.
- */
- public Bound<T> toFilenamePrefix(String baseOutputFilename) {
- return new Bound<>(classToBind, rootElementName, baseOutputFilename);
- }
-
- /**
- * Returns an XmlSink that writes XML files with an enclosing root element of the
- * supplied name.
- */
- public Bound<T> withRootElement(String rootElementName) {
- return new Bound<>(classToBind, rootElementName, baseOutputFilename);
- }
-
- /**
- * Validates that the root element, class to bind to a JAXB context, and filenamePrefix have
- * been set and that the class can be bound in a JAXB context.
- */
- @Override
- public void validate(PipelineOptions options) {
- Preconditions.checkNotNull(classToBind, "Missing a class to bind to a JAXB context.");
- Preconditions.checkNotNull(rootElementName, "Missing a root element name.");
- Preconditions.checkNotNull(baseOutputFilename, "Missing a filename to write to.");
- try {
- JAXBContext.newInstance(classToBind);
- } catch (JAXBException e) {
- throw new RuntimeException("Error binding classes to a JAXB Context.", e);
- }
- }
-
- /**
- * Creates an {@link XmlWriteOperation}.
- */
- @Override
- public XmlWriteOperation<T> createWriteOperation(PipelineOptions options) {
- return new XmlWriteOperation<>(this);
- }
- }
-
- /**
- * {@link Sink.WriteOperation} for XML {@link Sink}s.
- */
- protected static final class XmlWriteOperation<T> extends FileBasedWriteOperation<T> {
- public XmlWriteOperation(XmlSink.Bound<T> sink) {
- super(sink);
- }
-
- /**
- * Creates a {@link XmlWriter} with a marshaller for the type it will write.
- */
- @Override
- public XmlWriter<T> createWriter(PipelineOptions options) throws Exception {
- JAXBContext context;
- Marshaller marshaller;
- context = JAXBContext.newInstance(getSink().classToBind);
- marshaller = context.createMarshaller();
- marshaller.setProperty(Marshaller.JAXB_FORMATTED_OUTPUT, Boolean.TRUE);
- marshaller.setProperty(Marshaller.JAXB_FRAGMENT, Boolean.TRUE);
- marshaller.setProperty(Marshaller.JAXB_ENCODING, "UTF-8");
- return new XmlWriter<>(this, marshaller);
- }
-
- /**
- * Return the XmlSink.Bound for this write operation.
- */
- @Override
- public XmlSink.Bound<T> getSink() {
- return (XmlSink.Bound<T>) super.getSink();
- }
- }
-
- /**
- * A {@link Sink.Writer} that can write objects as XML elements.
- */
- protected static final class XmlWriter<T> extends FileBasedWriter<T> {
- final Marshaller marshaller;
- private OutputStream os = null;
-
- public XmlWriter(XmlWriteOperation<T> writeOperation, Marshaller marshaller) {
- super(writeOperation);
- this.marshaller = marshaller;
- }
-
- /**
- * Creates the output stream that elements will be written to.
- */
- @Override
- protected void prepareWrite(WritableByteChannel channel) throws Exception {
- os = Channels.newOutputStream(channel);
- }
-
- /**
- * Writes the root element opening tag.
- */
- @Override
- protected void writeHeader() throws Exception {
- String rootElementName = getWriteOperation().getSink().rootElementName;
- os.write(CoderUtils.encodeToByteArray(StringUtf8Coder.of(), "<" + rootElementName + ">\n"));
- }
-
- /**
- * Writes the root element closing tag.
- */
- @Override
- protected void writeFooter() throws Exception {
- String rootElementName = getWriteOperation().getSink().rootElementName;
- os.write(CoderUtils.encodeToByteArray(StringUtf8Coder.of(), "\n</" + rootElementName + ">"));
- }
-
- /**
- * Writes a value to the stream.
- */
- @Override
- public void write(T value) throws Exception {
- marshaller.marshal(value, os);
- }
-
- /**
- * Return the XmlWriteOperation this write belongs to.
- */
- @Override
- public XmlWriteOperation<T> getWriteOperation() {
- return (XmlWriteOperation<T>) super.getWriteOperation();
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/XmlSource.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/XmlSource.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/XmlSource.java
deleted file mode 100644
index 1ead391..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/XmlSource.java
+++ /dev/null
@@ -1,541 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
- * in compliance with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software distributed under the License
- * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
- * or implied. See the License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.io;
-
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.coders.JAXBCoder;
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-import com.google.cloud.dataflow.sdk.runners.PipelineRunner;
-import com.google.common.base.Preconditions;
-
-import org.codehaus.stax2.XMLInputFactory2;
-
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.io.SequenceInputStream;
-import java.nio.ByteBuffer;
-import java.nio.CharBuffer;
-import java.nio.channels.Channels;
-import java.nio.channels.ReadableByteChannel;
-import java.nio.charset.StandardCharsets;
-import java.util.NoSuchElementException;
-
-import javax.xml.bind.JAXBContext;
-import javax.xml.bind.JAXBElement;
-import javax.xml.bind.JAXBException;
-import javax.xml.bind.Unmarshaller;
-import javax.xml.bind.ValidationEvent;
-import javax.xml.bind.ValidationEventHandler;
-import javax.xml.stream.FactoryConfigurationError;
-import javax.xml.stream.XMLInputFactory;
-import javax.xml.stream.XMLStreamConstants;
-import javax.xml.stream.XMLStreamException;
-import javax.xml.stream.XMLStreamReader;
-
-// CHECKSTYLE.OFF: JavadocStyle
-/**
- * A source that can be used to read XML files. This source reads one or more
- * XML files and creates a {@code PCollection} of a given type. An Dataflow read transform can be
- * created by passing an {@code XmlSource} object to {@code Read.from()}. Please note the
- * example given below.
- *
- * <p>The XML file must be of the following form, where {@code root} and {@code record} are XML
- * element names that are defined by the user:
- *
- * <pre>
- * {@code
- * <root>
- * <record> ... </record>
- * <record> ... </record>
- * <record> ... </record>
- * ...
- * <record> ... </record>
- * </root>
- * }
- * </pre>
- *
- * <p>Basically, the XML document should contain a single root element with an inner list consisting
- * entirely of record elements. The records may contain arbitrary XML content; however, that content
- * <b>must not</b> contain the start {@code <record>} or end {@code </record>} tags. This
- * restriction enables reading from large XML files in parallel from different offsets in the file.
- *
- * <p>Root and/or record elements may additionally contain an arbitrary number of XML attributes.
- * Additionally users must provide a class of a JAXB annotated Java type that can be used convert
- * records into Java objects and vice versa using JAXB marshalling/unmarshalling mechanisms. Reading
- * the source will generate a {@code PCollection} of the given JAXB annotated Java type.
- * Optionally users may provide a minimum size of a bundle that should be created for the source.
- *
- * <p>The following example shows how to read from {@link XmlSource} in a Dataflow pipeline:
- *
- * <pre>
- * {@code
- * XmlSource<String> source = XmlSource.<String>from(file.toPath().toString())
- * .withRootElement("root")
- * .withRecordElement("record")
- * .withRecordClass(Record.class);
- * PCollection<String> output = p.apply(Read.from(source));
- * }
- * </pre>
- *
- * <p>Currently, only XML files that use single-byte characters are supported. Using a file that
- * contains multi-byte characters may result in data loss or duplication.
- *
- * <p>To use {@link XmlSource}:
- * <ol>
- * <li>Explicitly declare a dependency on org.codehaus.woodstox:stax2-api</li>
- * <li>Include a compatible implementation on the classpath at run-time,
- * such as org.codehaus.woodstox:woodstox-core-asl</li>
- * </ol>
- *
- * <p>These dependencies have been declared as optional in Maven sdk/pom.xml file of
- * Google Cloud Dataflow.
- *
- * <p><h3>Permissions</h3>
- * Permission requirements depend on the
- * {@link com.google.cloud.dataflow.sdk.runners.PipelineRunner PipelineRunner} that is
- * used to execute the Dataflow job. Please refer to the documentation of corresponding
- * {@link PipelineRunner PipelineRunners} for more details.
- *
- * @param <T> Type of the objects that represent the records of the XML file. The
- * {@code PCollection} generated by this source will be of this type.
- */
-// CHECKSTYLE.ON: JavadocStyle
-public class XmlSource<T> extends FileBasedSource<T> {
-
- private static final String XML_VERSION = "1.1";
- private static final int DEFAULT_MIN_BUNDLE_SIZE = 8 * 1024;
- private final String rootElement;
- private final String recordElement;
- private final Class<T> recordClass;
-
- /**
- * Creates an XmlSource for a single XML file or a set of XML files defined by a Java "glob" file
- * pattern. Each XML file should be of the form defined in {@link XmlSource}.
- */
- public static <T> XmlSource<T> from(String fileOrPatternSpec) {
- return new XmlSource<>(fileOrPatternSpec, DEFAULT_MIN_BUNDLE_SIZE, null, null, null);
- }
-
- /**
- * Sets name of the root element of the XML document. This will be used to create a valid starting
- * root element when initiating a bundle of records created from an XML document. This is a
- * required parameter.
- */
- public XmlSource<T> withRootElement(String rootElement) {
- return new XmlSource<>(
- getFileOrPatternSpec(), getMinBundleSize(), rootElement, recordElement, recordClass);
- }
-
- /**
- * Sets name of the record element of the XML document. This will be used to determine offset of
- * the first record of a bundle created from the XML document. This is a required parameter.
- */
- public XmlSource<T> withRecordElement(String recordElement) {
- return new XmlSource<>(
- getFileOrPatternSpec(), getMinBundleSize(), rootElement, recordElement, recordClass);
- }
-
- /**
- * Sets a JAXB annotated class that can be populated using a record of the provided XML file. This
- * will be used when unmarshalling record objects from the XML file. This is a required
- * parameter.
- */
- public XmlSource<T> withRecordClass(Class<T> recordClass) {
- return new XmlSource<>(
- getFileOrPatternSpec(), getMinBundleSize(), rootElement, recordElement, recordClass);
- }
-
- /**
- * Sets a parameter {@code minBundleSize} for the minimum bundle size of the source. Please refer
- * to {@link OffsetBasedSource} for the definition of minBundleSize. This is an optional
- * parameter.
- */
- public XmlSource<T> withMinBundleSize(long minBundleSize) {
- return new XmlSource<>(
- getFileOrPatternSpec(), minBundleSize, rootElement, recordElement, recordClass);
- }
-
- private XmlSource(String fileOrPattern, long minBundleSize, String rootElement,
- String recordElement, Class<T> recordClass) {
- super(fileOrPattern, minBundleSize);
- this.rootElement = rootElement;
- this.recordElement = recordElement;
- this.recordClass = recordClass;
- }
-
- private XmlSource(String fileOrPattern, long minBundleSize, long startOffset, long endOffset,
- String rootElement, String recordElement, Class<T> recordClass) {
- super(fileOrPattern, minBundleSize, startOffset, endOffset);
- this.rootElement = rootElement;
- this.recordElement = recordElement;
- this.recordClass = recordClass;
- }
-
- @Override
- protected FileBasedSource<T> createForSubrangeOfFile(String fileName, long start, long end) {
- return new XmlSource<T>(
- fileName, getMinBundleSize(), start, end, rootElement, recordElement, recordClass);
- }
-
- @Override
- protected FileBasedReader<T> createSingleFileReader(PipelineOptions options) {
- return new XMLReader<T>(this);
- }
-
- @Override
- public boolean producesSortedKeys(PipelineOptions options) throws Exception {
- return false;
- }
-
- @Override
- public void validate() {
- super.validate();
- Preconditions.checkNotNull(
- rootElement, "rootElement is null. Use builder method withRootElement() to set this.");
- Preconditions.checkNotNull(
- recordElement,
- "recordElement is null. Use builder method withRecordElement() to set this.");
- Preconditions.checkNotNull(
- recordClass, "recordClass is null. Use builder method withRecordClass() to set this.");
- }
-
- @Override
- public Coder<T> getDefaultOutputCoder() {
- return JAXBCoder.of(recordClass);
- }
-
- public String getRootElement() {
- return rootElement;
- }
-
- public String getRecordElement() {
- return recordElement;
- }
-
- public Class<T> getRecordClass() {
- return recordClass;
- }
-
- /**
- * A {@link Source.Reader} for reading JAXB annotated Java objects from an XML file. The XML
- * file should be of the form defined at {@link XmlSource}.
- *
- * <p>Timestamped values are currently unsupported - all values implicitly have the timestamp
- * of {@code BoundedWindow.TIMESTAMP_MIN_VALUE}.
- *
- * @param <T> Type of objects that will be read by the reader.
- */
- private static class XMLReader<T> extends FileBasedReader<T> {
- // The amount of bytes read from the channel to memory when determining the starting offset of
- // the first record in a bundle. After matching to starting offset of the first record the
- // remaining bytes read to this buffer and the bytes still not read from the channel are used to
- // create the XML parser.
- private static final int BUF_SIZE = 1024;
-
- // This should be the maximum number of bytes a character will encode to, for any encoding
- // supported by XmlSource. Currently this is set to 4 since UTF-8 characters may be
- // four bytes.
- private static final int MAX_CHAR_BYTES = 4;
-
- // In order to support reading starting in the middle of an XML file, we construct an imaginary
- // well-formed document (a header and root tag followed by the contents of the input starting at
- // the record boundary) and feed it to the parser. Because of this, the offset reported by the
- // XML parser is not the same as offset in the original file. They differ by a constant amount:
- // offsetInOriginalFile = parser.getLocation().getCharacterOffset() + parserBaseOffset;
- // Note that this is true only for files with single-byte characters.
- // It appears that, as of writing, there does not exist a Java XML parser capable of correctly
- // reporting byte offsets of elements in the presence of multi-byte characters.
- private long parserBaseOffset = 0;
- private boolean readingStarted = false;
-
- // If true, the current bundle does not contain any records.
- private boolean emptyBundle = false;
-
- private Unmarshaller jaxbUnmarshaller = null;
- private XMLStreamReader parser = null;
-
- private T currentRecord = null;
-
- // Byte offset of the current record in the XML file provided when creating the source.
- private long currentByteOffset = 0;
-
- public XMLReader(XmlSource<T> source) {
- super(source);
-
- // Set up a JAXB Unmarshaller that can be used to unmarshall record objects.
- try {
- JAXBContext jaxbContext = JAXBContext.newInstance(getCurrentSource().recordClass);
- jaxbUnmarshaller = jaxbContext.createUnmarshaller();
-
- // Throw errors if validation fails. JAXB by default ignores validation errors.
- jaxbUnmarshaller.setEventHandler(new ValidationEventHandler() {
- @Override
- public boolean handleEvent(ValidationEvent event) {
- throw new RuntimeException(event.getMessage(), event.getLinkedException());
- }
- });
- } catch (JAXBException e) {
- throw new RuntimeException(e);
- }
- }
-
- @Override
- public synchronized XmlSource<T> getCurrentSource() {
- return (XmlSource<T>) super.getCurrentSource();
- }
-
- @Override
- protected void startReading(ReadableByteChannel channel) throws IOException {
- // This method determines the correct starting offset of the first record by reading bytes
- // from the ReadableByteChannel. This implementation does not need the channel to be a
- // SeekableByteChannel.
- // The method tries to determine the first record element in the byte channel. The first
- // record must start with the characters "<recordElement" where "recordElement" is the
- // record element of the XML document described above. For the match to be complete this
- // has to be followed by one of following.
- // * any whitespace character
- // * '>' character
- // * '/' character (to support empty records).
- //
- // After this match this method creates the XML parser for parsing the XML document,
- // feeding it a fake document consisting of an XML header and the <rootElement> tag followed
- // by the contents of channel starting from <recordElement. The <rootElement> tag may be never
- // closed.
-
- // This stores any bytes that should be used prior to the remaining bytes of the channel when
- // creating an XML parser object.
- ByteArrayOutputStream preambleByteBuffer = new ByteArrayOutputStream();
- // A dummy declaration and root for the document with proper XML version and encoding. Without
- // this XML parsing may fail or may produce incorrect results.
-
- byte[] dummyStartDocumentBytes =
- ("<?xml version=\"" + XML_VERSION + "\" encoding=\"UTF-8\" ?>"
- + "<" + getCurrentSource().rootElement + ">").getBytes(StandardCharsets.UTF_8);
- preambleByteBuffer.write(dummyStartDocumentBytes);
- // Gets the byte offset (in the input file) of the first record in ReadableByteChannel. This
- // method returns the offset and stores any bytes that should be used when creating the XML
- // parser in preambleByteBuffer.
- long offsetInFileOfRecordElement =
- getFirstOccurenceOfRecordElement(channel, preambleByteBuffer);
- if (offsetInFileOfRecordElement < 0) {
- // Bundle has no records. So marking this bundle as an empty bundle.
- emptyBundle = true;
- return;
- } else {
- byte[] preambleBytes = preambleByteBuffer.toByteArray();
- currentByteOffset = offsetInFileOfRecordElement;
- setUpXMLParser(channel, preambleBytes);
- parserBaseOffset = offsetInFileOfRecordElement - dummyStartDocumentBytes.length;
- }
- readingStarted = true;
- }
-
- // Gets the first occurrence of the next record within the given ReadableByteChannel. Puts
- // any bytes read past the starting offset of the next record back to the preambleByteBuffer.
- // If a record is found, returns the starting offset of the record, otherwise
- // returns -1.
- private long getFirstOccurenceOfRecordElement(
- ReadableByteChannel channel, ByteArrayOutputStream preambleByteBuffer) throws IOException {
- int byteIndexInRecordElementToMatch = 0;
- // Index of the byte in the string "<recordElement" to be matched
- // against the current byte from the stream.
- boolean recordStartBytesMatched = false; // "<recordElement" matched. Still have to match the
- // next character to confirm if this is a positive match.
- boolean fullyMatched = false; // If true, record element was fully matched.
-
- // This gives the offset of the byte currently being read. We do a '-1' here since we
- // increment this value at the beginning of the while loop below.
- long offsetInFileOfCurrentByte = getCurrentSource().getStartOffset() - 1;
- long startingOffsetInFileOfCurrentMatch = -1;
- // If this is non-negative, currently there is a match in progress and this value gives the
- // starting offset of the match currently being conducted.
- boolean matchStarted = false; // If true, a match is currently in progress.
-
- // These two values are used to determine the character immediately following a match for
- // "<recordElement". Please see the comment for 'MAX_CHAR_BYTES' above.
- byte[] charBytes = new byte[MAX_CHAR_BYTES];
- int charBytesFound = 0;
-
- ByteBuffer buf = ByteBuffer.allocate(BUF_SIZE);
- byte[] recordStartBytes =
- ("<" + getCurrentSource().recordElement).getBytes(StandardCharsets.UTF_8);
-
- outer: while (channel.read(buf) > 0) {
- buf.flip();
- while (buf.hasRemaining()) {
- offsetInFileOfCurrentByte++;
- byte b = buf.get();
- boolean reset = false;
- if (recordStartBytesMatched) {
- // We already matched "<recordElement" reading the next character to determine if this
- // is a positive match for a new record.
- charBytes[charBytesFound] = b;
- charBytesFound++;
- Character c = null;
- if (charBytesFound == charBytes.length) {
- CharBuffer charBuf = CharBuffer.allocate(1);
- InputStream charBufStream = new ByteArrayInputStream(charBytes);
- java.io.Reader reader =
- new InputStreamReader(charBufStream, StandardCharsets.UTF_8);
- int read = reader.read();
- if (read <= 0) {
- return -1;
- }
- charBuf.flip();
- c = (char) read;
- } else {
- continue;
- }
-
- // Record start may be of following forms
- // * "<recordElement<whitespace>..."
- // * "<recordElement>..."
- // * "<recordElement/..."
- if (Character.isWhitespace(c) || c == '>' || c == '/') {
- fullyMatched = true;
- // Add the recordStartBytes and charBytes to preambleByteBuffer since these were
- // already read from the channel.
- preambleByteBuffer.write(recordStartBytes);
- preambleByteBuffer.write(charBytes);
- // Also add the rest of the current buffer to preambleByteBuffer.
- while (buf.hasRemaining()) {
- preambleByteBuffer.write(buf.get());
- }
- break outer;
- } else {
- // Matching was unsuccessful. Reset the buffer to include bytes read for the char.
- ByteBuffer newbuf = ByteBuffer.allocate(BUF_SIZE);
- newbuf.put(charBytes);
- offsetInFileOfCurrentByte -= charBytes.length;
- while (buf.hasRemaining()) {
- newbuf.put(buf.get());
- }
- newbuf.flip();
- buf = newbuf;
-
- // Ignore everything and try again starting from the current buffer.
- reset = true;
- }
- } else if (b == recordStartBytes[byteIndexInRecordElementToMatch]) {
- // Next byte matched.
- if (!matchStarted) {
- // Match was for the first byte, record the starting offset.
- matchStarted = true;
- startingOffsetInFileOfCurrentMatch = offsetInFileOfCurrentByte;
- }
- byteIndexInRecordElementToMatch++;
- } else {
- // Not a match. Ignore everything and try again starting at current point.
- reset = true;
- }
- if (reset) {
- // Clear variables and try to match starting from the next byte.
- byteIndexInRecordElementToMatch = 0;
- startingOffsetInFileOfCurrentMatch = -1;
- matchStarted = false;
- recordStartBytesMatched = false;
- charBytes = new byte[MAX_CHAR_BYTES];
- charBytesFound = 0;
- }
- if (byteIndexInRecordElementToMatch == recordStartBytes.length) {
- // "<recordElement" matched. Need to still check next byte since this might be an
- // element that has "recordElement" as a prefix.
- recordStartBytesMatched = true;
- }
- }
- buf.clear();
- }
-
- if (!fullyMatched) {
- return -1;
- } else {
- return startingOffsetInFileOfCurrentMatch;
- }
- }
-
- private void setUpXMLParser(ReadableByteChannel channel, byte[] lookAhead) throws IOException {
- try {
- // We use Woodstox because the StAX implementation provided by OpenJDK reports
- // character locations incorrectly. Note that Woodstox still currently reports *byte*
- // locations incorrectly when parsing documents that contain multi-byte characters.
- XMLInputFactory2 xmlInputFactory = (XMLInputFactory2) XMLInputFactory.newInstance();
- this.parser = xmlInputFactory.createXMLStreamReader(
- new SequenceInputStream(
- new ByteArrayInputStream(lookAhead), Channels.newInputStream(channel)),
- "UTF-8");
-
- // Current offset should be the offset before reading the record element.
- while (true) {
- int event = parser.next();
- if (event == XMLStreamConstants.START_ELEMENT) {
- String localName = parser.getLocalName();
- if (localName.equals(getCurrentSource().recordElement)) {
- break;
- }
- }
- }
- } catch (FactoryConfigurationError | XMLStreamException e) {
- throw new IOException(e);
- }
- }
-
- @Override
- protected boolean readNextRecord() throws IOException {
- if (emptyBundle) {
- currentByteOffset = Long.MAX_VALUE;
- return false;
- }
- try {
- // Update current offset and check if the next value is the record element.
- currentByteOffset = parserBaseOffset + parser.getLocation().getCharacterOffset();
- while (parser.getEventType() != XMLStreamConstants.START_ELEMENT) {
- parser.next();
- currentByteOffset = parserBaseOffset + parser.getLocation().getCharacterOffset();
- if (parser.getEventType() == XMLStreamConstants.END_DOCUMENT) {
- currentByteOffset = Long.MAX_VALUE;
- return false;
- }
- }
- JAXBElement<T> jb = jaxbUnmarshaller.unmarshal(parser, getCurrentSource().recordClass);
- currentRecord = jb.getValue();
- return true;
- } catch (JAXBException | XMLStreamException e) {
- throw new IOException(e);
- }
- }
-
- @Override
- public T getCurrent() throws NoSuchElementException {
- if (!readingStarted) {
- throw new NoSuchElementException();
- }
- return currentRecord;
- }
-
- @Override
- protected boolean isAtSplitPoint() {
- // Every record is at a split point.
- return true;
- }
-
- @Override
- protected long getCurrentOffset() {
- return currentByteOffset;
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/bigtable/BigtableIO.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/bigtable/BigtableIO.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/bigtable/BigtableIO.java
deleted file mode 100644
index 7d59b09..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/bigtable/BigtableIO.java
+++ /dev/null
@@ -1,987 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.io.bigtable;
-
-import static com.google.common.base.Preconditions.checkArgument;
-import static com.google.common.base.Preconditions.checkNotNull;
-import static com.google.common.base.Preconditions.checkState;
-
-import com.google.bigtable.v1.Mutation;
-import com.google.bigtable.v1.Row;
-import com.google.bigtable.v1.RowFilter;
-import com.google.bigtable.v1.SampleRowKeysResponse;
-import com.google.cloud.bigtable.config.BigtableOptions;
-import com.google.cloud.dataflow.sdk.annotations.Experimental;
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.coders.Proto2Coder;
-import com.google.cloud.dataflow.sdk.coders.VarLongCoder;
-import com.google.cloud.dataflow.sdk.io.BoundedSource;
-import com.google.cloud.dataflow.sdk.io.BoundedSource.BoundedReader;
-import com.google.cloud.dataflow.sdk.io.Sink.WriteOperation;
-import com.google.cloud.dataflow.sdk.io.Sink.Writer;
-import com.google.cloud.dataflow.sdk.io.range.ByteKey;
-import com.google.cloud.dataflow.sdk.io.range.ByteKeyRange;
-import com.google.cloud.dataflow.sdk.io.range.ByteKeyRangeTracker;
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-import com.google.cloud.dataflow.sdk.runners.PipelineRunner;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-import com.google.cloud.dataflow.sdk.util.DataflowReleaseInfo;
-import com.google.cloud.dataflow.sdk.values.KV;
-import com.google.cloud.dataflow.sdk.values.PBegin;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.cloud.dataflow.sdk.values.PDone;
-import com.google.common.base.MoreObjects;
-import com.google.common.collect.ImmutableList;
-import com.google.common.util.concurrent.FutureCallback;
-import com.google.common.util.concurrent.Futures;
-import com.google.protobuf.ByteString;
-import com.google.protobuf.Empty;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.IOException;
-import java.util.Collections;
-import java.util.Iterator;
-import java.util.List;
-import java.util.NoSuchElementException;
-import java.util.concurrent.ConcurrentLinkedQueue;
-
-import javax.annotation.Nullable;
-
-/**
- * A bounded source and sink for Google Cloud Bigtable.
- *
- * <p>For more information, see the online documentation at
- * <a href="https://cloud.google.com/bigtable/">Google Cloud Bigtable</a>.
- *
- * <h3>Reading from Cloud Bigtable</h3>
- *
- * <p>The Bigtable source returns a set of rows from a single table, returning a
- * {@code PCollection<Row>}.
- *
- * <p>To configure a Cloud Bigtable source, you must supply a table id and a {@link BigtableOptions}
- * or builder configured with the project and other information necessary to identify the
- * Bigtable cluster. A {@link RowFilter} may also optionally be specified using
- * {@link BigtableIO.Read#withRowFilter}. For example:
- *
- * <pre>{@code
- * BigtableOptions.Builder optionsBuilder =
- * new BigtableOptions.Builder()
- * .setProjectId("project")
- * .setClusterId("cluster")
- * .setZoneId("zone");
- *
- * Pipeline p = ...;
- *
- * // Scan the entire table.
- * p.apply("read",
- * BigtableIO.read()
- * .withBigtableOptions(optionsBuilder)
- * .withTableId("table"));
- *
- * // Scan a subset of rows that match the specified row filter.
- * p.apply("filtered read",
- * BigtableIO.read()
- * .withBigtableOptions(optionsBuilder)
- * .withTableId("table")
- * .withRowFilter(filter));
- * }</pre>
- *
- * <h3>Writing to Cloud Bigtable</h3>
- *
- * <p>The Bigtable sink executes a set of row mutations on a single table. It takes as input a
- * {@link PCollection PCollection<KV<ByteString, Iterable<Mutation>>>}, where the
- * {@link ByteString} is the key of the row being mutated, and each {@link Mutation} represents an
- * idempotent transformation to that row.
- *
- * <p>To configure a Cloud Bigtable sink, you must supply a table id and a {@link BigtableOptions}
- * or builder configured with the project and other information necessary to identify the
- * Bigtable cluster, for example:
- *
- * <pre>{@code
- * BigtableOptions.Builder optionsBuilder =
- * new BigtableOptions.Builder()
- * .setProjectId("project")
- * .setClusterId("cluster")
- * .setZoneId("zone");
- *
- * PCollection<KV<ByteString, Iterable<Mutation>>> data = ...;
- *
- * data.apply("write",
- * BigtableIO.write()
- * .withBigtableOptions(optionsBuilder)
- * .withTableId("table"));
- * }</pre>
- *
- * <h3>Experimental</h3>
- *
- * <p>This connector for Cloud Bigtable is considered experimental and may break or receive
- * backwards-incompatible changes in future versions of the Cloud Dataflow SDK. Cloud Bigtable is
- * in Beta, and thus it may introduce breaking changes in future revisions of its service or APIs.
- *
- * <h3>Permissions</h3>
- *
- * <p>Permission requirements depend on the {@link PipelineRunner} that is used to execute the
- * Dataflow job. Please refer to the documentation of corresponding
- * {@link PipelineRunner PipelineRunners} for more details.
- */
-@Experimental
-public class BigtableIO {
- private static final Logger logger = LoggerFactory.getLogger(BigtableIO.class);
-
- /**
- * Creates an uninitialized {@link BigtableIO.Read}. Before use, the {@code Read} must be
- * initialized with a
- * {@link BigtableIO.Read#withBigtableOptions(BigtableOptions) BigtableOptions} that specifies
- * the source Cloud Bigtable cluster, and a {@link BigtableIO.Read#withTableId tableId} that
- * specifies which table to read. A {@link RowFilter} may also optionally be specified using
- * {@link BigtableIO.Read#withRowFilter}.
- */
- @Experimental
- public static Read read() {
- return new Read(null, "", null, null);
- }
-
- /**
- * Creates an uninitialized {@link BigtableIO.Write}. Before use, the {@code Write} must be
- * initialized with a
- * {@link BigtableIO.Write#withBigtableOptions(BigtableOptions) BigtableOptions} that specifies
- * the destination Cloud Bigtable cluster, and a {@link BigtableIO.Write#withTableId tableId} that
- * specifies which table to write.
- */
- @Experimental
- public static Write write() {
- return new Write(null, "", null);
- }
-
- /**
- * A {@link PTransform} that reads from Google Cloud Bigtable. See the class-level Javadoc on
- * {@link BigtableIO} for more information.
- *
- * @see BigtableIO
- */
- @Experimental
- public static class Read extends PTransform<PBegin, PCollection<Row>> {
- /**
- * Returns a new {@link BigtableIO.Read} that will read from the Cloud Bigtable cluster
- * indicated by the given options, and using any other specified customizations.
- *
- * <p>Does not modify this object.
- */
- public Read withBigtableOptions(BigtableOptions options) {
- checkNotNull(options, "options");
- return withBigtableOptions(options.toBuilder());
- }
-
- /**
- * Returns a new {@link BigtableIO.Read} that will read from the Cloud Bigtable cluster
- * indicated by the given options, and using any other specified customizations.
- *
- * <p>Clones the given {@link BigtableOptions} builder so that any further changes
- * will have no effect on the returned {@link BigtableIO.Read}.
- *
- * <p>Does not modify this object.
- */
- public Read withBigtableOptions(BigtableOptions.Builder optionsBuilder) {
- checkNotNull(optionsBuilder, "optionsBuilder");
- // TODO: is there a better way to clone a Builder? Want it to be immune from user changes.
- BigtableOptions.Builder clonedBuilder = optionsBuilder.build().toBuilder();
- BigtableOptions optionsWithAgent = clonedBuilder.setUserAgent(getUserAgent()).build();
- return new Read(optionsWithAgent, tableId, filter, bigtableService);
- }
-
- /**
- * Returns a new {@link BigtableIO.Read} that will filter the rows read from Cloud Bigtable
- * using the given row filter.
- *
- * <p>Does not modify this object.
- */
- public Read withRowFilter(RowFilter filter) {
- checkNotNull(filter, "filter");
- return new Read(options, tableId, filter, bigtableService);
- }
-
- /**
- * Returns a new {@link BigtableIO.Read} that will read from the specified table.
- *
- * <p>Does not modify this object.
- */
- public Read withTableId(String tableId) {
- checkNotNull(tableId, "tableId");
- return new Read(options, tableId, filter, bigtableService);
- }
-
- /**
- * Returns the Google Cloud Bigtable cluster being read from, and other parameters.
- */
- public BigtableOptions getBigtableOptions() {
- return options;
- }
-
- /**
- * Returns the table being read from.
- */
- public String getTableId() {
- return tableId;
- }
-
- @Override
- public PCollection<Row> apply(PBegin input) {
- BigtableSource source =
- new BigtableSource(getBigtableService(), tableId, filter, ByteKeyRange.ALL_KEYS, null);
- return input.getPipeline().apply(com.google.cloud.dataflow.sdk.io.Read.from(source));
- }
-
- @Override
- public void validate(PBegin input) {
- checkArgument(options != null, "BigtableOptions not specified");
- checkArgument(!tableId.isEmpty(), "Table ID not specified");
- try {
- checkArgument(
- getBigtableService().tableExists(tableId), "Table %s does not exist", tableId);
- } catch (IOException e) {
- logger.warn("Error checking whether table {} exists; proceeding.", tableId, e);
- }
- }
-
- @Override
- public String toString() {
- return MoreObjects.toStringHelper(Read.class)
- .add("options", options)
- .add("tableId", tableId)
- .add("filter", filter)
- .toString();
- }
-
- /////////////////////////////////////////////////////////////////////////////////////////
- /**
- * Used to define the Cloud Bigtable cluster and any options for the networking layer.
- * Cannot actually be {@code null} at validation time, but may start out {@code null} while
- * source is being built.
- */
- @Nullable private final BigtableOptions options;
- private final String tableId;
- @Nullable private final RowFilter filter;
- @Nullable private final BigtableService bigtableService;
-
- private Read(
- @Nullable BigtableOptions options,
- String tableId,
- @Nullable RowFilter filter,
- @Nullable BigtableService bigtableService) {
- this.options = options;
- this.tableId = checkNotNull(tableId, "tableId");
- this.filter = filter;
- this.bigtableService = bigtableService;
- }
-
- /**
- * Returns a new {@link BigtableIO.Read} that will read using the given Cloud Bigtable
- * service implementation.
- *
- * <p>This is used for testing.
- *
- * <p>Does not modify this object.
- */
- Read withBigtableService(BigtableService bigtableService) {
- checkNotNull(bigtableService, "bigtableService");
- return new Read(options, tableId, filter, bigtableService);
- }
-
- /**
- * Helper function that either returns the mock Bigtable service supplied by
- * {@link #withBigtableService} or creates and returns an implementation that talks to
- * {@code Cloud Bigtable}.
- */
- private BigtableService getBigtableService() {
- if (bigtableService != null) {
- return bigtableService;
- }
- return new BigtableServiceImpl(options);
- }
- }
-
- /**
- * A {@link PTransform} that writes to Google Cloud Bigtable. See the class-level Javadoc on
- * {@link BigtableIO} for more information.
- *
- * @see BigtableIO
- */
- @Experimental
- public static class Write
- extends PTransform<PCollection<KV<ByteString, Iterable<Mutation>>>, PDone> {
- /**
- * Used to define the Cloud Bigtable cluster and any options for the networking layer.
- * Cannot actually be {@code null} at validation time, but may start out {@code null} while
- * source is being built.
- */
- @Nullable private final BigtableOptions options;
- private final String tableId;
- @Nullable private final BigtableService bigtableService;
-
- private Write(
- @Nullable BigtableOptions options,
- String tableId,
- @Nullable BigtableService bigtableService) {
- this.options = options;
- this.tableId = checkNotNull(tableId, "tableId");
- this.bigtableService = bigtableService;
- }
-
- /**
- * Returns a new {@link BigtableIO.Write} that will write to the Cloud Bigtable cluster
- * indicated by the given options, and using any other specified customizations.
- *
- * <p>Does not modify this object.
- */
- public Write withBigtableOptions(BigtableOptions options) {
- checkNotNull(options, "options");
- return withBigtableOptions(options.toBuilder());
- }
-
- /**
- * Returns a new {@link BigtableIO.Write} that will write to the Cloud Bigtable cluster
- * indicated by the given options, and using any other specified customizations.
- *
- * <p>Clones the given {@link BigtableOptions} builder so that any further changes
- * will have no effect on the returned {@link BigtableIO.Write}.
- *
- * <p>Does not modify this object.
- */
- public Write withBigtableOptions(BigtableOptions.Builder optionsBuilder) {
- checkNotNull(optionsBuilder, "optionsBuilder");
- // TODO: is there a better way to clone a Builder? Want it to be immune from user changes.
- BigtableOptions.Builder clonedBuilder = optionsBuilder.build().toBuilder();
- BigtableOptions optionsWithAgent = clonedBuilder.setUserAgent(getUserAgent()).build();
- return new Write(optionsWithAgent, tableId, bigtableService);
- }
-
- /**
- * Returns a new {@link BigtableIO.Write} that will write to the specified table.
- *
- * <p>Does not modify this object.
- */
- public Write withTableId(String tableId) {
- checkNotNull(tableId, "tableId");
- return new Write(options, tableId, bigtableService);
- }
-
- /**
- * Returns the Google Cloud Bigtable cluster being written to, and other parameters.
- */
- public BigtableOptions getBigtableOptions() {
- return options;
- }
-
- /**
- * Returns the table being written to.
- */
- public String getTableId() {
- return tableId;
- }
-
- @Override
- public PDone apply(PCollection<KV<ByteString, Iterable<Mutation>>> input) {
- Sink sink = new Sink(tableId, getBigtableService());
- return input.apply(com.google.cloud.dataflow.sdk.io.Write.to(sink));
- }
-
- @Override
- public void validate(PCollection<KV<ByteString, Iterable<Mutation>>> input) {
- checkArgument(options != null, "BigtableOptions not specified");
- checkArgument(!tableId.isEmpty(), "Table ID not specified");
- try {
- checkArgument(
- getBigtableService().tableExists(tableId), "Table %s does not exist", tableId);
- } catch (IOException e) {
- logger.warn("Error checking whether table {} exists; proceeding.", tableId, e);
- }
- }
-
- /**
- * Returns a new {@link BigtableIO.Write} that will write using the given Cloud Bigtable
- * service implementation.
- *
- * <p>This is used for testing.
- *
- * <p>Does not modify this object.
- */
- Write withBigtableService(BigtableService bigtableService) {
- checkNotNull(bigtableService, "bigtableService");
- return new Write(options, tableId, bigtableService);
- }
-
- @Override
- public String toString() {
- return MoreObjects.toStringHelper(Write.class)
- .add("options", options)
- .add("tableId", tableId)
- .toString();
- }
-
- /**
- * Helper function that either returns the mock Bigtable service supplied by
- * {@link #withBigtableService} or creates and returns an implementation that talks to
- * {@code Cloud Bigtable}.
- */
- private BigtableService getBigtableService() {
- if (bigtableService != null) {
- return bigtableService;
- }
- return new BigtableServiceImpl(options);
- }
- }
-
- //////////////////////////////////////////////////////////////////////////////////////////
- /** Disallow construction of utility class. */
- private BigtableIO() {}
-
- static class BigtableSource extends BoundedSource<Row> {
- public BigtableSource(
- BigtableService service,
- String tableId,
- @Nullable RowFilter filter,
- ByteKeyRange range,
- Long estimatedSizeBytes) {
- this.service = service;
- this.tableId = tableId;
- this.filter = filter;
- this.range = range;
- this.estimatedSizeBytes = estimatedSizeBytes;
- }
-
- @Override
- public String toString() {
- return MoreObjects.toStringHelper(BigtableSource.class)
- .add("tableId", tableId)
- .add("filter", filter)
- .add("range", range)
- .add("estimatedSizeBytes", estimatedSizeBytes)
- .toString();
- }
-
- ////// Private state and internal implementation details //////
- private final BigtableService service;
- @Nullable private final String tableId;
- @Nullable private final RowFilter filter;
- private final ByteKeyRange range;
- @Nullable private Long estimatedSizeBytes;
- @Nullable private transient List<SampleRowKeysResponse> sampleRowKeys;
-
- protected BigtableSource withStartKey(ByteKey startKey) {
- checkNotNull(startKey, "startKey");
- return new BigtableSource(
- service, tableId, filter, range.withStartKey(startKey), estimatedSizeBytes);
- }
-
- protected BigtableSource withEndKey(ByteKey endKey) {
- checkNotNull(endKey, "endKey");
- return new BigtableSource(
- service, tableId, filter, range.withEndKey(endKey), estimatedSizeBytes);
- }
-
- protected BigtableSource withEstimatedSizeBytes(Long estimatedSizeBytes) {
- checkNotNull(estimatedSizeBytes, "estimatedSizeBytes");
- return new BigtableSource(service, tableId, filter, range, estimatedSizeBytes);
- }
-
- /**
- * Makes an API call to the Cloud Bigtable service that gives information about tablet key
- * boundaries and estimated sizes. We can use these samples to ensure that splits are on
- * different tablets, and possibly generate sub-splits within tablets.
- */
- private List<SampleRowKeysResponse> getSampleRowKeys() throws IOException {
- return service.getSampleRowKeys(this);
- }
-
- @Override
- public List<BigtableSource> splitIntoBundles(
- long desiredBundleSizeBytes, PipelineOptions options) throws Exception {
- // Update the desiredBundleSizeBytes in order to limit the
- // number of splits to maximumNumberOfSplits.
- long maximumNumberOfSplits = 4000;
- long sizeEstimate = getEstimatedSizeBytes(options);
- desiredBundleSizeBytes =
- Math.max(sizeEstimate / maximumNumberOfSplits, desiredBundleSizeBytes);
-
- // Delegate to testable helper.
- return splitIntoBundlesBasedOnSamples(desiredBundleSizeBytes, getSampleRowKeys());
- }
-
- /** Helper that splits this source into bundles based on Cloud Bigtable sampled row keys. */
- private List<BigtableSource> splitIntoBundlesBasedOnSamples(
- long desiredBundleSizeBytes, List<SampleRowKeysResponse> sampleRowKeys) {
- // There are no regions, or no samples available. Just scan the entire range.
- if (sampleRowKeys.isEmpty()) {
- logger.info("Not splitting source {} because no sample row keys are available.", this);
- return Collections.singletonList(this);
- }
-
- logger.info(
- "About to split into bundles of size {} with sampleRowKeys length {} first element {}",
- desiredBundleSizeBytes,
- sampleRowKeys.size(),
- sampleRowKeys.get(0));
-
- // Loop through all sampled responses and generate splits from the ones that overlap the
- // scan range. The main complication is that we must track the end range of the previous
- // sample to generate good ranges.
- ByteKey lastEndKey = ByteKey.EMPTY;
- long lastOffset = 0;
- ImmutableList.Builder<BigtableSource> splits = ImmutableList.builder();
- for (SampleRowKeysResponse response : sampleRowKeys) {
- ByteKey responseEndKey = ByteKey.of(response.getRowKey());
- long responseOffset = response.getOffsetBytes();
- checkState(
- responseOffset >= lastOffset,
- "Expected response byte offset %s to come after the last offset %s",
- responseOffset,
- lastOffset);
-
- if (!range.overlaps(ByteKeyRange.of(lastEndKey, responseEndKey))) {
- // This region does not overlap the scan, so skip it.
- lastOffset = responseOffset;
- lastEndKey = responseEndKey;
- continue;
- }
-
- // Calculate the beginning of the split as the larger of startKey and the end of the last
- // split. Unspecified start is smallest key so is correctly treated as earliest key.
- ByteKey splitStartKey = lastEndKey;
- if (splitStartKey.compareTo(range.getStartKey()) < 0) {
- splitStartKey = range.getStartKey();
- }
-
- // Calculate the end of the split as the smaller of endKey and the end of this sample. Note
- // that range.containsKey handles the case when range.getEndKey() is empty.
- ByteKey splitEndKey = responseEndKey;
- if (!range.containsKey(splitEndKey)) {
- splitEndKey = range.getEndKey();
- }
-
- // We know this region overlaps the desired key range, and we know a rough estimate of its
- // size. Split the key range into bundle-sized chunks and then add them all as splits.
- long sampleSizeBytes = responseOffset - lastOffset;
- List<BigtableSource> subSplits =
- splitKeyRangeIntoBundleSizedSubranges(
- sampleSizeBytes,
- desiredBundleSizeBytes,
- ByteKeyRange.of(splitStartKey, splitEndKey));
- splits.addAll(subSplits);
-
- // Move to the next region.
- lastEndKey = responseEndKey;
- lastOffset = responseOffset;
- }
-
- // We must add one more region after the end of the samples if both these conditions hold:
- // 1. we did not scan to the end yet (lastEndKey is concrete, not 0-length).
- // 2. we want to scan to the end (endKey is empty) or farther (lastEndKey < endKey).
- if (!lastEndKey.isEmpty()
- && (range.getEndKey().isEmpty() || lastEndKey.compareTo(range.getEndKey()) < 0)) {
- splits.add(this.withStartKey(lastEndKey).withEndKey(range.getEndKey()));
- }
-
- List<BigtableSource> ret = splits.build();
- logger.info("Generated {} splits. First split: {}", ret.size(), ret.get(0));
- return ret;
- }
-
- @Override
- public long getEstimatedSizeBytes(PipelineOptions options) throws IOException {
- // Delegate to testable helper.
- if (estimatedSizeBytes == null) {
- estimatedSizeBytes = getEstimatedSizeBytesBasedOnSamples(getSampleRowKeys());
- }
- return estimatedSizeBytes;
- }
-
- /**
- * Computes the estimated size in bytes based on the total size of all samples that overlap
- * the key range this source will scan.
- */
- private long getEstimatedSizeBytesBasedOnSamples(List<SampleRowKeysResponse> samples) {
- long estimatedSizeBytes = 0;
- long lastOffset = 0;
- ByteKey currentStartKey = ByteKey.EMPTY;
- // Compute the total estimated size as the size of each sample that overlaps the scan range.
- // TODO: In future, Bigtable service may provide finer grained APIs, e.g., to sample given a
- // filter or to sample on a given key range.
- for (SampleRowKeysResponse response : samples) {
- ByteKey currentEndKey = ByteKey.of(response.getRowKey());
- long currentOffset = response.getOffsetBytes();
- if (!currentStartKey.isEmpty() && currentStartKey.equals(currentEndKey)) {
- // Skip an empty region.
- lastOffset = currentOffset;
- continue;
- } else if (range.overlaps(ByteKeyRange.of(currentStartKey, currentEndKey))) {
- estimatedSizeBytes += currentOffset - lastOffset;
- }
- currentStartKey = currentEndKey;
- lastOffset = currentOffset;
- }
- return estimatedSizeBytes;
- }
-
- /**
- * Cloud Bigtable returns query results ordered by key.
- */
- @Override
- public boolean producesSortedKeys(PipelineOptions options) throws Exception {
- return true;
- }
-
- @Override
- public BoundedReader<Row> createReader(PipelineOptions options) throws IOException {
- return new BigtableReader(this, service);
- }
-
- @Override
- public void validate() {
- checkArgument(!tableId.isEmpty(), "tableId cannot be empty");
- }
-
- @Override
- public Coder<Row> getDefaultOutputCoder() {
- return Proto2Coder.of(Row.class);
- }
-
- /** Helper that splits the specified range in this source into bundles. */
- private List<BigtableSource> splitKeyRangeIntoBundleSizedSubranges(
- long sampleSizeBytes, long desiredBundleSizeBytes, ByteKeyRange range) {
- // Catch the trivial cases. Split is small enough already, or this is the last region.
- logger.debug(
- "Subsplit for sampleSizeBytes {} and desiredBundleSizeBytes {}",
- sampleSizeBytes,
- desiredBundleSizeBytes);
- if (sampleSizeBytes <= desiredBundleSizeBytes) {
- return Collections.singletonList(
- this.withStartKey(range.getStartKey()).withEndKey(range.getEndKey()));
- }
-
- checkArgument(
- sampleSizeBytes > 0, "Sample size %s bytes must be greater than 0.", sampleSizeBytes);
- checkArgument(
- desiredBundleSizeBytes > 0,
- "Desired bundle size %s bytes must be greater than 0.",
- desiredBundleSizeBytes);
-
- int splitCount = (int) Math.ceil(((double) sampleSizeBytes) / (desiredBundleSizeBytes));
- List<ByteKey> splitKeys = range.split(splitCount);
- ImmutableList.Builder<BigtableSource> splits = ImmutableList.builder();
- Iterator<ByteKey> keys = splitKeys.iterator();
- ByteKey prev = keys.next();
- while (keys.hasNext()) {
- ByteKey next = keys.next();
- splits.add(
- this
- .withStartKey(prev)
- .withEndKey(next)
- .withEstimatedSizeBytes(sampleSizeBytes / splitCount));
- prev = next;
- }
- return splits.build();
- }
-
- public ByteKeyRange getRange() {
- return range;
- }
-
- public RowFilter getRowFilter() {
- return filter;
- }
-
- public String getTableId() {
- return tableId;
- }
- }
-
- private static class BigtableReader extends BoundedReader<Row> {
- // Thread-safety: source is protected via synchronization and is only accessed or modified
- // inside a synchronized block (or constructor, which is the same).
- private BigtableSource source;
- private BigtableService service;
- private BigtableService.Reader reader;
- private final ByteKeyRangeTracker rangeTracker;
- private long recordsReturned;
-
- public BigtableReader(BigtableSource source, BigtableService service) {
- this.source = source;
- this.service = service;
- rangeTracker = ByteKeyRangeTracker.of(source.getRange());
- }
-
- @Override
- public boolean start() throws IOException {
- reader = service.createReader(getCurrentSource());
- boolean hasRecord =
- reader.start()
- && rangeTracker.tryReturnRecordAt(true, ByteKey.of(reader.getCurrentRow().getKey()));
- if (hasRecord) {
- ++recordsReturned;
- }
- return hasRecord;
- }
-
- @Override
- public synchronized BigtableSource getCurrentSource() {
- return source;
- }
-
- @Override
- public boolean advance() throws IOException {
- boolean hasRecord =
- reader.advance()
- && rangeTracker.tryReturnRecordAt(true, ByteKey.of(reader.getCurrentRow().getKey()));
- if (hasRecord) {
- ++recordsReturned;
- }
- return hasRecord;
- }
-
- @Override
- public Row getCurrent() throws NoSuchElementException {
- return reader.getCurrentRow();
- }
-
- @Override
- public void close() throws IOException {
- logger.info("Closing reader after reading {} records.", recordsReturned);
- if (reader != null) {
- reader.close();
- reader = null;
- }
- }
-
- @Override
- public final Double getFractionConsumed() {
- return rangeTracker.getFractionConsumed();
- }
-
- @Override
- public final synchronized BigtableSource splitAtFraction(double fraction) {
- ByteKey splitKey;
- try {
- splitKey = source.getRange().interpolateKey(fraction);
- } catch (IllegalArgumentException e) {
- logger.info("%s: Failed to interpolate key for fraction %s.", source.getRange(), fraction);
- return null;
- }
- logger.debug(
- "Proposing to split {} at fraction {} (key {})", rangeTracker, fraction, splitKey);
- if (!rangeTracker.trySplitAtPosition(splitKey)) {
- return null;
- }
- BigtableSource primary = source.withEndKey(splitKey);
- BigtableSource residual = source.withStartKey(splitKey);
- this.source = primary;
- return residual;
- }
- }
-
- private static class Sink
- extends com.google.cloud.dataflow.sdk.io.Sink<KV<ByteString, Iterable<Mutation>>> {
-
- public Sink(String tableId, BigtableService bigtableService) {
- this.tableId = checkNotNull(tableId, "tableId");
- this.bigtableService = checkNotNull(bigtableService, "bigtableService");
- }
-
- public String getTableId() {
- return tableId;
- }
-
- public BigtableService getBigtableService() {
- return bigtableService;
- }
-
- @Override
- public String toString() {
- return MoreObjects.toStringHelper(Sink.class)
- .add("bigtableService", bigtableService)
- .add("tableId", tableId)
- .toString();
- }
-
- ///////////////////////////////////////////////////////////////////////////////
- private final String tableId;
- private final BigtableService bigtableService;
-
- @Override
- public WriteOperation<KV<ByteString, Iterable<Mutation>>, Long> createWriteOperation(
- PipelineOptions options) {
- return new BigtableWriteOperation(this);
- }
-
- /** Does nothing, as it is redundant with {@link Write#validate}. */
- @Override
- public void validate(PipelineOptions options) {}
- }
-
- private static class BigtableWriteOperation
- extends WriteOperation<KV<ByteString, Iterable<Mutation>>, Long> {
- private final Sink sink;
-
- public BigtableWriteOperation(Sink sink) {
- this.sink = sink;
- }
-
- @Override
- public Writer<KV<ByteString, Iterable<Mutation>>, Long> createWriter(PipelineOptions options)
- throws Exception {
- return new BigtableWriter(this);
- }
-
- @Override
- public void initialize(PipelineOptions options) {}
-
- @Override
- public void finalize(Iterable<Long> writerResults, PipelineOptions options) {
- long count = 0;
- for (Long value : writerResults) {
- value += count;
- }
- logger.debug("Wrote {} elements to BigtableIO.Sink {}", sink);
- }
-
- @Override
- public Sink getSink() {
- return sink;
- }
-
- @Override
- public Coder<Long> getWriterResultCoder() {
- return VarLongCoder.of();
- }
- }
-
- private static class BigtableWriter extends Writer<KV<ByteString, Iterable<Mutation>>, Long> {
- private final BigtableWriteOperation writeOperation;
- private final Sink sink;
- private BigtableService.Writer bigtableWriter;
- private long recordsWritten;
- private final ConcurrentLinkedQueue<BigtableWriteException> failures;
-
- public BigtableWriter(BigtableWriteOperation writeOperation) {
- this.writeOperation = writeOperation;
- this.sink = writeOperation.getSink();
- this.failures = new ConcurrentLinkedQueue<>();
- }
-
- @Override
- public void open(String uId) throws Exception {
- bigtableWriter = sink.getBigtableService().openForWriting(sink.getTableId());
- recordsWritten = 0;
- }
-
- /**
- * If any write has asynchronously failed, fail the bundle with a useful error.
- */
- private void checkForFailures() throws IOException {
- // Note that this function is never called by multiple threads and is the only place that
- // we remove from failures, so this code is safe.
- if (failures.isEmpty()) {
- return;
- }
-
- StringBuilder logEntry = new StringBuilder();
- int i = 0;
- for (; i < 10 && !failures.isEmpty(); ++i) {
- BigtableWriteException exc = failures.remove();
- logEntry.append("\n").append(exc.getMessage());
- if (exc.getCause() != null) {
- logEntry.append(": ").append(exc.getCause().getMessage());
- }
- }
- String message =
- String.format(
- "At least %d errors occurred writing to Bigtable. First %d errors: %s",
- i + failures.size(),
- i,
- logEntry.toString());
- logger.error(message);
- throw new IOException(message);
- }
-
- @Override
- public void write(KV<ByteString, Iterable<Mutation>> rowMutations) throws Exception {
- checkForFailures();
- Futures.addCallback(
- bigtableWriter.writeRecord(rowMutations), new WriteExceptionCallback(rowMutations));
- ++recordsWritten;
- }
-
- @Override
- public Long close() throws Exception {
- bigtableWriter.close();
- bigtableWriter = null;
- checkForFailures();
- logger.info("Wrote {} records", recordsWritten);
- return recordsWritten;
- }
-
- @Override
- public WriteOperation<KV<ByteString, Iterable<Mutation>>, Long> getWriteOperation() {
- return writeOperation;
- }
-
- private class WriteExceptionCallback implements FutureCallback<Empty> {
- private final KV<ByteString, Iterable<Mutation>> value;
-
- public WriteExceptionCallback(KV<ByteString, Iterable<Mutation>> value) {
- this.value = value;
- }
-
- @Override
- public void onFailure(Throwable cause) {
- failures.add(new BigtableWriteException(value, cause));
- }
-
- @Override
- public void onSuccess(Empty produced) {}
- }
- }
-
- /**
- * An exception that puts information about the failed record being written in its message.
- */
- static class BigtableWriteException extends IOException {
- public BigtableWriteException(KV<ByteString, Iterable<Mutation>> record, Throwable cause) {
- super(
- String.format(
- "Error mutating row %s with mutations %s",
- record.getKey().toStringUtf8(),
- record.getValue()),
- cause);
- }
- }
-
- /**
- * A helper function to produce a Cloud Bigtable user agent string.
- */
- private static String getUserAgent() {
- String javaVersion = System.getProperty("java.specification.version");
- DataflowReleaseInfo info = DataflowReleaseInfo.getReleaseInfo();
- return String.format(
- "%s/%s (%s); %s",
- info.getName(),
- info.getVersion(),
- javaVersion,
- "0.2.3" /* TODO get Bigtable client version directly from jar. */);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/bigtable/BigtableService.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/bigtable/BigtableService.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/bigtable/BigtableService.java
deleted file mode 100644
index 85d706c..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/bigtable/BigtableService.java
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.io.bigtable;
-
-import com.google.bigtable.v1.Mutation;
-import com.google.bigtable.v1.Row;
-import com.google.bigtable.v1.SampleRowKeysResponse;
-import com.google.cloud.dataflow.sdk.io.bigtable.BigtableIO.BigtableSource;
-import com.google.cloud.dataflow.sdk.values.KV;
-import com.google.common.util.concurrent.ListenableFuture;
-import com.google.protobuf.ByteString;
-import com.google.protobuf.Empty;
-
-import java.io.IOException;
-import java.io.Serializable;
-import java.util.List;
-import java.util.NoSuchElementException;
-
-/**
- * An interface for real or fake implementations of Cloud Bigtable.
- */
-interface BigtableService extends Serializable {
-
- /**
- * The interface of a class that can write to Cloud Bigtable.
- */
- interface Writer {
- /**
- * Writes a single row transaction to Cloud Bigtable. The key of the {@code record} is the
- * row key to be mutated and the iterable of mutations represent the changes to be made to the
- * row.
- *
- * @throws IOException if there is an error submitting the write.
- */
- ListenableFuture<Empty> writeRecord(KV<ByteString, Iterable<Mutation>> record)
- throws IOException;
-
- /**
- * Closes the writer.
- *
- * @throws IOException if any writes did not succeed
- */
- void close() throws IOException;
- }
-
- /**
- * The interface of a class that reads from Cloud Bigtable.
- */
- interface Reader {
- /**
- * Reads the first element (including initialization, such as opening a network connection) and
- * returns true if an element was found.
- */
- boolean start() throws IOException;
-
- /**
- * Attempts to read the next element, and returns true if an element has been read.
- */
- boolean advance() throws IOException;
-
- /**
- * Closes the reader.
- *
- * @throws IOException if there is an error.
- */
- void close() throws IOException;
-
- /**
- * Returns the last row read by a successful start() or advance(), or throws if there is no
- * current row because the last such call was unsuccessful.
- */
- Row getCurrentRow() throws NoSuchElementException;
- }
-
- /**
- * Returns {@code true} if the table with the give name exists.
- */
- boolean tableExists(String tableId) throws IOException;
-
- /**
- * Returns a {@link Reader} that will read from the specified source.
- */
- Reader createReader(BigtableSource source) throws IOException;
-
- /**
- * Returns a {@link Writer} that will write to the specified table.
- */
- Writer openForWriting(String tableId) throws IOException;
-
- /**
- * Returns a set of row keys sampled from the underlying table. These contain information about
- * the distribution of keys within the table.
- */
- List<SampleRowKeysResponse> getSampleRowKeys(BigtableSource source) throws IOException;
-}
[57/67] incubator-beam git commit: Directory reorganization
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/java/src/main/java/com/google/cloud/dataflow/examples/cookbook/JoinExamples.java
----------------------------------------------------------------------
diff --git a/examples/java/src/main/java/com/google/cloud/dataflow/examples/cookbook/JoinExamples.java b/examples/java/src/main/java/com/google/cloud/dataflow/examples/cookbook/JoinExamples.java
new file mode 100644
index 0000000..745c5d6
--- /dev/null
+++ b/examples/java/src/main/java/com/google/cloud/dataflow/examples/cookbook/JoinExamples.java
@@ -0,0 +1,185 @@
+/*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package com.google.cloud.dataflow.examples.cookbook;
+
+import com.google.api.services.bigquery.model.TableRow;
+import com.google.cloud.dataflow.sdk.Pipeline;
+import com.google.cloud.dataflow.sdk.io.BigQueryIO;
+import com.google.cloud.dataflow.sdk.io.TextIO;
+import com.google.cloud.dataflow.sdk.options.Description;
+import com.google.cloud.dataflow.sdk.options.PipelineOptions;
+import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
+import com.google.cloud.dataflow.sdk.options.Validation;
+import com.google.cloud.dataflow.sdk.transforms.DoFn;
+import com.google.cloud.dataflow.sdk.transforms.ParDo;
+import com.google.cloud.dataflow.sdk.transforms.join.CoGbkResult;
+import com.google.cloud.dataflow.sdk.transforms.join.CoGroupByKey;
+import com.google.cloud.dataflow.sdk.transforms.join.KeyedPCollectionTuple;
+import com.google.cloud.dataflow.sdk.values.KV;
+import com.google.cloud.dataflow.sdk.values.PCollection;
+import com.google.cloud.dataflow.sdk.values.TupleTag;
+
+/**
+ * This example shows how to do a join on two collections.
+ * It uses a sample of the GDELT 'world event' data (http://goo.gl/OB6oin), joining the event
+ * 'action' country code against a table that maps country codes to country names.
+ *
+ * <p>Concepts: Join operation; multiple input sources.
+ *
+ * <p>To execute this pipeline locally, specify general pipeline configuration:
+ * <pre>{@code
+ * --project=YOUR_PROJECT_ID
+ * }
+ * </pre>
+ * and a local output file or output prefix on GCS:
+ * <pre>{@code
+ * --output=[YOUR_LOCAL_FILE | gs://YOUR_OUTPUT_PREFIX]
+ * }</pre>
+ *
+ * <p>To execute this pipeline using the Dataflow service, specify pipeline configuration:
+ * <pre>{@code
+ * --project=YOUR_PROJECT_ID
+ * --stagingLocation=gs://YOUR_STAGING_DIRECTORY
+ * --runner=BlockingDataflowPipelineRunner
+ * }
+ * </pre>
+ * and an output prefix on GCS:
+ * <pre>{@code
+ * --output=gs://YOUR_OUTPUT_PREFIX
+ * }</pre>
+ */
+public class JoinExamples {
+
+ // A 1000-row sample of the GDELT data here: gdelt-bq:full.events.
+ private static final String GDELT_EVENTS_TABLE =
+ "clouddataflow-readonly:samples.gdelt_sample";
+ // A table that maps country codes to country names.
+ private static final String COUNTRY_CODES =
+ "gdelt-bq:full.crosswalk_geocountrycodetohuman";
+
+ /**
+ * Join two collections, using country code as the key.
+ */
+ static PCollection<String> joinEvents(PCollection<TableRow> eventsTable,
+ PCollection<TableRow> countryCodes) throws Exception {
+
+ final TupleTag<String> eventInfoTag = new TupleTag<String>();
+ final TupleTag<String> countryInfoTag = new TupleTag<String>();
+
+ // transform both input collections to tuple collections, where the keys are country
+ // codes in both cases.
+ PCollection<KV<String, String>> eventInfo = eventsTable.apply(
+ ParDo.of(new ExtractEventDataFn()));
+ PCollection<KV<String, String>> countryInfo = countryCodes.apply(
+ ParDo.of(new ExtractCountryInfoFn()));
+
+ // country code 'key' -> CGBKR (<event info>, <country name>)
+ PCollection<KV<String, CoGbkResult>> kvpCollection = KeyedPCollectionTuple
+ .of(eventInfoTag, eventInfo)
+ .and(countryInfoTag, countryInfo)
+ .apply(CoGroupByKey.<String>create());
+
+ // Process the CoGbkResult elements generated by the CoGroupByKey transform.
+ // country code 'key' -> string of <event info>, <country name>
+ PCollection<KV<String, String>> finalResultCollection =
+ kvpCollection.apply(ParDo.named("Process").of(
+ new DoFn<KV<String, CoGbkResult>, KV<String, String>>() {
+ @Override
+ public void processElement(ProcessContext c) {
+ KV<String, CoGbkResult> e = c.element();
+ String countryCode = e.getKey();
+ String countryName = "none";
+ countryName = e.getValue().getOnly(countryInfoTag);
+ for (String eventInfo : c.element().getValue().getAll(eventInfoTag)) {
+ // Generate a string that combines information from both collection values
+ c.output(KV.of(countryCode, "Country name: " + countryName
+ + ", Event info: " + eventInfo));
+ }
+ }
+ }));
+
+ // write to GCS
+ PCollection<String> formattedResults = finalResultCollection
+ .apply(ParDo.named("Format").of(new DoFn<KV<String, String>, String>() {
+ @Override
+ public void processElement(ProcessContext c) {
+ String outputstring = "Country code: " + c.element().getKey()
+ + ", " + c.element().getValue();
+ c.output(outputstring);
+ }
+ }));
+ return formattedResults;
+ }
+
+ /**
+ * Examines each row (event) in the input table. Output a KV with the key the country
+ * code of the event, and the value a string encoding event information.
+ */
+ static class ExtractEventDataFn extends DoFn<TableRow, KV<String, String>> {
+ @Override
+ public void processElement(ProcessContext c) {
+ TableRow row = c.element();
+ String countryCode = (String) row.get("ActionGeo_CountryCode");
+ String sqlDate = (String) row.get("SQLDATE");
+ String actor1Name = (String) row.get("Actor1Name");
+ String sourceUrl = (String) row.get("SOURCEURL");
+ String eventInfo = "Date: " + sqlDate + ", Actor1: " + actor1Name + ", url: " + sourceUrl;
+ c.output(KV.of(countryCode, eventInfo));
+ }
+ }
+
+
+ /**
+ * Examines each row (country info) in the input table. Output a KV with the key the country
+ * code, and the value the country name.
+ */
+ static class ExtractCountryInfoFn extends DoFn<TableRow, KV<String, String>> {
+ @Override
+ public void processElement(ProcessContext c) {
+ TableRow row = c.element();
+ String countryCode = (String) row.get("FIPSCC");
+ String countryName = (String) row.get("HumanName");
+ c.output(KV.of(countryCode, countryName));
+ }
+ }
+
+
+ /**
+ * Options supported by {@link JoinExamples}.
+ *
+ * <p>Inherits standard configuration options.
+ */
+ private static interface Options extends PipelineOptions {
+ @Description("Path of the file to write to")
+ @Validation.Required
+ String getOutput();
+ void setOutput(String value);
+ }
+
+ public static void main(String[] args) throws Exception {
+ Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
+ Pipeline p = Pipeline.create(options);
+ // the following two 'applys' create multiple inputs to our pipeline, one for each
+ // of our two input sources.
+ PCollection<TableRow> eventsTable = p.apply(BigQueryIO.Read.from(GDELT_EVENTS_TABLE));
+ PCollection<TableRow> countryCodes = p.apply(BigQueryIO.Read.from(COUNTRY_CODES));
+ PCollection<String> formattedResults = joinEvents(eventsTable, countryCodes);
+ formattedResults.apply(TextIO.Write.to(options.getOutput()));
+ p.run();
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/java/src/main/java/com/google/cloud/dataflow/examples/cookbook/MaxPerKeyExamples.java
----------------------------------------------------------------------
diff --git a/examples/java/src/main/java/com/google/cloud/dataflow/examples/cookbook/MaxPerKeyExamples.java b/examples/java/src/main/java/com/google/cloud/dataflow/examples/cookbook/MaxPerKeyExamples.java
new file mode 100644
index 0000000..1c26d0f
--- /dev/null
+++ b/examples/java/src/main/java/com/google/cloud/dataflow/examples/cookbook/MaxPerKeyExamples.java
@@ -0,0 +1,173 @@
+/*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package com.google.cloud.dataflow.examples.cookbook;
+
+import com.google.api.services.bigquery.model.TableFieldSchema;
+import com.google.api.services.bigquery.model.TableRow;
+import com.google.api.services.bigquery.model.TableSchema;
+import com.google.cloud.dataflow.sdk.Pipeline;
+import com.google.cloud.dataflow.sdk.io.BigQueryIO;
+import com.google.cloud.dataflow.sdk.options.Default;
+import com.google.cloud.dataflow.sdk.options.Description;
+import com.google.cloud.dataflow.sdk.options.PipelineOptions;
+import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
+import com.google.cloud.dataflow.sdk.options.Validation;
+import com.google.cloud.dataflow.sdk.transforms.DoFn;
+import com.google.cloud.dataflow.sdk.transforms.Max;
+import com.google.cloud.dataflow.sdk.transforms.PTransform;
+import com.google.cloud.dataflow.sdk.transforms.ParDo;
+import com.google.cloud.dataflow.sdk.values.KV;
+import com.google.cloud.dataflow.sdk.values.PCollection;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * An example that reads the public samples of weather data from BigQuery, and finds
+ * the maximum temperature ('mean_temp') for each month.
+ *
+ * <p>Concepts: The 'Max' statistical combination function, and how to find the max per
+ * key group.
+ *
+ * <p>Note: Before running this example, you must create a BigQuery dataset to contain your output
+ * table.
+ *
+ * <p>To execute this pipeline locally, specify general pipeline configuration:
+ * <pre>{@code
+ * --project=YOUR_PROJECT_ID
+ * }
+ * </pre>
+ * and the BigQuery table for the output, with the form
+ * <pre>{@code
+ * --output=YOUR_PROJECT_ID:DATASET_ID.TABLE_ID
+ * }</pre>
+ *
+ * <p>To execute this pipeline using the Dataflow service, specify pipeline configuration:
+ * <pre>{@code
+ * --project=YOUR_PROJECT_ID
+ * --stagingLocation=gs://YOUR_STAGING_DIRECTORY
+ * --runner=BlockingDataflowPipelineRunner
+ * }
+ * </pre>
+ * and the BigQuery table for the output:
+ * <pre>{@code
+ * --output=YOUR_PROJECT_ID:DATASET_ID.TABLE_ID
+ * }</pre>
+ *
+ * <p>The BigQuery input table defaults to {@code clouddataflow-readonly:samples.weather_stations }
+ * and can be overridden with {@code --input}.
+ */
+public class MaxPerKeyExamples {
+ // Default to using a 1000 row subset of the public weather station table publicdata:samples.gsod.
+ private static final String WEATHER_SAMPLES_TABLE =
+ "clouddataflow-readonly:samples.weather_stations";
+
+ /**
+ * Examines each row (weather reading) in the input table. Output the month of the reading,
+ * and the mean_temp.
+ */
+ static class ExtractTempFn extends DoFn<TableRow, KV<Integer, Double>> {
+ @Override
+ public void processElement(ProcessContext c) {
+ TableRow row = c.element();
+ Integer month = Integer.parseInt((String) row.get("month"));
+ Double meanTemp = Double.parseDouble(row.get("mean_temp").toString());
+ c.output(KV.of(month, meanTemp));
+ }
+ }
+
+ /**
+ * Format the results to a TableRow, to save to BigQuery.
+ *
+ */
+ static class FormatMaxesFn extends DoFn<KV<Integer, Double>, TableRow> {
+ @Override
+ public void processElement(ProcessContext c) {
+ TableRow row = new TableRow()
+ .set("month", c.element().getKey())
+ .set("max_mean_temp", c.element().getValue());
+ c.output(row);
+ }
+ }
+
+ /**
+ * Reads rows from a weather data table, and finds the max mean_temp for each
+ * month via the 'Max' statistical combination function.
+ */
+ static class MaxMeanTemp
+ extends PTransform<PCollection<TableRow>, PCollection<TableRow>> {
+ @Override
+ public PCollection<TableRow> apply(PCollection<TableRow> rows) {
+
+ // row... => <month, mean_temp> ...
+ PCollection<KV<Integer, Double>> temps = rows.apply(
+ ParDo.of(new ExtractTempFn()));
+
+ // month, mean_temp... => <month, max mean temp>...
+ PCollection<KV<Integer, Double>> tempMaxes =
+ temps.apply(Max.<Integer>doublesPerKey());
+
+ // <month, max>... => row...
+ PCollection<TableRow> results = tempMaxes.apply(
+ ParDo.of(new FormatMaxesFn()));
+
+ return results;
+ }
+ }
+
+ /**
+ * Options supported by {@link MaxPerKeyExamples}.
+ *
+ * <p>Inherits standard configuration options.
+ */
+ private static interface Options extends PipelineOptions {
+ @Description("Table to read from, specified as "
+ + "<project_id>:<dataset_id>.<table_id>")
+ @Default.String(WEATHER_SAMPLES_TABLE)
+ String getInput();
+ void setInput(String value);
+
+ @Description("Table to write to, specified as "
+ + "<project_id>:<dataset_id>.<table_id>")
+ @Validation.Required
+ String getOutput();
+ void setOutput(String value);
+ }
+
+ public static void main(String[] args)
+ throws Exception {
+
+ Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
+ Pipeline p = Pipeline.create(options);
+
+ // Build the table schema for the output table.
+ List<TableFieldSchema> fields = new ArrayList<>();
+ fields.add(new TableFieldSchema().setName("month").setType("INTEGER"));
+ fields.add(new TableFieldSchema().setName("max_mean_temp").setType("FLOAT"));
+ TableSchema schema = new TableSchema().setFields(fields);
+
+ p.apply(BigQueryIO.Read.from(options.getInput()))
+ .apply(new MaxMeanTemp())
+ .apply(BigQueryIO.Write
+ .to(options.getOutput())
+ .withSchema(schema)
+ .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED)
+ .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_TRUNCATE));
+
+ p.run();
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/java/src/main/java/com/google/cloud/dataflow/examples/cookbook/README.md
----------------------------------------------------------------------
diff --git a/examples/java/src/main/java/com/google/cloud/dataflow/examples/cookbook/README.md b/examples/java/src/main/java/com/google/cloud/dataflow/examples/cookbook/README.md
new file mode 100644
index 0000000..99f3080
--- /dev/null
+++ b/examples/java/src/main/java/com/google/cloud/dataflow/examples/cookbook/README.md
@@ -0,0 +1,55 @@
+
+# "Cookbook" Examples
+
+This directory holds simple "cookbook" examples, which show how to define
+commonly-used data analysis patterns that you would likely incorporate into a
+larger Dataflow pipeline. They include:
+
+ <ul>
+ <li><a href="https://github.com/GoogleCloudPlatform/DataflowJavaSDK/blob/master/examples/src/main/java/com/google/cloud/dataflow/examples/cookbook/BigQueryTornadoes.java">BigQueryTornadoes</a>
+ — An example that reads the public samples of weather data from Google
+ BigQuery, counts the number of tornadoes that occur in each month, and
+ writes the results to BigQuery. Demonstrates reading/writing BigQuery,
+ counting a <code>PCollection</code>, and user-defined <code>PTransforms</code>.</li>
+ <li><a href="https://github.com/GoogleCloudPlatform/DataflowJavaSDK/blob/master/examples/src/main/java/com/google/cloud/dataflow/examples/cookbook/CombinePerKeyExamples.java">CombinePerKeyExamples</a>
+ — An example that reads the public "Shakespeare" data, and for
+ each word in the dataset that exceeds a given length, generates a string
+ containing the list of play names in which that word appears.
+ Demonstrates the <code>Combine.perKey</code>
+ transform, which lets you combine the values in a key-grouped
+ <code>PCollection</code>.
+ </li>
+ <li><a href="https://github.com/GoogleCloudPlatform/DataflowJavaSDK/blob/master/examples/src/main/java/com/google/cloud/dataflow/examples/cookbook/DatastoreWordCount.java">DatastoreWordCount</a>
+ — An example that shows you how to read from Google Cloud Datastore.</li>
+ <li><a href="https://github.com/GoogleCloudPlatform/DataflowJavaSDK/blob/master/examples/src/main/java/com/google/cloud/dataflow/examples/cookbook/DeDupExample.java">DeDupExample</a>
+ — An example that uses Shakespeare's plays as plain text files, and
+ removes duplicate lines across all the files. Demonstrates the
+ <code>RemoveDuplicates</code>, <code>TextIO.Read</code>,
+ and <code>TextIO.Write</code> transforms, and how to wire transforms together.
+ </li>
+ <li><a href="https://github.com/GoogleCloudPlatform/DataflowJavaSDK/blob/master/examples/src/main/java/com/google/cloud/dataflow/examples/cookbook/FilterExamples.java">FilterExamples</a>
+ — An example that shows different approaches to filtering, including
+ selection and projection. It also shows how to dynamically set parameters
+ by defining and using new pipeline options, and use how to use a value derived
+ by a pipeline. Demonstrates the <code>Mean</code> transform,
+ <code>Options</code> configuration, and using pipeline-derived data as a side
+ input.
+ </li>
+ <li><a href="https://github.com/GoogleCloudPlatform/DataflowJavaSDK/blob/master/examples/src/main/java/com/google/cloud/dataflow/examples/cookbook/JoinExamples.java">JoinExamples</a>
+ — An example that shows how to join two collections. It uses a
+ sample of the <a href="http://goo.gl/OB6oin">GDELT "world event"
+ data</a>, joining the event <code>action</code> country code against a table
+ that maps country codes to country names. Demonstrates the <code>Join</code>
+ operation, and using multiple input sources.
+ </li>
+ <li><a href="https://github.com/GoogleCloudPlatform/DataflowJavaSDK/blob/master/examples/src/main/java/com/google/cloud/dataflow/examples/cookbook/MaxPerKeyExamples.java">MaxPerKeyExamples</a>
+ — An example that reads the public samples of weather data from BigQuery,
+ and finds the maximum temperature (<code>mean_temp</code>) for each month.
+ Demonstrates the <code>Max</code> statistical combination transform, and how to
+ find the max-per-key group.
+ </li>
+ </ul>
+
+See the [documentation](https://cloud.google.com/dataflow/getting-started) and the [Examples
+README](../../../../../../../../../README.md) for
+information about how to run these examples.
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/java/src/main/java/com/google/cloud/dataflow/examples/cookbook/TriggerExample.java
----------------------------------------------------------------------
diff --git a/examples/java/src/main/java/com/google/cloud/dataflow/examples/cookbook/TriggerExample.java b/examples/java/src/main/java/com/google/cloud/dataflow/examples/cookbook/TriggerExample.java
new file mode 100644
index 0000000..ce5e08e
--- /dev/null
+++ b/examples/java/src/main/java/com/google/cloud/dataflow/examples/cookbook/TriggerExample.java
@@ -0,0 +1,564 @@
+/*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package com.google.cloud.dataflow.examples.cookbook;
+
+import com.google.api.services.bigquery.model.TableFieldSchema;
+import com.google.api.services.bigquery.model.TableReference;
+import com.google.api.services.bigquery.model.TableRow;
+import com.google.api.services.bigquery.model.TableSchema;
+import com.google.cloud.dataflow.examples.common.DataflowExampleOptions;
+import com.google.cloud.dataflow.examples.common.DataflowExampleUtils;
+import com.google.cloud.dataflow.examples.common.ExampleBigQueryTableOptions;
+import com.google.cloud.dataflow.examples.common.ExamplePubsubTopicOptions;
+import com.google.cloud.dataflow.examples.common.PubsubFileInjector;
+import com.google.cloud.dataflow.sdk.Pipeline;
+import com.google.cloud.dataflow.sdk.PipelineResult;
+import com.google.cloud.dataflow.sdk.io.BigQueryIO;
+import com.google.cloud.dataflow.sdk.io.PubsubIO;
+import com.google.cloud.dataflow.sdk.io.TextIO;
+import com.google.cloud.dataflow.sdk.options.DataflowPipelineOptions;
+import com.google.cloud.dataflow.sdk.options.Default;
+import com.google.cloud.dataflow.sdk.options.Description;
+import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
+import com.google.cloud.dataflow.sdk.runners.DataflowPipelineRunner;
+import com.google.cloud.dataflow.sdk.transforms.DoFn;
+import com.google.cloud.dataflow.sdk.transforms.DoFn.RequiresWindowAccess;
+import com.google.cloud.dataflow.sdk.transforms.GroupByKey;
+import com.google.cloud.dataflow.sdk.transforms.IntraBundleParallelization;
+import com.google.cloud.dataflow.sdk.transforms.PTransform;
+import com.google.cloud.dataflow.sdk.transforms.ParDo;
+import com.google.cloud.dataflow.sdk.transforms.windowing.AfterEach;
+import com.google.cloud.dataflow.sdk.transforms.windowing.AfterProcessingTime;
+import com.google.cloud.dataflow.sdk.transforms.windowing.AfterWatermark;
+import com.google.cloud.dataflow.sdk.transforms.windowing.FixedWindows;
+import com.google.cloud.dataflow.sdk.transforms.windowing.Repeatedly;
+import com.google.cloud.dataflow.sdk.transforms.windowing.Window;
+import com.google.cloud.dataflow.sdk.values.KV;
+import com.google.cloud.dataflow.sdk.values.PCollection;
+import com.google.cloud.dataflow.sdk.values.PCollectionList;
+
+import org.joda.time.Duration;
+import org.joda.time.Instant;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.concurrent.TimeUnit;
+
+/**
+ * This example illustrates the basic concepts behind triggering. It shows how to use different
+ * trigger definitions to produce partial (speculative) results before all the data is processed and
+ * to control when updated results are produced for late data. The example performs a streaming
+ * analysis of the data coming in from PubSub and writes the results to BigQuery. It divides the
+ * data into {@link Window windows} to be processed, and demonstrates using various kinds of {@link
+ * Trigger triggers} to control when the results for each window are emitted.
+ *
+ * <p> This example uses a portion of real traffic data from San Diego freeways. It contains
+ * readings from sensor stations set up along each freeway. Each sensor reading includes a
+ * calculation of the 'total flow' across all lanes in that freeway direction.
+ *
+ * <p> Concepts:
+ * <pre>
+ * 1. The default triggering behavior
+ * 2. Late data with the default trigger
+ * 3. How to get speculative estimates
+ * 4. Combining late data and speculative estimates
+ * </pre>
+ *
+ * <p> Before running this example, it will be useful to familiarize yourself with Dataflow triggers
+ * and understand the concept of 'late data',
+ * See: <a href="https://cloud.google.com/dataflow/model/triggers">
+ * https://cloud.google.com/dataflow/model/triggers </a> and
+ * <a href="https://cloud.google.com/dataflow/model/windowing#Advanced">
+ * https://cloud.google.com/dataflow/model/windowing#Advanced </a>
+ *
+ * <p> The example pipeline reads data from a Pub/Sub topic. By default, running the example will
+ * also run an auxiliary pipeline to inject data from the default {@code --input} file to the
+ * {@code --pubsubTopic}. The auxiliary pipeline puts a timestamp on the injected data so that the
+ * example pipeline can operate on <i>event time</i> (rather than arrival time). The auxiliary
+ * pipeline also randomly simulates late data, by setting the timestamps of some of the data
+ * elements to be in the past. You may override the default {@code --input} with the file of your
+ * choosing or set {@code --input=""} which will disable the automatic Pub/Sub injection, and allow
+ * you to use a separate tool to publish to the given topic.
+ *
+ * <p> The example is configured to use the default Pub/Sub topic and the default BigQuery table
+ * from the example common package (there are no defaults for a general Dataflow pipeline).
+ * You can override them by using the {@code --pubsubTopic}, {@code --bigQueryDataset}, and
+ * {@code --bigQueryTable} options. If the Pub/Sub topic or the BigQuery table do not exist,
+ * the example will try to create them.
+ *
+ * <p> The pipeline outputs its results to a BigQuery table.
+ * Here are some queries you can use to see interesting results:
+ * Replace {@code <enter_table_name>} in the query below with the name of the BigQuery table.
+ * Replace {@code <enter_window_interval>} in the query below with the window interval.
+ *
+ * <p> To see the results of the default trigger,
+ * Note: When you start up your pipeline, you'll initially see results from 'late' data. Wait after
+ * the window duration, until the first pane of non-late data has been emitted, to see more
+ * interesting results.
+ * {@code SELECT * FROM enter_table_name WHERE trigger_type = "default" ORDER BY window DESC}
+ *
+ * <p> To see the late data i.e. dropped by the default trigger,
+ * {@code SELECT * FROM <enter_table_name> WHERE trigger_type = "withAllowedLateness" and
+ * (timing = "LATE" or timing = "ON_TIME") and freeway = "5" ORDER BY window DESC, processing_time}
+ *
+ * <p>To see the the difference between accumulation mode and discarding mode,
+ * {@code SELECT * FROM <enter_table_name> WHERE (timing = "LATE" or timing = "ON_TIME") AND
+ * (trigger_type = "withAllowedLateness" or trigger_type = "sequential") and freeway = "5" ORDER BY
+ * window DESC, processing_time}
+ *
+ * <p> To see speculative results every minute,
+ * {@code SELECT * FROM <enter_table_name> WHERE trigger_type = "speculative" and freeway = "5"
+ * ORDER BY window DESC, processing_time}
+ *
+ * <p> To see speculative results every five minutes after the end of the window
+ * {@code SELECT * FROM <enter_table_name> WHERE trigger_type = "sequential" and timing != "EARLY"
+ * and freeway = "5" ORDER BY window DESC, processing_time}
+ *
+ * <p> To see the first and the last pane for a freeway in a window for all the trigger types,
+ * {@code SELECT * FROM <enter_table_name> WHERE (isFirst = true or isLast = true) ORDER BY window}
+ *
+ * <p> To reduce the number of results for each query we can add additional where clauses.
+ * For examples, To see the results of the default trigger,
+ * {@code SELECT * FROM <enter_table_name> WHERE trigger_type = "default" AND freeway = "5" AND
+ * window = "<enter_window_interval>"}
+ *
+ * <p> The example will try to cancel the pipelines on the signal to terminate the process (CTRL-C)
+ * and then exits.
+ */
+
+public class TriggerExample {
+ //Numeric value of fixed window duration, in minutes
+ public static final int WINDOW_DURATION = 30;
+ // Constants used in triggers.
+ // Speeding up ONE_MINUTE or FIVE_MINUTES helps you get an early approximation of results.
+ // ONE_MINUTE is used only with processing time before the end of the window
+ public static final Duration ONE_MINUTE = Duration.standardMinutes(1);
+ // FIVE_MINUTES is used only with processing time after the end of the window
+ public static final Duration FIVE_MINUTES = Duration.standardMinutes(5);
+ // ONE_DAY is used to specify the amount of lateness allowed for the data elements.
+ public static final Duration ONE_DAY = Duration.standardDays(1);
+
+ /**
+ * This transform demonstrates using triggers to control when data is produced for each window
+ * Consider an example to understand the results generated by each type of trigger.
+ * The example uses "freeway" as the key. Event time is the timestamp associated with the data
+ * element and processing time is the time when the data element gets processed in the pipeline.
+ * For freeway 5, suppose there are 10 elements in the [10:00:00, 10:30:00) window.
+ * Key (freeway) | Value (total_flow) | event time | processing time
+ * 5 | 50 | 10:00:03 | 10:00:47
+ * 5 | 30 | 10:01:00 | 10:01:03
+ * 5 | 30 | 10:02:00 | 11:07:00
+ * 5 | 20 | 10:04:10 | 10:05:15
+ * 5 | 60 | 10:05:00 | 11:03:00
+ * 5 | 20 | 10:05:01 | 11.07:30
+ * 5 | 60 | 10:15:00 | 10:27:15
+ * 5 | 40 | 10:26:40 | 10:26:43
+ * 5 | 60 | 10:27:20 | 10:27:25
+ * 5 | 60 | 10:29:00 | 11:11:00
+ *
+ * <p> Dataflow tracks a watermark which records up to what point in event time the data is
+ * complete. For the purposes of the example, we'll assume the watermark is approximately 15m
+ * behind the current processing time. In practice, the actual value would vary over time based
+ * on the systems knowledge of the current PubSub delay and contents of the backlog (data
+ * that has not yet been processed).
+ *
+ * <p> If the watermark is 15m behind, then the window [10:00:00, 10:30:00) (in event time) would
+ * close at 10:44:59, when the watermark passes 10:30:00.
+ */
+ static class CalculateTotalFlow
+ extends PTransform <PCollection<KV<String, Integer>>, PCollectionList<TableRow>> {
+ private int windowDuration;
+
+ CalculateTotalFlow(int windowDuration) {
+ this.windowDuration = windowDuration;
+ }
+
+ @Override
+ public PCollectionList<TableRow> apply(PCollection<KV<String, Integer>> flowInfo) {
+
+ // Concept #1: The default triggering behavior
+ // By default Dataflow uses a trigger which fires when the watermark has passed the end of the
+ // window. This would be written {@code Repeatedly.forever(AfterWatermark.pastEndOfWindow())}.
+
+ // The system also defaults to dropping late data -- data which arrives after the watermark
+ // has passed the event timestamp of the arriving element. This means that the default trigger
+ // will only fire once.
+
+ // Each pane produced by the default trigger with no allowed lateness will be the first and
+ // last pane in the window, and will be ON_TIME.
+
+ // The results for the example above with the default trigger and zero allowed lateness
+ // would be:
+ // Key (freeway) | Value (total_flow) | number_of_records | isFirst | isLast | timing
+ // 5 | 260 | 6 | true | true | ON_TIME
+
+ // At 11:03:00 (processing time) the system watermark may have advanced to 10:54:00. As a
+ // result, when the data record with event time 10:05:00 arrives at 11:03:00, it is considered
+ // late, and dropped.
+
+ PCollection<TableRow> defaultTriggerResults = flowInfo
+ .apply("Default", Window
+ // The default window duration values work well if you're running the default input
+ // file. You may want to adjust the window duration otherwise.
+ .<KV<String, Integer>>into(FixedWindows.of(Duration.standardMinutes(windowDuration)))
+ // The default trigger first emits output when the system's watermark passes the end
+ // of the window.
+ .triggering(Repeatedly.forever(AfterWatermark.pastEndOfWindow()))
+ // Late data is dropped
+ .withAllowedLateness(Duration.ZERO)
+ // Discard elements after emitting each pane.
+ // With no allowed lateness and the specified trigger there will only be a single
+ // pane, so this doesn't have a noticeable effect. See concept 2 for more details.
+ .discardingFiredPanes())
+ .apply(new TotalFlow("default"));
+
+ // Concept #2: Late data with the default trigger
+ // This uses the same trigger as concept #1, but allows data that is up to ONE_DAY late. This
+ // leads to each window staying open for ONE_DAY after the watermark has passed the end of the
+ // window. Any late data will result in an additional pane being fired for that same window.
+
+ // The first pane produced will be ON_TIME and the remaining panes will be LATE.
+ // To definitely get the last pane when the window closes, use
+ // .withAllowedLateness(ONE_DAY, ClosingBehavior.FIRE_ALWAYS).
+
+ // The results for the example above with the default trigger and ONE_DAY allowed lateness
+ // would be:
+ // Key (freeway) | Value (total_flow) | number_of_records | isFirst | isLast | timing
+ // 5 | 260 | 6 | true | false | ON_TIME
+ // 5 | 60 | 1 | false | false | LATE
+ // 5 | 30 | 1 | false | false | LATE
+ // 5 | 20 | 1 | false | false | LATE
+ // 5 | 60 | 1 | false | false | LATE
+ PCollection<TableRow> withAllowedLatenessResults = flowInfo
+ .apply("WithLateData", Window
+ .<KV<String, Integer>>into(FixedWindows.of(Duration.standardMinutes(windowDuration)))
+ // Late data is emitted as it arrives
+ .triggering(Repeatedly.forever(AfterWatermark.pastEndOfWindow()))
+ // Once the output is produced, the pane is dropped and we start preparing the next
+ // pane for the window
+ .discardingFiredPanes()
+ // Late data is handled up to one day
+ .withAllowedLateness(ONE_DAY))
+ .apply(new TotalFlow("withAllowedLateness"));
+
+ // Concept #3: How to get speculative estimates
+ // We can specify a trigger that fires independent of the watermark, for instance after
+ // ONE_MINUTE of processing time. This allows us to produce speculative estimates before
+ // all the data is available. Since we don't have any triggers that depend on the watermark
+ // we don't get an ON_TIME firing. Instead, all panes are either EARLY or LATE.
+
+ // We also use accumulatingFiredPanes to build up the results across each pane firing.
+
+ // The results for the example above for this trigger would be:
+ // Key (freeway) | Value (total_flow) | number_of_records | isFirst | isLast | timing
+ // 5 | 80 | 2 | true | false | EARLY
+ // 5 | 100 | 3 | false | false | EARLY
+ // 5 | 260 | 6 | false | false | EARLY
+ // 5 | 320 | 7 | false | false | LATE
+ // 5 | 370 | 9 | false | false | LATE
+ // 5 | 430 | 10 | false | false | LATE
+ PCollection<TableRow> speculativeResults = flowInfo
+ .apply("Speculative" , Window
+ .<KV<String, Integer>>into(FixedWindows.of(Duration.standardMinutes(windowDuration)))
+ // Trigger fires every minute.
+ .triggering(Repeatedly.forever(AfterProcessingTime.pastFirstElementInPane()
+ // Speculative every ONE_MINUTE
+ .plusDelayOf(ONE_MINUTE)))
+ // After emitting each pane, it will continue accumulating the elements so that each
+ // approximation includes all of the previous data in addition to the newly arrived
+ // data.
+ .accumulatingFiredPanes()
+ .withAllowedLateness(ONE_DAY))
+ .apply(new TotalFlow("speculative"));
+
+ // Concept #4: Combining late data and speculative estimates
+ // We can put the previous concepts together to get EARLY estimates, an ON_TIME result,
+ // and LATE updates based on late data.
+
+ // Each time a triggering condition is satisfied it advances to the next trigger.
+ // If there are new elements this trigger emits a window under following condition:
+ // > Early approximations every minute till the end of the window.
+ // > An on-time firing when the watermark has passed the end of the window
+ // > Every five minutes of late data.
+
+ // Every pane produced will either be EARLY, ON_TIME or LATE.
+
+ // The results for the example above for this trigger would be:
+ // Key (freeway) | Value (total_flow) | number_of_records | isFirst | isLast | timing
+ // 5 | 80 | 2 | true | false | EARLY
+ // 5 | 100 | 3 | false | false | EARLY
+ // 5 | 260 | 6 | false | false | EARLY
+ // [First pane fired after the end of the window]
+ // 5 | 320 | 7 | false | false | ON_TIME
+ // 5 | 430 | 10 | false | false | LATE
+
+ // For more possibilities of how to build advanced triggers, see {@link Trigger}.
+ PCollection<TableRow> sequentialResults = flowInfo
+ .apply("Sequential", Window
+ .<KV<String, Integer>>into(FixedWindows.of(Duration.standardMinutes(windowDuration)))
+ .triggering(AfterEach.inOrder(
+ Repeatedly.forever(AfterProcessingTime.pastFirstElementInPane()
+ // Speculative every ONE_MINUTE
+ .plusDelayOf(ONE_MINUTE)).orFinally(AfterWatermark.pastEndOfWindow()),
+ Repeatedly.forever(AfterProcessingTime.pastFirstElementInPane()
+ // Late data every FIVE_MINUTES
+ .plusDelayOf(FIVE_MINUTES))))
+ .accumulatingFiredPanes()
+ // For up to ONE_DAY
+ .withAllowedLateness(ONE_DAY))
+ .apply(new TotalFlow("sequential"));
+
+ // Adds the results generated by each trigger type to a PCollectionList.
+ PCollectionList<TableRow> resultsList = PCollectionList.of(defaultTriggerResults)
+ .and(withAllowedLatenessResults)
+ .and(speculativeResults)
+ .and(sequentialResults);
+
+ return resultsList;
+ }
+ }
+
+ //////////////////////////////////////////////////////////////////////////////////////////////////
+ // The remaining parts of the pipeline are needed to produce the output for each
+ // concept above. Not directly relevant to understanding the trigger examples.
+
+ /**
+ * Calculate total flow and number of records for each freeway and format the results to TableRow
+ * objects, to save to BigQuery.
+ */
+ static class TotalFlow extends
+ PTransform <PCollection<KV<String, Integer>>, PCollection<TableRow>> {
+ private String triggerType;
+
+ public TotalFlow(String triggerType) {
+ this.triggerType = triggerType;
+ }
+
+ @Override
+ public PCollection<TableRow> apply(PCollection<KV<String, Integer>> flowInfo) {
+ PCollection<KV<String, Iterable<Integer>>> flowPerFreeway = flowInfo
+ .apply(GroupByKey.<String, Integer>create());
+
+ PCollection<KV<String, String>> results = flowPerFreeway.apply(ParDo.of(
+ new DoFn <KV<String, Iterable<Integer>>, KV<String, String>>() {
+
+ @Override
+ public void processElement(ProcessContext c) throws Exception {
+ Iterable<Integer> flows = c.element().getValue();
+ Integer sum = 0;
+ Long numberOfRecords = 0L;
+ for (Integer value : flows) {
+ sum += value;
+ numberOfRecords++;
+ }
+ c.output(KV.of(c.element().getKey(), sum + "," + numberOfRecords));
+ }
+ }));
+ PCollection<TableRow> output = results.apply(ParDo.of(new FormatTotalFlow(triggerType)));
+ return output;
+ }
+ }
+
+ /**
+ * Format the results of the Total flow calculation to a TableRow, to save to BigQuery.
+ * Adds the triggerType, pane information, processing time and the window timestamp.
+ * */
+ static class FormatTotalFlow extends DoFn<KV<String, String>, TableRow>
+ implements RequiresWindowAccess {
+ private String triggerType;
+
+ public FormatTotalFlow(String triggerType) {
+ this.triggerType = triggerType;
+ }
+ @Override
+ public void processElement(ProcessContext c) throws Exception {
+ String[] values = c.element().getValue().split(",");
+ TableRow row = new TableRow()
+ .set("trigger_type", triggerType)
+ .set("freeway", c.element().getKey())
+ .set("total_flow", Integer.parseInt(values[0]))
+ .set("number_of_records", Long.parseLong(values[1]))
+ .set("window", c.window().toString())
+ .set("isFirst", c.pane().isFirst())
+ .set("isLast", c.pane().isLast())
+ .set("timing", c.pane().getTiming().toString())
+ .set("event_time", c.timestamp().toString())
+ .set("processing_time", Instant.now().toString());
+ c.output(row);
+ }
+ }
+
+ /**
+ * Extract the freeway and total flow in a reading.
+ * Freeway is used as key since we are calculating the total flow for each freeway.
+ */
+ static class ExtractFlowInfo extends DoFn<String, KV<String, Integer>> {
+ @Override
+ public void processElement(ProcessContext c) throws Exception {
+ String[] laneInfo = c.element().split(",");
+ if (laneInfo[0].equals("timestamp")) {
+ // Header row
+ return;
+ }
+ if (laneInfo.length < 48) {
+ //Skip the invalid input.
+ return;
+ }
+ String freeway = laneInfo[2];
+ Integer totalFlow = tryIntegerParse(laneInfo[7]);
+ // Ignore the records with total flow 0 to easily understand the working of triggers.
+ // Skip the records with total flow -1 since they are invalid input.
+ if (totalFlow == null || totalFlow <= 0) {
+ return;
+ }
+ c.output(KV.of(freeway, totalFlow));
+ }
+ }
+
+ /**
+ * Inherits standard configuration options.
+ */
+ public interface TrafficFlowOptions
+ extends ExamplePubsubTopicOptions, ExampleBigQueryTableOptions, DataflowExampleOptions {
+
+ @Description("Input file to inject to Pub/Sub topic")
+ @Default.String("gs://dataflow-samples/traffic_sensor/"
+ + "Freeways-5Minaa2010-01-01_to_2010-02-15.csv")
+ String getInput();
+ void setInput(String value);
+
+ @Description("Numeric value of window duration for fixed windows, in minutes")
+ @Default.Integer(WINDOW_DURATION)
+ Integer getWindowDuration();
+ void setWindowDuration(Integer value);
+ }
+
+ private static final String PUBSUB_TIMESTAMP_LABEL_KEY = "timestamp_ms";
+
+ public static void main(String[] args) throws Exception {
+ TrafficFlowOptions options = PipelineOptionsFactory.fromArgs(args)
+ .withValidation()
+ .as(TrafficFlowOptions.class);
+ options.setStreaming(true);
+
+ // In order to cancel the pipelines automatically,
+ // {@code DataflowPipelineRunner} is forced to be used.
+ options.setRunner(DataflowPipelineRunner.class);
+ options.setBigQuerySchema(getSchema());
+
+ DataflowExampleUtils dataflowUtils = new DataflowExampleUtils(options);
+ dataflowUtils.setup();
+
+ Pipeline pipeline = Pipeline.create(options);
+
+ TableReference tableRef = getTableReference(options.getProject(),
+ options.getBigQueryDataset(), options.getBigQueryTable());
+
+ PCollectionList<TableRow> resultList = pipeline.apply(PubsubIO.Read.named("ReadPubsubInput")
+ .timestampLabel(PUBSUB_TIMESTAMP_LABEL_KEY)
+ .topic(options.getPubsubTopic()))
+ .apply(ParDo.of(new ExtractFlowInfo()))
+ .apply(new CalculateTotalFlow(options.getWindowDuration()));
+
+ for (int i = 0; i < resultList.size(); i++){
+ resultList.get(i).apply(BigQueryIO.Write.to(tableRef).withSchema(getSchema()));
+ }
+
+ PipelineResult result = pipeline.run();
+ if (!options.getInput().isEmpty()){
+ //Inject the data into the pubsub topic
+ dataflowUtils.runInjectorPipeline(runInjector(options));
+ }
+ // dataflowUtils will try to cancel the pipeline and the injector before the program exits.
+ dataflowUtils.waitToFinish(result);
+ }
+
+ private static Pipeline runInjector(TrafficFlowOptions options){
+ DataflowPipelineOptions copiedOptions = options.cloneAs(DataflowPipelineOptions.class);
+ copiedOptions.setStreaming(false);
+ copiedOptions.setNumWorkers(options.as(DataflowExampleOptions.class).getInjectorNumWorkers());
+ copiedOptions.setJobName(options.getJobName() + "-injector");
+ Pipeline injectorPipeline = Pipeline.create(copiedOptions);
+ injectorPipeline
+ .apply(TextIO.Read.named("ReadMyFile").from(options.getInput()))
+ .apply(ParDo.named("InsertRandomDelays").of(new InsertDelays()))
+ .apply(IntraBundleParallelization.of(PubsubFileInjector
+ .withTimestampLabelKey(PUBSUB_TIMESTAMP_LABEL_KEY)
+ .publish(options.getPubsubTopic()))
+ .withMaxParallelism(20));
+
+ return injectorPipeline;
+ }
+
+ /**
+ * Add current time to each record.
+ * Also insert a delay at random to demo the triggers.
+ */
+ public static class InsertDelays extends DoFn<String, String> {
+ private static final double THRESHOLD = 0.001;
+ // MIN_DELAY and MAX_DELAY in minutes.
+ private static final int MIN_DELAY = 1;
+ private static final int MAX_DELAY = 100;
+
+ @Override
+ public void processElement(ProcessContext c) throws Exception {
+ Instant timestamp = Instant.now();
+ if (Math.random() < THRESHOLD){
+ int range = MAX_DELAY - MIN_DELAY;
+ int delayInMinutes = (int) (Math.random() * range) + MIN_DELAY;
+ long delayInMillis = TimeUnit.MINUTES.toMillis(delayInMinutes);
+ timestamp = new Instant(timestamp.getMillis() - delayInMillis);
+ }
+ c.outputWithTimestamp(c.element(), timestamp);
+ }
+ }
+
+
+ /**Sets the table reference. **/
+ private static TableReference getTableReference(String project, String dataset, String table){
+ TableReference tableRef = new TableReference();
+ tableRef.setProjectId(project);
+ tableRef.setDatasetId(dataset);
+ tableRef.setTableId(table);
+ return tableRef;
+ }
+
+ /** Defines the BigQuery schema used for the output. */
+ private static TableSchema getSchema() {
+ List<TableFieldSchema> fields = new ArrayList<>();
+ fields.add(new TableFieldSchema().setName("trigger_type").setType("STRING"));
+ fields.add(new TableFieldSchema().setName("freeway").setType("STRING"));
+ fields.add(new TableFieldSchema().setName("total_flow").setType("INTEGER"));
+ fields.add(new TableFieldSchema().setName("number_of_records").setType("INTEGER"));
+ fields.add(new TableFieldSchema().setName("window").setType("STRING"));
+ fields.add(new TableFieldSchema().setName("isFirst").setType("BOOLEAN"));
+ fields.add(new TableFieldSchema().setName("isLast").setType("BOOLEAN"));
+ fields.add(new TableFieldSchema().setName("timing").setType("STRING"));
+ fields.add(new TableFieldSchema().setName("event_time").setType("TIMESTAMP"));
+ fields.add(new TableFieldSchema().setName("processing_time").setType("TIMESTAMP"));
+ TableSchema schema = new TableSchema().setFields(fields);
+ return schema;
+ }
+
+ private static Integer tryIntegerParse(String number) {
+ try {
+ return Integer.parseInt(number);
+ } catch (NumberFormatException e) {
+ return null;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/java/src/test/java/com/google/cloud/dataflow/examples/DebuggingWordCountTest.java
----------------------------------------------------------------------
diff --git a/examples/java/src/test/java/com/google/cloud/dataflow/examples/DebuggingWordCountTest.java b/examples/java/src/test/java/com/google/cloud/dataflow/examples/DebuggingWordCountTest.java
new file mode 100644
index 0000000..77d7bc8
--- /dev/null
+++ b/examples/java/src/test/java/com/google/cloud/dataflow/examples/DebuggingWordCountTest.java
@@ -0,0 +1,45 @@
+/*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package com.google.cloud.dataflow.examples;
+
+import com.google.common.io.Files;
+
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+import java.io.File;
+import java.nio.charset.StandardCharsets;
+
+/**
+ * Tests for {@link DebuggingWordCount}.
+ */
+@RunWith(JUnit4.class)
+public class DebuggingWordCountTest {
+ @Rule public TemporaryFolder tmpFolder = new TemporaryFolder();
+
+ @Test
+ public void testDebuggingWordCount() throws Exception {
+ File file = tmpFolder.newFile();
+ Files.write("stomach secret Flourish message Flourish here Flourish", file,
+ StandardCharsets.UTF_8);
+ DebuggingWordCount.main(new String[]{"--inputFile=" + file.getAbsolutePath()});
+ }
+}
+
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/java/src/test/java/com/google/cloud/dataflow/examples/WordCountTest.java
----------------------------------------------------------------------
diff --git a/examples/java/src/test/java/com/google/cloud/dataflow/examples/WordCountTest.java b/examples/java/src/test/java/com/google/cloud/dataflow/examples/WordCountTest.java
new file mode 100644
index 0000000..4542c48
--- /dev/null
+++ b/examples/java/src/test/java/com/google/cloud/dataflow/examples/WordCountTest.java
@@ -0,0 +1,85 @@
+/*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package com.google.cloud.dataflow.examples;
+
+import com.google.cloud.dataflow.examples.WordCount.CountWords;
+import com.google.cloud.dataflow.examples.WordCount.ExtractWordsFn;
+import com.google.cloud.dataflow.examples.WordCount.FormatAsTextFn;
+import com.google.cloud.dataflow.sdk.Pipeline;
+import com.google.cloud.dataflow.sdk.coders.StringUtf8Coder;
+import com.google.cloud.dataflow.sdk.testing.DataflowAssert;
+import com.google.cloud.dataflow.sdk.testing.RunnableOnService;
+import com.google.cloud.dataflow.sdk.testing.TestPipeline;
+import com.google.cloud.dataflow.sdk.transforms.Create;
+import com.google.cloud.dataflow.sdk.transforms.DoFnTester;
+import com.google.cloud.dataflow.sdk.transforms.MapElements;
+import com.google.cloud.dataflow.sdk.values.PCollection;
+
+import org.hamcrest.CoreMatchers;
+import org.junit.Assert;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * Tests of WordCount.
+ */
+@RunWith(JUnit4.class)
+public class WordCountTest {
+
+ /** Example test that tests a specific DoFn. */
+ @Test
+ public void testExtractWordsFn() {
+ DoFnTester<String, String> extractWordsFn =
+ DoFnTester.of(new ExtractWordsFn());
+
+ Assert.assertThat(extractWordsFn.processBatch(" some input words "),
+ CoreMatchers.hasItems("some", "input", "words"));
+ Assert.assertThat(extractWordsFn.processBatch(" "),
+ CoreMatchers.<String>hasItems());
+ Assert.assertThat(extractWordsFn.processBatch(" some ", " input", " words"),
+ CoreMatchers.hasItems("some", "input", "words"));
+ }
+
+ static final String[] WORDS_ARRAY = new String[] {
+ "hi there", "hi", "hi sue bob",
+ "hi sue", "", "bob hi"};
+
+ static final List<String> WORDS = Arrays.asList(WORDS_ARRAY);
+
+ static final String[] COUNTS_ARRAY = new String[] {
+ "hi: 5", "there: 1", "sue: 2", "bob: 2"};
+
+ /** Example test that tests a PTransform by using an in-memory input and inspecting the output. */
+ @Test
+ @Category(RunnableOnService.class)
+ public void testCountWords() throws Exception {
+ Pipeline p = TestPipeline.create();
+
+ PCollection<String> input = p.apply(Create.of(WORDS).withCoder(StringUtf8Coder.of()));
+
+ PCollection<String> output = input.apply(new CountWords())
+ .apply(MapElements.via(new FormatAsTextFn()));
+
+ DataflowAssert.that(output).containsInAnyOrder(COUNTS_ARRAY);
+ p.run();
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/java/src/test/java/com/google/cloud/dataflow/examples/complete/AutoCompleteTest.java
----------------------------------------------------------------------
diff --git a/examples/java/src/test/java/com/google/cloud/dataflow/examples/complete/AutoCompleteTest.java b/examples/java/src/test/java/com/google/cloud/dataflow/examples/complete/AutoCompleteTest.java
new file mode 100644
index 0000000..aec1557
--- /dev/null
+++ b/examples/java/src/test/java/com/google/cloud/dataflow/examples/complete/AutoCompleteTest.java
@@ -0,0 +1,181 @@
+/*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package com.google.cloud.dataflow.examples.complete;
+
+import com.google.cloud.dataflow.examples.complete.AutoComplete.CompletionCandidate;
+import com.google.cloud.dataflow.examples.complete.AutoComplete.ComputeTopCompletions;
+import com.google.cloud.dataflow.sdk.Pipeline;
+import com.google.cloud.dataflow.sdk.testing.DataflowAssert;
+import com.google.cloud.dataflow.sdk.testing.TestPipeline;
+import com.google.cloud.dataflow.sdk.transforms.Create;
+import com.google.cloud.dataflow.sdk.transforms.DoFn;
+import com.google.cloud.dataflow.sdk.transforms.Filter;
+import com.google.cloud.dataflow.sdk.transforms.PTransform;
+import com.google.cloud.dataflow.sdk.transforms.ParDo;
+import com.google.cloud.dataflow.sdk.transforms.SerializableFunction;
+import com.google.cloud.dataflow.sdk.transforms.windowing.SlidingWindows;
+import com.google.cloud.dataflow.sdk.transforms.windowing.Window;
+import com.google.cloud.dataflow.sdk.values.KV;
+import com.google.cloud.dataflow.sdk.values.PCollection;
+import com.google.cloud.dataflow.sdk.values.TimestampedValue;
+
+import org.joda.time.Duration;
+import org.joda.time.Instant;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.List;
+
+/**
+ * Tests of AutoComplete.
+ */
+@RunWith(Parameterized.class)
+public class AutoCompleteTest implements Serializable {
+ private boolean recursive;
+
+ public AutoCompleteTest(Boolean recursive) {
+ this.recursive = recursive;
+ }
+
+ @Parameterized.Parameters
+ public static Collection<Object[]> testRecursive() {
+ return Arrays.asList(new Object[][] {
+ { true },
+ { false }
+ });
+ }
+
+ @Test
+ public void testAutoComplete() {
+ List<String> words = Arrays.asList(
+ "apple",
+ "apple",
+ "apricot",
+ "banana",
+ "blackberry",
+ "blackberry",
+ "blackberry",
+ "blueberry",
+ "blueberry",
+ "cherry");
+
+ Pipeline p = TestPipeline.create();
+
+ PCollection<String> input = p.apply(Create.of(words));
+
+ PCollection<KV<String, List<CompletionCandidate>>> output =
+ input.apply(new ComputeTopCompletions(2, recursive))
+ .apply(Filter.byPredicate(
+ new SerializableFunction<KV<String, List<CompletionCandidate>>, Boolean>() {
+ @Override
+ public Boolean apply(KV<String, List<CompletionCandidate>> element) {
+ return element.getKey().length() <= 2;
+ }
+ }));
+
+ DataflowAssert.that(output).containsInAnyOrder(
+ KV.of("a", parseList("apple:2", "apricot:1")),
+ KV.of("ap", parseList("apple:2", "apricot:1")),
+ KV.of("b", parseList("blackberry:3", "blueberry:2")),
+ KV.of("ba", parseList("banana:1")),
+ KV.of("bl", parseList("blackberry:3", "blueberry:2")),
+ KV.of("c", parseList("cherry:1")),
+ KV.of("ch", parseList("cherry:1")));
+ p.run();
+ }
+
+ @Test
+ public void testTinyAutoComplete() {
+ List<String> words = Arrays.asList("x", "x", "x", "xy", "xy", "xyz");
+
+ Pipeline p = TestPipeline.create();
+
+ PCollection<String> input = p.apply(Create.of(words));
+
+ PCollection<KV<String, List<CompletionCandidate>>> output =
+ input.apply(new ComputeTopCompletions(2, recursive));
+
+ DataflowAssert.that(output).containsInAnyOrder(
+ KV.of("x", parseList("x:3", "xy:2")),
+ KV.of("xy", parseList("xy:2", "xyz:1")),
+ KV.of("xyz", parseList("xyz:1")));
+ p.run();
+ }
+
+ @Test
+ public void testWindowedAutoComplete() {
+ List<TimestampedValue<String>> words = Arrays.asList(
+ TimestampedValue.of("xA", new Instant(1)),
+ TimestampedValue.of("xA", new Instant(1)),
+ TimestampedValue.of("xB", new Instant(1)),
+ TimestampedValue.of("xB", new Instant(2)),
+ TimestampedValue.of("xB", new Instant(2)));
+
+ Pipeline p = TestPipeline.create();
+
+ PCollection<String> input = p
+ .apply(Create.of(words))
+ .apply(new ReifyTimestamps<String>());
+
+ PCollection<KV<String, List<CompletionCandidate>>> output =
+ input.apply(Window.<String>into(SlidingWindows.of(new Duration(2))))
+ .apply(new ComputeTopCompletions(2, recursive));
+
+ DataflowAssert.that(output).containsInAnyOrder(
+ // Window [0, 2)
+ KV.of("x", parseList("xA:2", "xB:1")),
+ KV.of("xA", parseList("xA:2")),
+ KV.of("xB", parseList("xB:1")),
+
+ // Window [1, 3)
+ KV.of("x", parseList("xB:3", "xA:2")),
+ KV.of("xA", parseList("xA:2")),
+ KV.of("xB", parseList("xB:3")),
+
+ // Window [2, 3)
+ KV.of("x", parseList("xB:2")),
+ KV.of("xB", parseList("xB:2")));
+ p.run();
+ }
+
+ private static List<CompletionCandidate> parseList(String... entries) {
+ List<CompletionCandidate> all = new ArrayList<>();
+ for (String s : entries) {
+ String[] countValue = s.split(":");
+ all.add(new CompletionCandidate(countValue[0], Integer.valueOf(countValue[1])));
+ }
+ return all;
+ }
+
+ private static class ReifyTimestamps<T>
+ extends PTransform<PCollection<TimestampedValue<T>>, PCollection<T>> {
+ @Override
+ public PCollection<T> apply(PCollection<TimestampedValue<T>> input) {
+ return input.apply(ParDo.of(new DoFn<TimestampedValue<T>, T>() {
+ @Override
+ public void processElement(ProcessContext c) {
+ c.outputWithTimestamp(c.element().getValue(), c.element().getTimestamp());
+ }
+ }));
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/java/src/test/java/com/google/cloud/dataflow/examples/complete/TfIdfTest.java
----------------------------------------------------------------------
diff --git a/examples/java/src/test/java/com/google/cloud/dataflow/examples/complete/TfIdfTest.java b/examples/java/src/test/java/com/google/cloud/dataflow/examples/complete/TfIdfTest.java
new file mode 100644
index 0000000..5ee136c
--- /dev/null
+++ b/examples/java/src/test/java/com/google/cloud/dataflow/examples/complete/TfIdfTest.java
@@ -0,0 +1,67 @@
+/*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package com.google.cloud.dataflow.examples.complete;
+
+import com.google.cloud.dataflow.sdk.Pipeline;
+import com.google.cloud.dataflow.sdk.coders.StringDelegateCoder;
+import com.google.cloud.dataflow.sdk.testing.DataflowAssert;
+import com.google.cloud.dataflow.sdk.testing.RunnableOnService;
+import com.google.cloud.dataflow.sdk.testing.TestPipeline;
+import com.google.cloud.dataflow.sdk.transforms.Create;
+import com.google.cloud.dataflow.sdk.transforms.Keys;
+import com.google.cloud.dataflow.sdk.transforms.RemoveDuplicates;
+import com.google.cloud.dataflow.sdk.values.KV;
+import com.google.cloud.dataflow.sdk.values.PCollection;
+
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+import java.net.URI;
+import java.util.Arrays;
+
+/**
+ * Tests of {@link TfIdf}.
+ */
+@RunWith(JUnit4.class)
+public class TfIdfTest {
+
+ /** Test that the example runs. */
+ @Test
+ @Category(RunnableOnService.class)
+ public void testTfIdf() throws Exception {
+ Pipeline pipeline = TestPipeline.create();
+
+ pipeline.getCoderRegistry().registerCoder(URI.class, StringDelegateCoder.of(URI.class));
+
+ PCollection<KV<String, KV<URI, Double>>> wordToUriAndTfIdf = pipeline
+ .apply(Create.of(
+ KV.of(new URI("x"), "a b c d"),
+ KV.of(new URI("y"), "a b c"),
+ KV.of(new URI("z"), "a m n")))
+ .apply(new TfIdf.ComputeTfIdf());
+
+ PCollection<String> words = wordToUriAndTfIdf
+ .apply(Keys.<String>create())
+ .apply(RemoveDuplicates.<String>create());
+
+ DataflowAssert.that(words).containsInAnyOrder(Arrays.asList("a", "m", "n", "b", "c", "d"));
+
+ pipeline.run();
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/java/src/test/java/com/google/cloud/dataflow/examples/complete/TopWikipediaSessionsTest.java
----------------------------------------------------------------------
diff --git a/examples/java/src/test/java/com/google/cloud/dataflow/examples/complete/TopWikipediaSessionsTest.java b/examples/java/src/test/java/com/google/cloud/dataflow/examples/complete/TopWikipediaSessionsTest.java
new file mode 100644
index 0000000..ce9de51
--- /dev/null
+++ b/examples/java/src/test/java/com/google/cloud/dataflow/examples/complete/TopWikipediaSessionsTest.java
@@ -0,0 +1,62 @@
+/*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package com.google.cloud.dataflow.examples.complete;
+
+import com.google.api.services.bigquery.model.TableRow;
+import com.google.cloud.dataflow.sdk.Pipeline;
+import com.google.cloud.dataflow.sdk.testing.DataflowAssert;
+import com.google.cloud.dataflow.sdk.testing.RunnableOnService;
+import com.google.cloud.dataflow.sdk.testing.TestPipeline;
+import com.google.cloud.dataflow.sdk.transforms.Create;
+import com.google.cloud.dataflow.sdk.values.PCollection;
+
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+import java.util.Arrays;
+
+/** Unit tests for {@link TopWikipediaSessions}. */
+@RunWith(JUnit4.class)
+public class TopWikipediaSessionsTest {
+ @Test
+ @Category(RunnableOnService.class)
+ public void testComputeTopUsers() {
+ Pipeline p = TestPipeline.create();
+
+ PCollection<String> output =
+ p.apply(Create.of(Arrays.asList(
+ new TableRow().set("timestamp", 0).set("contributor_username", "user1"),
+ new TableRow().set("timestamp", 1).set("contributor_username", "user1"),
+ new TableRow().set("timestamp", 2).set("contributor_username", "user1"),
+ new TableRow().set("timestamp", 0).set("contributor_username", "user2"),
+ new TableRow().set("timestamp", 1).set("contributor_username", "user2"),
+ new TableRow().set("timestamp", 3601).set("contributor_username", "user2"),
+ new TableRow().set("timestamp", 3602).set("contributor_username", "user2"),
+ new TableRow().set("timestamp", 35 * 24 * 3600).set("contributor_username", "user3"))))
+ .apply(new TopWikipediaSessions.ComputeTopSessions(1.0));
+
+ DataflowAssert.that(output).containsInAnyOrder(Arrays.asList(
+ "user1 : [1970-01-01T00:00:00.000Z..1970-01-01T01:00:02.000Z)"
+ + " : 3 : 1970-01-01T00:00:00.000Z",
+ "user3 : [1970-02-05T00:00:00.000Z..1970-02-05T01:00:00.000Z)"
+ + " : 1 : 1970-02-01T00:00:00.000Z"));
+
+ p.run();
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/java/src/test/java/com/google/cloud/dataflow/examples/cookbook/BigQueryTornadoesTest.java
----------------------------------------------------------------------
diff --git a/examples/java/src/test/java/com/google/cloud/dataflow/examples/cookbook/BigQueryTornadoesTest.java b/examples/java/src/test/java/com/google/cloud/dataflow/examples/cookbook/BigQueryTornadoesTest.java
new file mode 100644
index 0000000..6dce4ed
--- /dev/null
+++ b/examples/java/src/test/java/com/google/cloud/dataflow/examples/cookbook/BigQueryTornadoesTest.java
@@ -0,0 +1,80 @@
+/*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package com.google.cloud.dataflow.examples.cookbook;
+
+import com.google.api.services.bigquery.model.TableRow;
+import com.google.cloud.dataflow.examples.cookbook.BigQueryTornadoes.ExtractTornadoesFn;
+import com.google.cloud.dataflow.examples.cookbook.BigQueryTornadoes.FormatCountsFn;
+import com.google.cloud.dataflow.sdk.transforms.DoFnTester;
+import com.google.cloud.dataflow.sdk.values.KV;
+
+import org.hamcrest.CoreMatchers;
+import org.junit.Assert;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+import java.util.List;
+
+/**
+ * Test case for {@link BigQueryTornadoes}.
+ */
+@RunWith(JUnit4.class)
+public class BigQueryTornadoesTest {
+
+ @Test
+ public void testExtractTornadoes() throws Exception {
+ TableRow row = new TableRow()
+ .set("month", "6")
+ .set("tornado", true);
+ DoFnTester<TableRow, Integer> extractWordsFn =
+ DoFnTester.of(new ExtractTornadoesFn());
+ Assert.assertThat(extractWordsFn.processBatch(row),
+ CoreMatchers.hasItems(6));
+ }
+
+ @Test
+ public void testNoTornadoes() throws Exception {
+ TableRow row = new TableRow()
+ .set("month", 6)
+ .set("tornado", false);
+ DoFnTester<TableRow, Integer> extractWordsFn =
+ DoFnTester.of(new ExtractTornadoesFn());
+ Assert.assertTrue(extractWordsFn.processBatch(row).isEmpty());
+ }
+
+ @Test
+ @SuppressWarnings({"rawtypes", "unchecked"})
+ public void testFormatCounts() throws Exception {
+ DoFnTester<KV<Integer, Long>, TableRow> formatCountsFn =
+ DoFnTester.of(new FormatCountsFn());
+ KV empty[] = {};
+ List<TableRow> results = formatCountsFn.processBatch(empty);
+ Assert.assertTrue(results.size() == 0);
+ KV input[] = { KV.of(3, 0L),
+ KV.of(4, Long.MAX_VALUE),
+ KV.of(5, Long.MIN_VALUE) };
+ results = formatCountsFn.processBatch(input);
+ Assert.assertEquals(results.size(), 3);
+ Assert.assertEquals(results.get(0).get("month"), 3);
+ Assert.assertEquals(results.get(0).get("tornado_count"), 0L);
+ Assert.assertEquals(results.get(1).get("month"), 4);
+ Assert.assertEquals(results.get(1).get("tornado_count"), Long.MAX_VALUE);
+ Assert.assertEquals(results.get(2).get("month"), 5);
+ Assert.assertEquals(results.get(2).get("tornado_count"), Long.MIN_VALUE);
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/java/src/test/java/com/google/cloud/dataflow/examples/cookbook/CombinePerKeyExamplesTest.java
----------------------------------------------------------------------
diff --git a/examples/java/src/test/java/com/google/cloud/dataflow/examples/cookbook/CombinePerKeyExamplesTest.java b/examples/java/src/test/java/com/google/cloud/dataflow/examples/cookbook/CombinePerKeyExamplesTest.java
new file mode 100644
index 0000000..fe4823d
--- /dev/null
+++ b/examples/java/src/test/java/com/google/cloud/dataflow/examples/cookbook/CombinePerKeyExamplesTest.java
@@ -0,0 +1,90 @@
+/*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package com.google.cloud.dataflow.examples.cookbook;
+
+import com.google.api.services.bigquery.model.TableRow;
+import com.google.cloud.dataflow.examples.cookbook.CombinePerKeyExamples.ExtractLargeWordsFn;
+import com.google.cloud.dataflow.examples.cookbook.CombinePerKeyExamples.FormatShakespeareOutputFn;
+import com.google.cloud.dataflow.sdk.transforms.DoFnTester;
+import com.google.cloud.dataflow.sdk.values.KV;
+
+import org.hamcrest.CoreMatchers;
+import org.junit.Assert;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+import java.util.List;
+
+/** Unit tests for {@link CombinePerKeyExamples}. */
+@RunWith(JUnit4.class)
+public class CombinePerKeyExamplesTest {
+
+ private static final TableRow row1 = new TableRow()
+ .set("corpus", "king_lear").set("word", "snuffleupaguses");
+ private static final TableRow row2 = new TableRow()
+ .set("corpus", "macbeth").set("word", "antidisestablishmentarianism");
+ private static final TableRow row3 = new TableRow()
+ .set("corpus", "king_lear").set("word", "antidisestablishmentarianism");
+ private static final TableRow row4 = new TableRow()
+ .set("corpus", "macbeth").set("word", "bob");
+ private static final TableRow row5 = new TableRow()
+ .set("corpus", "king_lear").set("word", "hi");
+
+ static final TableRow[] ROWS_ARRAY = new TableRow[] {
+ row1, row2, row3, row4, row5
+ };
+
+ private static final KV<String, String> tuple1 = KV.of("snuffleupaguses", "king_lear");
+ private static final KV<String, String> tuple2 = KV.of("antidisestablishmentarianism", "macbeth");
+ private static final KV<String, String> tuple3 = KV.of("antidisestablishmentarianism",
+ "king_lear");
+
+ private static final KV<String, String> combinedTuple1 = KV.of("antidisestablishmentarianism",
+ "king_lear,macbeth");
+ private static final KV<String, String> combinedTuple2 = KV.of("snuffleupaguses", "king_lear");
+
+ @SuppressWarnings({"unchecked", "rawtypes"})
+ static final KV<String, String>[] COMBINED_TUPLES_ARRAY = new KV[] {
+ combinedTuple1, combinedTuple2
+ };
+
+ private static final TableRow resultRow1 = new TableRow()
+ .set("word", "snuffleupaguses").set("all_plays", "king_lear");
+ private static final TableRow resultRow2 = new TableRow()
+ .set("word", "antidisestablishmentarianism")
+ .set("all_plays", "king_lear,macbeth");
+
+ @Test
+ public void testExtractLargeWordsFn() {
+ DoFnTester<TableRow, KV<String, String>> extractLargeWordsFn =
+ DoFnTester.of(new ExtractLargeWordsFn());
+ List<KV<String, String>> results = extractLargeWordsFn.processBatch(ROWS_ARRAY);
+ Assert.assertThat(results, CoreMatchers.hasItem(tuple1));
+ Assert.assertThat(results, CoreMatchers.hasItem(tuple2));
+ Assert.assertThat(results, CoreMatchers.hasItem(tuple3));
+ }
+
+ @Test
+ public void testFormatShakespeareOutputFn() {
+ DoFnTester<KV<String, String>, TableRow> formatShakespeareOutputFn =
+ DoFnTester.of(new FormatShakespeareOutputFn());
+ List<TableRow> results = formatShakespeareOutputFn.processBatch(COMBINED_TUPLES_ARRAY);
+ Assert.assertThat(results, CoreMatchers.hasItem(resultRow1));
+ Assert.assertThat(results, CoreMatchers.hasItem(resultRow2));
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/java/src/test/java/com/google/cloud/dataflow/examples/cookbook/DeDupExampleTest.java
----------------------------------------------------------------------
diff --git a/examples/java/src/test/java/com/google/cloud/dataflow/examples/cookbook/DeDupExampleTest.java b/examples/java/src/test/java/com/google/cloud/dataflow/examples/cookbook/DeDupExampleTest.java
new file mode 100644
index 0000000..bce6b11
--- /dev/null
+++ b/examples/java/src/test/java/com/google/cloud/dataflow/examples/cookbook/DeDupExampleTest.java
@@ -0,0 +1,83 @@
+/*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package com.google.cloud.dataflow.examples.cookbook;
+
+import com.google.cloud.dataflow.sdk.Pipeline;
+import com.google.cloud.dataflow.sdk.coders.StringUtf8Coder;
+import com.google.cloud.dataflow.sdk.testing.DataflowAssert;
+import com.google.cloud.dataflow.sdk.testing.RunnableOnService;
+import com.google.cloud.dataflow.sdk.testing.TestPipeline;
+import com.google.cloud.dataflow.sdk.transforms.Create;
+import com.google.cloud.dataflow.sdk.transforms.RemoveDuplicates;
+import com.google.cloud.dataflow.sdk.values.PCollection;
+
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+import java.util.Arrays;
+import java.util.List;
+
+/** Unit tests for {@link DeDupExample}. */
+@RunWith(JUnit4.class)
+public class DeDupExampleTest {
+
+ @Test
+ @Category(RunnableOnService.class)
+ public void testRemoveDuplicates() {
+ List<String> strings = Arrays.asList(
+ "k1",
+ "k5",
+ "k5",
+ "k2",
+ "k1",
+ "k2",
+ "k3");
+
+ Pipeline p = TestPipeline.create();
+
+ PCollection<String> input =
+ p.apply(Create.of(strings)
+ .withCoder(StringUtf8Coder.of()));
+
+ PCollection<String> output =
+ input.apply(RemoveDuplicates.<String>create());
+
+ DataflowAssert.that(output)
+ .containsInAnyOrder("k1", "k5", "k2", "k3");
+ p.run();
+ }
+
+ @Test
+ @Category(RunnableOnService.class)
+ public void testRemoveDuplicatesEmpty() {
+ List<String> strings = Arrays.asList();
+
+ Pipeline p = TestPipeline.create();
+
+ PCollection<String> input =
+ p.apply(Create.of(strings)
+ .withCoder(StringUtf8Coder.of()));
+
+ PCollection<String> output =
+ input.apply(RemoveDuplicates.<String>create());
+
+ DataflowAssert.that(output).empty();
+ p.run();
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/java/src/test/java/com/google/cloud/dataflow/examples/cookbook/FilterExamplesTest.java
----------------------------------------------------------------------
diff --git a/examples/java/src/test/java/com/google/cloud/dataflow/examples/cookbook/FilterExamplesTest.java b/examples/java/src/test/java/com/google/cloud/dataflow/examples/cookbook/FilterExamplesTest.java
new file mode 100644
index 0000000..6d822f9
--- /dev/null
+++ b/examples/java/src/test/java/com/google/cloud/dataflow/examples/cookbook/FilterExamplesTest.java
@@ -0,0 +1,85 @@
+/*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package com.google.cloud.dataflow.examples.cookbook;
+
+import com.google.api.services.bigquery.model.TableRow;
+import com.google.cloud.dataflow.examples.cookbook.FilterExamples.FilterSingleMonthDataFn;
+import com.google.cloud.dataflow.examples.cookbook.FilterExamples.ProjectionFn;
+import com.google.cloud.dataflow.sdk.transforms.DoFnTester;
+
+import org.hamcrest.CoreMatchers;
+import org.junit.Assert;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+import java.util.Arrays;
+import java.util.List;
+
+/** Unit tests for {@link FilterExamples}. */
+@RunWith(JUnit4.class)
+public class FilterExamplesTest {
+
+ private static final TableRow row1 = new TableRow()
+ .set("month", "6").set("day", "21")
+ .set("year", "2014").set("mean_temp", "85.3")
+ .set("tornado", true);
+ private static final TableRow row2 = new TableRow()
+ .set("month", "7").set("day", "20")
+ .set("year", "2014").set("mean_temp", "75.4")
+ .set("tornado", false);
+ private static final TableRow row3 = new TableRow()
+ .set("month", "6").set("day", "18")
+ .set("year", "2014").set("mean_temp", "45.3")
+ .set("tornado", true);
+ static final TableRow[] ROWS_ARRAY = new TableRow[] {
+ row1, row2, row3
+ };
+ static final List<TableRow> ROWS = Arrays.asList(ROWS_ARRAY);
+
+ private static final TableRow outRow1 = new TableRow()
+ .set("year", 2014).set("month", 6)
+ .set("day", 21).set("mean_temp", 85.3);
+ private static final TableRow outRow2 = new TableRow()
+ .set("year", 2014).set("month", 7)
+ .set("day", 20).set("mean_temp", 75.4);
+ private static final TableRow outRow3 = new TableRow()
+ .set("year", 2014).set("month", 6)
+ .set("day", 18).set("mean_temp", 45.3);
+ private static final TableRow[] PROJROWS_ARRAY = new TableRow[] {
+ outRow1, outRow2, outRow3
+ };
+
+
+ @Test
+ public void testProjectionFn() {
+ DoFnTester<TableRow, TableRow> projectionFn =
+ DoFnTester.of(new ProjectionFn());
+ List<TableRow> results = projectionFn.processBatch(ROWS_ARRAY);
+ Assert.assertThat(results, CoreMatchers.hasItem(outRow1));
+ Assert.assertThat(results, CoreMatchers.hasItem(outRow2));
+ Assert.assertThat(results, CoreMatchers.hasItem(outRow3));
+ }
+
+ @Test
+ public void testFilterSingleMonthDataFn() {
+ DoFnTester<TableRow, TableRow> filterSingleMonthDataFn =
+ DoFnTester.of(new FilterSingleMonthDataFn(7));
+ List<TableRow> results = filterSingleMonthDataFn.processBatch(PROJROWS_ARRAY);
+ Assert.assertThat(results, CoreMatchers.hasItem(outRow2));
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/2eaa709c/examples/java/src/test/java/com/google/cloud/dataflow/examples/cookbook/JoinExamplesTest.java
----------------------------------------------------------------------
diff --git a/examples/java/src/test/java/com/google/cloud/dataflow/examples/cookbook/JoinExamplesTest.java b/examples/java/src/test/java/com/google/cloud/dataflow/examples/cookbook/JoinExamplesTest.java
new file mode 100644
index 0000000..db3ae34
--- /dev/null
+++ b/examples/java/src/test/java/com/google/cloud/dataflow/examples/cookbook/JoinExamplesTest.java
@@ -0,0 +1,114 @@
+/*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package com.google.cloud.dataflow.examples.cookbook;
+
+import com.google.api.services.bigquery.model.TableRow;
+import com.google.cloud.dataflow.examples.cookbook.JoinExamples.ExtractCountryInfoFn;
+import com.google.cloud.dataflow.examples.cookbook.JoinExamples.ExtractEventDataFn;
+import com.google.cloud.dataflow.sdk.Pipeline;
+import com.google.cloud.dataflow.sdk.testing.DataflowAssert;
+import com.google.cloud.dataflow.sdk.testing.RunnableOnService;
+import com.google.cloud.dataflow.sdk.testing.TestPipeline;
+import com.google.cloud.dataflow.sdk.transforms.Create;
+import com.google.cloud.dataflow.sdk.transforms.DoFnTester;
+import com.google.cloud.dataflow.sdk.values.KV;
+import com.google.cloud.dataflow.sdk.values.PCollection;
+
+import org.hamcrest.CoreMatchers;
+import org.junit.Assert;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+import java.util.Arrays;
+import java.util.List;
+
+/** Unit tests for {@link JoinExamples}. */
+@RunWith(JUnit4.class)
+public class JoinExamplesTest {
+
+ private static final TableRow row1 = new TableRow()
+ .set("ActionGeo_CountryCode", "VM").set("SQLDATE", "20141212")
+ .set("Actor1Name", "BANGKOK").set("SOURCEURL", "http://cnn.com");
+ private static final TableRow row2 = new TableRow()
+ .set("ActionGeo_CountryCode", "VM").set("SQLDATE", "20141212")
+ .set("Actor1Name", "LAOS").set("SOURCEURL", "http://www.chicagotribune.com");
+ private static final TableRow row3 = new TableRow()
+ .set("ActionGeo_CountryCode", "BE").set("SQLDATE", "20141213")
+ .set("Actor1Name", "AFGHANISTAN").set("SOURCEURL", "http://cnn.com");
+ static final TableRow[] EVENTS = new TableRow[] {
+ row1, row2, row3
+ };
+ static final List<TableRow> EVENT_ARRAY = Arrays.asList(EVENTS);
+
+ private static final KV<String, String> kv1 = KV.of("VM",
+ "Date: 20141212, Actor1: LAOS, url: http://www.chicagotribune.com");
+ private static final KV<String, String> kv2 = KV.of("BE",
+ "Date: 20141213, Actor1: AFGHANISTAN, url: http://cnn.com");
+ private static final KV<String, String> kv3 = KV.of("BE", "Belgium");
+ private static final KV<String, String> kv4 = KV.of("VM", "Vietnam");
+
+ private static final TableRow cc1 = new TableRow()
+ .set("FIPSCC", "VM").set("HumanName", "Vietnam");
+ private static final TableRow cc2 = new TableRow()
+ .set("FIPSCC", "BE").set("HumanName", "Belgium");
+ static final TableRow[] CCS = new TableRow[] {
+ cc1, cc2
+ };
+ static final List<TableRow> CC_ARRAY = Arrays.asList(CCS);
+
+ static final String[] JOINED_EVENTS = new String[] {
+ "Country code: VM, Country name: Vietnam, Event info: Date: 20141212, Actor1: LAOS, "
+ + "url: http://www.chicagotribune.com",
+ "Country code: VM, Country name: Vietnam, Event info: Date: 20141212, Actor1: BANGKOK, "
+ + "url: http://cnn.com",
+ "Country code: BE, Country name: Belgium, Event info: Date: 20141213, Actor1: AFGHANISTAN, "
+ + "url: http://cnn.com"
+ };
+
+ @Test
+ public void testExtractEventDataFn() {
+ DoFnTester<TableRow, KV<String, String>> extractEventDataFn =
+ DoFnTester.of(new ExtractEventDataFn());
+ List<KV<String, String>> results = extractEventDataFn.processBatch(EVENTS);
+ Assert.assertThat(results, CoreMatchers.hasItem(kv1));
+ Assert.assertThat(results, CoreMatchers.hasItem(kv2));
+ }
+
+ @Test
+ public void testExtractCountryInfoFn() {
+ DoFnTester<TableRow, KV<String, String>> extractCountryInfoFn =
+ DoFnTester.of(new ExtractCountryInfoFn());
+ List<KV<String, String>> results = extractCountryInfoFn.processBatch(CCS);
+ Assert.assertThat(results, CoreMatchers.hasItem(kv3));
+ Assert.assertThat(results, CoreMatchers.hasItem(kv4));
+ }
+
+
+ @Test
+ @Category(RunnableOnService.class)
+ public void testJoin() throws java.lang.Exception {
+ Pipeline p = TestPipeline.create();
+ PCollection<TableRow> input1 = p.apply("CreateEvent", Create.of(EVENT_ARRAY));
+ PCollection<TableRow> input2 = p.apply("CreateCC", Create.of(CC_ARRAY));
+
+ PCollection<String> output = JoinExamples.joinEvents(input1, input2);
+ DataflowAssert.that(output).containsInAnyOrder(JOINED_EVENTS);
+ p.run();
+ }
+}
[15/67] [partial] incubator-beam git commit: Directory reorganization
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/WindowFn.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/WindowFn.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/WindowFn.java
deleted file mode 100644
index d51fc7e..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/WindowFn.java
+++ /dev/null
@@ -1,221 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms.windowing;
-
-import com.google.cloud.dataflow.sdk.annotations.Experimental;
-import com.google.cloud.dataflow.sdk.annotations.Experimental.Kind;
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.util.WindowingStrategy;
-import com.google.common.collect.Ordering;
-
-import org.joda.time.Instant;
-
-import java.io.Serializable;
-import java.util.Collection;
-
-/**
- * The argument to the {@link Window} transform used to assign elements into
- * windows and to determine how windows are merged. See {@link Window} for more
- * information on how {@code WindowFn}s are used and for a library of
- * predefined {@code WindowFn}s.
- *
- * <p>Users will generally want to use the predefined
- * {@code WindowFn}s, but it is also possible to create new
- * subclasses.
- *
- * <p>To create a custom {@code WindowFn}, inherit from this class and override all required
- * methods. If no merging is required, inherit from {@link NonMergingWindowFn}
- * instead. If no merging is required and each element is assigned to a single window, inherit from
- * {@code PartitioningWindowFn}. Inheriting from the most specific subclass will enable more
- * optimizations in the runner.
- *
- * @param <T> type of elements being windowed
- * @param <W> {@link BoundedWindow} subclass used to represent the
- * windows used by this {@code WindowFn}
- */
-public abstract class WindowFn<T, W extends BoundedWindow>
- implements Serializable {
- /**
- * Information available when running {@link #assignWindows}.
- */
- public abstract class AssignContext {
- /**
- * Returns the current element.
- */
- public abstract T element();
-
- /**
- * Returns the timestamp of the current element.
- */
- public abstract Instant timestamp();
-
- /**
- * Returns the windows the current element was in, prior to this
- * {@code WindowFn} being called.
- */
- public abstract Collection<? extends BoundedWindow> windows();
- }
-
- /**
- * Given a timestamp and element, returns the set of windows into which it
- * should be placed.
- */
- public abstract Collection<W> assignWindows(AssignContext c) throws Exception;
-
- /**
- * Information available when running {@link #mergeWindows}.
- */
- public abstract class MergeContext {
- /**
- * Returns the current set of windows.
- */
- public abstract Collection<W> windows();
-
- /**
- * Signals to the framework that the windows in {@code toBeMerged} should
- * be merged together to form {@code mergeResult}.
- *
- * <p>{@code toBeMerged} should be a subset of {@link #windows}
- * and disjoint from the {@code toBeMerged} set of previous calls
- * to {@code merge}.
- *
- * <p>{@code mergeResult} must either not be in {@link #windows} or be in
- * {@code toBeMerged}.
- *
- * @throws IllegalArgumentException if any elements of toBeMerged are not
- * in windows(), or have already been merged
- */
- public abstract void merge(Collection<W> toBeMerged, W mergeResult)
- throws Exception;
- }
-
- /**
- * Does whatever merging of windows is necessary.
- *
- * <p>See {@link MergeOverlappingIntervalWindows#mergeWindows} for an
- * example of how to override this method.
- */
- public abstract void mergeWindows(MergeContext c) throws Exception;
-
- /**
- * Returns whether this performs the same merging as the given
- * {@code WindowFn}.
- */
- public abstract boolean isCompatible(WindowFn<?, ?> other);
-
- /**
- * Returns the {@link Coder} used for serializing the windows used
- * by this windowFn.
- */
- public abstract Coder<W> windowCoder();
-
- /**
- * Returns the window of the side input corresponding to the given window of
- * the main input.
- *
- * <p>Authors of custom {@code WindowFn}s should override this.
- */
- public abstract W getSideInputWindow(final BoundedWindow window);
-
- /**
- * @deprecated Implement {@link #getOutputTimeFn} to return one of the appropriate
- * {@link OutputTimeFns}, or a custom {@link OutputTimeFn} extending
- * {@link OutputTimeFn.Defaults}.
- */
- @Deprecated
- @Experimental(Kind.OUTPUT_TIME)
- public Instant getOutputTime(Instant inputTimestamp, W window) {
- return getOutputTimeFn().assignOutputTime(inputTimestamp, window);
- }
-
- /**
- * Provides a default implementation for {@link WindowingStrategy#getOutputTimeFn()}.
- * See the full specification there.
- *
- * <p>If this {@link WindowFn} doesn't produce overlapping windows, this need not (and probably
- * should not) override any of the default implementations in {@link OutputTimeFn.Defaults}.
- *
- * <p>If this {@link WindowFn} does produce overlapping windows that can be predicted here, it is
- * suggested that the result in later overlapping windows is past the end of earlier windows so
- * that the later windows don't prevent the watermark from progressing past the end of the earlier
- * window.
- *
- * <p>For example, a timestamp in a sliding window should be moved past the beginning of the next
- * sliding window. See {@link SlidingWindows#getOutputTimeFn}.
- */
- @Experimental(Kind.OUTPUT_TIME)
- public OutputTimeFn<? super W> getOutputTimeFn() {
- return new OutputAtEarliestAssignedTimestamp<>(this);
- }
-
- /**
- * Returns true if this {@code WindowFn} never needs to merge any windows.
- */
- public boolean isNonMerging() {
- return false;
- }
-
- /**
- * Returns true if this {@code WindowFn} assigns each element to a single window.
- */
- public boolean assignsToSingleWindow() {
- return false;
- }
-
- /**
- * A compatibility adapter that will return the assigned timestamps according to the
- * {@link WindowFn}, which was the prior policy. Specifying the assigned output timestamps
- * on the {@link WindowFn} is now deprecated.
- */
- private static class OutputAtEarliestAssignedTimestamp<W extends BoundedWindow>
- extends OutputTimeFn.Defaults<W> {
-
- private final WindowFn<?, W> windowFn;
-
- public OutputAtEarliestAssignedTimestamp(WindowFn<?, W> windowFn) {
- this.windowFn = windowFn;
- }
-
- /**
- * {@inheritDoc}
- *
- * @return the result of {@link WindowFn#getOutputTime windowFn.getOutputTime()}.
- */
- @Override
- @SuppressWarnings("deprecation") // this is an adapter for the deprecated behavior
- public Instant assignOutputTime(Instant timestamp, W window) {
- return windowFn.getOutputTime(timestamp, window);
- }
-
- @Override
- public Instant combine(Instant outputTime, Instant otherOutputTime) {
- return Ordering.natural().min(outputTime, otherOutputTime);
- }
-
- /**
- * {@inheritDoc}
- *
- * @return {@code true}. When the {@link OutputTimeFn} is not overridden by {@link WindowFn}
- * or {@link WindowingStrategy}, the minimum output timestamp is taken, which depends
- * only on the minimum input timestamp by monotonicity of {@link #assignOutputTime}.
- */
- @Override
- public boolean dependsOnlyOnEarliestInputTimestamp() {
- return true;
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/package-info.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/package-info.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/package-info.java
deleted file mode 100644
index 65ccf71..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/package-info.java
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-/**
- * Defines the {@link com.google.cloud.dataflow.sdk.transforms.windowing.Window} transform
- * for dividing the elements in a PCollection into windows, and the
- * {@link com.google.cloud.dataflow.sdk.transforms.windowing.Trigger} for controlling when those
- * elements are output.
- *
- * <p>{@code Window} logically divides up or groups the elements of a
- * {@link com.google.cloud.dataflow.sdk.values.PCollection} into finite windows according to a
- * {@link com.google.cloud.dataflow.sdk.transforms.windowing.WindowFn}.
- * The output of {@code Window} contains the same elements as input, but they
- * have been logically assigned to windows. The next
- * {@link com.google.cloud.dataflow.sdk.transforms.GroupByKey}s, including one
- * within composite transforms, will group by the combination of keys and
- * windows.
- *
- * <p>Windowing a {@code PCollection} allows chunks of it to be processed
- * individually, before the entire {@code PCollection} is available. This is
- * especially important for {@code PCollection}s with unbounded size, since the full
- * {@code PCollection} is never available at once.
- *
- * <p>For {@code PCollection}s with a bounded size, by default, all data is implicitly in a
- * single window, and this replicates conventional batch mode. However, windowing can still be a
- * convenient way to express time-sliced algorithms over bounded {@code PCollection}s.
- *
- * <p>As elements are assigned to a window, they are are placed into a pane. When the trigger fires
- * all of the elements in the current pane are output.
- *
- * <p>The {@link com.google.cloud.dataflow.sdk.transforms.windowing.DefaultTrigger} will output a
- * window when the system watermark passes the end of the window. See
- * {@link com.google.cloud.dataflow.sdk.transforms.windowing.AfterWatermark} for details on the
- * watermark.
- */
-package com.google.cloud.dataflow.sdk.transforms.windowing;
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/ActiveWindowSet.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/ActiveWindowSet.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/ActiveWindowSet.java
deleted file mode 100644
index 69350cb..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/ActiveWindowSet.java
+++ /dev/null
@@ -1,171 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
-import com.google.cloud.dataflow.sdk.transforms.windowing.WindowFn;
-
-import java.util.Collection;
-import java.util.Set;
-
-import javax.annotation.Nullable;
-
-/**
- * Track which active windows have their state associated with merged-away windows.
- *
- * When windows are merged we must track which state previously associated with the merged windows
- * must now be associated with the result window. Some of that state may be combined eagerly when
- * the windows are merged. The rest is combined lazily when the final state is actually
- * required when emitting a pane. We keep track of this using an {@link ActiveWindowSet}.
- *
- * <p>An {@link ActiveWindowSet} considers a window to be in one of the following states:
- *
- * <ol>
- * <li><b>NEW</b>: The initial state for a window on an incoming element; we do not yet know
- * if it should be merged into an ACTIVE window, or whether it is already present as an
- * ACTIVE window, since we have not yet called
- * {@link WindowFn#mergeWindows}.</li>
- * <li><b>ACTIVE</b>: A window that has state associated with it and has not itself been merged
- * away. The window may have one or more <i>state address</i> windows under which its
- * non-empty state is stored. A state value for an ACTIVE window must be derived by reading
- * the state in all of its state address windows.</li>
- * <li><b>EPHEMERAL</b>: A NEW window that has been merged into an ACTIVE window before any state
- * has been associated with that window. Thus the window is neither ACTIVE nor MERGED. These
- * windows are not persistently represented since if they reappear the merge function should
- * again redirect them to an ACTIVE window. EPHEMERAL windows are an optimization for
- * the common case of in-order events and {@link Sessions session window} by never associating
- * state with windows that are created and immediately merged away.</li>
- * <li><b>MERGED</b>: An ACTIVE window has been merged into another ACTIVE window after it had
- * state associated with it. The window will thus appear as a state address window for exactly
- * one ACTIVE window.</li>
- * <li><b>EXPIRED</b>: The window has expired and may have been garbage collected. No new elements
- * (even late elements) will ever be assigned to that window. These windows are not explicitly
- * represented anywhere; it is expected that the user of {@link ActiveWindowSet} will store
- * no state associated with the window.</li>
- * </ol>
- *
- * <p>
- *
- * <p>If no windows will ever be merged we can use the trivial implementation {@link
- * NonMergingActiveWindowSet}. Otherwise, the actual implementation of this data structure is in
- * {@link MergingActiveWindowSet}.
- *
- * @param <W> the type of window being managed
- */
-public interface ActiveWindowSet<W extends BoundedWindow> {
- /**
- * Callback for {@link #merge}.
- */
- public interface MergeCallback<W extends BoundedWindow> {
- /**
- * Called when windows are about to be merged, but before any {@link #onMerge} callback
- * has been made.
- */
- void prefetchOnMerge(Collection<W> toBeMerged, Collection<W> activeToBeMerged, W mergeResult)
- throws Exception;
-
- /**
- * Called when windows are about to be merged, after all {@link #prefetchOnMerge} calls
- * have been made, but before the active window set has been updated to reflect the merge.
- *
- * @param toBeMerged the windows about to be merged.
- * @param activeToBeMerged the subset of {@code toBeMerged} corresponding to windows which
- * are currently ACTIVE (and about to be merged). The remaining windows have been deemed
- * EPHEMERAL, and thus have no state associated with them.
- * @param mergeResult the result window, either a member of {@code toBeMerged} or new.
- */
- void onMerge(Collection<W> toBeMerged, Collection<W> activeToBeMerged, W mergeResult)
- throws Exception;
- }
-
- /**
- * Remove EPHEMERAL windows since we only need to know about them while processing new elements.
- */
- void removeEphemeralWindows();
-
- /**
- * Save any state changes needed.
- */
- void persist();
-
- /**
- * Return the ACTIVE window into which {@code window} has been merged.
- * Return {@code window} itself if it is ACTIVE. Return null if {@code window} has not
- * yet been seen.
- */
- @Nullable
- W representative(W window);
-
- /**
- * Return (a view of) the set of currently ACTIVE windows.
- */
- Set<W> getActiveWindows();
-
- /**
- * Return {@code true} if {@code window} is ACTIVE.
- */
- boolean isActive(W window);
-
- /**
- * If {@code window} is not already known to be ACTIVE, MERGED or EPHEMERAL then add it
- * as NEW. All NEW windows will be accounted for as ACTIVE, MERGED or EPHEMERAL by a call
- * to {@link #merge}.
- */
- void addNew(W window);
-
- /**
- * If {@code window} is not already known to be ACTIVE, MERGED or EPHEMERAL then add it
- * as ACTIVE.
- */
- void addActive(W window);
-
- /**
- * Remove {@code window} from the set.
- */
- void remove(W window);
-
- /**
- * Invoke {@link WindowFn#mergeWindows} on the {@code WindowFn} associated with this window set,
- * merging as many of the active windows as possible. {@code mergeCallback} will be invoked for
- * each group of windows that are merged. After this no NEW windows will remain, all merge
- * result windows will be ACTIVE, and all windows which have been merged away will not be ACTIVE.
- */
- void merge(MergeCallback<W> mergeCallback) throws Exception;
-
- /**
- * Signal that all state in {@link #readStateAddresses} for {@code window} has been merged into
- * the {@link #writeStateAddress} for {@code window}.
- */
- void merged(W window);
-
- /**
- * Return the state address windows for ACTIVE {@code window} from which all state associated
- * should be read and merged.
- */
- Set<W> readStateAddresses(W window);
-
- /**
- * Return the state address window of ACTIVE {@code window} into which all new state should be
- * written. Always one of the results of {@link #readStateAddresses}.
- */
- W writeStateAddress(W window);
-
- /**
- * Return the state address window into which all new state should be written after
- * ACTIVE windows {@code toBeMerged} have been merged into {@code mergeResult}.
- */
- W mergedWriteStateAddress(Collection<W> toBeMerged, W mergeResult);
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/ApiSurface.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/ApiSurface.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/ApiSurface.java
deleted file mode 100644
index 7a9c877..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/ApiSurface.java
+++ /dev/null
@@ -1,642 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.common.base.Joiner;
-import com.google.common.base.Supplier;
-import com.google.common.collect.Lists;
-import com.google.common.collect.Maps;
-import com.google.common.collect.Multimap;
-import com.google.common.collect.Multimaps;
-import com.google.common.collect.Sets;
-import com.google.common.reflect.ClassPath;
-import com.google.common.reflect.ClassPath.ClassInfo;
-import com.google.common.reflect.Invokable;
-import com.google.common.reflect.Parameter;
-import com.google.common.reflect.TypeToken;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.IOException;
-import java.lang.annotation.Annotation;
-import java.lang.reflect.Constructor;
-import java.lang.reflect.Field;
-import java.lang.reflect.GenericArrayType;
-import java.lang.reflect.Method;
-import java.lang.reflect.Modifier;
-import java.lang.reflect.ParameterizedType;
-import java.lang.reflect.Type;
-import java.lang.reflect.TypeVariable;
-import java.lang.reflect.WildcardType;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.List;
-import java.util.Set;
-import java.util.regex.Pattern;
-
-/**
- * Represents the API surface of a package prefix. Used for accessing public classes,
- * methods, and the types they reference, to control what dependencies are re-exported.
- *
- * <p>For the purposes of calculating the public API surface, exposure includes any public
- * or protected occurrence of:
- *
- * <ul>
- * <li>superclasses
- * <li>interfaces implemented
- * <li>actual type arguments to generic types
- * <li>array component types
- * <li>method return types
- * <li>method parameter types
- * <li>type variable bounds
- * <li>wildcard bounds
- * </ul>
- *
- * <p>Exposure is a transitive property. The resulting map excludes primitives
- * and array classes themselves.
- *
- * <p>It is prudent (though not required) to prune prefixes like "java" via the builder
- * method {@link #pruningPrefix} to halt the traversal so it does not uselessly catalog references
- * that are not interesting.
- */
-@SuppressWarnings("rawtypes")
-public class ApiSurface {
- private static Logger logger = LoggerFactory.getLogger(ApiSurface.class);
-
- /**
- * Returns an empty {@link ApiSurface}.
- */
- public static ApiSurface empty() {
- logger.debug("Returning an empty ApiSurface");
- return new ApiSurface(Collections.<Class<?>>emptySet(), Collections.<Pattern>emptySet());
- }
-
- /**
- * Returns an {@link ApiSurface} object representing the given package and all subpackages.
- */
- public static ApiSurface ofPackage(String packageName) throws IOException {
- return ApiSurface.empty().includingPackage(packageName);
- }
-
- /**
- * Returns an {@link ApiSurface} object representing just the surface of the given class.
- */
- public static ApiSurface ofClass(Class<?> clazz) {
- return ApiSurface.empty().includingClass(clazz);
- }
-
- /**
- * Returns an {@link ApiSurface} like this one, but also including the named
- * package and all of its subpackages.
- */
- public ApiSurface includingPackage(String packageName) throws IOException {
- ClassPath classPath = ClassPath.from(ClassLoader.getSystemClassLoader());
-
- Set<Class<?>> newRootClasses = Sets.newHashSet();
- for (ClassInfo classInfo : classPath.getTopLevelClassesRecursive(packageName)) {
- Class clazz = classInfo.load();
- if (exposed(clazz.getModifiers())) {
- newRootClasses.add(clazz);
- }
- }
- logger.debug("Including package {} and subpackages: {}", packageName, newRootClasses);
- newRootClasses.addAll(rootClasses);
-
- return new ApiSurface(newRootClasses, patternsToPrune);
- }
-
- /**
- * Returns an {@link ApiSurface} like this one, but also including the given class.
- */
- public ApiSurface includingClass(Class<?> clazz) {
- Set<Class<?>> newRootClasses = Sets.newHashSet();
- logger.debug("Including class {}", clazz);
- newRootClasses.add(clazz);
- newRootClasses.addAll(rootClasses);
- return new ApiSurface(newRootClasses, patternsToPrune);
- }
-
- /**
- * Returns an {@link ApiSurface} like this one, but pruning transitive
- * references from classes whose full name (including package) begins with the provided prefix.
- */
- public ApiSurface pruningPrefix(String prefix) {
- return pruningPattern(Pattern.compile(Pattern.quote(prefix) + ".*"));
- }
-
- /**
- * Returns an {@link ApiSurface} like this one, but pruning references from the named
- * class.
- */
- public ApiSurface pruningClassName(String className) {
- return pruningPattern(Pattern.compile(Pattern.quote(className)));
- }
-
- /**
- * Returns an {@link ApiSurface} like this one, but pruning references from the
- * provided class.
- */
- public ApiSurface pruningClass(Class<?> clazz) {
- return pruningClassName(clazz.getName());
- }
-
- /**
- * Returns an {@link ApiSurface} like this one, but pruning transitive
- * references from classes whose full name (including package) begins with the provided prefix.
- */
- public ApiSurface pruningPattern(Pattern pattern) {
- Set<Pattern> newPatterns = Sets.newHashSet();
- newPatterns.addAll(patternsToPrune);
- newPatterns.add(pattern);
- return new ApiSurface(rootClasses, newPatterns);
- }
-
- /**
- * See {@link #pruningPattern(Pattern)}.
- */
- public ApiSurface pruningPattern(String patternString) {
- return pruningPattern(Pattern.compile(patternString));
- }
-
- /**
- * Returns all public classes originally belonging to the package
- * in the {@link ApiSurface}.
- */
- public Set<Class<?>> getRootClasses() {
- return rootClasses;
- }
-
- /**
- * Returns exposed types in this set, including arrays and primitives as
- * specified.
- */
- public Set<Class<?>> getExposedClasses() {
- return getExposedToExposers().keySet();
- }
-
- /**
- * Returns a path from an exposed class to a root class. There may be many, but this
- * gives only one.
- *
- * <p>If there are only cycles, with no path back to a root class, throws
- * IllegalStateException.
- */
- public List<Class<?>> getAnyExposurePath(Class<?> exposedClass) {
- Set<Class<?>> excluded = Sets.newHashSet();
- excluded.add(exposedClass);
- List<Class<?>> path = getAnyExposurePath(exposedClass, excluded);
- if (path == null) {
- throw new IllegalArgumentException(
- "Class " + exposedClass + " has no path back to any root class."
- + " It should never have been considered exposed.");
- } else {
- return path;
- }
- }
-
- /**
- * Returns a path from an exposed class to a root class. There may be many, but this
- * gives only one. It will not return a path that crosses the excluded classes.
- *
- * <p>If there are only cycles or paths through the excluded classes, returns null.
- *
- * <p>If the class is not actually in the exposure map, throws IllegalArgumentException
- */
- private List<Class<?>> getAnyExposurePath(Class<?> exposedClass, Set<Class<?>> excluded) {
- List<Class<?>> exposurePath = Lists.newArrayList();
- exposurePath.add(exposedClass);
-
- Collection<Class<?>> exposers = getExposedToExposers().get(exposedClass);
- if (exposers.isEmpty()) {
- throw new IllegalArgumentException("Class " + exposedClass + " is not exposed.");
- }
-
- for (Class<?> exposer : exposers) {
- if (excluded.contains(exposer)) {
- continue;
- }
-
- // A null exposer means this is already a root class.
- if (exposer == null) {
- return exposurePath;
- }
-
- List<Class<?>> restOfPath = getAnyExposurePath(
- exposer,
- Sets.union(excluded, Sets.newHashSet(exposer)));
-
- if (restOfPath != null) {
- exposurePath.addAll(restOfPath);
- return exposurePath;
- }
- }
- return null;
- }
-
- ////////////////////////////////////////////////////////////////////
-
- // Fields initialized upon construction
- private final Set<Class<?>> rootClasses;
- private final Set<Pattern> patternsToPrune;
-
- // Fields computed on-demand
- private Multimap<Class<?>, Class<?>> exposedToExposers = null;
- private Pattern prunedPattern = null;
- private Set<Type> visited = null;
-
- private ApiSurface(Set<Class<?>> rootClasses, Set<Pattern> patternsToPrune) {
- this.rootClasses = rootClasses;
- this.patternsToPrune = patternsToPrune;
- }
-
- /**
- * A map from exposed types to place where they are exposed, in the sense of being a part
- * of a public-facing API surface.
- *
- * <p>This map is the adjencency list representation of a directed graph, where an edge from type
- * {@code T1} to type {@code T2} indicates that {@code T2} directly exposes {@code T1} in its API
- * surface.
- *
- * <p>The traversal methods in this class are designed to avoid repeatedly processing types, since
- * there will almost always be cyclic references.
- */
- private Multimap<Class<?>, Class<?>> getExposedToExposers() {
- if (exposedToExposers == null) {
- constructExposedToExposers();
- }
- return exposedToExposers;
- }
-
- /**
- * See {@link #getExposedToExposers}.
- */
- private void constructExposedToExposers() {
- visited = Sets.newHashSet();
- exposedToExposers = Multimaps.newSetMultimap(
- Maps.<Class<?>, Collection<Class<?>>>newHashMap(),
- new Supplier<Set<Class<?>>>() {
- @Override
- public Set<Class<?>> get() {
- return Sets.newHashSet();
- }
- });
-
- for (Class<?> clazz : rootClasses) {
- addExposedTypes(clazz, null);
- }
- }
-
- /**
- * A combined {@code Pattern} that implements all the pruning specified.
- */
- private Pattern getPrunedPattern() {
- if (prunedPattern == null) {
- constructPrunedPattern();
- }
- return prunedPattern;
- }
-
- /**
- * See {@link #getPrunedPattern}.
- */
- private void constructPrunedPattern() {
- Set<String> prunedPatternStrings = Sets.newHashSet();
- for (Pattern patternToPrune : patternsToPrune) {
- prunedPatternStrings.add(patternToPrune.pattern());
- }
- prunedPattern = Pattern.compile("(" + Joiner.on(")|(").join(prunedPatternStrings) + ")");
- }
-
- /**
- * Whether a type and all that it references should be pruned from the graph.
- */
- private boolean pruned(Type type) {
- return pruned(TypeToken.of(type).getRawType());
- }
-
- /**
- * Whether a class and all that it references should be pruned from the graph.
- */
- private boolean pruned(Class<?> clazz) {
- return clazz.isPrimitive()
- || clazz.isArray()
- || getPrunedPattern().matcher(clazz.getName()).matches();
- }
-
- /**
- * Whether a type has already beens sufficiently processed.
- */
- private boolean done(Type type) {
- return visited.contains(type);
- }
-
- private void recordExposure(Class<?> exposed, Class<?> cause) {
- exposedToExposers.put(exposed, cause);
- }
-
- private void recordExposure(Type exposed, Class<?> cause) {
- exposedToExposers.put(TypeToken.of(exposed).getRawType(), cause);
- }
-
- private void visit(Type type) {
- visited.add(type);
- }
-
- /**
- * See {@link #addExposedTypes(Type, Class)}.
- */
- private void addExposedTypes(TypeToken type, Class<?> cause) {
- logger.debug(
- "Adding exposed types from {}, which is the type in type token {}", type.getType(), type);
- addExposedTypes(type.getType(), cause);
- }
-
- /**
- * Adds any references learned by following a link from {@code cause} to {@code type}.
- * This will dispatch according to the concrete {@code Type} implementation. See the
- * other overloads of {@code addExposedTypes} for their details.
- */
- private void addExposedTypes(Type type, Class<?> cause) {
- if (type instanceof TypeVariable) {
- logger.debug("Adding exposed types from {}, which is a type variable", type);
- addExposedTypes((TypeVariable) type, cause);
- } else if (type instanceof WildcardType) {
- logger.debug("Adding exposed types from {}, which is a wildcard type", type);
- addExposedTypes((WildcardType) type, cause);
- } else if (type instanceof GenericArrayType) {
- logger.debug("Adding exposed types from {}, which is a generic array type", type);
- addExposedTypes((GenericArrayType) type, cause);
- } else if (type instanceof ParameterizedType) {
- logger.debug("Adding exposed types from {}, which is a parameterized type", type);
- addExposedTypes((ParameterizedType) type, cause);
- } else if (type instanceof Class) {
- logger.debug("Adding exposed types from {}, which is a class", type);
- addExposedTypes((Class) type, cause);
- } else {
- throw new IllegalArgumentException("Unknown implementation of Type");
- }
- }
-
- /**
- * Adds any types exposed to this set. These will
- * come from the (possibly absent) bounds on the
- * type variable.
- */
- private void addExposedTypes(TypeVariable type, Class<?> cause) {
- if (done(type)) {
- return;
- }
- visit(type);
- for (Type bound : type.getBounds()) {
- logger.debug("Adding exposed types from {}, which is a type bound on {}", bound, type);
- addExposedTypes(bound, cause);
- }
- }
-
- /**
- * Adds any types exposed to this set. These will come from the (possibly absent) bounds on the
- * wildcard.
- */
- private void addExposedTypes(WildcardType type, Class<?> cause) {
- visit(type);
- for (Type lowerBound : type.getLowerBounds()) {
- logger.debug(
- "Adding exposed types from {}, which is a type lower bound on wildcard type {}",
- lowerBound,
- type);
- addExposedTypes(lowerBound, cause);
- }
- for (Type upperBound : type.getUpperBounds()) {
- logger.debug(
- "Adding exposed types from {}, which is a type upper bound on wildcard type {}",
- upperBound,
- type);
- addExposedTypes(upperBound, cause);
- }
- }
-
- /**
- * Adds any types exposed from the given array type. The array type itself is not added. The
- * cause of the exposure of the underlying type is considered whatever type exposed the array
- * type.
- */
- private void addExposedTypes(GenericArrayType type, Class<?> cause) {
- if (done(type)) {
- return;
- }
- visit(type);
- logger.debug(
- "Adding exposed types from {}, which is the component type on generic array type {}",
- type.getGenericComponentType(),
- type);
- addExposedTypes(type.getGenericComponentType(), cause);
- }
-
- /**
- * Adds any types exposed to this set. Even if the
- * root type is to be pruned, the actual type arguments
- * are processed.
- */
- private void addExposedTypes(ParameterizedType type, Class<?> cause) {
- // Even if the type is already done, this link to it may be new
- boolean alreadyDone = done(type);
- if (!pruned(type)) {
- visit(type);
- recordExposure(type, cause);
- }
- if (alreadyDone) {
- return;
- }
-
- // For a parameterized type, pruning does not take place
- // here, only for the raw class.
- // The type parameters themselves may not be pruned,
- // for example with List<MyApiType> probably the
- // standard List is pruned, but MyApiType is not.
- logger.debug(
- "Adding exposed types from {}, which is the raw type on parameterized type {}",
- type.getRawType(),
- type);
- addExposedTypes(type.getRawType(), cause);
- for (Type typeArg : type.getActualTypeArguments()) {
- logger.debug(
- "Adding exposed types from {}, which is a type argument on parameterized type {}",
- typeArg,
- type);
- addExposedTypes(typeArg, cause);
- }
- }
-
- /**
- * Adds a class and all of the types it exposes. The cause
- * of the class being exposed is given, and the cause
- * of everything within the class is that class itself.
- */
- private void addExposedTypes(Class<?> clazz, Class<?> cause) {
- if (pruned(clazz)) {
- return;
- }
- // Even if `clazz` has been visited, the link from `cause` may be new
- boolean alreadyDone = done(clazz);
- visit(clazz);
- recordExposure(clazz, cause);
- if (alreadyDone || pruned(clazz)) {
- return;
- }
-
- TypeToken<?> token = TypeToken.of(clazz);
- for (TypeToken<?> superType : token.getTypes()) {
- if (!superType.equals(token)) {
- logger.debug(
- "Adding exposed types from {}, which is a super type token on {}", superType, clazz);
- addExposedTypes(superType, clazz);
- }
- }
- for (Class innerClass : clazz.getDeclaredClasses()) {
- if (exposed(innerClass.getModifiers())) {
- logger.debug(
- "Adding exposed types from {}, which is an exposed inner class of {}",
- innerClass,
- clazz);
- addExposedTypes(innerClass, clazz);
- }
- }
- for (Field field : clazz.getDeclaredFields()) {
- if (exposed(field.getModifiers())) {
- logger.debug("Adding exposed types from {}, which is an exposed field on {}", field, clazz);
- addExposedTypes(field, clazz);
- }
- }
- for (Invokable invokable : getExposedInvokables(token)) {
- logger.debug(
- "Adding exposed types from {}, which is an exposed invokable on {}", invokable, clazz);
- addExposedTypes(invokable, clazz);
- }
- }
-
- private void addExposedTypes(Invokable<?, ?> invokable, Class<?> cause) {
- addExposedTypes(invokable.getReturnType(), cause);
- for (Annotation annotation : invokable.getAnnotations()) {
- logger.debug(
- "Adding exposed types from {}, which is an annotation on invokable {}",
- annotation,
- invokable);
- addExposedTypes(annotation.annotationType(), cause);
- }
- for (Parameter parameter : invokable.getParameters()) {
- logger.debug(
- "Adding exposed types from {}, which is a parameter on invokable {}",
- parameter,
- invokable);
- addExposedTypes(parameter, cause);
- }
- for (TypeToken<?> exceptionType : invokable.getExceptionTypes()) {
- logger.debug(
- "Adding exposed types from {}, which is an exception type on invokable {}",
- exceptionType,
- invokable);
- addExposedTypes(exceptionType, cause);
- }
- }
-
- private void addExposedTypes(Parameter parameter, Class<?> cause) {
- logger.debug(
- "Adding exposed types from {}, which is the type of parameter {}",
- parameter.getType(),
- parameter);
- addExposedTypes(parameter.getType(), cause);
- for (Annotation annotation : parameter.getAnnotations()) {
- logger.debug(
- "Adding exposed types from {}, which is an annotation on parameter {}",
- annotation,
- parameter);
- addExposedTypes(annotation.annotationType(), cause);
- }
- }
-
- private void addExposedTypes(Field field, Class<?> cause) {
- addExposedTypes(field.getGenericType(), cause);
- for (Annotation annotation : field.getDeclaredAnnotations()) {
- logger.debug(
- "Adding exposed types from {}, which is an annotation on field {}", annotation, field);
- addExposedTypes(annotation.annotationType(), cause);
- }
- }
-
- /**
- * Returns an {@link Invokable} for each public methods or constructors of a type.
- */
- private Set<Invokable> getExposedInvokables(TypeToken<?> type) {
- Set<Invokable> invokables = Sets.newHashSet();
-
- for (Constructor constructor : type.getRawType().getConstructors()) {
- if (0 != (constructor.getModifiers() & (Modifier.PUBLIC | Modifier.PROTECTED))) {
- invokables.add(type.constructor(constructor));
- }
- }
-
- for (Method method : type.getRawType().getMethods()) {
- if (0 != (method.getModifiers() & (Modifier.PUBLIC | Modifier.PROTECTED))) {
- invokables.add(type.method(method));
- }
- }
-
- return invokables;
- }
-
- /**
- * Returns true of the given modifier bitmap indicates exposure (public or protected access).
- */
- private boolean exposed(int modifiers) {
- return 0 != (modifiers & (Modifier.PUBLIC | Modifier.PROTECTED));
- }
-
-
- ////////////////////////////////////////////////////////////////////////////
-
- public static ApiSurface getSdkApiSurface() throws IOException {
- return ApiSurface.ofPackage("com.google.cloud.dataflow")
- .pruningPattern("com[.]google[.]cloud[.]dataflow.*Test")
- .pruningPattern("com[.]google[.]cloud[.]dataflow.*Benchmark")
- .pruningPrefix("com.google.cloud.dataflow.integration")
- .pruningPrefix("java")
- .pruningPrefix("com.google.api")
- .pruningPrefix("com.google.auth")
- .pruningPrefix("com.google.bigtable.v1")
- .pruningPrefix("com.google.cloud.bigtable.config")
- .pruningPrefix("com.google.cloud.bigtable.grpc.Bigtable*Name")
- .pruningPrefix("com.google.protobuf")
- .pruningPrefix("org.joda.time")
- .pruningPrefix("org.apache.avro")
- .pruningPrefix("org.junit")
- .pruningPrefix("com.fasterxml.jackson.annotation");
- }
-
- public static void main(String[] args) throws Exception {
- List<String> names = Lists.newArrayList();
- for (Class clazz : getSdkApiSurface().getExposedClasses()) {
- names.add(clazz.getName());
- }
- List<String> sortedNames = Lists.newArrayList(names);
- Collections.sort(sortedNames);
-
- for (String name : sortedNames) {
- System.out.println(name);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/AppEngineEnvironment.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/AppEngineEnvironment.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/AppEngineEnvironment.java
deleted file mode 100644
index c7fe4b4..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/AppEngineEnvironment.java
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-import java.lang.reflect.InvocationTargetException;
-
-/** Stores whether we are running within AppEngine or not. */
-public class AppEngineEnvironment {
- /**
- * True if running inside of AppEngine, false otherwise.
- */
- @Deprecated
- public static final boolean IS_APP_ENGINE = isAppEngine();
-
- /**
- * Attempts to detect whether we are inside of AppEngine.
- *
- * <p>Purposely copied and left private from private <a href="https://code.google.com/p/
- * guava-libraries/source/browse/guava/src/com/google/common/util/concurrent/
- * MoreExecutors.java#785">code.google.common.util.concurrent.MoreExecutors#isAppEngine</a>.
- *
- * @return true if we are inside of AppEngine, false otherwise.
- */
- static boolean isAppEngine() {
- if (System.getProperty("com.google.appengine.runtime.environment") == null) {
- return false;
- }
- try {
- // If the current environment is null, we're not inside AppEngine.
- return Class.forName("com.google.apphosting.api.ApiProxy")
- .getMethod("getCurrentEnvironment")
- .invoke(null) != null;
- } catch (ClassNotFoundException e) {
- // If ApiProxy doesn't exist, we're not on AppEngine at all.
- return false;
- } catch (InvocationTargetException e) {
- // If ApiProxy throws an exception, we're not in a proper AppEngine environment.
- return false;
- } catch (IllegalAccessException e) {
- // If the method isn't accessible, we're not on a supported version of AppEngine;
- return false;
- } catch (NoSuchMethodException e) {
- // If the method doesn't exist, we're not on a supported version of AppEngine;
- return false;
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/AppliedCombineFn.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/AppliedCombineFn.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/AppliedCombineFn.java
deleted file mode 100644
index 512d72d..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/AppliedCombineFn.java
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.cloud.dataflow.sdk.coders.CannotProvideCoderException;
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.coders.CoderRegistry;
-import com.google.cloud.dataflow.sdk.coders.KvCoder;
-import com.google.cloud.dataflow.sdk.transforms.CombineFnBase.PerKeyCombineFn;
-import com.google.cloud.dataflow.sdk.transforms.CombineWithContext.KeyedCombineFnWithContext;
-import com.google.cloud.dataflow.sdk.values.PCollectionView;
-import com.google.common.annotations.VisibleForTesting;
-
-import java.io.Serializable;
-
-/**
- * A {@link KeyedCombineFnWithContext} with a fixed accumulator coder. This is created from a
- * specific application of the {@link KeyedCombineFnWithContext}.
- *
- * <p>Because the {@code AccumT} may reference {@code InputT}, the specific {@code Coder<AccumT>}
- * may depend on the {@code Coder<InputT>}.
- *
- * @param <K> type of keys
- * @param <InputT> type of input values
- * @param <AccumT> type of mutable accumulator values
- * @param <OutputT> type of output values
- */
-public class AppliedCombineFn<K, InputT, AccumT, OutputT> implements Serializable {
-
- private final PerKeyCombineFn<K, InputT, AccumT, OutputT> fn;
- private final Coder<AccumT> accumulatorCoder;
-
- private final Iterable<PCollectionView<?>> sideInputViews;
- private final KvCoder<K, InputT> kvCoder;
- private final WindowingStrategy<?, ?> windowingStrategy;
-
- private AppliedCombineFn(PerKeyCombineFn<K, InputT, AccumT, OutputT> fn,
- Coder<AccumT> accumulatorCoder, Iterable<PCollectionView<?>> sideInputViews,
- KvCoder<K, InputT> kvCoder, WindowingStrategy<?, ?> windowingStrategy) {
- this.fn = fn;
- this.accumulatorCoder = accumulatorCoder;
- this.sideInputViews = sideInputViews;
- this.kvCoder = kvCoder;
- this.windowingStrategy = windowingStrategy;
- }
-
- public static <K, InputT, AccumT, OutputT> AppliedCombineFn<K, InputT, AccumT, OutputT>
- withAccumulatorCoder(
- PerKeyCombineFn<? super K, ? super InputT, AccumT, OutputT> fn,
- Coder<AccumT> accumCoder) {
- return withAccumulatorCoder(fn, accumCoder, null, null, null);
- }
-
- public static <K, InputT, AccumT, OutputT> AppliedCombineFn<K, InputT, AccumT, OutputT>
- withAccumulatorCoder(
- PerKeyCombineFn<? super K, ? super InputT, AccumT, OutputT> fn,
- Coder<AccumT> accumCoder, Iterable<PCollectionView<?>> sideInputViews,
- KvCoder<K, InputT> kvCoder, WindowingStrategy<?, ?> windowingStrategy) {
- // Casting down the K and InputT is safe because they're only used as inputs.
- @SuppressWarnings("unchecked")
- PerKeyCombineFn<K, InputT, AccumT, OutputT> clonedFn =
- (PerKeyCombineFn<K, InputT, AccumT, OutputT>) SerializableUtils.clone(fn);
- return create(clonedFn, accumCoder, sideInputViews, kvCoder, windowingStrategy);
- }
-
- @VisibleForTesting
- public static <K, InputT, AccumT, OutputT> AppliedCombineFn<K, InputT, AccumT, OutputT>
- withInputCoder(PerKeyCombineFn<? super K, ? super InputT, AccumT, OutputT> fn,
- CoderRegistry registry, KvCoder<K, InputT> kvCoder) {
- return withInputCoder(fn, registry, kvCoder, null, null);
- }
-
- public static <K, InputT, AccumT, OutputT> AppliedCombineFn<K, InputT, AccumT, OutputT>
- withInputCoder(PerKeyCombineFn<? super K, ? super InputT, AccumT, OutputT> fn,
- CoderRegistry registry, KvCoder<K, InputT> kvCoder,
- Iterable<PCollectionView<?>> sideInputViews, WindowingStrategy<?, ?> windowingStrategy) {
- // Casting down the K and InputT is safe because they're only used as inputs.
- @SuppressWarnings("unchecked")
- PerKeyCombineFn<K, InputT, AccumT, OutputT> clonedFn =
- (PerKeyCombineFn<K, InputT, AccumT, OutputT>) SerializableUtils.clone(fn);
- try {
- Coder<AccumT> accumulatorCoder = clonedFn.getAccumulatorCoder(
- registry, kvCoder.getKeyCoder(), kvCoder.getValueCoder());
- return create(clonedFn, accumulatorCoder, sideInputViews, kvCoder, windowingStrategy);
- } catch (CannotProvideCoderException e) {
- throw new IllegalStateException("Could not determine coder for accumulator", e);
- }
- }
-
- private static <K, InputT, AccumT, OutputT> AppliedCombineFn<K, InputT, AccumT, OutputT> create(
- PerKeyCombineFn<K, InputT, AccumT, OutputT> fn,
- Coder<AccumT> accumulatorCoder, Iterable<PCollectionView<?>> sideInputViews,
- KvCoder<K, InputT> kvCoder, WindowingStrategy<?, ?> windowingStrategy) {
- return new AppliedCombineFn<>(
- fn, accumulatorCoder, sideInputViews, kvCoder, windowingStrategy);
- }
-
- public PerKeyCombineFn<K, InputT, AccumT, OutputT> getFn() {
- return fn;
- }
-
- public Iterable<PCollectionView<?>> getSideInputViews() {
- return sideInputViews;
- }
-
- public Coder<AccumT> getAccumulatorCoder() {
- return accumulatorCoder;
- }
-
- public KvCoder<K, InputT> getKvCoder() {
- return kvCoder;
- }
-
- public WindowingStrategy<?, ?> getWindowingStrategy() {
- return windowingStrategy;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/AssignWindowsDoFn.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/AssignWindowsDoFn.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/AssignWindowsDoFn.java
deleted file mode 100644
index ca59c53..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/AssignWindowsDoFn.java
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.cloud.dataflow.sdk.transforms.DoFn;
-import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
-import com.google.cloud.dataflow.sdk.transforms.windowing.PaneInfo;
-import com.google.cloud.dataflow.sdk.transforms.windowing.WindowFn;
-
-import org.joda.time.Instant;
-
-import java.util.Collection;
-
-/**
- * {@link DoFn} that tags elements of a PCollection with windows, according
- * to the provided {@link WindowFn}.
- * @param <T> Type of elements being windowed
- * @param <W> Window type
- */
-@SystemDoFnInternal
-public class AssignWindowsDoFn<T, W extends BoundedWindow> extends DoFn<T, T> {
- private WindowFn<? super T, W> fn;
-
- public AssignWindowsDoFn(WindowFn<? super T, W> fn) {
- this.fn = fn;
- }
-
- @Override
- @SuppressWarnings("unchecked")
- public void processElement(final ProcessContext c) throws Exception {
- Collection<W> windows =
- ((WindowFn<T, W>) fn).assignWindows(
- ((WindowFn<T, W>) fn).new AssignContext() {
- @Override
- public T element() {
- return c.element();
- }
-
- @Override
- public Instant timestamp() {
- return c.timestamp();
- }
-
- @Override
- public Collection<? extends BoundedWindow> windows() {
- return c.windowingInternals().windows();
- }
- });
-
- c.windowingInternals()
- .outputWindowedValue(c.element(), c.timestamp(), windows, PaneInfo.NO_FIRING);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/AttemptAndTimeBoundedExponentialBackOff.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/AttemptAndTimeBoundedExponentialBackOff.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/AttemptAndTimeBoundedExponentialBackOff.java
deleted file mode 100644
index e94d414..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/AttemptAndTimeBoundedExponentialBackOff.java
+++ /dev/null
@@ -1,168 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.api.client.util.BackOff;
-import com.google.api.client.util.NanoClock;
-import com.google.common.base.Preconditions;
-
-import java.util.concurrent.TimeUnit;
-
-/**
- * Extension of {@link AttemptBoundedExponentialBackOff} that bounds the total time that the backoff
- * is happening as well as the amount of retries. Acts exactly as a AttemptBoundedExponentialBackOff
- * unless the time interval has expired since the object was created. At this point, it will always
- * return BackOff.STOP. Calling reset() resets both the timer and the number of retry attempts,
- * unless a custom ResetPolicy (ResetPolicy.ATTEMPTS or ResetPolicy.TIMER) is passed to the
- * constructor.
- *
- * <p>Implementation is not thread-safe.
- */
-public class AttemptAndTimeBoundedExponentialBackOff extends AttemptBoundedExponentialBackOff {
- private long endTimeMillis;
- private long maximumTotalWaitTimeMillis;
- private ResetPolicy resetPolicy;
- private final NanoClock nanoClock;
- // NanoClock.SYSTEM has a max elapsed time of 292 years or 2^63 ns. Here, we choose 2^53 ns as
- // a smaller but still huge limit.
- private static final long MAX_ELAPSED_TIME_MILLIS = 1L << 53;
-
- /**
- * A ResetPolicy controls the behavior of this BackOff when reset() is called. By default, both
- * the number of attempts and the time bound for the BackOff are reset, but an alternative
- * ResetPolicy may be set to only reset one of these two.
- */
- public static enum ResetPolicy {
- ALL,
- ATTEMPTS,
- TIMER
- }
-
- /**
- * Constructs an instance of AttemptAndTimeBoundedExponentialBackoff.
- *
- * @param maximumNumberOfAttempts The maximum number of attempts it will make.
- * @param initialIntervalMillis The original interval to wait between attempts in milliseconds.
- * @param maximumTotalWaitTimeMillis The maximum total time that this object will
- * allow more attempts in milliseconds.
- */
- public AttemptAndTimeBoundedExponentialBackOff(
- int maximumNumberOfAttempts, long initialIntervalMillis, long maximumTotalWaitTimeMillis) {
- this(
- maximumNumberOfAttempts,
- initialIntervalMillis,
- maximumTotalWaitTimeMillis,
- ResetPolicy.ALL,
- NanoClock.SYSTEM);
- }
-
- /**
- * Constructs an instance of AttemptAndTimeBoundedExponentialBackoff.
- *
- * @param maximumNumberOfAttempts The maximum number of attempts it will make.
- * @param initialIntervalMillis The original interval to wait between attempts in milliseconds.
- * @param maximumTotalWaitTimeMillis The maximum total time that this object will
- * allow more attempts in milliseconds.
- * @param resetPolicy The ResetPolicy specifying the properties of this BackOff that are subject
- * to being reset.
- */
- public AttemptAndTimeBoundedExponentialBackOff(
- int maximumNumberOfAttempts,
- long initialIntervalMillis,
- long maximumTotalWaitTimeMillis,
- ResetPolicy resetPolicy) {
- this(
- maximumNumberOfAttempts,
- initialIntervalMillis,
- maximumTotalWaitTimeMillis,
- resetPolicy,
- NanoClock.SYSTEM);
- }
-
- /**
- * Constructs an instance of AttemptAndTimeBoundedExponentialBackoff.
- *
- * @param maximumNumberOfAttempts The maximum number of attempts it will make.
- * @param initialIntervalMillis The original interval to wait between attempts in milliseconds.
- * @param maximumTotalWaitTimeMillis The maximum total time that this object will
- * allow more attempts in milliseconds.
- * @param resetPolicy The ResetPolicy specifying the properties of this BackOff that are subject
- * to being reset.
- * @param nanoClock clock used to measure the time that has passed.
- */
- public AttemptAndTimeBoundedExponentialBackOff(
- int maximumNumberOfAttempts,
- long initialIntervalMillis,
- long maximumTotalWaitTimeMillis,
- ResetPolicy resetPolicy,
- NanoClock nanoClock) {
- super(maximumNumberOfAttempts, initialIntervalMillis);
- Preconditions.checkArgument(
- maximumTotalWaitTimeMillis > 0, "Maximum total wait time must be greater than zero.");
- Preconditions.checkArgument(
- maximumTotalWaitTimeMillis < MAX_ELAPSED_TIME_MILLIS,
- "Maximum total wait time must be less than " + MAX_ELAPSED_TIME_MILLIS + " milliseconds");
- Preconditions.checkArgument(resetPolicy != null, "resetPolicy may not be null");
- Preconditions.checkArgument(nanoClock != null, "nanoClock may not be null");
- this.maximumTotalWaitTimeMillis = maximumTotalWaitTimeMillis;
- this.resetPolicy = resetPolicy;
- this.nanoClock = nanoClock;
- // Set the end time for this BackOff. Note that we cannot simply call reset() here since the
- // resetPolicy may not be set to reset the time bound.
- endTimeMillis = getTimeMillis() + maximumTotalWaitTimeMillis;
- }
-
- @Override
- public void reset() {
- // reset() is called in the constructor of the parent class before resetPolicy and nanoClock are
- // set. In this case, we call the parent class's reset() method and return.
- if (resetPolicy == null) {
- super.reset();
- return;
- }
- // Reset the number of attempts.
- if (resetPolicy == ResetPolicy.ALL || resetPolicy == ResetPolicy.ATTEMPTS) {
- super.reset();
- }
- // Reset the time bound.
- if (resetPolicy == ResetPolicy.ALL || resetPolicy == ResetPolicy.TIMER) {
- endTimeMillis = getTimeMillis() + maximumTotalWaitTimeMillis;
- }
- }
-
- public void setEndtimeMillis(long endTimeMillis) {
- this.endTimeMillis = endTimeMillis;
- }
-
- @Override
- public long nextBackOffMillis() {
- if (atMaxAttempts()) {
- return BackOff.STOP;
- }
- long backoff = Math.min(super.nextBackOffMillis(), endTimeMillis - getTimeMillis());
- return (backoff > 0 ? backoff : BackOff.STOP);
- }
-
- private long getTimeMillis() {
- return TimeUnit.NANOSECONDS.toMillis(nanoClock.nanoTime());
- }
-
- @Override
- public boolean atMaxAttempts() {
- return super.atMaxAttempts() || getTimeMillis() >= endTimeMillis;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/AttemptBoundedExponentialBackOff.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/AttemptBoundedExponentialBackOff.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/AttemptBoundedExponentialBackOff.java
deleted file mode 100644
index 613316e..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/AttemptBoundedExponentialBackOff.java
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.api.client.util.BackOff;
-import com.google.common.base.Preconditions;
-
-/**
- * Implementation of {@link BackOff} that increases the back off period for each retry attempt
- * using a randomization function that grows exponentially.
- *
- * <p>Example: The initial interval is .5 seconds and the maximum number of retries is 10.
- * For 10 tries the sequence will be (values in seconds):
- *
- * <pre>
- * retry# retry_interval randomized_interval
- * 1 0.5 [0.25, 0.75]
- * 2 0.75 [0.375, 1.125]
- * 3 1.125 [0.562, 1.687]
- * 4 1.687 [0.8435, 2.53]
- * 5 2.53 [1.265, 3.795]
- * 6 3.795 [1.897, 5.692]
- * 7 5.692 [2.846, 8.538]
- * 8 8.538 [4.269, 12.807]
- * 9 12.807 [6.403, 19.210]
- * 10 {@link BackOff#STOP}
- * </pre>
- *
- * <p>Implementation is not thread-safe.
- */
-public class AttemptBoundedExponentialBackOff implements BackOff {
- public static final double DEFAULT_MULTIPLIER = 1.5;
- public static final double DEFAULT_RANDOMIZATION_FACTOR = 0.5;
- private final int maximumNumberOfAttempts;
- private final long initialIntervalMillis;
- private int currentAttempt;
-
- public AttemptBoundedExponentialBackOff(int maximumNumberOfAttempts, long initialIntervalMillis) {
- Preconditions.checkArgument(maximumNumberOfAttempts > 0,
- "Maximum number of attempts must be greater than zero.");
- Preconditions.checkArgument(initialIntervalMillis > 0,
- "Initial interval must be greater than zero.");
- this.maximumNumberOfAttempts = maximumNumberOfAttempts;
- this.initialIntervalMillis = initialIntervalMillis;
- reset();
- }
-
- @Override
- public void reset() {
- currentAttempt = 1;
- }
-
- @Override
- public long nextBackOffMillis() {
- if (currentAttempt >= maximumNumberOfAttempts) {
- return BackOff.STOP;
- }
- double currentIntervalMillis = initialIntervalMillis
- * Math.pow(DEFAULT_MULTIPLIER, currentAttempt - 1);
- double randomOffset = (Math.random() * 2 - 1)
- * DEFAULT_RANDOMIZATION_FACTOR * currentIntervalMillis;
- currentAttempt += 1;
- return Math.round(currentIntervalMillis + randomOffset);
- }
-
- public boolean atMaxAttempts() {
- return currentAttempt >= maximumNumberOfAttempts;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/AvroUtils.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/AvroUtils.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/AvroUtils.java
deleted file mode 100644
index c3a4861..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/AvroUtils.java
+++ /dev/null
@@ -1,345 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-import static com.google.common.base.MoreObjects.firstNonNull;
-import static com.google.common.base.Preconditions.checkNotNull;
-import static com.google.common.base.Verify.verify;
-
-import com.google.api.services.bigquery.model.TableFieldSchema;
-import com.google.api.services.bigquery.model.TableRow;
-import com.google.api.services.bigquery.model.TableSchema;
-import com.google.common.collect.ImmutableList;
-import com.google.common.collect.ImmutableMap;
-
-import org.apache.avro.Schema;
-import org.apache.avro.Schema.Field;
-import org.apache.avro.Schema.Type;
-import org.apache.avro.file.DataFileConstants;
-import org.apache.avro.generic.GenericRecord;
-import org.apache.avro.io.BinaryDecoder;
-import org.apache.avro.io.DecoderFactory;
-import org.joda.time.format.DateTimeFormat;
-import org.joda.time.format.DateTimeFormatter;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.nio.ByteBuffer;
-import java.nio.channels.Channels;
-import java.util.Arrays;
-import java.util.List;
-
-import javax.annotation.Nullable;
-
-/**
- * A set of utilities for working with Avro files.
- *
- * <p>These utilities are based on the <a
- * href="https://avro.apache.org/docs/1.7.7/spec.html">Avro 1.7.7</a> specification.
- */
-public class AvroUtils {
-
- /**
- * Avro file metadata.
- */
- public static class AvroMetadata {
- private byte[] syncMarker;
- private String codec;
- private String schemaString;
-
- AvroMetadata(byte[] syncMarker, String codec, String schemaString) {
- this.syncMarker = syncMarker;
- this.codec = codec;
- this.schemaString = schemaString;
- }
-
- /**
- * The JSON-encoded <a href="https://avro.apache.org/docs/1.7.7/spec.html#schemas">schema</a>
- * string for the file.
- */
- public String getSchemaString() {
- return schemaString;
- }
-
- /**
- * The <a href="https://avro.apache.org/docs/1.7.7/spec.html#Required+Codecs">codec</a> of the
- * file.
- */
- public String getCodec() {
- return codec;
- }
-
- /**
- * The 16-byte sync marker for the file. See the documentation for
- * <a href="https://avro.apache.org/docs/1.7.7/spec.html#Object+Container+Files">Object
- * Container File</a> for more information.
- */
- public byte[] getSyncMarker() {
- return syncMarker;
- }
- }
-
- /**
- * Reads the {@link AvroMetadata} from the header of an Avro file.
- *
- * <p>This method parses the header of an Avro
- * <a href="https://avro.apache.org/docs/1.7.7/spec.html#Object+Container+Files">
- * Object Container File</a>.
- *
- * @throws IOException if the file is an invalid format.
- */
- public static AvroMetadata readMetadataFromFile(String fileName) throws IOException {
- String codec = null;
- String schemaString = null;
- byte[] syncMarker;
- try (InputStream stream =
- Channels.newInputStream(IOChannelUtils.getFactory(fileName).open(fileName))) {
- BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(stream, null);
-
- // The header of an object container file begins with a four-byte magic number, followed
- // by the file metadata (including the schema and codec), encoded as a map. Finally, the
- // header ends with the file's 16-byte sync marker.
- // See https://avro.apache.org/docs/1.7.7/spec.html#Object+Container+Files for details on
- // the encoding of container files.
-
- // Read the magic number.
- byte[] magic = new byte[DataFileConstants.MAGIC.length];
- decoder.readFixed(magic);
- if (!Arrays.equals(magic, DataFileConstants.MAGIC)) {
- throw new IOException("Missing Avro file signature: " + fileName);
- }
-
- // Read the metadata to find the codec and schema.
- ByteBuffer valueBuffer = ByteBuffer.allocate(512);
- long numRecords = decoder.readMapStart();
- while (numRecords > 0) {
- for (long recordIndex = 0; recordIndex < numRecords; recordIndex++) {
- String key = decoder.readString();
- // readBytes() clears the buffer and returns a buffer where:
- // - position is the start of the bytes read
- // - limit is the end of the bytes read
- valueBuffer = decoder.readBytes(valueBuffer);
- byte[] bytes = new byte[valueBuffer.remaining()];
- valueBuffer.get(bytes);
- if (key.equals(DataFileConstants.CODEC)) {
- codec = new String(bytes, "UTF-8");
- } else if (key.equals(DataFileConstants.SCHEMA)) {
- schemaString = new String(bytes, "UTF-8");
- }
- }
- numRecords = decoder.mapNext();
- }
- if (codec == null) {
- codec = DataFileConstants.NULL_CODEC;
- }
-
- // Finally, read the sync marker.
- syncMarker = new byte[DataFileConstants.SYNC_SIZE];
- decoder.readFixed(syncMarker);
- }
- return new AvroMetadata(syncMarker, codec, schemaString);
- }
-
- /**
- * Formats BigQuery seconds-since-epoch into String matching JSON export. Thread-safe and
- * immutable.
- */
- private static final DateTimeFormatter DATE_AND_SECONDS_FORMATTER =
- DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss").withZoneUTC();
- // Package private for BigQueryTableRowIterator to use.
- static String formatTimestamp(String timestamp) {
- // timestamp is in "seconds since epoch" format, with scientific notation.
- // e.g., "1.45206229112345E9" to mean "2016-01-06 06:38:11.123456 UTC".
- // Separate into seconds and microseconds.
- double timestampDoubleMicros = Double.parseDouble(timestamp) * 1000000;
- long timestampMicros = (long) timestampDoubleMicros;
- long seconds = timestampMicros / 1000000;
- int micros = (int) (timestampMicros % 1000000);
- String dayAndTime = DATE_AND_SECONDS_FORMATTER.print(seconds * 1000);
-
- // No sub-second component.
- if (micros == 0) {
- return String.format("%s UTC", dayAndTime);
- }
-
- // Sub-second component.
- int digits = 6;
- int subsecond = micros;
- while (subsecond % 10 == 0) {
- digits--;
- subsecond /= 10;
- }
- String formatString = String.format("%%0%dd", digits);
- String fractionalSeconds = String.format(formatString, subsecond);
- return String.format("%s.%s UTC", dayAndTime, fractionalSeconds);
- }
-
- /**
- * Utility function to convert from an Avro {@link GenericRecord} to a BigQuery {@link TableRow}.
- *
- * See <a href="https://cloud.google.com/bigquery/exporting-data-from-bigquery#config">
- * "Avro format"</a> for more information.
- */
- public static TableRow convertGenericRecordToTableRow(GenericRecord record, TableSchema schema) {
- return convertGenericRecordToTableRow(record, schema.getFields());
- }
-
- private static TableRow convertGenericRecordToTableRow(
- GenericRecord record, List<TableFieldSchema> fields) {
- TableRow row = new TableRow();
- for (TableFieldSchema subSchema : fields) {
- // Per https://cloud.google.com/bigquery/docs/reference/v2/tables#schema, the name field
- // is required, so it may not be null.
- Field field = record.getSchema().getField(subSchema.getName());
- Object convertedValue =
- getTypedCellValue(field.schema(), subSchema, record.get(field.name()));
- if (convertedValue != null) {
- // To match the JSON files exported by BigQuery, do not include null values in the output.
- row.set(field.name(), convertedValue);
- }
- }
- return row;
- }
-
- @Nullable
- private static Object getTypedCellValue(Schema schema, TableFieldSchema fieldSchema, Object v) {
- // Per https://cloud.google.com/bigquery/docs/reference/v2/tables#schema, the mode field
- // is optional (and so it may be null), but defaults to "NULLABLE".
- String mode = firstNonNull(fieldSchema.getMode(), "NULLABLE");
- switch (mode) {
- case "REQUIRED":
- return convertRequiredField(schema.getType(), fieldSchema, v);
- case "REPEATED":
- return convertRepeatedField(schema, fieldSchema, v);
- case "NULLABLE":
- return convertNullableField(schema, fieldSchema, v);
- default:
- throw new UnsupportedOperationException(
- "Parsing a field with BigQuery field schema mode " + fieldSchema.getMode());
- }
- }
-
- private static List<Object> convertRepeatedField(
- Schema schema, TableFieldSchema fieldSchema, Object v) {
- Type arrayType = schema.getType();
- verify(
- arrayType == Type.ARRAY,
- "BigQuery REPEATED field %s should be Avro ARRAY, not %s",
- fieldSchema.getName(),
- arrayType);
- // REPEATED fields are represented as Avro arrays.
- if (v == null) {
- // Handle the case of an empty repeated field.
- return ImmutableList.of();
- }
- @SuppressWarnings("unchecked")
- List<Object> elements = (List<Object>) v;
- ImmutableList.Builder<Object> values = ImmutableList.builder();
- Type elementType = schema.getElementType().getType();
- for (Object element : elements) {
- values.add(convertRequiredField(elementType, fieldSchema, element));
- }
- return values.build();
- }
-
- private static Object convertRequiredField(
- Type avroType, TableFieldSchema fieldSchema, Object v) {
- // REQUIRED fields are represented as the corresponding Avro types. For example, a BigQuery
- // INTEGER type maps to an Avro LONG type.
- checkNotNull(v, "REQUIRED field %s should not be null", fieldSchema.getName());
- ImmutableMap<String, Type> fieldMap =
- ImmutableMap.<String, Type>builder()
- .put("STRING", Type.STRING)
- .put("INTEGER", Type.LONG)
- .put("FLOAT", Type.DOUBLE)
- .put("BOOLEAN", Type.BOOLEAN)
- .put("TIMESTAMP", Type.LONG)
- .put("RECORD", Type.RECORD)
- .build();
- // Per https://cloud.google.com/bigquery/docs/reference/v2/tables#schema, the type field
- // is required, so it may not be null.
- String bqType = fieldSchema.getType();
- Type expectedAvroType = fieldMap.get(bqType);
- verify(
- avroType == expectedAvroType,
- "Expected Avro schema type %s, not %s, for BigQuery %s field %s",
- expectedAvroType,
- avroType,
- bqType,
- fieldSchema.getName());
- switch (fieldSchema.getType()) {
- case "STRING":
- // Avro will use a CharSequence to represent String objects, but it may not always use
- // java.lang.String; for example, it may prefer org.apache.avro.util.Utf8.
- verify(v instanceof CharSequence, "Expected CharSequence (String), got %s", v.getClass());
- return v.toString();
- case "INTEGER":
- verify(v instanceof Long, "Expected Long, got %s", v.getClass());
- return ((Long) v).toString();
- case "FLOAT":
- verify(v instanceof Double, "Expected Double, got %s", v.getClass());
- return v;
- case "BOOLEAN":
- verify(v instanceof Boolean, "Expected Boolean, got %s", v.getClass());
- return v;
- case "TIMESTAMP":
- // TIMESTAMP data types are represented as Avro LONG types. They are converted back to
- // Strings with variable-precision (up to six digits) to match the JSON files export
- // by BigQuery.
- verify(v instanceof Long, "Expected Long, got %s", v.getClass());
- Double doubleValue = ((Long) v) / 1000000.0;
- return formatTimestamp(doubleValue.toString());
- case "RECORD":
- verify(v instanceof GenericRecord, "Expected GenericRecord, got %s", v.getClass());
- return convertGenericRecordToTableRow((GenericRecord) v, fieldSchema.getFields());
- default:
- throw new UnsupportedOperationException(
- String.format(
- "Unexpected BigQuery field schema type %s for field named %s",
- fieldSchema.getType(),
- fieldSchema.getName()));
- }
- }
-
- @Nullable
- private static Object convertNullableField(
- Schema avroSchema, TableFieldSchema fieldSchema, Object v) {
- // NULLABLE fields are represented as an Avro Union of the corresponding type and "null".
- verify(
- avroSchema.getType() == Type.UNION,
- "Expected Avro schema type UNION, not %s, for BigQuery NULLABLE field %s",
- avroSchema.getType(),
- fieldSchema.getName());
- List<Schema> unionTypes = avroSchema.getTypes();
- verify(
- unionTypes.size() == 2,
- "BigQuery NULLABLE field %s should be an Avro UNION of NULL and another type, not %s",
- fieldSchema.getName(),
- unionTypes);
-
- if (v == null) {
- return null;
- }
-
- Type firstType = unionTypes.get(0).getType();
- if (!firstType.equals(Type.NULL)) {
- return convertRequiredField(firstType, fieldSchema, v);
- }
- return convertRequiredField(unionTypes.get(1).getType(), fieldSchema, v);
- }
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/BaseExecutionContext.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/BaseExecutionContext.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/BaseExecutionContext.java
deleted file mode 100644
index 6a0ccf3..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/BaseExecutionContext.java
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
-import com.google.cloud.dataflow.sdk.util.common.worker.StateSampler;
-import com.google.cloud.dataflow.sdk.util.state.StateInternals;
-import com.google.cloud.dataflow.sdk.values.TupleTag;
-
-import java.io.IOException;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.Map;
-
-/**
- * Base class for implementations of {@link ExecutionContext}.
- *
- * <p>A concrete subclass should implement {@link #createStepContext} to create the appropriate
- * {@link StepContext} implementation. Any {@code StepContext} created will
- * be cached for the lifetime of this {@link ExecutionContext}.
- *
- * <p>BaseExecutionContext is generic to allow implementing subclasses to return a concrete subclass
- * of {@link StepContext} from {@link #getOrCreateStepContext(String, String, StateSampler)} and
- * {@link #getAllStepContexts()} without forcing each subclass to override the method, e.g.
- * <pre>
- * @Override
- * StreamingModeExecutionContext.StepContext getOrCreateStepContext(...) {
- * return (StreamingModeExecutionContext.StepContext) super.getOrCreateStepContext(...);
- * }
- * </pre>
- *
- * <p>When a subclass of {@code BaseExecutionContext} has been downcast, the return types of
- * {@link #createStepContext(String, String, StateSampler)},
- * {@link #getOrCreateStepContext(String, String, StateSampler}, and {@link #getAllStepContexts()}
- * will be appropriately specialized.
- */
-public abstract class BaseExecutionContext<T extends ExecutionContext.StepContext>
- implements ExecutionContext {
-
- private Map<String, T> cachedStepContexts = new HashMap<>();
-
- /**
- * Implementations should override this to create the specific type
- * of {@link StepContext} they need.
- */
- protected abstract T createStepContext(
- String stepName, String transformName, StateSampler stateSampler);
-
-
- /**
- * Returns the {@link StepContext} associated with the given step.
- */
- @Override
- public T getOrCreateStepContext(
- String stepName, String transformName, StateSampler stateSampler) {
- T context = cachedStepContexts.get(stepName);
- if (context == null) {
- context = createStepContext(stepName, transformName, stateSampler);
- cachedStepContexts.put(stepName, context);
- }
- return context;
- }
-
- /**
- * Returns a collection view of all of the {@link StepContext}s.
- */
- @Override
- public Collection<? extends T> getAllStepContexts() {
- return Collections.unmodifiableCollection(cachedStepContexts.values());
- }
-
- /**
- * Hook for subclasses to implement that will be called whenever
- * {@link com.google.cloud.dataflow.sdk.transforms.DoFn.Context#output}
- * is called.
- */
- @Override
- public void noteOutput(WindowedValue<?> output) {}
-
- /**
- * Hook for subclasses to implement that will be called whenever
- * {@link com.google.cloud.dataflow.sdk.transforms.DoFn.Context#sideOutput}
- * is called.
- */
- @Override
- public void noteSideOutput(TupleTag<?> tag, WindowedValue<?> output) {}
-
- /**
- * Base class for implementations of {@link ExecutionContext.StepContext}.
- *
- * <p>To complete a concrete subclass, implement {@link #timerInternals} and
- * {@link #stateInternals}.
- */
- public abstract static class StepContext implements ExecutionContext.StepContext {
- private final ExecutionContext executionContext;
- private final String stepName;
- private final String transformName;
-
- public StepContext(ExecutionContext executionContext, String stepName, String transformName) {
- this.executionContext = executionContext;
- this.stepName = stepName;
- this.transformName = transformName;
- }
-
- @Override
- public String getStepName() {
- return stepName;
- }
-
- @Override
- public String getTransformName() {
- return transformName;
- }
-
- @Override
- public void noteOutput(WindowedValue<?> output) {
- executionContext.noteOutput(output);
- }
-
- @Override
- public void noteSideOutput(TupleTag<?> tag, WindowedValue<?> output) {
- executionContext.noteSideOutput(tag, output);
- }
-
- @Override
- public <T, W extends BoundedWindow> void writePCollectionViewData(
- TupleTag<?> tag,
- Iterable<WindowedValue<T>> data, Coder<Iterable<WindowedValue<T>>> dataCoder,
- W window, Coder<W> windowCoder) throws IOException {
- throw new UnsupportedOperationException("Not implemented.");
- }
-
- @Override
- public abstract StateInternals<?> stateInternals();
-
- @Override
- public abstract TimerInternals timerInternals();
- }
-}
[23/67] [partial] incubator-beam git commit: Directory reorganization
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/Create.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/Create.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/Create.java
deleted file mode 100644
index a74e5bf..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/Create.java
+++ /dev/null
@@ -1,426 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms;
-
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.coders.CannotProvideCoderException;
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.coders.CoderRegistry;
-import com.google.cloud.dataflow.sdk.coders.VoidCoder;
-import com.google.cloud.dataflow.sdk.runners.DirectPipelineRunner;
-import com.google.cloud.dataflow.sdk.util.WindowingStrategy;
-import com.google.cloud.dataflow.sdk.values.KV;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.cloud.dataflow.sdk.values.PCollection.IsBounded;
-import com.google.cloud.dataflow.sdk.values.PInput;
-import com.google.cloud.dataflow.sdk.values.TimestampedValue;
-import com.google.cloud.dataflow.sdk.values.TimestampedValue.TimestampedValueCoder;
-import com.google.cloud.dataflow.sdk.values.TypeDescriptor;
-import com.google.common.base.Function;
-import com.google.common.base.Optional;
-import com.google.common.base.Preconditions;
-import com.google.common.collect.Iterables;
-
-import org.joda.time.Instant;
-
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-import java.util.Objects;
-
-/**
- * {@code Create<T>} takes a collection of elements of type {@code T}
- * known when the pipeline is constructed and returns a
- * {@code PCollection<T>} containing the elements.
- *
- * <p>Example of use:
- * <pre> {@code
- * Pipeline p = ...;
- *
- * PCollection<Integer> pc = p.apply(Create.of(3, 4, 5).withCoder(BigEndianIntegerCoder.of()));
- *
- * Map<String, Integer> map = ...;
- * PCollection<KV<String, Integer>> pt =
- * p.apply(Create.of(map)
- * .withCoder(KvCoder.of(StringUtf8Coder.of(),
- * BigEndianIntegerCoder.of())));
- * } </pre>
- *
- * <p>{@code Create} can automatically determine the {@code Coder} to use
- * if all elements have the same run-time class, and a default coder is registered for that
- * class. See {@link CoderRegistry} for details on how defaults are determined.
- *
- * <p>If a coder can not be inferred, {@link Create.Values#withCoder} must be called
- * explicitly to set the encoding of the resulting
- * {@code PCollection}.
- *
- * <p>A good use for {@code Create} is when a {@code PCollection}
- * needs to be created without dependencies on files or other external
- * entities. This is especially useful during testing.
- *
- * <p>Caveat: {@code Create} only supports small in-memory datasets,
- * particularly when submitting jobs to the Google Cloud Dataflow
- * service.
- *
- * @param <T> the type of the elements of the resulting {@code PCollection}
- */
-public class Create<T> {
- /**
- * Returns a new {@code Create.Values} transform that produces a
- * {@link PCollection} containing elements of the provided
- * {@code Iterable}.
- *
- * <p>The argument should not be modified after this is called.
- *
- * <p>The elements of the output {@link PCollection} will have a timestamp of negative infinity,
- * see {@link Create#timestamped} for a way of creating a {@code PCollection} with timestamped
- * elements.
- *
- * <p>By default, {@code Create.Values} can automatically determine the {@code Coder} to use
- * if all elements have the same non-parameterized run-time class, and a default coder is
- * registered for that class. See {@link CoderRegistry} for details on how defaults are
- * determined.
- * Otherwise, use {@link Create.Values#withCoder} to set the coder explicitly.
- */
- public static <T> Values<T> of(Iterable<T> elems) {
- return new Values<>(elems, Optional.<Coder<T>>absent());
- }
-
- /**
- * Returns a new {@code Create.Values} transform that produces a
- * {@link PCollection} containing the specified elements.
- *
- * <p>The elements will have a timestamp of negative infinity, see
- * {@link Create#timestamped} for a way of creating a {@code PCollection}
- * with timestamped elements.
- *
- * <p>The arguments should not be modified after this is called.
- *
- * <p>By default, {@code Create.Values} can automatically determine the {@code Coder} to use
- * if all elements have the same non-parameterized run-time class, and a default coder is
- * registered for that class. See {@link CoderRegistry} for details on how defaults are
- * determined.
- * Otherwise, use {@link Create.Values#withCoder} to set the coder explicitly.
- */
- @SafeVarargs
- public static <T> Values<T> of(T... elems) {
- return of(Arrays.asList(elems));
- }
-
- /**
- * Returns a new {@code Create.Values} transform that produces a
- * {@link PCollection} of {@link KV}s corresponding to the keys and
- * values of the specified {@code Map}.
- *
- * <p>The elements will have a timestamp of negative infinity, see
- * {@link Create#timestamped} for a way of creating a {@code PCollection}
- * with timestamped elements.
- *
- * <p>By default, {@code Create.Values} can automatically determine the {@code Coder} to use
- * if all elements have the same non-parameterized run-time class, and a default coder is
- * registered for that class. See {@link CoderRegistry} for details on how defaults are
- * determined.
- * Otherwise, use {@link Create.Values#withCoder} to set the coder explicitly.
- */
- public static <K, V> Values<KV<K, V>> of(Map<K, V> elems) {
- List<KV<K, V>> kvs = new ArrayList<>(elems.size());
- for (Map.Entry<K, V> entry : elems.entrySet()) {
- kvs.add(KV.of(entry.getKey(), entry.getValue()));
- }
- return of(kvs);
- }
-
- /**
- * Returns a new {@link Create.TimestampedValues} transform that produces a
- * {@link PCollection} containing the elements of the provided {@code Iterable}
- * with the specified timestamps.
- *
- * <p>The argument should not be modified after this is called.
- *
- * <p>By default, {@code Create.TimestampedValues} can automatically determine the {@code Coder}
- * to use if all elements have the same non-parameterized run-time class, and a default coder is
- * registered for that class. See {@link CoderRegistry} for details on how defaults are
- * determined.
- * Otherwise, use {@link Create.TimestampedValues#withCoder} to set the coder explicitly.
- */
- public static <T> TimestampedValues<T> timestamped(Iterable<TimestampedValue<T>> elems) {
- return new TimestampedValues<>(elems, Optional.<Coder<T>>absent());
- }
-
- /**
- * Returns a new {@link Create.TimestampedValues} transform that produces a {@link PCollection}
- * containing the specified elements with the specified timestamps.
- *
- * <p>The arguments should not be modified after this is called.
- */
- @SafeVarargs
- public static <T> TimestampedValues<T> timestamped(
- @SuppressWarnings("unchecked") TimestampedValue<T>... elems) {
- return timestamped(Arrays.asList(elems));
- }
-
- /**
- * Returns a new root transform that produces a {@link PCollection} containing
- * the specified elements with the specified timestamps.
- *
- * <p>The arguments should not be modified after this is called.
- *
- * <p>By default, {@code Create.TimestampedValues} can automatically determine the {@code Coder}
- * to use if all elements have the same non-parameterized run-time class, and a default coder
- * is registered for that class. See {@link CoderRegistry} for details on how defaults are
- * determined.
- * Otherwise, use {@link Create.TimestampedValues#withCoder} to set the coder explicitly.
-
- * @throws IllegalArgumentException if there are a different number of values
- * and timestamps
- */
- public static <T> TimestampedValues<T> timestamped(
- Iterable<T> values, Iterable<Long> timestamps) {
- List<TimestampedValue<T>> elems = new ArrayList<>();
- Iterator<T> valueIter = values.iterator();
- Iterator<Long> timestampIter = timestamps.iterator();
- while (valueIter.hasNext() && timestampIter.hasNext()) {
- elems.add(TimestampedValue.of(valueIter.next(), new Instant(timestampIter.next())));
- }
- Preconditions.checkArgument(
- !valueIter.hasNext() && !timestampIter.hasNext(),
- "Expect sizes of values and timestamps are same.");
- return timestamped(elems);
- }
-
- /////////////////////////////////////////////////////////////////////////////
-
- /**
- * A {@code PTransform} that creates a {@code PCollection} from a set of in-memory objects.
- */
- public static class Values<T> extends PTransform<PInput, PCollection<T>> {
- /**
- * Returns a {@link Create.Values} PTransform like this one that uses the given
- * {@code Coder<T>} to decode each of the objects into a
- * value of type {@code T}.
- *
- * <p>By default, {@code Create.Values} can automatically determine the {@code Coder} to use
- * if all elements have the same non-parameterized run-time class, and a default coder is
- * registered for that class. See {@link CoderRegistry} for details on how defaults are
- * determined.
- *
- * <p>Note that for {@link Create.Values} with no elements, the {@link VoidCoder} is used.
- */
- public Values<T> withCoder(Coder<T> coder) {
- return new Values<>(elems, Optional.of(coder));
- }
-
- public Iterable<T> getElements() {
- return elems;
- }
-
- @Override
- public PCollection<T> apply(PInput input) {
- try {
- Coder<T> coder = getDefaultOutputCoder(input);
- return PCollection
- .<T>createPrimitiveOutputInternal(
- input.getPipeline(),
- WindowingStrategy.globalDefault(),
- IsBounded.BOUNDED)
- .setCoder(coder);
- } catch (CannotProvideCoderException e) {
- throw new IllegalArgumentException("Unable to infer a coder and no Coder was specified. "
- + "Please set a coder by invoking Create.withCoder() explicitly.", e);
- }
- }
-
- @Override
- public Coder<T> getDefaultOutputCoder(PInput input) throws CannotProvideCoderException {
- if (coder.isPresent()) {
- return coder.get();
- }
- // First try to deduce a coder using the types of the elements.
- Class<?> elementClazz = Void.class;
- for (T elem : elems) {
- if (elem == null) {
- continue;
- }
- Class<?> clazz = elem.getClass();
- if (elementClazz.equals(Void.class)) {
- elementClazz = clazz;
- } else if (!elementClazz.equals(clazz)) {
- // Elements are not the same type, require a user-specified coder.
- throw new CannotProvideCoderException(
- "Cannot provide coder for Create: The elements are not all of the same class.");
- }
- }
-
- if (elementClazz.getTypeParameters().length == 0) {
- try {
- @SuppressWarnings("unchecked") // elementClazz is a wildcard type
- Coder<T> coder = (Coder<T>) input.getPipeline().getCoderRegistry()
- .getDefaultCoder(TypeDescriptor.of(elementClazz));
- return coder;
- } catch (CannotProvideCoderException exc) {
- // let the next stage try
- }
- }
-
- // If that fails, try to deduce a coder using the elements themselves
- Optional<Coder<T>> coder = Optional.absent();
- for (T elem : elems) {
- Coder<T> c = input.getPipeline().getCoderRegistry().getDefaultCoder(elem);
- if (!coder.isPresent()) {
- coder = Optional.of(c);
- } else if (!Objects.equals(c, coder.get())) {
- throw new CannotProvideCoderException(
- "Cannot provide coder for elements of " + Create.class.getSimpleName() + ":"
- + " For their common class, no coder could be provided."
- + " Based on their values, they do not all default to the same Coder.");
- }
- }
-
- if (!coder.isPresent()) {
- throw new CannotProvideCoderException("Unable to infer a coder. Please register "
- + "a coder for ");
- }
- return coder.get();
- }
-
- /////////////////////////////////////////////////////////////////////////////
-
- /** The elements of the resulting PCollection. */
- private final transient Iterable<T> elems;
-
- /** The coder used to encode the values to and from a binary representation. */
- private final transient Optional<Coder<T>> coder;
-
- /**
- * Constructs a {@code Create.Values} transform that produces a
- * {@link PCollection} containing the specified elements.
- *
- * <p>The arguments should not be modified after this is called.
- */
- private Values(Iterable<T> elems, Optional<Coder<T>> coder) {
- this.elems = elems;
- this.coder = coder;
- }
- }
-
- /////////////////////////////////////////////////////////////////////////////
-
- /**
- * A {@code PTransform} that creates a {@code PCollection} whose elements have
- * associated timestamps.
- */
- public static class TimestampedValues<T> extends Values<T> {
- /**
- * Returns a {@link Create.TimestampedValues} PTransform like this one that uses the given
- * {@code Coder<T>} to decode each of the objects into a
- * value of type {@code T}.
- *
- * <p>By default, {@code Create.TimestampedValues} can automatically determine the
- * {@code Coder} to use if all elements have the same non-parameterized run-time class,
- * and a default coder is registered for that class. See {@link CoderRegistry} for details
- * on how defaults are determined.
- *
- * <p>Note that for {@link Create.TimestampedValues with no elements}, the {@link VoidCoder}
- * is used.
- */
- @Override
- public TimestampedValues<T> withCoder(Coder<T> coder) {
- return new TimestampedValues<>(elems, Optional.<Coder<T>>of(coder));
- }
-
- @Override
- public PCollection<T> apply(PInput input) {
- try {
- Coder<T> coder = getDefaultOutputCoder(input);
- PCollection<TimestampedValue<T>> intermediate = Pipeline.applyTransform(input,
- Create.of(elems).withCoder(TimestampedValueCoder.of(coder)));
-
- PCollection<T> output = intermediate.apply(ParDo.of(new ConvertTimestamps<T>()));
- output.setCoder(coder);
- return output;
- } catch (CannotProvideCoderException e) {
- throw new IllegalArgumentException("Unable to infer a coder and no Coder was specified. "
- + "Please set a coder by invoking CreateTimestamped.withCoder() explicitly.", e);
- }
- }
-
- /////////////////////////////////////////////////////////////////////////////
-
- /** The timestamped elements of the resulting PCollection. */
- private final transient Iterable<TimestampedValue<T>> elems;
-
- private TimestampedValues(Iterable<TimestampedValue<T>> elems,
- Optional<Coder<T>> coder) {
- super(
- Iterables.transform(elems, new Function<TimestampedValue<T>, T>() {
- @Override
- public T apply(TimestampedValue<T> input) {
- return input.getValue();
- }
- }), coder);
- this.elems = elems;
- }
-
- private static class ConvertTimestamps<T> extends DoFn<TimestampedValue<T>, T> {
- @Override
- public void processElement(ProcessContext c) {
- c.outputWithTimestamp(c.element().getValue(), c.element().getTimestamp());
- }
- }
- }
-
- /////////////////////////////////////////////////////////////////////////////
-
- static {
- registerDefaultTransformEvaluator();
- }
-
- @SuppressWarnings({"rawtypes", "unchecked"})
- private static void registerDefaultTransformEvaluator() {
- DirectPipelineRunner.registerDefaultTransformEvaluator(
- Create.Values.class,
- new DirectPipelineRunner.TransformEvaluator<Create.Values>() {
- @Override
- public void evaluate(
- Create.Values transform,
- DirectPipelineRunner.EvaluationContext context) {
- evaluateHelper(transform, context);
- }
- });
- }
-
- private static <T> void evaluateHelper(
- Create.Values<T> transform,
- DirectPipelineRunner.EvaluationContext context) {
- // Convert the Iterable of elems into a List of elems.
- List<T> listElems;
- if (transform.elems instanceof Collection) {
- Collection<T> collectionElems = (Collection<T>) transform.elems;
- listElems = new ArrayList<>(collectionElems.size());
- } else {
- listElems = new ArrayList<>();
- }
- for (T elem : transform.elems) {
- listElems.add(
- context.ensureElementEncodable(context.getOutput(transform), elem));
- }
- context.setPCollection(context.getOutput(transform), listElems);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/DoFn.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/DoFn.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/DoFn.java
deleted file mode 100644
index 5ba9992..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/DoFn.java
+++ /dev/null
@@ -1,563 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms;
-
-import static com.google.common.base.Preconditions.checkArgument;
-import static com.google.common.base.Preconditions.checkNotNull;
-import static com.google.common.base.Preconditions.checkState;
-
-import com.google.cloud.dataflow.sdk.annotations.Experimental;
-import com.google.cloud.dataflow.sdk.annotations.Experimental.Kind;
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-import com.google.cloud.dataflow.sdk.transforms.Combine.CombineFn;
-import com.google.cloud.dataflow.sdk.transforms.display.DisplayData;
-import com.google.cloud.dataflow.sdk.transforms.display.HasDisplayData;
-import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
-import com.google.cloud.dataflow.sdk.transforms.windowing.PaneInfo;
-import com.google.cloud.dataflow.sdk.util.WindowingInternals;
-import com.google.cloud.dataflow.sdk.values.PCollectionView;
-import com.google.cloud.dataflow.sdk.values.TupleTag;
-import com.google.cloud.dataflow.sdk.values.TypeDescriptor;
-import com.google.common.base.MoreObjects;
-
-import org.joda.time.Duration;
-import org.joda.time.Instant;
-
-import java.io.Serializable;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.Objects;
-import java.util.UUID;
-
-/**
- * The argument to {@link ParDo} providing the code to use to process
- * elements of the input
- * {@link com.google.cloud.dataflow.sdk.values.PCollection}.
- *
- * <p>See {@link ParDo} for more explanation, examples of use, and
- * discussion of constraints on {@code DoFn}s, including their
- * serializability, lack of access to global shared mutable state,
- * requirements for failure tolerance, and benefits of optimization.
- *
- * <p>{@code DoFn}s can be tested in the context of a particular
- * {@code Pipeline} by running that {@code Pipeline} on sample input
- * and then checking its output. Unit testing of a {@code DoFn},
- * separately from any {@code ParDo} transform or {@code Pipeline},
- * can be done via the {@link DoFnTester} harness.
- *
- * <p>{@link DoFnWithContext} (currently experimental) offers an alternative
- * mechanism for accessing {@link ProcessContext#window()} without the need
- * to implement {@link RequiresWindowAccess}.
- *
- * <p>See also {@link #processElement} for details on implementing the transformation
- * from {@code InputT} to {@code OutputT}.
- *
- * @param <InputT> the type of the (main) input elements
- * @param <OutputT> the type of the (main) output elements
- */
-public abstract class DoFn<InputT, OutputT> implements Serializable, HasDisplayData {
-
- /**
- * Information accessible to all methods in this {@code DoFn}.
- * Used primarily to output elements.
- */
- public abstract class Context {
-
- /**
- * Returns the {@code PipelineOptions} specified with the
- * {@link com.google.cloud.dataflow.sdk.runners.PipelineRunner}
- * invoking this {@code DoFn}. The {@code PipelineOptions} will
- * be the default running via {@link DoFnTester}.
- */
- public abstract PipelineOptions getPipelineOptions();
-
- /**
- * Adds the given element to the main output {@code PCollection}.
- *
- * <p>Once passed to {@code output} the element should be considered
- * immutable and not be modified in any way. It may be cached or retained
- * by the Dataflow runtime or later steps in the pipeline, or used in
- * other unspecified ways.
- *
- * <p>If invoked from {@link DoFn#processElement processElement}, the output
- * element will have the same timestamp and be in the same windows
- * as the input element passed to {@link DoFn#processElement processElement}.
- *
- * <p>If invoked from {@link #startBundle startBundle} or {@link #finishBundle finishBundle},
- * this will attempt to use the
- * {@link com.google.cloud.dataflow.sdk.transforms.windowing.WindowFn}
- * of the input {@code PCollection} to determine what windows the element
- * should be in, throwing an exception if the {@code WindowFn} attempts
- * to access any information about the input element. The output element
- * will have a timestamp of negative infinity.
- */
- public abstract void output(OutputT output);
-
- /**
- * Adds the given element to the main output {@code PCollection},
- * with the given timestamp.
- *
- * <p>Once passed to {@code outputWithTimestamp} the element should not be
- * modified in any way.
- *
- * <p>If invoked from {@link DoFn#processElement processElement}, the timestamp
- * must not be older than the input element's timestamp minus
- * {@link DoFn#getAllowedTimestampSkew getAllowedTimestampSkew}. The output element will
- * be in the same windows as the input element.
- *
- * <p>If invoked from {@link #startBundle startBundle} or {@link #finishBundle finishBundle},
- * this will attempt to use the
- * {@link com.google.cloud.dataflow.sdk.transforms.windowing.WindowFn}
- * of the input {@code PCollection} to determine what windows the element
- * should be in, throwing an exception if the {@code WindowFn} attempts
- * to access any information about the input element except for the
- * timestamp.
- */
- public abstract void outputWithTimestamp(OutputT output, Instant timestamp);
-
- /**
- * Adds the given element to the side output {@code PCollection} with the
- * given tag.
- *
- * <p>Once passed to {@code sideOutput} the element should not be modified
- * in any way.
- *
- * <p>The caller of {@code ParDo} uses {@link ParDo#withOutputTags withOutputTags} to
- * specify the tags of side outputs that it consumes. Non-consumed side
- * outputs, e.g., outputs for monitoring purposes only, don't necessarily
- * need to be specified.
- *
- * <p>The output element will have the same timestamp and be in the same
- * windows as the input element passed to {@link DoFn#processElement processElement}.
- *
- * <p>If invoked from {@link #startBundle startBundle} or {@link #finishBundle finishBundle},
- * this will attempt to use the
- * {@link com.google.cloud.dataflow.sdk.transforms.windowing.WindowFn}
- * of the input {@code PCollection} to determine what windows the element
- * should be in, throwing an exception if the {@code WindowFn} attempts
- * to access any information about the input element. The output element
- * will have a timestamp of negative infinity.
- *
- * @see ParDo#withOutputTags
- */
- public abstract <T> void sideOutput(TupleTag<T> tag, T output);
-
- /**
- * Adds the given element to the specified side output {@code PCollection},
- * with the given timestamp.
- *
- * <p>Once passed to {@code sideOutputWithTimestamp} the element should not be
- * modified in any way.
- *
- * <p>If invoked from {@link DoFn#processElement processElement}, the timestamp
- * must not be older than the input element's timestamp minus
- * {@link DoFn#getAllowedTimestampSkew getAllowedTimestampSkew}. The output element will
- * be in the same windows as the input element.
- *
- * <p>If invoked from {@link #startBundle startBundle} or {@link #finishBundle finishBundle},
- * this will attempt to use the
- * {@link com.google.cloud.dataflow.sdk.transforms.windowing.WindowFn}
- * of the input {@code PCollection} to determine what windows the element
- * should be in, throwing an exception if the {@code WindowFn} attempts
- * to access any information about the input element except for the
- * timestamp.
- *
- * @see ParDo#withOutputTags
- */
- public abstract <T> void sideOutputWithTimestamp(
- TupleTag<T> tag, T output, Instant timestamp);
-
- /**
- * Creates an {@link Aggregator} in the {@link DoFn} context with the
- * specified name and aggregation logic specified by {@link CombineFn}.
- *
- * <p>For internal use only.
- *
- * @param name the name of the aggregator
- * @param combiner the {@link CombineFn} to use in the aggregator
- * @return an aggregator for the provided name and {@link CombineFn} in this
- * context
- */
- @Experimental(Kind.AGGREGATOR)
- protected abstract <AggInputT, AggOutputT> Aggregator<AggInputT, AggOutputT>
- createAggregatorInternal(String name, CombineFn<AggInputT, ?, AggOutputT> combiner);
-
- /**
- * Sets up {@link Aggregator}s created by the {@link DoFn} so they are
- * usable within this context.
- *
- * <p>This method should be called by runners before {@link DoFn#startBundle}
- * is executed.
- */
- @Experimental(Kind.AGGREGATOR)
- protected final void setupDelegateAggregators() {
- for (DelegatingAggregator<?, ?> aggregator : aggregators.values()) {
- setupDelegateAggregator(aggregator);
- }
-
- aggregatorsAreFinal = true;
- }
-
- private final <AggInputT, AggOutputT> void setupDelegateAggregator(
- DelegatingAggregator<AggInputT, AggOutputT> aggregator) {
-
- Aggregator<AggInputT, AggOutputT> delegate = createAggregatorInternal(
- aggregator.getName(), aggregator.getCombineFn());
-
- aggregator.setDelegate(delegate);
- }
- }
-
- /**
- * Information accessible when running {@link DoFn#processElement}.
- */
- public abstract class ProcessContext extends Context {
-
- /**
- * Returns the input element to be processed.
- *
- * <p>The element should be considered immutable. The Dataflow runtime will not mutate the
- * element, so it is safe to cache, etc. The element should not be mutated by any of the
- * {@link DoFn} methods, because it may be cached elsewhere, retained by the Dataflow runtime,
- * or used in other unspecified ways.
- */
- public abstract InputT element();
-
- /**
- * Returns the value of the side input for the window corresponding to the
- * window of the main input element.
- *
- * <p>See
- * {@link com.google.cloud.dataflow.sdk.transforms.windowing.WindowFn#getSideInputWindow}
- * for how this corresponding window is determined.
- *
- * @throws IllegalArgumentException if this is not a side input
- * @see ParDo#withSideInputs
- */
- public abstract <T> T sideInput(PCollectionView<T> view);
-
- /**
- * Returns the timestamp of the input element.
- *
- * <p>See {@link com.google.cloud.dataflow.sdk.transforms.windowing.Window}
- * for more information.
- */
- public abstract Instant timestamp();
-
- /**
- * Returns the window into which the input element has been assigned.
- *
- * <p>See {@link com.google.cloud.dataflow.sdk.transforms.windowing.Window}
- * for more information.
- *
- * @throws UnsupportedOperationException if this {@link DoFn} does
- * not implement {@link RequiresWindowAccess}.
- */
- public abstract BoundedWindow window();
-
- /**
- * Returns information about the pane within this window into which the
- * input element has been assigned.
- *
- * <p>Generally all data is in a single, uninteresting pane unless custom
- * triggering and/or late data has been explicitly requested.
- * See {@link com.google.cloud.dataflow.sdk.transforms.windowing.Window}
- * for more information.
- */
- public abstract PaneInfo pane();
-
- /**
- * Returns the process context to use for implementing windowing.
- */
- @Experimental
- public abstract WindowingInternals<InputT, OutputT> windowingInternals();
- }
-
- /**
- * Returns the allowed timestamp skew duration, which is the maximum
- * duration that timestamps can be shifted backward in
- * {@link DoFn.Context#outputWithTimestamp}.
- *
- * <p>The default value is {@code Duration.ZERO}, in which case
- * timestamps can only be shifted forward to future. For infinite
- * skew, return {@code Duration.millis(Long.MAX_VALUE)}.
- *
- * <p> Note that producing an element whose timestamp is less than the
- * current timestamp may result in late data, i.e. returning a non-zero
- * value here does not impact watermark calculations used for firing
- * windows.
- *
- * @deprecated does not interact well with the watermark.
- */
- @Deprecated
- public Duration getAllowedTimestampSkew() {
- return Duration.ZERO;
- }
-
- /**
- * Interface for signaling that a {@link DoFn} needs to access the window the
- * element is being processed in, via {@link DoFn.ProcessContext#window}.
- */
- @Experimental
- public interface RequiresWindowAccess {}
-
- public DoFn() {
- this(new HashMap<String, DelegatingAggregator<?, ?>>());
- }
-
- DoFn(Map<String, DelegatingAggregator<?, ?>> aggregators) {
- this.aggregators = aggregators;
- }
-
- /////////////////////////////////////////////////////////////////////////////
-
- private final Map<String, DelegatingAggregator<?, ?>> aggregators;
-
- /**
- * Protects aggregators from being created after initialization.
- */
- private boolean aggregatorsAreFinal;
-
- /**
- * Prepares this {@code DoFn} instance for processing a batch of elements.
- *
- * <p>By default, does nothing.
- */
- public void startBundle(Context c) throws Exception {
- }
-
- /**
- * Processes one input element.
- *
- * <p>The current element of the input {@code PCollection} is returned by
- * {@link ProcessContext#element() c.element()}. It should be considered immutable. The Dataflow
- * runtime will not mutate the element, so it is safe to cache, etc. The element should not be
- * mutated by any of the {@link DoFn} methods, because it may be cached elsewhere, retained by the
- * Dataflow runtime, or used in other unspecified ways.
- *
- * <p>A value is added to the main output {@code PCollection} by {@link ProcessContext#output}.
- * Once passed to {@code output} the element should be considered immutable and not be modified in
- * any way. It may be cached elsewhere, retained by the Dataflow runtime, or used in other
- * unspecified ways.
- *
- * @see ProcessContext
- */
- public abstract void processElement(ProcessContext c) throws Exception;
-
- /**
- * Finishes processing this batch of elements.
- *
- * <p>By default, does nothing.
- */
- public void finishBundle(Context c) throws Exception {
- }
-
- /**
- * {@inheritDoc}
- *
- * <p>By default, does not register any display data. Implementors may override this method
- * to provide their own display metadata.
- */
- @Override
- public void populateDisplayData(DisplayData.Builder builder) {
- }
-
- /////////////////////////////////////////////////////////////////////////////
-
- /**
- * Returns a {@link TypeDescriptor} capturing what is known statically
- * about the input type of this {@code DoFn} instance's most-derived
- * class.
- *
- * <p>See {@link #getOutputTypeDescriptor} for more discussion.
- */
- protected TypeDescriptor<InputT> getInputTypeDescriptor() {
- return new TypeDescriptor<InputT>(getClass()) {};
- }
-
- /**
- * Returns a {@link TypeDescriptor} capturing what is known statically
- * about the output type of this {@code DoFn} instance's
- * most-derived class.
- *
- * <p>In the normal case of a concrete {@code DoFn} subclass with
- * no generic type parameters of its own (including anonymous inner
- * classes), this will be a complete non-generic type, which is good
- * for choosing a default output {@code Coder<OutputT>} for the output
- * {@code PCollection<OutputT>}.
- */
- protected TypeDescriptor<OutputT> getOutputTypeDescriptor() {
- return new TypeDescriptor<OutputT>(getClass()) {};
- }
-
- /**
- * Returns an {@link Aggregator} with aggregation logic specified by the
- * {@link CombineFn} argument. The name provided must be unique across
- * {@link Aggregator}s created within the DoFn. Aggregators can only be created
- * during pipeline construction.
- *
- * @param name the name of the aggregator
- * @param combiner the {@link CombineFn} to use in the aggregator
- * @return an aggregator for the provided name and combiner in the scope of
- * this DoFn
- * @throws NullPointerException if the name or combiner is null
- * @throws IllegalArgumentException if the given name collides with another
- * aggregator in this scope
- * @throws IllegalStateException if called during pipeline processing.
- */
- protected final <AggInputT, AggOutputT> Aggregator<AggInputT, AggOutputT>
- createAggregator(String name, CombineFn<? super AggInputT, ?, AggOutputT> combiner) {
- checkNotNull(name, "name cannot be null");
- checkNotNull(combiner, "combiner cannot be null");
- checkArgument(!aggregators.containsKey(name),
- "Cannot create aggregator with name %s."
- + " An Aggregator with that name already exists within this scope.",
- name);
-
- checkState(!aggregatorsAreFinal, "Cannot create an aggregator during DoFn processing."
- + " Aggregators should be registered during pipeline construction.");
-
- DelegatingAggregator<AggInputT, AggOutputT> aggregator =
- new DelegatingAggregator<>(name, combiner);
- aggregators.put(name, aggregator);
- return aggregator;
- }
-
- /**
- * Returns an {@link Aggregator} with the aggregation logic specified by the
- * {@link SerializableFunction} argument. The name provided must be unique
- * across {@link Aggregator}s created within the DoFn. Aggregators can only be
- * created during pipeline construction.
- *
- * @param name the name of the aggregator
- * @param combiner the {@link SerializableFunction} to use in the aggregator
- * @return an aggregator for the provided name and combiner in the scope of
- * this DoFn
- * @throws NullPointerException if the name or combiner is null
- * @throws IllegalArgumentException if the given name collides with another
- * aggregator in this scope
- * @throws IllegalStateException if called during pipeline processing.
- */
- protected final <AggInputT> Aggregator<AggInputT, AggInputT> createAggregator(String name,
- SerializableFunction<Iterable<AggInputT>, AggInputT> combiner) {
- checkNotNull(combiner, "combiner cannot be null.");
- return createAggregator(name, Combine.IterableCombineFn.of(combiner));
- }
-
- /**
- * Returns the {@link Aggregator Aggregators} created by this {@code DoFn}.
- */
- Collection<Aggregator<?, ?>> getAggregators() {
- return Collections.<Aggregator<?, ?>>unmodifiableCollection(aggregators.values());
- }
-
- /**
- * An {@link Aggregator} that delegates calls to addValue to another
- * aggregator.
- *
- * @param <AggInputT> the type of input element
- * @param <AggOutputT> the type of output element
- */
- static class DelegatingAggregator<AggInputT, AggOutputT> implements
- Aggregator<AggInputT, AggOutputT>, Serializable {
- private final UUID id;
-
- private final String name;
-
- private final CombineFn<AggInputT, ?, AggOutputT> combineFn;
-
- private Aggregator<AggInputT, ?> delegate;
-
- public DelegatingAggregator(String name,
- CombineFn<? super AggInputT, ?, AggOutputT> combiner) {
- this.id = UUID.randomUUID();
- this.name = checkNotNull(name, "name cannot be null");
- // Safe contravariant cast
- @SuppressWarnings("unchecked")
- CombineFn<AggInputT, ?, AggOutputT> specificCombiner =
- (CombineFn<AggInputT, ?, AggOutputT>) checkNotNull(combiner, "combineFn cannot be null");
- this.combineFn = specificCombiner;
- }
-
- @Override
- public void addValue(AggInputT value) {
- if (delegate == null) {
- throw new IllegalStateException(
- "addValue cannot be called on Aggregator outside of the execution of a DoFn.");
- } else {
- delegate.addValue(value);
- }
- }
-
- @Override
- public String getName() {
- return name;
- }
-
- @Override
- public CombineFn<AggInputT, ?, AggOutputT> getCombineFn() {
- return combineFn;
- }
-
- /**
- * Sets the current delegate of the Aggregator.
- *
- * @param delegate the delegate to set in this aggregator
- */
- public void setDelegate(Aggregator<AggInputT, ?> delegate) {
- this.delegate = delegate;
- }
-
- @Override
- public String toString() {
- return MoreObjects.toStringHelper(getClass())
- .add("name", name)
- .add("combineFn", combineFn)
- .toString();
- }
-
- @Override
- public int hashCode() {
- return Objects.hash(id, name, combineFn.getClass());
- }
-
- /**
- * Indicates whether some other object is "equal to" this one.
- *
- * <p>{@code DelegatingAggregator} instances are equal if they have the same name, their
- * CombineFns are the same class, and they have identical IDs.
- */
- @Override
- public boolean equals(Object o) {
- if (o == this) {
- return true;
- }
- if (o == null) {
- return false;
- }
- if (o instanceof DelegatingAggregator) {
- DelegatingAggregator<?, ?> that = (DelegatingAggregator<?, ?>) o;
- return Objects.equals(this.id, that.id)
- && Objects.equals(this.name, that.name)
- && Objects.equals(this.combineFn.getClass(), that.combineFn.getClass());
- }
- return false;
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/DoFnReflector.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/DoFnReflector.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/DoFnReflector.java
deleted file mode 100644
index 1c46541..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/DoFnReflector.java
+++ /dev/null
@@ -1,668 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms;
-
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-import com.google.cloud.dataflow.sdk.transforms.DoFnWithContext.ExtraContextFactory;
-import com.google.cloud.dataflow.sdk.transforms.DoFnWithContext.FinishBundle;
-import com.google.cloud.dataflow.sdk.transforms.DoFnWithContext.ProcessElement;
-import com.google.cloud.dataflow.sdk.transforms.DoFnWithContext.StartBundle;
-import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
-import com.google.cloud.dataflow.sdk.transforms.windowing.PaneInfo;
-import com.google.cloud.dataflow.sdk.util.UserCodeException;
-import com.google.cloud.dataflow.sdk.util.WindowingInternals;
-import com.google.cloud.dataflow.sdk.util.common.ReflectHelpers;
-import com.google.cloud.dataflow.sdk.values.PCollectionView;
-import com.google.cloud.dataflow.sdk.values.TupleTag;
-import com.google.cloud.dataflow.sdk.values.TypeDescriptor;
-import com.google.common.annotations.VisibleForTesting;
-import com.google.common.base.Function;
-import com.google.common.base.Throwables;
-import com.google.common.collect.FluentIterable;
-import com.google.common.collect.ImmutableMap;
-import com.google.common.reflect.TypeParameter;
-import com.google.common.reflect.TypeToken;
-
-import org.joda.time.Instant;
-
-import java.io.IOException;
-import java.lang.annotation.Annotation;
-import java.lang.reflect.InvocationTargetException;
-import java.lang.reflect.Method;
-import java.lang.reflect.Modifier;
-import java.lang.reflect.ParameterizedType;
-import java.lang.reflect.Type;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.LinkedHashMap;
-import java.util.LinkedHashSet;
-import java.util.Map;
-
-import javax.annotation.Nullable;
-
-/**
- * Utility implementing the necessary reflection for working with {@link DoFnWithContext}s.
- */
-public abstract class DoFnReflector {
-
- private interface ExtraContextInfo {
- /**
- * Create an instance of the given instance using the instance factory.
- */
- <InputT, OutputT> Object createInstance(
- DoFnWithContext.ExtraContextFactory<InputT, OutputT> factory);
-
- /**
- * Create the type token for the given type, filling in the generics.
- */
- <InputT, OutputT> TypeToken<?> tokenFor(TypeToken<InputT> in, TypeToken<OutputT> out);
- }
-
- private static final Map<Class<?>, ExtraContextInfo> EXTRA_CONTEXTS = Collections.emptyMap();
- private static final Map<Class<?>, ExtraContextInfo> EXTRA_PROCESS_CONTEXTS =
- ImmutableMap.<Class<?>, ExtraContextInfo>builder()
- .putAll(EXTRA_CONTEXTS)
- .put(BoundedWindow.class, new ExtraContextInfo() {
- @Override
- public <InputT, OutputT> Object
- createInstance(ExtraContextFactory<InputT, OutputT> factory) {
- return factory.window();
- }
-
- @Override
- public <InputT, OutputT> TypeToken<?>
- tokenFor(TypeToken<InputT> in, TypeToken<OutputT> out) {
- return TypeToken.of(BoundedWindow.class);
- }
- })
- .put(WindowingInternals.class, new ExtraContextInfo() {
- @Override
- public <InputT, OutputT> Object
- createInstance(ExtraContextFactory<InputT, OutputT> factory) {
- return factory.windowingInternals();
- }
-
- @Override
- public <InputT, OutputT> TypeToken<?>
- tokenFor(TypeToken<InputT> in, TypeToken<OutputT> out) {
- return new TypeToken<WindowingInternals<InputT, OutputT>>() {
- }
- .where(new TypeParameter<InputT>() {}, in)
- .where(new TypeParameter<OutputT>() {}, out);
- }
- })
- .build();
-
- /**
- * @return true if the reflected {@link DoFnWithContext} uses a Single Window.
- */
- public abstract boolean usesSingleWindow();
-
- /**
- * Invoke the reflected {@link ProcessElement} method on the given instance.
- *
- * @param fn an instance of the {@link DoFnWithContext} to invoke {@link ProcessElement} on.
- * @param c the {@link com.google.cloud.dataflow.sdk.transforms.DoFnWithContext.ProcessContext}
- * to pass to {@link ProcessElement}.
- */
- abstract <InputT, OutputT> void invokeProcessElement(
- DoFnWithContext<InputT, OutputT> fn,
- DoFnWithContext<InputT, OutputT>.ProcessContext c,
- ExtraContextFactory<InputT, OutputT> extra);
-
- /**
- * Invoke the reflected {@link StartBundle} method on the given instance.
- *
- * @param fn an instance of the {@link DoFnWithContext} to invoke {@link StartBundle} on.
- * @param c the {@link com.google.cloud.dataflow.sdk.transforms.DoFnWithContext.Context}
- * to pass to {@link StartBundle}.
- */
- <InputT, OutputT> void invokeStartBundle(
- DoFnWithContext<InputT, OutputT> fn,
- DoFnWithContext<InputT, OutputT>.Context c,
- ExtraContextFactory<InputT, OutputT> extra) {
- fn.prepareForProcessing();
- }
-
- /**
- * Invoke the reflected {@link FinishBundle} method on the given instance.
- *
- * @param fn an instance of the {@link DoFnWithContext} to invoke {@link FinishBundle} on.
- * @param c the {@link com.google.cloud.dataflow.sdk.transforms.DoFnWithContext.Context}
- * to pass to {@link FinishBundle}.
- */
- abstract <InputT, OutputT> void invokeFinishBundle(
- DoFnWithContext<InputT, OutputT> fn,
- DoFnWithContext<InputT, OutputT>.Context c,
- ExtraContextFactory<InputT, OutputT> extra);
-
- private static final Map<Class<?>, DoFnReflector> REFLECTOR_CACHE =
- new LinkedHashMap<Class<?>, DoFnReflector>();
-
- /**
- * @return the {@link DoFnReflector} for the given {@link DoFnWithContext}.
- */
- public static DoFnReflector of(
- @SuppressWarnings("rawtypes") Class<? extends DoFnWithContext> fn) {
- DoFnReflector reflector = REFLECTOR_CACHE.get(fn);
- if (reflector != null) {
- return reflector;
- }
-
- reflector = new GenericDoFnReflector(fn);
- REFLECTOR_CACHE.put(fn, reflector);
- return reflector;
- }
-
- /**
- * Create a {@link DoFn} that the {@link DoFnWithContext}.
- */
- public <InputT, OutputT> DoFn<InputT, OutputT> toDoFn(DoFnWithContext<InputT, OutputT> fn) {
- if (usesSingleWindow()) {
- return new WindowDoFnAdapter<InputT, OutputT>(this, fn);
- } else {
- return new SimpleDoFnAdapter<InputT, OutputT>(this, fn);
- }
- }
-
- private static String formatType(TypeToken<?> t) {
- return ReflectHelpers.TYPE_SIMPLE_DESCRIPTION.apply(t.getType());
- }
-
- private static String format(Method m) {
- return ReflectHelpers.CLASS_AND_METHOD_FORMATTER.apply(m);
- }
-
- private static Collection<String> describeSupportedTypes(
- Map<Class<?>, ExtraContextInfo> extraProcessContexts,
- final TypeToken<?> in, final TypeToken<?> out) {
- return FluentIterable
- .from(extraProcessContexts.values())
- .transform(new Function<ExtraContextInfo, String>() {
- @Override
- @Nullable
- public String apply(@Nullable ExtraContextInfo input) {
- if (input == null) {
- return null;
- } else {
- return formatType(input.tokenFor(in, out));
- }
- }
- })
- .toSortedSet(String.CASE_INSENSITIVE_ORDER);
- }
-
- @VisibleForTesting
- static <InputT, OutputT> ExtraContextInfo[] verifyProcessMethodArguments(Method m) {
- return verifyMethodArguments(m,
- EXTRA_PROCESS_CONTEXTS,
- new TypeToken<DoFnWithContext<InputT, OutputT>.ProcessContext>() {
- },
- new TypeParameter<InputT>() {},
- new TypeParameter<OutputT>() {});
- }
-
- @VisibleForTesting
- static <InputT, OutputT> ExtraContextInfo[] verifyBundleMethodArguments(Method m) {
- return verifyMethodArguments(m,
- EXTRA_CONTEXTS,
- new TypeToken<DoFnWithContext<InputT, OutputT>.Context>() {
- },
- new TypeParameter<InputT>() {},
- new TypeParameter<OutputT>() {});
- }
-
- /**
- * Verify the method arguments for a given {@link DoFnWithContext} method.
- *
- * <p>The requirements for a method to be valid, are:
- * <ol>
- * <li>The method has at least one argument.
- * <li>The first argument is of type firstContextArg.
- * <li>The remaining arguments have raw types that appear in {@code contexts}
- * <li>Any generics on the extra context arguments match what is expected. Eg.,
- * {@code WindowingInternals<InputT, OutputT>} either matches the
- * {@code InputT} and {@code OutputT} parameters of the
- * {@code DoFn<InputT, OutputT>.ProcessContext}, or it uses a wildcard, etc.
- * </ol>
- *
- * @param m the method to verify
- * @param contexts mapping from raw classes to the {@link ExtraContextInfo} used
- * to create new instances.
- * @param firstContextArg the expected type of the first context argument
- * @param iParam TypeParameter representing the input type
- * @param oParam TypeParameter representing the output type
- */
- @VisibleForTesting static <InputT, OutputT> ExtraContextInfo[] verifyMethodArguments(Method m,
- Map<Class<?>, ExtraContextInfo> contexts,
- TypeToken<?> firstContextArg, TypeParameter<InputT> iParam, TypeParameter<OutputT> oParam) {
-
- if (!void.class.equals(m.getReturnType())) {
- throw new IllegalStateException(String.format(
- "%s must have a void return type", format(m)));
- }
- if (m.isVarArgs()) {
- throw new IllegalStateException(String.format(
- "%s must not have var args", format(m)));
- }
-
- // The first parameter must be present, and must be the specified type
- Type[] params = m.getGenericParameterTypes();
- TypeToken<?> contextToken = null;
- if (params.length > 0) {
- contextToken = TypeToken.of(params[0]);
- }
- if (contextToken == null
- || !contextToken.getRawType().equals(firstContextArg.getRawType())) {
- throw new IllegalStateException(String.format(
- "%s must take a %s as its first argument",
- format(m), firstContextArg.getRawType().getSimpleName()));
- }
- ExtraContextInfo[] contextInfos = new ExtraContextInfo[params.length - 1];
-
- // Fill in the generics in the allExtraContextArgs interface from the types in the
- // Context or ProcessContext DoFn.
- ParameterizedType pt = (ParameterizedType) contextToken.getType();
- // We actually want the owner, since ProcessContext and Context are owned by DoFnWithContext.
- pt = (ParameterizedType) pt.getOwnerType();
- @SuppressWarnings("unchecked")
- TypeToken<InputT> iActual = (TypeToken<InputT>) TypeToken.of(pt.getActualTypeArguments()[0]);
- @SuppressWarnings("unchecked")
- TypeToken<OutputT> oActual = (TypeToken<OutputT>) TypeToken.of(pt.getActualTypeArguments()[1]);
-
- // All of the remaining parameters must be a super-interface of allExtraContextArgs
- // that is not listed in the EXCLUDED_INTERFACES set.
- for (int i = 1; i < params.length; i++) {
- TypeToken<?> param = TypeToken.of(params[i]);
-
- ExtraContextInfo info = contexts.get(param.getRawType());
- if (info == null) {
- throw new IllegalStateException(String.format(
- "%s is not a valid context parameter for method %s. Should be one of %s",
- formatType(param), format(m),
- describeSupportedTypes(contexts, iActual, oActual)));
- }
-
- // If we get here, the class matches, but maybe the generics don't:
- TypeToken<?> expected = info.tokenFor(iActual, oActual);
- if (!expected.isSubtypeOf(param)) {
- throw new IllegalStateException(String.format(
- "Incompatible generics in context parameter %s for method %s. Should be %s",
- formatType(param), format(m), formatType(info.tokenFor(iActual, oActual))));
- }
-
- // Register the (now validated) context info
- contextInfos[i - 1] = info;
- }
- return contextInfos;
- }
-
- /**
- * Implementation of {@link DoFnReflector} for the arbitrary {@link DoFnWithContext}.
- */
- private static class GenericDoFnReflector extends DoFnReflector {
-
- private Method startBundle;
- private Method processElement;
- private Method finishBundle;
- private ExtraContextInfo[] processElementArgs;
- private ExtraContextInfo[] startBundleArgs;
- private ExtraContextInfo[] finishBundleArgs;
-
- private GenericDoFnReflector(Class<?> fn) {
- // Locate the annotated methods
- this.processElement = findAnnotatedMethod(ProcessElement.class, fn, true);
- this.startBundle = findAnnotatedMethod(StartBundle.class, fn, false);
- this.finishBundle = findAnnotatedMethod(FinishBundle.class, fn, false);
-
- // Verify that their method arguments satisfy our conditions.
- processElementArgs = verifyProcessMethodArguments(processElement);
- if (startBundle != null) {
- startBundleArgs = verifyBundleMethodArguments(startBundle);
- }
- if (finishBundle != null) {
- finishBundleArgs = verifyBundleMethodArguments(finishBundle);
- }
- }
-
- private static Collection<Method> declaredMethodsWithAnnotation(
- Class<? extends Annotation> anno,
- Class<?> startClass, Class<?> stopClass) {
- Collection<Method> matches = new ArrayList<>();
-
- Class<?> clazz = startClass;
- LinkedHashSet<Class<?>> interfaces = new LinkedHashSet<>();
-
- // First, find all declared methods on the startClass and parents (up to stopClass)
- while (clazz != null && !clazz.equals(stopClass)) {
- for (Method method : clazz.getDeclaredMethods()) {
- if (method.isAnnotationPresent(anno)) {
- matches.add(method);
- }
- }
-
- Collections.addAll(interfaces, clazz.getInterfaces());
-
- clazz = clazz.getSuperclass();
- }
-
- // Now, iterate over all the discovered interfaces
- for (Method method : ReflectHelpers.getClosureOfMethodsOnInterfaces(interfaces)) {
- if (method.isAnnotationPresent(anno)) {
- matches.add(method);
- }
- }
- return matches;
- }
-
- private static Method findAnnotatedMethod(
- Class<? extends Annotation> anno, Class<?> fnClazz, boolean required) {
- Collection<Method> matches = declaredMethodsWithAnnotation(
- anno, fnClazz, DoFnWithContext.class);
-
- if (matches.size() == 0) {
- if (required == true) {
- throw new IllegalStateException(String.format(
- "No method annotated with @%s found in %s",
- anno.getSimpleName(), fnClazz.getName()));
- } else {
- return null;
- }
- }
-
- // If we have at least one match, then either it should be the only match
- // or it should be an extension of the other matches (which came from parent
- // classes).
- Method first = matches.iterator().next();
- for (Method other : matches) {
- if (!first.getName().equals(other.getName())
- || !Arrays.equals(first.getParameterTypes(), other.getParameterTypes())) {
- throw new IllegalStateException(String.format(
- "Found multiple methods annotated with @%s. [%s] and [%s]",
- anno.getSimpleName(), format(first), format(other)));
- }
- }
-
- // We need to be able to call it. We require it is public.
- if ((first.getModifiers() & Modifier.PUBLIC) == 0) {
- throw new IllegalStateException(format(first) + " must be public");
- }
-
- // And make sure its not static.
- if ((first.getModifiers() & Modifier.STATIC) != 0) {
- throw new IllegalStateException(format(first) + " must not be static");
- }
-
- first.setAccessible(true);
- return first;
- }
-
- @Override
- public boolean usesSingleWindow() {
- return usesContext(BoundedWindow.class);
- }
-
- private boolean usesContext(Class<?> context) {
- for (Class<?> clazz : processElement.getParameterTypes()) {
- if (clazz.equals(context)) {
- return true;
- }
- }
- return false;
- }
-
- @Override
- <InputT, OutputT> void invokeProcessElement(
- DoFnWithContext<InputT, OutputT> fn,
- DoFnWithContext<InputT, OutputT>.ProcessContext c,
- ExtraContextFactory<InputT, OutputT> extra) {
- invoke(processElement, fn, c, extra, processElementArgs);
- }
-
- @Override
- <InputT, OutputT> void invokeStartBundle(
- DoFnWithContext<InputT, OutputT> fn,
- DoFnWithContext<InputT, OutputT>.Context c,
- ExtraContextFactory<InputT, OutputT> extra) {
- super.invokeStartBundle(fn, c, extra);
- if (startBundle != null) {
- invoke(startBundle, fn, c, extra, startBundleArgs);
- }
- }
-
- @Override
- <InputT, OutputT> void invokeFinishBundle(
- DoFnWithContext<InputT, OutputT> fn,
- DoFnWithContext<InputT, OutputT>.Context c,
- ExtraContextFactory<InputT, OutputT> extra) {
- if (finishBundle != null) {
- invoke(finishBundle, fn, c, extra, finishBundleArgs);
- }
- }
-
- private <InputT, OutputT> void invoke(Method m,
- DoFnWithContext<InputT, OutputT> on,
- DoFnWithContext<InputT, OutputT>.Context contextArg,
- ExtraContextFactory<InputT, OutputT> extraArgFactory,
- ExtraContextInfo[] extraArgs) {
-
- Class<?>[] parameterTypes = m.getParameterTypes();
- Object[] args = new Object[parameterTypes.length];
- args[0] = contextArg;
- for (int i = 1; i < args.length; i++) {
- args[i] = extraArgs[i - 1].createInstance(extraArgFactory);
- }
-
- try {
- m.invoke(on, args);
- } catch (InvocationTargetException e) {
- // Exception in user code.
- throw UserCodeException.wrap(e.getCause());
- } catch (IllegalAccessException | IllegalArgumentException e) {
- // Exception in our code.
- throw Throwables.propagate(e);
- }
- }
- }
-
- private static class ContextAdapter<InputT, OutputT>
- extends DoFnWithContext<InputT, OutputT>.Context
- implements DoFnWithContext.ExtraContextFactory<InputT, OutputT> {
-
- private DoFn<InputT, OutputT>.Context context;
-
- private ContextAdapter(
- DoFnWithContext<InputT, OutputT> fn, DoFn<InputT, OutputT>.Context context) {
- fn.super();
- this.context = context;
- }
-
- @Override
- public PipelineOptions getPipelineOptions() {
- return context.getPipelineOptions();
- }
-
- @Override
- public void output(OutputT output) {
- context.output(output);
- }
-
- @Override
- public void outputWithTimestamp(OutputT output, Instant timestamp) {
- context.outputWithTimestamp(output, timestamp);
- }
-
- @Override
- public <T> void sideOutput(TupleTag<T> tag, T output) {
- context.sideOutput(tag, output);
- }
-
- @Override
- public <T> void sideOutputWithTimestamp(TupleTag<T> tag, T output, Instant timestamp) {
- context.sideOutputWithTimestamp(tag, output, timestamp);
- }
-
- @Override
- public BoundedWindow window() {
- // The DoFnWithContext doesn't allow us to ask for these outside ProcessElements, so this
- // should be unreachable.
- throw new UnsupportedOperationException("Can only get the window in ProcessElements");
- }
-
- @Override
- public WindowingInternals<InputT, OutputT> windowingInternals() {
- // The DoFnWithContext doesn't allow us to ask for these outside ProcessElements, so this
- // should be unreachable.
- throw new UnsupportedOperationException(
- "Can only get the windowingInternals in ProcessElements");
- }
- }
-
- private static class ProcessContextAdapter<InputT, OutputT>
- extends DoFnWithContext<InputT, OutputT>.ProcessContext
- implements DoFnWithContext.ExtraContextFactory<InputT, OutputT> {
-
- private DoFn<InputT, OutputT>.ProcessContext context;
-
- private ProcessContextAdapter(
- DoFnWithContext<InputT, OutputT> fn,
- DoFn<InputT, OutputT>.ProcessContext context) {
- fn.super();
- this.context = context;
- }
-
- @Override
- public PipelineOptions getPipelineOptions() {
- return context.getPipelineOptions();
- }
-
- @Override
- public <T> T sideInput(PCollectionView<T> view) {
- return context.sideInput(view);
- }
-
- @Override
- public void output(OutputT output) {
- context.output(output);
- }
-
- @Override
- public void outputWithTimestamp(OutputT output, Instant timestamp) {
- context.outputWithTimestamp(output, timestamp);
- }
-
- @Override
- public <T> void sideOutput(TupleTag<T> tag, T output) {
- context.sideOutput(tag, output);
- }
-
- @Override
- public <T> void sideOutputWithTimestamp(TupleTag<T> tag, T output, Instant timestamp) {
- context.sideOutputWithTimestamp(tag, output, timestamp);
- }
-
- @Override
- public InputT element() {
- return context.element();
- }
-
- @Override
- public Instant timestamp() {
- return context.timestamp();
- }
-
- @Override
- public PaneInfo pane() {
- return context.pane();
- }
-
- @Override
- public BoundedWindow window() {
- return context.window();
- }
-
- @Override
- public WindowingInternals<InputT, OutputT> windowingInternals() {
- return context.windowingInternals();
- }
- }
-
- public static Class<?> getDoFnClass(DoFn<?, ?> fn) {
- if (fn instanceof SimpleDoFnAdapter) {
- return ((SimpleDoFnAdapter<?, ?>) fn).fn.getClass();
- } else {
- return fn.getClass();
- }
- }
-
- private static class SimpleDoFnAdapter<InputT, OutputT> extends DoFn<InputT, OutputT> {
-
- private transient DoFnReflector reflector;
- private DoFnWithContext<InputT, OutputT> fn;
-
- private SimpleDoFnAdapter(DoFnReflector reflector, DoFnWithContext<InputT, OutputT> fn) {
- super(fn.aggregators);
- this.reflector = reflector;
- this.fn = fn;
- }
-
- @Override
- public void startBundle(DoFn<InputT, OutputT>.Context c) throws Exception {
- ContextAdapter<InputT, OutputT> adapter = new ContextAdapter<>(fn, c);
- reflector.invokeStartBundle(fn, (DoFnWithContext<InputT, OutputT>.Context) adapter, adapter);
- }
-
- @Override
- public void finishBundle(DoFn<InputT, OutputT>.Context c) throws Exception {
- ContextAdapter<InputT, OutputT> adapter = new ContextAdapter<>(fn, c);
- reflector.invokeFinishBundle(fn, (DoFnWithContext<InputT, OutputT>.Context) adapter, adapter);
- }
-
- @Override
- public void processElement(DoFn<InputT, OutputT>.ProcessContext c) throws Exception {
- ProcessContextAdapter<InputT, OutputT> adapter = new ProcessContextAdapter<>(fn, c);
- reflector.invokeProcessElement(
- fn, (DoFnWithContext<InputT, OutputT>.ProcessContext) adapter, adapter);
- }
-
- @Override
- protected TypeDescriptor<InputT> getInputTypeDescriptor() {
- return fn.getInputTypeDescriptor();
- }
-
- @Override
- protected TypeDescriptor<OutputT> getOutputTypeDescriptor() {
- return fn.getOutputTypeDescriptor();
- }
-
- private void readObject(java.io.ObjectInputStream in)
- throws IOException, ClassNotFoundException {
- in.defaultReadObject();
- reflector = DoFnReflector.of(fn.getClass());
- }
- }
-
- private static class WindowDoFnAdapter<InputT, OutputT>
- extends SimpleDoFnAdapter<InputT, OutputT> implements DoFn.RequiresWindowAccess {
-
- private WindowDoFnAdapter(DoFnReflector reflector, DoFnWithContext<InputT, OutputT> fn) {
- super(reflector, fn);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/DoFnTester.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/DoFnTester.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/DoFnTester.java
deleted file mode 100644
index 5447664..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/DoFnTester.java
+++ /dev/null
@@ -1,495 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.transforms;
-
-import com.google.cloud.dataflow.sdk.annotations.Experimental;
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
-import com.google.cloud.dataflow.sdk.util.DirectModeExecutionContext;
-import com.google.cloud.dataflow.sdk.util.DirectSideInputReader;
-import com.google.cloud.dataflow.sdk.util.DoFnRunner;
-import com.google.cloud.dataflow.sdk.util.DoFnRunnerBase;
-import com.google.cloud.dataflow.sdk.util.DoFnRunners;
-import com.google.cloud.dataflow.sdk.util.PTuple;
-import com.google.cloud.dataflow.sdk.util.SerializableUtils;
-import com.google.cloud.dataflow.sdk.util.WindowedValue;
-import com.google.cloud.dataflow.sdk.util.WindowingStrategy;
-import com.google.cloud.dataflow.sdk.util.common.Counter;
-import com.google.cloud.dataflow.sdk.util.common.CounterSet;
-import com.google.cloud.dataflow.sdk.values.PCollectionView;
-import com.google.cloud.dataflow.sdk.values.TupleTag;
-import com.google.cloud.dataflow.sdk.values.TupleTagList;
-import com.google.common.base.Function;
-import com.google.common.base.Objects;
-import com.google.common.collect.Iterables;
-import com.google.common.collect.Lists;
-
-import org.joda.time.Instant;
-
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-
-/**
- * A harness for unit-testing a {@link DoFn}.
- *
- * <p>For example:
- *
- * <pre> {@code
- * DoFn<InputT, OutputT> fn = ...;
- *
- * DoFnTester<InputT, OutputT> fnTester = DoFnTester.of(fn);
- *
- * // Set arguments shared across all batches:
- * fnTester.setSideInputs(...); // If fn takes side inputs.
- * fnTester.setSideOutputTags(...); // If fn writes to side outputs.
- *
- * // Process a batch containing a single input element:
- * Input testInput = ...;
- * List<OutputT> testOutputs = fnTester.processBatch(testInput);
- * Assert.assertThat(testOutputs,
- * JUnitMatchers.hasItems(...));
- *
- * // Process a bigger batch:
- * Assert.assertThat(fnTester.processBatch(i1, i2, ...),
- * JUnitMatchers.hasItems(...));
- * } </pre>
- *
- * @param <InputT> the type of the {@code DoFn}'s (main) input elements
- * @param <OutputT> the type of the {@code DoFn}'s (main) output elements
- */
-public class DoFnTester<InputT, OutputT> {
- /**
- * Returns a {@code DoFnTester} supporting unit-testing of the given
- * {@link DoFn}.
- */
- @SuppressWarnings("unchecked")
- public static <InputT, OutputT> DoFnTester<InputT, OutputT> of(DoFn<InputT, OutputT> fn) {
- return new DoFnTester<InputT, OutputT>(fn);
- }
-
- /**
- * Returns a {@code DoFnTester} supporting unit-testing of the given
- * {@link DoFn}.
- */
- @SuppressWarnings("unchecked")
- public static <InputT, OutputT> DoFnTester<InputT, OutputT>
- of(DoFnWithContext<InputT, OutputT> fn) {
- return new DoFnTester<InputT, OutputT>(DoFnReflector.of(fn.getClass()).toDoFn(fn));
- }
-
- /**
- * Registers the tuple of values of the side input {@link PCollectionView}s to
- * pass to the {@link DoFn} under test.
- *
- * <p>If needed, first creates a fresh instance of the {@link DoFn}
- * under test.
- *
- * <p>If this isn't called, {@code DoFnTester} assumes the
- * {@link DoFn} takes no side inputs.
- */
- public void setSideInputs(Map<PCollectionView<?>, Iterable<WindowedValue<?>>> sideInputs) {
- this.sideInputs = sideInputs;
- resetState();
- }
-
- /**
- * Registers the values of a side input {@link PCollectionView} to
- * pass to the {@link DoFn} under test.
- *
- * <p>If needed, first creates a fresh instance of the {@code DoFn}
- * under test.
- *
- * <p>If this isn't called, {@code DoFnTester} assumes the
- * {@code DoFn} takes no side inputs.
- */
- public void setSideInput(PCollectionView<?> sideInput, Iterable<WindowedValue<?>> value) {
- sideInputs.put(sideInput, value);
- }
-
- /**
- * Registers the values for a side input {@link PCollectionView} to
- * pass to the {@link DoFn} under test. All values are placed
- * in the global window.
- */
- public void setSideInputInGlobalWindow(
- PCollectionView<?> sideInput,
- Iterable<?> value) {
- sideInputs.put(
- sideInput,
- Iterables.transform(value, new Function<Object, WindowedValue<?>>() {
- @Override
- public WindowedValue<?> apply(Object input) {
- return WindowedValue.valueInGlobalWindow(input);
- }
- }));
- }
-
-
- /**
- * Registers the list of {@code TupleTag}s that can be used by the
- * {@code DoFn} under test to output to side output
- * {@code PCollection}s.
- *
- * <p>If needed, first creates a fresh instance of the DoFn under test.
- *
- * <p>If this isn't called, {@code DoFnTester} assumes the
- * {@code DoFn} doesn't emit to any side outputs.
- */
- public void setSideOutputTags(TupleTagList sideOutputTags) {
- this.sideOutputTags = sideOutputTags.getAll();
- resetState();
- }
-
- /**
- * A convenience operation that first calls {@link #startBundle},
- * then calls {@link #processElement} on each of the input elements, then
- * calls {@link #finishBundle}, then returns the result of
- * {@link #takeOutputElements}.
- */
- public List<OutputT> processBatch(Iterable <? extends InputT> inputElements) {
- startBundle();
- for (InputT inputElement : inputElements) {
- processElement(inputElement);
- }
- finishBundle();
- return takeOutputElements();
- }
-
- /**
- * A convenience method for testing {@link DoFn DoFns} with bundles of elements.
- * Logic proceeds as follows:
- *
- * <ol>
- * <li>Calls {@link #startBundle}.</li>
- * <li>Calls {@link #processElement} on each of the arguments.<li>
- * <li>Calls {@link #finishBundle}.</li>
- * <li>Returns the result of {@link #takeOutputElements}.</li>
- * </ol>
- */
- @SafeVarargs
- public final List<OutputT> processBatch(InputT... inputElements) {
- return processBatch(Arrays.asList(inputElements));
- }
-
- /**
- * Calls {@link DoFn#startBundle} on the {@code DoFn} under test.
- *
- * <p>If needed, first creates a fresh instance of the DoFn under test.
- */
- public void startBundle() {
- resetState();
- initializeState();
- fnRunner.startBundle();
- state = State.STARTED;
- }
-
- /**
- * Calls {@link DoFn#processElement} on the {@code DoFn} under test, in a
- * context where {@link DoFn.ProcessContext#element} returns the
- * given element.
- *
- * <p>Will call {@link #startBundle} automatically, if it hasn't
- * already been called.
- *
- * @throws IllegalStateException if the {@code DoFn} under test has already
- * been finished
- */
- public void processElement(InputT element) {
- if (state == State.FINISHED) {
- throw new IllegalStateException("finishBundle() has already been called");
- }
- if (state == State.UNSTARTED) {
- startBundle();
- }
- fnRunner.processElement(WindowedValue.valueInGlobalWindow(element));
- }
-
- /**
- * Calls {@link DoFn#finishBundle} of the {@code DoFn} under test.
- *
- * <p>Will call {@link #startBundle} automatically, if it hasn't
- * already been called.
- *
- * @throws IllegalStateException if the {@code DoFn} under test has already
- * been finished
- */
- public void finishBundle() {
- if (state == State.FINISHED) {
- throw new IllegalStateException("finishBundle() has already been called");
- }
- if (state == State.UNSTARTED) {
- startBundle();
- }
- fnRunner.finishBundle();
- state = State.FINISHED;
- }
-
- /**
- * Returns the elements output so far to the main output. Does not
- * clear them, so subsequent calls will continue to include these
- * elements.
- *
- * @see #takeOutputElements
- * @see #clearOutputElements
- *
- */
- public List<OutputT> peekOutputElements() {
- // TODO: Should we return an unmodifiable list?
- return Lists.transform(
- peekOutputElementsWithTimestamp(),
- new Function<OutputElementWithTimestamp<OutputT>, OutputT>() {
- @Override
- @SuppressWarnings("unchecked")
- public OutputT apply(OutputElementWithTimestamp<OutputT> input) {
- return input.getValue();
- }
- });
- }
-
- /**
- * Returns the elements output so far to the main output with associated timestamps. Does not
- * clear them, so subsequent calls will continue to include these.
- * elements.
- *
- * @see #takeOutputElementsWithTimestamp
- * @see #clearOutputElements
- */
- @Experimental
- public List<OutputElementWithTimestamp<OutputT>> peekOutputElementsWithTimestamp() {
- // TODO: Should we return an unmodifiable list?
- return Lists.transform(
- outputManager.getOutput(mainOutputTag),
- new Function<Object, OutputElementWithTimestamp<OutputT>>() {
- @Override
- @SuppressWarnings("unchecked")
- public OutputElementWithTimestamp<OutputT> apply(Object input) {
- return new OutputElementWithTimestamp<OutputT>(
- ((WindowedValue<OutputT>) input).getValue(),
- ((WindowedValue<OutputT>) input).getTimestamp());
- }
- });
- }
-
- /**
- * Clears the record of the elements output so far to the main output.
- *
- * @see #peekOutputElements
- */
- public void clearOutputElements() {
- peekOutputElements().clear();
- }
-
- /**
- * Returns the elements output so far to the main output.
- * Clears the list so these elements don't appear in future calls.
- *
- * @see #peekOutputElements
- */
- public List<OutputT> takeOutputElements() {
- List<OutputT> resultElems = new ArrayList<>(peekOutputElements());
- clearOutputElements();
- return resultElems;
- }
-
- /**
- * Returns the elements output so far to the main output with associated timestamps.
- * Clears the list so these elements don't appear in future calls.
- *
- * @see #peekOutputElementsWithTimestamp
- * @see #takeOutputElements
- * @see #clearOutputElements
- */
- @Experimental
- public List<OutputElementWithTimestamp<OutputT>> takeOutputElementsWithTimestamp() {
- List<OutputElementWithTimestamp<OutputT>> resultElems =
- new ArrayList<>(peekOutputElementsWithTimestamp());
- clearOutputElements();
- return resultElems;
- }
-
- /**
- * Returns the elements output so far to the side output with the
- * given tag. Does not clear them, so subsequent calls will
- * continue to include these elements.
- *
- * @see #takeSideOutputElements
- * @see #clearSideOutputElements
- */
- public <T> List<T> peekSideOutputElements(TupleTag<T> tag) {
- // TODO: Should we return an unmodifiable list?
- return Lists.transform(
- outputManager.getOutput(tag),
- new Function<WindowedValue<T>, T>() {
- @SuppressWarnings("unchecked")
- @Override
- public T apply(WindowedValue<T> input) {
- return input.getValue();
- }});
- }
-
- /**
- * Clears the record of the elements output so far to the side
- * output with the given tag.
- *
- * @see #peekSideOutputElements
- */
- public <T> void clearSideOutputElements(TupleTag<T> tag) {
- peekSideOutputElements(tag).clear();
- }
-
- /**
- * Returns the elements output so far to the side output with the given tag.
- * Clears the list so these elements don't appear in future calls.
- *
- * @see #peekSideOutputElements
- */
- public <T> List<T> takeSideOutputElements(TupleTag<T> tag) {
- List<T> resultElems = new ArrayList<>(peekSideOutputElements(tag));
- clearSideOutputElements(tag);
- return resultElems;
- }
-
- /**
- * Returns the value of the provided {@link Aggregator}.
- */
- public <AggregateT> AggregateT getAggregatorValue(Aggregator<?, AggregateT> agg) {
- @SuppressWarnings("unchecked")
- Counter<AggregateT> counter =
- (Counter<AggregateT>)
- counterSet.getExistingCounter("user-" + STEP_NAME + "-" + agg.getName());
- return counter.getAggregate();
- }
-
- /**
- * Holder for an OutputElement along with its associated timestamp.
- */
- @Experimental
- public static class OutputElementWithTimestamp<OutputT> {
- private final OutputT value;
- private final Instant timestamp;
-
- OutputElementWithTimestamp(OutputT value, Instant timestamp) {
- this.value = value;
- this.timestamp = timestamp;
- }
-
- OutputT getValue() {
- return value;
- }
-
- Instant getTimestamp() {
- return timestamp;
- }
-
- @Override
- public boolean equals(Object obj) {
- if (!(obj instanceof OutputElementWithTimestamp)) {
- return false;
- }
- OutputElementWithTimestamp<?> other = (OutputElementWithTimestamp<?>) obj;
- return Objects.equal(other.value, value) && Objects.equal(other.timestamp, timestamp);
- }
-
- @Override
- public int hashCode() {
- return Objects.hashCode(value, timestamp);
- }
- }
-
- /////////////////////////////////////////////////////////////////////////////
-
- /** The possible states of processing a DoFn. */
- enum State {
- UNSTARTED,
- STARTED,
- FINISHED
- }
-
- /** The name of the step of a DoFnTester. */
- static final String STEP_NAME = "stepName";
- /** The name of the enclosing DoFn PTransform for a DoFnTester. */
- static final String TRANSFORM_NAME = "transformName";
-
- final PipelineOptions options = PipelineOptionsFactory.create();
-
- /** The original DoFn under test. */
- final DoFn<InputT, OutputT> origFn;
-
- /** The side input values to provide to the DoFn under test. */
- private Map<PCollectionView<?>, Iterable<WindowedValue<?>>> sideInputs =
- new HashMap<>();
-
- /** The output tags used by the DoFn under test. */
- TupleTag<OutputT> mainOutputTag = new TupleTag<>();
- List<TupleTag<?>> sideOutputTags = new ArrayList<>();
-
- /** The original DoFn under test, if started. */
- DoFn<InputT, OutputT> fn;
-
- /** The ListOutputManager to examine the outputs. */
- DoFnRunnerBase.ListOutputManager outputManager;
-
- /** The DoFnRunner if processing is in progress. */
- DoFnRunner<InputT, OutputT> fnRunner;
-
- /** Counters for user-defined Aggregators if processing is in progress. */
- CounterSet counterSet;
-
- /** The state of processing of the DoFn under test. */
- State state;
-
- DoFnTester(DoFn<InputT, OutputT> origFn) {
- this.origFn = origFn;
- resetState();
- }
-
- void resetState() {
- fn = null;
- outputManager = null;
- fnRunner = null;
- counterSet = null;
- state = State.UNSTARTED;
- }
-
- @SuppressWarnings("unchecked")
- void initializeState() {
- fn = (DoFn<InputT, OutputT>)
- SerializableUtils.deserializeFromByteArray(
- SerializableUtils.serializeToByteArray(origFn),
- origFn.toString());
- counterSet = new CounterSet();
- PTuple runnerSideInputs = PTuple.empty();
- for (Map.Entry<PCollectionView<?>, Iterable<WindowedValue<?>>> entry
- : sideInputs.entrySet()) {
- runnerSideInputs = runnerSideInputs.and(entry.getKey().getTagInternal(), entry.getValue());
- }
- outputManager = new DoFnRunnerBase.ListOutputManager();
- fnRunner = DoFnRunners.createDefault(
- options,
- fn,
- DirectSideInputReader.of(runnerSideInputs),
- outputManager,
- mainOutputTag,
- sideOutputTags,
- DirectModeExecutionContext.create().getOrCreateStepContext(STEP_NAME, TRANSFORM_NAME, null),
- counterSet.getAddCounterMutator(),
- WindowingStrategy.globalDefault());
- }
-}
[02/67] [partial] incubator-beam git commit: Directory reorganization
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/test/java/com/google/cloud/dataflow/sdk/PipelineTest.java
----------------------------------------------------------------------
diff --git a/sdk/src/test/java/com/google/cloud/dataflow/sdk/PipelineTest.java b/sdk/src/test/java/com/google/cloud/dataflow/sdk/PipelineTest.java
deleted file mode 100644
index e311252..0000000
--- a/sdk/src/test/java/com/google/cloud/dataflow/sdk/PipelineTest.java
+++ /dev/null
@@ -1,296 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk;
-
-import static org.hamcrest.Matchers.containsString;
-import static org.hamcrest.Matchers.instanceOf;
-import static org.hamcrest.Matchers.isA;
-import static org.hamcrest.Matchers.not;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.fail;
-
-import com.google.cloud.dataflow.sdk.Pipeline.PipelineExecutionException;
-import com.google.cloud.dataflow.sdk.options.PipelineOptions;
-import com.google.cloud.dataflow.sdk.options.PipelineOptions.CheckEnabled;
-import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
-import com.google.cloud.dataflow.sdk.runners.DirectPipelineRunner;
-import com.google.cloud.dataflow.sdk.runners.PipelineRunner;
-import com.google.cloud.dataflow.sdk.testing.DataflowAssert;
-import com.google.cloud.dataflow.sdk.testing.ExpectedLogs;
-import com.google.cloud.dataflow.sdk.testing.RunnableOnService;
-import com.google.cloud.dataflow.sdk.testing.TestPipeline;
-import com.google.cloud.dataflow.sdk.transforms.Create;
-import com.google.cloud.dataflow.sdk.transforms.DoFn;
-import com.google.cloud.dataflow.sdk.transforms.Flatten;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-import com.google.cloud.dataflow.sdk.transforms.ParDo;
-import com.google.cloud.dataflow.sdk.util.UserCodeException;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.cloud.dataflow.sdk.values.PCollectionList;
-import com.google.cloud.dataflow.sdk.values.PCollectionTuple;
-import com.google.cloud.dataflow.sdk.values.PInput;
-import com.google.cloud.dataflow.sdk.values.POutput;
-import com.google.cloud.dataflow.sdk.values.TupleTag;
-import com.google.common.collect.ImmutableList;
-
-import org.junit.Assert;
-import org.junit.Rule;
-import org.junit.Test;
-import org.junit.experimental.categories.Category;
-import org.junit.rules.ExpectedException;
-import org.junit.runner.RunWith;
-import org.junit.runners.JUnit4;
-
-/**
- * Tests for Pipeline.
- */
-@RunWith(JUnit4.class)
-public class PipelineTest {
-
- @Rule public ExpectedLogs logged = ExpectedLogs.none(Pipeline.class);
- @Rule public ExpectedException thrown = ExpectedException.none();
-
- static class PipelineWrapper extends Pipeline {
- protected PipelineWrapper(PipelineRunner<?> runner) {
- super(runner, PipelineOptionsFactory.create());
- }
- }
-
- // Mock class that throws a user code exception during the call to
- // Pipeline.run().
- static class TestPipelineRunnerThrowingUserException
- extends PipelineRunner<PipelineResult> {
- @Override
- public PipelineResult run(Pipeline pipeline) {
- Throwable t = new IllegalStateException("user code exception");
- throw UserCodeException.wrap(t);
- }
- }
-
- // Mock class that throws an SDK or API client code exception during
- // the call to Pipeline.run().
- static class TestPipelineRunnerThrowingSDKException
- extends PipelineRunner<PipelineResult> {
- @Override
- public PipelineResult run(Pipeline pipeline) {
- throw new IllegalStateException("SDK exception");
- }
- }
-
- @Test
- public void testPipelineUserExceptionHandling() {
- Pipeline p = new PipelineWrapper(
- new TestPipelineRunnerThrowingUserException());
-
- // Check pipeline runner correctly catches user errors.
- thrown.expect(PipelineExecutionException.class);
- thrown.expectCause(isA(IllegalStateException.class));
- thrown.expectMessage("user code exception");
- p.run();
- }
-
- @Test
- public void testPipelineSDKExceptionHandling() {
- Pipeline p = new PipelineWrapper(new TestPipelineRunnerThrowingSDKException());
-
- // Check pipeline runner correctly catches SDK errors.
- try {
- p.run();
- fail("Should have thrown an exception.");
- } catch (RuntimeException exn) {
- // Make sure the exception isn't a UserCodeException.
- Assert.assertThat(exn, not(instanceOf(UserCodeException.class)));
- // Assert that the message is correct.
- Assert.assertThat(exn.getMessage(), containsString("SDK exception"));
- // RuntimeException should be IllegalStateException.
- Assert.assertThat(exn, instanceOf(IllegalStateException.class));
- }
- }
-
- @Test
- @Category(com.google.cloud.dataflow.sdk.testing.RunnableOnService.class)
- public void testMultipleApply() {
- PTransform<PCollection<? extends String>, PCollection<String>> myTransform =
- addSuffix("+");
-
- Pipeline p = TestPipeline.create();
- PCollection<String> input = p.apply(Create.<String>of(ImmutableList.of("a", "b")));
-
- PCollection<String> left = input.apply("Left1", myTransform).apply("Left2", myTransform);
- PCollection<String> right = input.apply("Right", myTransform);
-
- PCollection<String> both = PCollectionList.of(left).and(right)
- .apply(Flatten.<String>pCollections());
-
- DataflowAssert.that(both).containsInAnyOrder("a++", "b++", "a+", "b+");
-
- p.run();
- }
-
- private static PTransform<PCollection<? extends String>, PCollection<String>> addSuffix(
- final String suffix) {
- return ParDo.of(new DoFn<String, String>() {
- @Override
- public void processElement(DoFn<String, String>.ProcessContext c) {
- c.output(c.element() + suffix);
- }
- });
- }
-
- @Test
- public void testToString() {
- PipelineOptions options = PipelineOptionsFactory.as(PipelineOptions.class);
- options.setRunner(DirectPipelineRunner.class);
- Pipeline pipeline = Pipeline.create(options);
- assertEquals("Pipeline#" + pipeline.hashCode(), pipeline.toString());
- }
-
- @Test
- public void testStableUniqueNameOff() {
- Pipeline p = TestPipeline.create();
- p.getOptions().setStableUniqueNames(CheckEnabled.OFF);
-
- p.apply(Create.of(5, 6, 7));
- p.apply(Create.of(5, 6, 7));
-
- logged.verifyNotLogged("does not have a stable unique name.");
- }
-
- @Test
- public void testStableUniqueNameWarning() {
- Pipeline p = TestPipeline.create();
- p.getOptions().setStableUniqueNames(CheckEnabled.WARNING);
-
- p.apply(Create.of(5, 6, 7));
- p.apply(Create.of(5, 6, 7));
-
- logged.verifyWarn("does not have a stable unique name.");
- }
-
- @Test
- public void testStableUniqueNameError() {
- Pipeline p = TestPipeline.create();
- p.getOptions().setStableUniqueNames(CheckEnabled.ERROR);
-
- p.apply(Create.of(5, 6, 7));
-
- thrown.expectMessage("does not have a stable unique name.");
- p.apply(Create.of(5, 6, 7));
- }
-
- /**
- * Tests that Pipeline supports a pass-through identity function.
- */
- @Test
- @Category(RunnableOnService.class)
- public void testIdentityTransform() throws Exception {
- Pipeline pipeline = TestPipeline.create();
-
- PCollection<Integer> output = pipeline
- .apply(Create.<Integer>of(1, 2, 3, 4))
- .apply("IdentityTransform", new IdentityTransform<PCollection<Integer>>());
-
- DataflowAssert.that(output).containsInAnyOrder(1, 2, 3, 4);
- pipeline.run();
- }
-
- private static class IdentityTransform<T extends PInput & POutput>
- extends PTransform<T, T> {
- @Override
- public T apply(T input) {
- return input;
- }
- }
-
- /**
- * Tests that Pipeline supports pulling an element out of a tuple as a transform.
- */
- @Test
- @Category(RunnableOnService.class)
- public void testTupleProjectionTransform() throws Exception {
- Pipeline pipeline = TestPipeline.create();
-
- PCollection<Integer> input = pipeline
- .apply(Create.<Integer>of(1, 2, 3, 4));
-
- TupleTag<Integer> tag = new TupleTag<Integer>();
- PCollectionTuple tuple = PCollectionTuple.of(tag, input);
-
- PCollection<Integer> output = tuple
- .apply("ProjectTag", new TupleProjectionTransform<Integer>(tag));
-
- DataflowAssert.that(output).containsInAnyOrder(1, 2, 3, 4);
- pipeline.run();
- }
-
- private static class TupleProjectionTransform<T>
- extends PTransform<PCollectionTuple, PCollection<T>> {
- private TupleTag<T> tag;
-
- public TupleProjectionTransform(TupleTag<T> tag) {
- this.tag = tag;
- }
-
- @Override
- public PCollection<T> apply(PCollectionTuple input) {
- return input.get(tag);
- }
- }
-
- /**
- * Tests that Pipeline supports putting an element into a tuple as a transform.
- */
- @Test
- @Category(RunnableOnService.class)
- public void testTupleInjectionTransform() throws Exception {
- Pipeline pipeline = TestPipeline.create();
-
- PCollection<Integer> input = pipeline
- .apply(Create.<Integer>of(1, 2, 3, 4));
-
- TupleTag<Integer> tag = new TupleTag<Integer>();
-
- PCollectionTuple output = input
- .apply("ProjectTag", new TupleInjectionTransform<Integer>(tag));
-
- DataflowAssert.that(output.get(tag)).containsInAnyOrder(1, 2, 3, 4);
- pipeline.run();
- }
-
- private static class TupleInjectionTransform<T>
- extends PTransform<PCollection<T>, PCollectionTuple> {
- private TupleTag<T> tag;
-
- public TupleInjectionTransform(TupleTag<T> tag) {
- this.tag = tag;
- }
-
- @Override
- public PCollectionTuple apply(PCollection<T> input) {
- return PCollectionTuple.of(tag, input);
- }
- }
-
- /**
- * Tests that an empty pipeline runs.
- */
- @Test
- public void testEmptyPipeline() throws Exception {
- Pipeline pipeline = TestPipeline.create();
- pipeline.run();
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/test/java/com/google/cloud/dataflow/sdk/TestUtils.java
----------------------------------------------------------------------
diff --git a/sdk/src/test/java/com/google/cloud/dataflow/sdk/TestUtils.java b/sdk/src/test/java/com/google/cloud/dataflow/sdk/TestUtils.java
deleted file mode 100644
index 257ecbb..0000000
--- a/sdk/src/test/java/com/google/cloud/dataflow/sdk/TestUtils.java
+++ /dev/null
@@ -1,213 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk;
-
-import static org.junit.Assert.assertThat;
-
-import com.google.cloud.dataflow.sdk.transforms.Combine.CombineFn;
-import com.google.cloud.dataflow.sdk.values.KV;
-
-import org.hamcrest.CoreMatchers;
-import org.hamcrest.Description;
-import org.hamcrest.Matcher;
-import org.hamcrest.TypeSafeMatcher;
-
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.List;
-
-/**
- * Utilities for tests.
- */
-public class TestUtils {
- // Do not instantiate.
- private TestUtils() {}
-
- public static final String[] NO_LINES_ARRAY = new String[] { };
-
- public static final List<String> NO_LINES = Arrays.asList(NO_LINES_ARRAY);
-
- public static final String[] LINES_ARRAY = new String[] {
- "To be, or not to be: that is the question: ",
- "Whether 'tis nobler in the mind to suffer ",
- "The slings and arrows of outrageous fortune, ",
- "Or to take arms against a sea of troubles, ",
- "And by opposing end them? To die: to sleep; ",
- "No more; and by a sleep to say we end ",
- "The heart-ache and the thousand natural shocks ",
- "That flesh is heir to, 'tis a consummation ",
- "Devoutly to be wish'd. To die, to sleep; ",
- "To sleep: perchance to dream: ay, there's the rub; ",
- "For in that sleep of death what dreams may come ",
- "When we have shuffled off this mortal coil, ",
- "Must give us pause: there's the respect ",
- "That makes calamity of so long life; ",
- "For who would bear the whips and scorns of time, ",
- "The oppressor's wrong, the proud man's contumely, ",
- "The pangs of despised love, the law's delay, ",
- "The insolence of office and the spurns ",
- "That patient merit of the unworthy takes, ",
- "When he himself might his quietus make ",
- "With a bare bodkin? who would fardels bear, ",
- "To grunt and sweat under a weary life, ",
- "But that the dread of something after death, ",
- "The undiscover'd country from whose bourn ",
- "No traveller returns, puzzles the will ",
- "And makes us rather bear those ills we have ",
- "Than fly to others that we know not of? ",
- "Thus conscience does make cowards of us all; ",
- "And thus the native hue of resolution ",
- "Is sicklied o'er with the pale cast of thought, ",
- "And enterprises of great pith and moment ",
- "With this regard their currents turn awry, ",
- "And lose the name of action.--Soft you now! ",
- "The fair Ophelia! Nymph, in thy orisons ",
- "Be all my sins remember'd." };
-
- public static final List<String> LINES = Arrays.asList(LINES_ARRAY);
-
- public static final String[] LINES2_ARRAY = new String[] {
- "hi", "there", "bob!" };
-
- public static final List<String> LINES2 = Arrays.asList(LINES2_ARRAY);
-
- public static final Integer[] NO_INTS_ARRAY = new Integer[] { };
-
- public static final List<Integer> NO_INTS = Arrays.asList(NO_INTS_ARRAY);
-
- public static final Integer[] INTS_ARRAY = new Integer[] {
- 3, 42, Integer.MAX_VALUE, 0, -1, Integer.MIN_VALUE, 666 };
-
- public static final List<Integer> INTS = Arrays.asList(INTS_ARRAY);
-
- /**
- * Matcher for KVs.
- */
- public static class KvMatcher<K, V>
- extends TypeSafeMatcher<KV<? extends K, ? extends V>> {
- final Matcher<? super K> keyMatcher;
- final Matcher<? super V> valueMatcher;
-
- public static <K, V> KvMatcher<K, V> isKv(Matcher<K> keyMatcher,
- Matcher<V> valueMatcher) {
- return new KvMatcher<>(keyMatcher, valueMatcher);
- }
-
- public KvMatcher(Matcher<? super K> keyMatcher,
- Matcher<? super V> valueMatcher) {
- this.keyMatcher = keyMatcher;
- this.valueMatcher = valueMatcher;
- }
-
- @Override
- public boolean matchesSafely(KV<? extends K, ? extends V> kv) {
- return keyMatcher.matches(kv.getKey())
- && valueMatcher.matches(kv.getValue());
- }
-
- @Override
- public void describeTo(Description description) {
- description
- .appendText("a KV(").appendValue(keyMatcher)
- .appendText(", ").appendValue(valueMatcher)
- .appendText(")");
- }
- }
-
- ////////////////////////////////////////////////////////////////////////////
- // Utilities for testing CombineFns, ensuring they give correct results
- // across various permutations and shardings of the input.
-
- public static <InputT, AccumT, OutputT> void checkCombineFn(
- CombineFn<InputT, AccumT, OutputT> fn, List<InputT> input, final OutputT expected) {
- checkCombineFn(fn, input, CoreMatchers.is(expected));
- }
-
- public static <InputT, AccumT, OutputT> void checkCombineFn(
- CombineFn<InputT, AccumT, OutputT> fn, List<InputT> input, Matcher<? super OutputT> matcher) {
- checkCombineFnInternal(fn, input, matcher);
- Collections.shuffle(input);
- checkCombineFnInternal(fn, input, matcher);
- }
-
- private static <InputT, AccumT, OutputT> void checkCombineFnInternal(
- CombineFn<InputT, AccumT, OutputT> fn, List<InputT> input, Matcher<? super OutputT> matcher) {
- int size = input.size();
- checkCombineFnShards(fn, Collections.singletonList(input), matcher);
- checkCombineFnShards(fn, shardEvenly(input, 2), matcher);
- if (size > 4) {
- checkCombineFnShards(fn, shardEvenly(input, size / 2), matcher);
- checkCombineFnShards(
- fn, shardEvenly(input, (int) (size / Math.sqrt(size))), matcher);
- }
- checkCombineFnShards(fn, shardExponentially(input, 1.4), matcher);
- checkCombineFnShards(fn, shardExponentially(input, 2), matcher);
- checkCombineFnShards(fn, shardExponentially(input, Math.E), matcher);
- }
-
- public static <InputT, AccumT, OutputT> void checkCombineFnShards(
- CombineFn<InputT, AccumT, OutputT> fn,
- List<? extends Iterable<InputT>> shards,
- Matcher<? super OutputT> matcher) {
- checkCombineFnShardsInternal(fn, shards, matcher);
- Collections.shuffle(shards);
- checkCombineFnShardsInternal(fn, shards, matcher);
- }
-
- private static <InputT, AccumT, OutputT> void checkCombineFnShardsInternal(
- CombineFn<InputT, AccumT, OutputT> fn,
- Iterable<? extends Iterable<InputT>> shards,
- Matcher<? super OutputT> matcher) {
- List<AccumT> accumulators = new ArrayList<>();
- int maybeCompact = 0;
- for (Iterable<InputT> shard : shards) {
- AccumT accumulator = fn.createAccumulator();
- for (InputT elem : shard) {
- accumulator = fn.addInput(accumulator, elem);
- }
- if (maybeCompact++ % 2 == 0) {
- accumulator = fn.compact(accumulator);
- }
- accumulators.add(accumulator);
- }
- AccumT merged = fn.mergeAccumulators(accumulators);
- assertThat(fn.extractOutput(merged), matcher);
- }
-
- private static <T> List<List<T>> shardEvenly(List<T> input, int numShards) {
- List<List<T>> shards = new ArrayList<>(numShards);
- for (int i = 0; i < numShards; i++) {
- shards.add(input.subList(i * input.size() / numShards,
- (i + 1) * input.size() / numShards));
- }
- return shards;
- }
-
- private static <T> List<List<T>> shardExponentially(
- List<T> input, double base) {
- assert base > 1.0;
- List<List<T>> shards = new ArrayList<>();
- int end = input.size();
- while (end > 0) {
- int start = (int) (end / base);
- shards.add(input.subList(start, end));
- end = start;
- }
- return shards;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/test/java/com/google/cloud/dataflow/sdk/WindowMatchers.java
----------------------------------------------------------------------
diff --git a/sdk/src/test/java/com/google/cloud/dataflow/sdk/WindowMatchers.java b/sdk/src/test/java/com/google/cloud/dataflow/sdk/WindowMatchers.java
deleted file mode 100644
index 9d7cfc8..0000000
--- a/sdk/src/test/java/com/google/cloud/dataflow/sdk/WindowMatchers.java
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk;
-
-import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
-import com.google.cloud.dataflow.sdk.transforms.windowing.IntervalWindow;
-import com.google.cloud.dataflow.sdk.transforms.windowing.PaneInfo;
-import com.google.cloud.dataflow.sdk.util.WindowedValue;
-
-import org.hamcrest.Description;
-import org.hamcrest.Matcher;
-import org.hamcrest.Matchers;
-import org.hamcrest.TypeSafeMatcher;
-import org.joda.time.Instant;
-
-import java.util.Collection;
-import java.util.Objects;
-
-/**
- * Matchers that are useful for working with Windowing, Timestamps, etc.
- */
-public class WindowMatchers {
-
- public static <T> Matcher<WindowedValue<? extends T>> isWindowedValue(
- Matcher<? super T> valueMatcher, Matcher<? super Instant> timestampMatcher,
- Matcher<? super Collection<? extends BoundedWindow>> windowsMatcher) {
- return new WindowedValueMatcher<>(valueMatcher, timestampMatcher, windowsMatcher);
- }
-
- public static <T> Matcher<WindowedValue<? extends T>> isWindowedValue(
- Matcher<? super T> valueMatcher, Matcher<? super Instant> timestampMatcher) {
- return new WindowedValueMatcher<>(valueMatcher, timestampMatcher, Matchers.anything());
- }
-
- public static <T> Matcher<WindowedValue<? extends T>> isSingleWindowedValue(
- T value, long timestamp, long windowStart, long windowEnd) {
- return WindowMatchers.<T>isSingleWindowedValue(
- Matchers.equalTo(value), timestamp, windowStart, windowEnd);
- }
-
- public static <T> Matcher<WindowedValue<? extends T>> isSingleWindowedValue(
- Matcher<T> valueMatcher, long timestamp, long windowStart, long windowEnd) {
- IntervalWindow intervalWindow =
- new IntervalWindow(new Instant(windowStart), new Instant(windowEnd));
- return WindowMatchers.<T>isSingleWindowedValue(
- valueMatcher,
- Matchers.describedAs("%0", Matchers.equalTo(new Instant(timestamp)), timestamp),
- Matchers.<BoundedWindow>equalTo(intervalWindow));
- }
-
- public static <T> Matcher<WindowedValue<? extends T>> isSingleWindowedValue(
- Matcher<? super T> valueMatcher, Matcher<? super Instant> timestampMatcher,
- Matcher<? super BoundedWindow> windowMatcher) {
- return new WindowedValueMatcher<T>(
- valueMatcher, timestampMatcher, Matchers.contains(windowMatcher));
- }
-
- public static Matcher<IntervalWindow> intervalWindow(long start, long end) {
- return Matchers.equalTo(new IntervalWindow(new Instant(start), new Instant(end)));
- }
-
- public static <T> Matcher<WindowedValue<? extends T>> valueWithPaneInfo(final PaneInfo paneInfo) {
- return new TypeSafeMatcher<WindowedValue<? extends T>>() {
- @Override
- public void describeTo(Description description) {
- description
- .appendText("WindowedValue(paneInfo = ").appendValue(paneInfo).appendText(")");
- }
-
- @Override
- protected boolean matchesSafely(WindowedValue<? extends T> item) {
- return Objects.equals(item.getPane(), paneInfo);
- }
-
- @Override
- protected void describeMismatchSafely(
- WindowedValue<? extends T> item, Description mismatchDescription) {
- mismatchDescription.appendValue(item.getPane());
- }
- };
- }
-
- @SuppressWarnings({"unchecked", "rawtypes"})
- @SafeVarargs
- public static final <W extends BoundedWindow> Matcher<Iterable<W>> ofWindows(
- Matcher<W>... windows) {
- return (Matcher) Matchers.<W>containsInAnyOrder(windows);
- }
-
- private WindowMatchers() {}
-
- private static class WindowedValueMatcher<T> extends TypeSafeMatcher<WindowedValue<? extends T>> {
-
- private Matcher<? super T> valueMatcher;
- private Matcher<? super Instant> timestampMatcher;
- private Matcher<? super Collection<? extends BoundedWindow>> windowsMatcher;
-
- private WindowedValueMatcher(
- Matcher<? super T> valueMatcher,
- Matcher<? super Instant> timestampMatcher,
- Matcher<? super Collection<? extends BoundedWindow>> windowsMatcher) {
- this.valueMatcher = valueMatcher;
- this.timestampMatcher = timestampMatcher;
- this.windowsMatcher = windowsMatcher;
- }
-
- @Override
- public void describeTo(Description description) {
- description
- .appendText("a WindowedValue(").appendValue(valueMatcher)
- .appendText(", ").appendValue(timestampMatcher)
- .appendText(", ").appendValue(windowsMatcher)
- .appendText(")");
- }
-
- @Override
- protected boolean matchesSafely(WindowedValue<? extends T> windowedValue) {
- return valueMatcher.matches(windowedValue.getValue())
- && timestampMatcher.matches(windowedValue.getTimestamp())
- && windowsMatcher.matches(windowedValue.getWindows());
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/test/java/com/google/cloud/dataflow/sdk/coders/AvroCoderTest.java
----------------------------------------------------------------------
diff --git a/sdk/src/test/java/com/google/cloud/dataflow/sdk/coders/AvroCoderTest.java b/sdk/src/test/java/com/google/cloud/dataflow/sdk/coders/AvroCoderTest.java
deleted file mode 100644
index db6e944..0000000
--- a/sdk/src/test/java/com/google/cloud/dataflow/sdk/coders/AvroCoderTest.java
+++ /dev/null
@@ -1,754 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.coders;
-
-import static org.hamcrest.Matchers.containsString;
-import static org.hamcrest.Matchers.equalTo;
-import static org.junit.Assert.assertThat;
-import static org.junit.Assert.assertTrue;
-import static org.junit.Assert.fail;
-
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.coders.Coder.Context;
-import com.google.cloud.dataflow.sdk.coders.Coder.NonDeterministicException;
-import com.google.cloud.dataflow.sdk.testing.CoderProperties;
-import com.google.cloud.dataflow.sdk.testing.DataflowAssert;
-import com.google.cloud.dataflow.sdk.testing.TestPipeline;
-import com.google.cloud.dataflow.sdk.transforms.Create;
-import com.google.cloud.dataflow.sdk.transforms.DoFn;
-import com.google.cloud.dataflow.sdk.transforms.ParDo;
-import com.google.cloud.dataflow.sdk.util.CloudObject;
-import com.google.cloud.dataflow.sdk.util.SerializableUtils;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-
-import org.apache.avro.AvroTypeException;
-import org.apache.avro.Schema;
-import org.apache.avro.SchemaBuilder;
-import org.apache.avro.generic.GenericData;
-import org.apache.avro.generic.GenericRecord;
-import org.apache.avro.reflect.AvroName;
-import org.apache.avro.reflect.AvroSchema;
-import org.apache.avro.reflect.Nullable;
-import org.apache.avro.reflect.ReflectData;
-import org.apache.avro.reflect.Stringable;
-import org.apache.avro.reflect.Union;
-import org.apache.avro.specific.SpecificData;
-import org.apache.avro.util.Utf8;
-import org.hamcrest.Description;
-import org.hamcrest.Matcher;
-import org.hamcrest.Matchers;
-import org.hamcrest.TypeSafeMatcher;
-import org.junit.Assert;
-import org.junit.Test;
-import org.junit.runner.RunWith;
-import org.junit.runners.JUnit4;
-
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.HashSet;
-import java.util.LinkedHashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.Objects;
-import java.util.SortedMap;
-import java.util.SortedSet;
-import java.util.TreeMap;
-import java.util.TreeSet;
-
-/** Tests for {@link AvroCoder}. */
-@RunWith(JUnit4.class)
-public class AvroCoderTest {
-
- @DefaultCoder(AvroCoder.class)
- private static class Pojo {
- public String text;
- public int count;
-
- // Empty constructor required for Avro decoding.
- @SuppressWarnings("unused")
- public Pojo() {
- }
-
- public Pojo(String text, int count) {
- this.text = text;
- this.count = count;
- }
-
- // auto-generated
- @Override
- public boolean equals(Object o) {
- if (this == o) {
- return true;
- }
- if (o == null || getClass() != o.getClass()) {
- return false;
- }
-
- Pojo pojo = (Pojo) o;
-
- if (count != pojo.count) {
- return false;
- }
- if (text != null
- ? !text.equals(pojo.text)
- : pojo.text != null) {
- return false;
- }
-
- return true;
- }
-
- @Override
- public int hashCode() {
- return 0;
- }
-
- @Override
- public String toString() {
- return "Pojo{"
- + "text='" + text + '\''
- + ", count=" + count
- + '}';
- }
- }
-
- private static class GetTextFn extends DoFn<Pojo, String> {
- @Override
- public void processElement(ProcessContext c) {
- c.output(c.element().text);
- }
- }
-
- @Test
- public void testAvroCoderEncoding() throws Exception {
- AvroCoder<Pojo> coder = AvroCoder.of(Pojo.class);
- CloudObject encoding = coder.asCloudObject();
-
- Assert.assertThat(encoding.keySet(),
- Matchers.containsInAnyOrder("@type", "type", "schema", "encoding_id"));
- }
-
- @Test
- public void testPojoEncoding() throws Exception {
- Pojo value = new Pojo("Hello", 42);
- AvroCoder<Pojo> coder = AvroCoder.of(Pojo.class);
-
- CoderProperties.coderDecodeEncodeEqual(coder, value);
- }
-
- @Test
- public void testPojoEncodingId() throws Exception {
- AvroCoder<Pojo> coder = AvroCoder.of(Pojo.class);
- CoderProperties.coderHasEncodingId(coder, Pojo.class.getName());
- }
-
- @Test
- public void testGenericRecordEncoding() throws Exception {
- String schemaString =
- "{\"namespace\": \"example.avro\",\n"
- + " \"type\": \"record\",\n"
- + " \"name\": \"User\",\n"
- + " \"fields\": [\n"
- + " {\"name\": \"name\", \"type\": \"string\"},\n"
- + " {\"name\": \"favorite_number\", \"type\": [\"int\", \"null\"]},\n"
- + " {\"name\": \"favorite_color\", \"type\": [\"string\", \"null\"]}\n"
- + " ]\n"
- + "}";
- Schema schema = (new Schema.Parser()).parse(schemaString);
-
- GenericRecord before = new GenericData.Record(schema);
- before.put("name", "Bob");
- before.put("favorite_number", 256);
- // Leave favorite_color null
-
- AvroCoder<GenericRecord> coder = AvroCoder.of(GenericRecord.class, schema);
-
- CoderProperties.coderDecodeEncodeEqual(coder, before);
- Assert.assertEquals(schema, coder.getSchema());
- }
-
- @Test
- public void testEncodingNotBuffered() throws Exception {
- // This test ensures that the coder doesn't read ahead and buffer data.
- // Reading ahead causes a problem if the stream consists of records of different
- // types.
- Pojo before = new Pojo("Hello", 42);
-
- AvroCoder<Pojo> coder = AvroCoder.of(Pojo.class);
- SerializableCoder<Integer> intCoder = SerializableCoder.of(Integer.class);
-
- ByteArrayOutputStream outStream = new ByteArrayOutputStream();
-
- Context context = Context.NESTED;
- coder.encode(before, outStream, context);
- intCoder.encode(10, outStream, context);
-
- ByteArrayInputStream inStream = new ByteArrayInputStream(outStream.toByteArray());
-
- Pojo after = coder.decode(inStream, context);
- Assert.assertEquals(before, after);
-
- Integer intAfter = intCoder.decode(inStream, context);
- Assert.assertEquals(new Integer(10), intAfter);
- }
-
- @Test
- public void testDefaultCoder() throws Exception {
- Pipeline p = TestPipeline.create();
-
- // Use MyRecord as input and output types without explicitly specifying
- // a coder (this uses the default coders, which may not be AvroCoder).
- PCollection<String> output =
- p.apply(Create.of(new Pojo("hello", 1), new Pojo("world", 2)))
- .apply(ParDo.of(new GetTextFn()));
-
- DataflowAssert.that(output)
- .containsInAnyOrder("hello", "world");
- p.run();
- }
-
- @Test
- public void testAvroCoderIsSerializable() throws Exception {
- AvroCoder<Pojo> coder = AvroCoder.of(Pojo.class);
-
- // Check that the coder is serializable using the regular JSON approach.
- SerializableUtils.ensureSerializable(coder);
- }
-
- private final void assertDeterministic(AvroCoder<?> coder) {
- try {
- coder.verifyDeterministic();
- } catch (NonDeterministicException e) {
- fail("Expected " + coder + " to be deterministic, but got:\n" + e);
- }
- }
-
- private final void assertNonDeterministic(AvroCoder<?> coder,
- Matcher<String> reason1) {
- try {
- coder.verifyDeterministic();
- fail("Expected " + coder + " to be non-deterministic.");
- } catch (NonDeterministicException e) {
- assertThat(e.getReasons(), Matchers.<String>iterableWithSize(1));
- assertThat(e.getReasons(), Matchers.<String>contains(reason1));
- }
- }
-
- @Test
- public void testDeterministicInteger() {
- assertDeterministic(AvroCoder.of(Integer.class));
- }
-
- @Test
- public void testDeterministicInt() {
- assertDeterministic(AvroCoder.of(int.class));
- }
-
- private static class SimpleDeterministicClass {
- @SuppressWarnings("unused")
- private Integer intField;
- @SuppressWarnings("unused")
- private char charField;
- @SuppressWarnings("unused")
- private Integer[] intArray;
- @SuppressWarnings("unused")
- private Utf8 utf8field;
- }
-
- @Test
- public void testDeterministicSimple() {
- assertDeterministic(AvroCoder.of(SimpleDeterministicClass.class));
- }
-
- private static class UnorderedMapClass {
- @SuppressWarnings("unused")
- private Map<String, String> mapField;
- }
-
- private Matcher<String> reason(final String prefix, final String messagePart) {
- return new TypeSafeMatcher<String>(String.class) {
- @Override
- public void describeTo(Description description) {
- description.appendText(String.format("Reason starting with '%s:' containing '%s'",
- prefix, messagePart));
- }
-
- @Override
- protected boolean matchesSafely(String item) {
- return item.startsWith(prefix + ":") && item.contains(messagePart);
- }
- };
- }
-
- private Matcher<String> reasonClass(Class<?> clazz, String message) {
- return reason(clazz.getName(), message);
- }
-
- private Matcher<String> reasonField(
- Class<?> clazz, String field, String message) {
- return reason(clazz.getName() + "#" + field, message);
- }
-
- @Test
- public void testDeterministicUnorderedMap() {
- assertNonDeterministic(AvroCoder.of(UnorderedMapClass.class),
- reasonField(UnorderedMapClass.class, "mapField",
- "java.util.Map<java.lang.String, java.lang.String> "
- + "may not be deterministically ordered"));
- }
-
- private static class NonDeterministicArray {
- @SuppressWarnings("unused")
- private UnorderedMapClass[] arrayField;
- }
- @Test
- public void testDeterministicNonDeterministicArray() {
- assertNonDeterministic(AvroCoder.of(NonDeterministicArray.class),
- reasonField(UnorderedMapClass.class, "mapField",
- "java.util.Map<java.lang.String, java.lang.String>"
- + " may not be deterministically ordered"));
- }
-
- private static class SubclassOfUnorderedMapClass extends UnorderedMapClass {}
-
-
- @Test
- public void testDeterministicNonDeterministicChild() {
- // Super class has non deterministic fields.
- assertNonDeterministic(AvroCoder.of(SubclassOfUnorderedMapClass.class),
- reasonField(UnorderedMapClass.class, "mapField",
- "may not be deterministically ordered"));
- }
-
- private static class SubclassHidingParent extends UnorderedMapClass {
- @SuppressWarnings("unused")
- @AvroName("mapField2") // AvroName is not enough
- private int mapField;
- }
-
- @Test
- public void testAvroProhibitsShadowing() {
- // This test verifies that Avro won't serialize a class with two fields of
- // the same name. This is important for our error reporting, and also how
- // we lookup a field.
- try {
- ReflectData.get().getSchema(SubclassHidingParent.class);
- fail("Expected AvroTypeException");
- } catch (AvroTypeException e) {
- assertThat(e.getMessage(), containsString("mapField"));
- assertThat(e.getMessage(), containsString("two fields named"));
- }
- }
-
- private static class FieldWithAvroName {
- @AvroName("name")
- @SuppressWarnings("unused")
- private int someField;
- }
-
- @Test
- public void testDeterministicWithAvroName() {
- assertDeterministic(AvroCoder.of(FieldWithAvroName.class));
- }
-
- @Test
- public void testDeterminismSortedMap() {
- assertDeterministic(AvroCoder.of(StringSortedMapField.class));
- }
-
- private static class StringSortedMapField {
- @SuppressWarnings("unused")
- SortedMap<String, String> sortedMapField;
- }
-
- @Test
- public void testDeterminismTreeMapValue() {
- // The value is non-deterministic, so we should fail.
- assertNonDeterministic(AvroCoder.of(TreeMapNonDetValue.class),
- reasonField(UnorderedMapClass.class, "mapField",
- "java.util.Map<java.lang.String, java.lang.String> "
- + "may not be deterministically ordered"));
- }
-
- private static class TreeMapNonDetValue {
- @SuppressWarnings("unused")
- TreeMap<String, NonDeterministicArray> nonDeterministicField;
- }
-
- @Test
- public void testDeterminismUnorderedMap() {
- // LinkedHashMap is not deterministically ordered, so we should fail.
- assertNonDeterministic(AvroCoder.of(LinkedHashMapField.class),
- reasonField(LinkedHashMapField.class, "nonDeterministicMap",
- "java.util.LinkedHashMap<java.lang.String, java.lang.String> "
- + "may not be deterministically ordered"));
- }
-
- private static class LinkedHashMapField {
- @SuppressWarnings("unused")
- LinkedHashMap<String, String> nonDeterministicMap;
- }
-
- @Test
- public void testDeterminismCollection() {
- assertNonDeterministic(AvroCoder.of(StringCollection.class),
- reasonField(StringCollection.class, "stringCollection",
- "java.util.Collection<java.lang.String> may not be deterministically ordered"));
- }
-
- private static class StringCollection {
- @SuppressWarnings("unused")
- Collection<String> stringCollection;
- }
-
- @Test
- public void testDeterminismList() {
- assertDeterministic(AvroCoder.of(StringList.class));
- assertDeterministic(AvroCoder.of(StringArrayList.class));
- }
-
- private static class StringList {
- @SuppressWarnings("unused")
- List<String> stringCollection;
- }
-
- private static class StringArrayList {
- @SuppressWarnings("unused")
- ArrayList<String> stringCollection;
- }
-
- @Test
- public void testDeterminismSet() {
- assertDeterministic(AvroCoder.of(StringSortedSet.class));
- assertDeterministic(AvroCoder.of(StringTreeSet.class));
- assertNonDeterministic(AvroCoder.of(StringHashSet.class),
- reasonField(StringHashSet.class, "stringCollection",
- "java.util.HashSet<java.lang.String> may not be deterministically ordered"));
- }
-
- private static class StringSortedSet{
- @SuppressWarnings("unused")
- SortedSet<String> stringCollection;
- }
-
- private static class StringTreeSet {
- @SuppressWarnings("unused")
- TreeSet<String> stringCollection;
- }
-
- private static class StringHashSet {
- @SuppressWarnings("unused")
- HashSet<String> stringCollection;
- }
-
- @Test
- public void testDeterminismCollectionValue() {
- assertNonDeterministic(AvroCoder.of(OrderedSetOfNonDetValues.class),
- reasonField(UnorderedMapClass.class, "mapField",
- "may not be deterministically ordered"));
- assertNonDeterministic(AvroCoder.of(ListOfNonDetValues.class),
- reasonField(UnorderedMapClass.class, "mapField",
- "may not be deterministically ordered"));
- }
-
- private static class OrderedSetOfNonDetValues {
- @SuppressWarnings("unused")
- SortedSet<UnorderedMapClass> set;
- }
-
- private static class ListOfNonDetValues {
- @SuppressWarnings("unused")
- List<UnorderedMapClass> set;
- }
-
- @Test
- public void testDeterminismUnion() {
- assertDeterministic(AvroCoder.of(DeterministicUnionBase.class));
- assertNonDeterministic(AvroCoder.of(NonDeterministicUnionBase.class),
- reasonField(UnionCase3.class, "mapField", "may not be deterministically ordered"));
- }
-
- @Test
- public void testDeterminismStringable() {
- assertDeterministic(AvroCoder.of(String.class));
- assertNonDeterministic(AvroCoder.of(StringableClass.class),
- reasonClass(StringableClass.class, "may not have deterministic #toString()"));
- }
-
- @Stringable
- private static class StringableClass {
- }
-
- @Test
- public void testDeterminismCyclicClass() {
- assertNonDeterministic(AvroCoder.of(Cyclic.class),
- reasonField(Cyclic.class, "cyclicField", "appears recursively"));
- assertNonDeterministic(AvroCoder.of(CyclicField.class),
- reasonField(Cyclic.class, "cyclicField",
- Cyclic.class.getName() + " appears recursively"));
- assertNonDeterministic(AvroCoder.of(IndirectCycle1.class),
- reasonField(IndirectCycle2.class, "field2",
- IndirectCycle1.class.getName() + " appears recursively"));
- }
-
- private static class Cyclic {
- @SuppressWarnings("unused")
- int intField;
- @SuppressWarnings("unused")
- Cyclic cyclicField;
- }
-
- private static class CyclicField {
- @SuppressWarnings("unused")
- Cyclic cyclicField2;
- }
-
- private static class IndirectCycle1 {
- @SuppressWarnings("unused")
- IndirectCycle2 field1;
- }
-
- private static class IndirectCycle2 {
- @SuppressWarnings("unused")
- IndirectCycle1 field2;
- }
-
- @Test
- public void testDeterminismHasGenericRecord() {
- assertDeterministic(AvroCoder.of(HasGenericRecord.class));
- }
-
- private static class HasGenericRecord {
- @AvroSchema("{\"name\": \"bar\", \"type\": \"record\", \"fields\": ["
- + "{\"name\": \"foo\", \"type\": \"int\"}]}")
- GenericRecord genericRecord;
- }
-
- @Test
- public void testDeterminismHasCustomSchema() {
- assertNonDeterministic(AvroCoder.of(HasCustomSchema.class),
- reasonField(HasCustomSchema.class, "withCustomSchema",
- "Custom schemas are only supported for subtypes of IndexedRecord."));
- }
-
- private static class HasCustomSchema {
- @AvroSchema("{\"name\": \"bar\", \"type\": \"record\", \"fields\": ["
- + "{\"name\": \"foo\", \"type\": \"int\"}]}")
- int withCustomSchema;
- }
-
- @Test
- public void testAvroCoderTreeMapDeterminism()
- throws Exception, NonDeterministicException {
- TreeMapField size1 = new TreeMapField();
- TreeMapField size2 = new TreeMapField();
-
- // Different order for entries
- size1.field.put("hello", "world");
- size1.field.put("another", "entry");
-
- size2.field.put("another", "entry");
- size2.field.put("hello", "world");
-
- AvroCoder<TreeMapField> coder = AvroCoder.of(TreeMapField.class);
- coder.verifyDeterministic();
-
- ByteArrayOutputStream outStream1 = new ByteArrayOutputStream();
- ByteArrayOutputStream outStream2 = new ByteArrayOutputStream();
-
- Context context = Context.NESTED;
- coder.encode(size1, outStream1, context);
- coder.encode(size2, outStream2, context);
-
- assertTrue(Arrays.equals(
- outStream1.toByteArray(), outStream2.toByteArray()));
- }
-
- private static class TreeMapField {
- private TreeMap<String, String> field = new TreeMap<>();
- }
-
- @Union({ UnionCase1.class, UnionCase2.class })
- private abstract static class DeterministicUnionBase {}
-
- @Union({ UnionCase1.class, UnionCase2.class, UnionCase3.class })
- private abstract static class NonDeterministicUnionBase {}
- private static class UnionCase1 extends DeterministicUnionBase {}
- private static class UnionCase2 extends DeterministicUnionBase {
- @SuppressWarnings("unused")
- String field;
- }
-
- private static class UnionCase3 extends NonDeterministicUnionBase {
- @SuppressWarnings("unused")
- private Map<String, String> mapField;
- }
-
- @Test
- public void testAvroCoderSimpleSchemaDeterminism() {
- assertDeterministic(AvroCoder.of(SchemaBuilder.record("someRecord").fields()
- .endRecord()));
- assertDeterministic(AvroCoder.of(SchemaBuilder.record("someRecord").fields()
- .name("int").type().intType().noDefault()
- .endRecord()));
- assertDeterministic(AvroCoder.of(SchemaBuilder.record("someRecord").fields()
- .name("string").type().stringType().noDefault()
- .endRecord()));
-
- assertNonDeterministic(AvroCoder.of(SchemaBuilder.record("someRecord").fields()
- .name("map").type().map().values().stringType().noDefault()
- .endRecord()),
- reason("someRecord.map", "HashMap to represent MAPs"));
-
- assertDeterministic(AvroCoder.of(SchemaBuilder.record("someRecord").fields()
- .name("array").type().array().items().stringType().noDefault()
- .endRecord()));
-
- assertDeterministic(AvroCoder.of(SchemaBuilder.record("someRecord").fields()
- .name("enum").type().enumeration("anEnum").symbols("s1", "s2").enumDefault("s1")
- .endRecord()));
-
- assertDeterministic(AvroCoder.of(SchemaBuilder.unionOf()
- .intType().and()
- .record("someRecord").fields().nullableString("someField", "").endRecord()
- .endUnion()));
- }
-
- @Test
- public void testAvroCoderStrings() {
- // Custom Strings in Records
- assertDeterministic(AvroCoder.of(SchemaBuilder.record("someRecord").fields()
- .name("string").prop(SpecificData.CLASS_PROP, "java.lang.String")
- .type().stringType().noDefault()
- .endRecord()));
- assertNonDeterministic(AvroCoder.of(SchemaBuilder.record("someRecord").fields()
- .name("string").prop(SpecificData.CLASS_PROP, "unknownString")
- .type().stringType().noDefault()
- .endRecord()),
- reason("someRecord.string", "unknownString is not known to be deterministic"));
-
- // Custom Strings in Unions
- assertNonDeterministic(AvroCoder.of(SchemaBuilder.unionOf()
- .intType().and()
- .record("someRecord").fields()
- .name("someField").prop(SpecificData.CLASS_PROP, "unknownString")
- .type().stringType().noDefault().endRecord()
- .endUnion()),
- reason("someRecord.someField", "unknownString is not known to be deterministic"));
- }
-
- @Test
- public void testAvroCoderNestedRecords() {
- // Nested Record
- assertDeterministic(AvroCoder.of(SchemaBuilder.record("nestedRecord").fields()
- .name("subRecord").type().record("subRecord").fields()
- .name("innerField").type().stringType().noDefault()
- .endRecord().noDefault()
- .endRecord()));
- }
-
- @Test
- public void testAvroCoderCyclicRecords() {
- // Recursive record
- assertNonDeterministic(AvroCoder.of(SchemaBuilder.record("cyclicRecord").fields()
- .name("cycle").type("cyclicRecord").noDefault()
- .endRecord()),
- reason("cyclicRecord.cycle", "cyclicRecord appears recursively"));
- }
-
- private static class NullableField {
- @SuppressWarnings("unused")
- @Nullable private String nullable;
- }
-
- @Test
- public void testNullableField() {
- assertDeterministic(AvroCoder.of(NullableField.class));
- }
-
- private static class NullableNonDeterministicField {
- @SuppressWarnings("unused")
- @Nullable private NonDeterministicArray nullableNonDetArray;
- }
-
- private static class NullableCyclic {
- @SuppressWarnings("unused")
- @Nullable private NullableCyclic nullableNullableCyclicField;
- }
-
- private static class NullableCyclicField {
- @SuppressWarnings("unused")
- @Nullable private Cyclic nullableCyclicField;
- }
-
- @Test
- public void testNullableNonDeterministicField() {
- assertNonDeterministic(AvroCoder.of(NullableCyclic.class),
- reasonField(NullableCyclic.class, "nullableNullableCyclicField",
- NullableCyclic.class.getName() + " appears recursively"));
- assertNonDeterministic(AvroCoder.of(NullableCyclicField.class),
- reasonField(Cyclic.class, "cyclicField",
- Cyclic.class.getName() + " appears recursively"));
- assertNonDeterministic(AvroCoder.of(NullableNonDeterministicField.class),
- reasonField(UnorderedMapClass.class, "mapField",
- " may not be deterministically ordered"));
- }
-
- /**
- * Tests that a parameterized class can have an automatically generated schema if the generic
- * field is annotated with a union tag.
- */
- @Test
- public void testGenericClassWithUnionAnnotation() throws Exception {
- // Cast is safe as long as the same coder is used for encoding and decoding.
- @SuppressWarnings({"unchecked", "rawtypes"})
- AvroCoder<GenericWithAnnotation<String>> coder =
- (AvroCoder) AvroCoder.of(GenericWithAnnotation.class);
-
- assertThat(coder.getSchema().getField("onlySomeTypesAllowed").schema().getType(),
- equalTo(Schema.Type.UNION));
-
- CoderProperties.coderDecodeEncodeEqual(coder, new GenericWithAnnotation<>("hello"));
- }
-
- private static class GenericWithAnnotation<T> {
- @AvroSchema("[\"string\", \"int\"]")
- private T onlySomeTypesAllowed;
-
- public GenericWithAnnotation(T value) {
- onlySomeTypesAllowed = value;
- }
-
- // For deserialization only
- @SuppressWarnings("unused")
- protected GenericWithAnnotation() { }
-
- @Override
- public boolean equals(Object other) {
- return other instanceof GenericWithAnnotation
- && onlySomeTypesAllowed.equals(((GenericWithAnnotation<?>) other).onlySomeTypesAllowed);
- }
-
- @Override
- public int hashCode() {
- return Objects.hash(getClass(), onlySomeTypesAllowed);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/test/java/com/google/cloud/dataflow/sdk/coders/BigEndianIntegerCoderTest.java
----------------------------------------------------------------------
diff --git a/sdk/src/test/java/com/google/cloud/dataflow/sdk/coders/BigEndianIntegerCoderTest.java b/sdk/src/test/java/com/google/cloud/dataflow/sdk/coders/BigEndianIntegerCoderTest.java
deleted file mode 100644
index d96c208..0000000
--- a/sdk/src/test/java/com/google/cloud/dataflow/sdk/coders/BigEndianIntegerCoderTest.java
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.coders;
-
-import com.google.cloud.dataflow.sdk.testing.CoderProperties;
-import com.google.cloud.dataflow.sdk.util.CoderUtils;
-
-import org.junit.Rule;
-import org.junit.Test;
-import org.junit.rules.ExpectedException;
-import org.junit.runner.RunWith;
-import org.junit.runners.JUnit4;
-
-import java.util.Arrays;
-import java.util.List;
-
-/**
- * Test case for {@link BigEndianIntegerCoder}.
- */
-@RunWith(JUnit4.class)
-public class BigEndianIntegerCoderTest {
-
- private static final Coder<Integer> TEST_CODER = BigEndianIntegerCoder.of();
-
- private static final List<Integer> TEST_VALUES = Arrays.asList(
- -11, -3, -1, 0, 1, 5, 13, 29,
- Integer.MAX_VALUE,
- Integer.MIN_VALUE);
-
- @Test
- public void testDecodeEncodeEqual() throws Exception {
- for (Integer value : TEST_VALUES) {
- CoderProperties.coderDecodeEncodeEqual(TEST_CODER, value);
- }
- }
-
- // This should never change. The definition of big endian encoding is fixed.
- private static final String EXPECTED_ENCODING_ID = "";
-
- @Test
- public void testEncodingId() throws Exception {
- CoderProperties.coderHasEncodingId(TEST_CODER, EXPECTED_ENCODING_ID);
- }
-
- /**
- * Generated data to check that the wire format has not changed. To regenerate, see
- * {@link com.google.cloud.dataflow.sdk.coders.PrintBase64Encodings}.
- */
- private static final List<String> TEST_ENCODINGS = Arrays.asList(
- "____9Q",
- "_____Q",
- "_____w",
- "AAAAAA",
- "AAAAAQ",
- "AAAABQ",
- "AAAADQ",
- "AAAAHQ",
- "f____w",
- "gAAAAA");
-
- @Test
- public void testWireFormatEncode() throws Exception {
- CoderProperties.coderEncodesBase64(TEST_CODER, TEST_VALUES, TEST_ENCODINGS);
- }
-
- @Rule
- public ExpectedException thrown = ExpectedException.none();
-
- @Test
- public void encodeNullThrowsCoderException() throws Exception {
- thrown.expect(CoderException.class);
- thrown.expectMessage("cannot encode a null Integer");
-
- CoderUtils.encodeToBase64(TEST_CODER, null);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/test/java/com/google/cloud/dataflow/sdk/coders/BigEndianLongCoderTest.java
----------------------------------------------------------------------
diff --git a/sdk/src/test/java/com/google/cloud/dataflow/sdk/coders/BigEndianLongCoderTest.java b/sdk/src/test/java/com/google/cloud/dataflow/sdk/coders/BigEndianLongCoderTest.java
deleted file mode 100644
index ea486c1..0000000
--- a/sdk/src/test/java/com/google/cloud/dataflow/sdk/coders/BigEndianLongCoderTest.java
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.coders;
-
-import com.google.cloud.dataflow.sdk.testing.CoderProperties;
-import com.google.cloud.dataflow.sdk.util.CoderUtils;
-
-import org.junit.Rule;
-import org.junit.Test;
-import org.junit.rules.ExpectedException;
-import org.junit.runner.RunWith;
-import org.junit.runners.JUnit4;
-
-import java.util.Arrays;
-import java.util.List;
-
-/**
- * Test case for {@link BigEndianLongCoder}.
- */
-@RunWith(JUnit4.class)
-public class BigEndianLongCoderTest {
-
- private static final Coder<Long> TEST_CODER = BigEndianLongCoder.of();
-
- private static final List<Long> TEST_VALUES = Arrays.asList(
- -11L, -3L, -1L, 0L, 1L, 5L, 13L, 29L,
- Integer.MAX_VALUE + 131L,
- Integer.MIN_VALUE - 29L,
- Long.MAX_VALUE,
- Long.MIN_VALUE);
-
- @Test
- public void testDecodeEncodeEqual() throws Exception {
- for (Long value : TEST_VALUES) {
- CoderProperties.coderDecodeEncodeEqual(TEST_CODER, value);
- }
- }
-
- // This should never change. The definition of big endian is fixed.
- private static final String EXPECTED_ENCODING_ID = "";
-
- @Test
- public void testEncodingId() throws Exception {
- CoderProperties.coderHasEncodingId(TEST_CODER, EXPECTED_ENCODING_ID);
- }
-
- /**
- * Generated data to check that the wire format has not changed. To regenerate, see
- * {@link com.google.cloud.dataflow.sdk.coders.PrintBase64Encodings}.
- */
- private static final List<String> TEST_ENCODINGS = Arrays.asList(
- "__________U",
- "__________0",
- "__________8",
- "AAAAAAAAAAA",
- "AAAAAAAAAAE",
- "AAAAAAAAAAU",
- "AAAAAAAAAA0",
- "AAAAAAAAAB0",
- "AAAAAIAAAII",
- "_____3___-M",
- "f_________8",
- "gAAAAAAAAAA");
-
- @Test
- public void testWireFormatEncode() throws Exception {
- CoderProperties.coderEncodesBase64(TEST_CODER, TEST_VALUES, TEST_ENCODINGS);
- }
-
- @Rule
- public ExpectedException thrown = ExpectedException.none();
-
- @Test
- public void encodeNullThrowsCoderException() throws Exception {
- thrown.expect(CoderException.class);
- thrown.expectMessage("cannot encode a null Long");
-
- CoderUtils.encodeToBase64(TEST_CODER, null);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/test/java/com/google/cloud/dataflow/sdk/coders/ByteArrayCoderTest.java
----------------------------------------------------------------------
diff --git a/sdk/src/test/java/com/google/cloud/dataflow/sdk/coders/ByteArrayCoderTest.java b/sdk/src/test/java/com/google/cloud/dataflow/sdk/coders/ByteArrayCoderTest.java
deleted file mode 100644
index 989bc7f..0000000
--- a/sdk/src/test/java/com/google/cloud/dataflow/sdk/coders/ByteArrayCoderTest.java
+++ /dev/null
@@ -1,144 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.coders;
-
-import static org.hamcrest.Matchers.equalTo;
-import static org.hamcrest.Matchers.not;
-import static org.junit.Assert.assertThat;
-
-import com.google.cloud.dataflow.sdk.testing.CoderProperties;
-import com.google.cloud.dataflow.sdk.util.CoderUtils;
-import com.google.cloud.dataflow.sdk.util.common.CounterTestUtils;
-
-import org.junit.Rule;
-import org.junit.Test;
-import org.junit.rules.ExpectedException;
-import org.junit.runner.RunWith;
-import org.junit.runners.JUnit4;
-
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
-import java.util.Arrays;
-import java.util.List;
-
-/**
- * Unit tests for {@link ByteArrayCoder}.
- */
-@RunWith(JUnit4.class)
-public class ByteArrayCoderTest {
-
- private static final ByteArrayCoder TEST_CODER = ByteArrayCoder.of();
-
- private static final List<byte[]> TEST_VALUES = Arrays.asList(
- new byte[]{0xa, 0xb, 0xc},
- new byte[]{0xd, 0x3},
- new byte[]{0xd, 0xe},
- new byte[]{});
-
- @Test
- public void testDecodeEncodeEquals() throws Exception {
- for (byte[] value : TEST_VALUES) {
- CoderProperties.coderDecodeEncodeEqual(TEST_CODER, value);
- }
- }
-
- @Test
- public void testRegisterByteSizeObserver() throws Exception {
- CounterTestUtils.testByteCount(ByteArrayCoder.of(), Coder.Context.OUTER,
- new byte[][]{{ 0xa, 0xb, 0xc }});
-
- CounterTestUtils.testByteCount(ByteArrayCoder.of(), Coder.Context.NESTED,
- new byte[][]{{ 0xa, 0xb, 0xc }, {}, {}, { 0xd, 0xe }, {}});
- }
-
- @Test
- public void testStructuralValueConsistentWithEquals() throws Exception {
- // We know that byte array coders are NOT compatible with equals
- // (aka injective w.r.t. Object.equals)
- for (byte[] value1 : TEST_VALUES) {
- for (byte[] value2 : TEST_VALUES) {
- CoderProperties.structuralValueConsistentWithEquals(TEST_CODER, value1, value2);
- }
- }
- }
-
- @Test
- public void testEncodeThenMutate() throws Exception {
- byte[] input = { 0x7, 0x3, 0xA, 0xf };
- byte[] encoded = CoderUtils.encodeToByteArray(TEST_CODER, input);
- input[1] = 0x9;
- byte[] decoded = CoderUtils.decodeFromByteArray(TEST_CODER, encoded);
-
- // now that I have mutated the input, the output should NOT match
- assertThat(input, not(equalTo(decoded)));
- }
-
- @Test
- public void testEncodeAndOwn() throws Exception {
- for (byte[] value : TEST_VALUES) {
- byte[] encodedSlow = CoderUtils.encodeToByteArray(TEST_CODER, value);
- byte[] encodedFast = encodeToByteArrayAndOwn(TEST_CODER, value);
- assertThat(encodedSlow, equalTo(encodedFast));
- }
- }
-
- private static byte[] encodeToByteArrayAndOwn(ByteArrayCoder coder, byte[] value)
- throws IOException {
- return encodeToByteArrayAndOwn(coder, value, Coder.Context.OUTER);
- }
-
- private static byte[] encodeToByteArrayAndOwn(
- ByteArrayCoder coder, byte[] value, Coder.Context context) throws IOException {
- ByteArrayOutputStream os = new ByteArrayOutputStream();
- coder.encodeAndOwn(value, os, context);
- return os.toByteArray();
- }
-
- // If this changes, it implies the binary format has changed.
- private static final String EXPECTED_ENCODING_ID = "";
-
- @Test
- public void testEncodingId() throws Exception {
- CoderProperties.coderHasEncodingId(TEST_CODER, EXPECTED_ENCODING_ID);
- }
-
- /**
- * Generated data to check that the wire format has not changed. To regenerate, see
- * {@link com.google.cloud.dataflow.sdk.coders.PrintBase64Encodings}.
- */
- private static final List<String> TEST_ENCODINGS = Arrays.asList(
- "CgsM",
- "DQM",
- "DQ4",
- "");
-
- @Test
- public void testWireFormatEncode() throws Exception {
- CoderProperties.coderEncodesBase64(TEST_CODER, TEST_VALUES, TEST_ENCODINGS);
- }
-
- @Rule
- public ExpectedException thrown = ExpectedException.none();
-
- @Test
- public void encodeNullThrowsCoderException() throws Exception {
- thrown.expect(CoderException.class);
- thrown.expectMessage("cannot encode a null byte[]");
-
- CoderUtils.encodeToBase64(TEST_CODER, null);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/test/java/com/google/cloud/dataflow/sdk/coders/ByteCoderTest.java
----------------------------------------------------------------------
diff --git a/sdk/src/test/java/com/google/cloud/dataflow/sdk/coders/ByteCoderTest.java b/sdk/src/test/java/com/google/cloud/dataflow/sdk/coders/ByteCoderTest.java
deleted file mode 100644
index 6cb852e..0000000
--- a/sdk/src/test/java/com/google/cloud/dataflow/sdk/coders/ByteCoderTest.java
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.coders;
-
-import com.google.cloud.dataflow.sdk.testing.CoderProperties;
-import com.google.cloud.dataflow.sdk.util.CoderUtils;
-
-import org.junit.Rule;
-import org.junit.Test;
-import org.junit.rules.ExpectedException;
-import org.junit.runner.RunWith;
-import org.junit.runners.JUnit4;
-
-import java.util.Arrays;
-import java.util.List;
-
-/**
- * Test case for {@link ByteCoder}.
- */
-@RunWith(JUnit4.class)
-public class ByteCoderTest {
-
- private static final Coder<Byte> TEST_CODER = ByteCoder.of();
-
- private static final List<Byte> TEST_VALUES = Arrays.asList(
- (byte) 1,
- (byte) 4,
- (byte) 6,
- (byte) 50,
- (byte) 124,
- Byte.MAX_VALUE,
- Byte.MIN_VALUE);
-
- @Test
- public void testDecodeEncodeEqual() throws Exception {
- for (Byte value : TEST_VALUES) {
- CoderProperties.coderDecodeEncodeEqual(TEST_CODER, value);
- }
- }
-
- // This should never change. The format is fixed by Java.
- private static final String EXPECTED_ENCODING_ID = "";
-
- @Test
- public void testEncodingId() throws Exception {
- CoderProperties.coderHasEncodingId(TEST_CODER, EXPECTED_ENCODING_ID);
- }
-
- /**
- * Generated data to check that the wire format has not changed. To regenerate, see
- * {@link com.google.cloud.dataflow.sdk.coders.PrintBase64Encodings}.
- */
- private static final List<String> TEST_ENCODINGS = Arrays.asList(
- "AQ",
- "BA",
- "Bg",
- "Mg",
- "fA",
- "fw",
- "gA");
-
- @Test
- public void testWireFormatEncode() throws Exception {
- CoderProperties.coderEncodesBase64(TEST_CODER, TEST_VALUES, TEST_ENCODINGS);
- }
-
- @Rule
- public ExpectedException thrown = ExpectedException.none();
-
- @Test
- public void encodeNullThrowsCoderException() throws Exception {
- thrown.expect(CoderException.class);
- thrown.expectMessage("cannot encode a null Byte");
-
- CoderUtils.encodeToBase64(TEST_CODER, null);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/test/java/com/google/cloud/dataflow/sdk/coders/ByteStringCoderTest.java
----------------------------------------------------------------------
diff --git a/sdk/src/test/java/com/google/cloud/dataflow/sdk/coders/ByteStringCoderTest.java b/sdk/src/test/java/com/google/cloud/dataflow/sdk/coders/ByteStringCoderTest.java
deleted file mode 100644
index debae71..0000000
--- a/sdk/src/test/java/com/google/cloud/dataflow/sdk/coders/ByteStringCoderTest.java
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.coders;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-
-import com.google.cloud.dataflow.sdk.coders.Coder.Context;
-import com.google.cloud.dataflow.sdk.testing.CoderProperties;
-import com.google.cloud.dataflow.sdk.util.CoderUtils;
-import com.google.common.collect.ImmutableList;
-import com.google.protobuf.ByteString;
-
-import org.junit.Rule;
-import org.junit.Test;
-import org.junit.rules.ExpectedException;
-import org.junit.runner.RunWith;
-import org.junit.runners.JUnit4;
-
-import java.util.Arrays;
-import java.util.List;
-
-/**
- * Test case for {@link ByteStringCoder}.
- */
-@RunWith(JUnit4.class)
-public class ByteStringCoderTest {
-
- private static final ByteStringCoder TEST_CODER = ByteStringCoder.of();
-
- private static final List<String> TEST_STRING_VALUES = Arrays.asList(
- "", "a", "13", "hello",
- "a longer string with spaces and all that",
- "a string with a \n newline",
- "???????????????");
- private static final ImmutableList<ByteString> TEST_VALUES;
- static {
- ImmutableList.Builder<ByteString> builder = ImmutableList.<ByteString>builder();
- for (String s : TEST_STRING_VALUES) {
- builder.add(ByteString.copyFrom(s.getBytes()));
- }
- TEST_VALUES = builder.build();
- }
-
- /**
- * Generated data to check that the wire format has not changed. To regenerate, see
- * {@link com.google.cloud.dataflow.sdk.coders.PrintBase64Encodings}.
- */
- private static final List<String> TEST_ENCODINGS = Arrays.asList(
- "",
- "YQ",
- "MTM",
- "aGVsbG8",
- "YSBsb25nZXIgc3RyaW5nIHdpdGggc3BhY2VzIGFuZCBhbGwgdGhhdA",
- "YSBzdHJpbmcgd2l0aCBhIAogbmV3bGluZQ",
- "Pz8_Pz8_Pz8_Pz8_Pz8_");
-
- @Rule
- public ExpectedException thrown = ExpectedException.none();
-
- @Test
- public void testDecodeEncodeEqualInAllContexts() throws Exception {
- for (ByteString value : TEST_VALUES) {
- CoderProperties.coderDecodeEncodeEqual(TEST_CODER, value);
- }
- }
-
- @Test
- public void testWireFormatEncode() throws Exception {
- CoderProperties.coderEncodesBase64(TEST_CODER, TEST_VALUES, TEST_ENCODINGS);
- }
-
- @Test
- public void testCoderDeterministic() throws Throwable {
- TEST_CODER.verifyDeterministic();
- }
-
- @Test
- public void testConsistentWithEquals() {
- assertTrue(TEST_CODER.consistentWithEquals());
- }
-
- @Test
- public void testEncodeNullThrowsCoderException() throws Exception {
- thrown.expect(CoderException.class);
- thrown.expectMessage("cannot encode a null ByteString");
-
- CoderUtils.encodeToBase64(TEST_CODER, null);
- }
-
- @Test
- public void testNestedCoding() throws Throwable {
- Coder<List<ByteString>> listCoder = ListCoder.of(TEST_CODER);
- CoderProperties.coderDecodeEncodeContentsEqual(listCoder, TEST_VALUES);
- CoderProperties.coderDecodeEncodeContentsInSameOrder(listCoder, TEST_VALUES);
- }
-
- @Test
- public void testEncodedElementByteSizeInAllContexts() throws Throwable {
- for (Context context : CoderProperties.ALL_CONTEXTS) {
- for (ByteString value : TEST_VALUES) {
- byte[] encoded = CoderUtils.encodeToByteArray(TEST_CODER, value, context);
- assertEquals(encoded.length, TEST_CODER.getEncodedElementByteSize(value, context));
- }
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/test/java/com/google/cloud/dataflow/sdk/coders/CoderFactoriesTest.java
----------------------------------------------------------------------
diff --git a/sdk/src/test/java/com/google/cloud/dataflow/sdk/coders/CoderFactoriesTest.java b/sdk/src/test/java/com/google/cloud/dataflow/sdk/coders/CoderFactoriesTest.java
deleted file mode 100644
index 8d702bf..0000000
--- a/sdk/src/test/java/com/google/cloud/dataflow/sdk/coders/CoderFactoriesTest.java
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Copyright (C) 2014 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.coders;
-
-import static org.junit.Assert.assertEquals;
-
-import org.junit.Test;
-import org.junit.runner.RunWith;
-import org.junit.runners.JUnit4;
-
-import java.util.Arrays;
-import java.util.Collections;
-
-/**
- * Tests for {@link CoderFactories}.
- */
-@RunWith(JUnit4.class)
-public class CoderFactoriesTest {
-
- /**
- * Ensures that a few of our standard atomic coder classes
- * can each be built into a factory that works as expected.
- * It is presumed that testing a few, not all, suffices to
- * exercise CoderFactoryFromStaticMethods.
- */
- @Test
- public void testAtomicCoderClassFactories() {
- checkAtomicCoderFactory(StringUtf8Coder.class, StringUtf8Coder.of());
- checkAtomicCoderFactory(DoubleCoder.class, DoubleCoder.of());
- checkAtomicCoderFactory(ByteArrayCoder.class, ByteArrayCoder.of());
- }
-
- /**
- * Checks that {#link CoderFactories.fromStaticMethods} successfully
- * builds a working {@link CoderFactory} from {@link KvCoder KvCoder.class}.
- */
- @Test
- public void testKvCoderFactory() {
- CoderFactory kvCoderFactory = CoderFactories.fromStaticMethods(KvCoder.class);
- assertEquals(
- KvCoder.of(DoubleCoder.of(), DoubleCoder.of()),
- kvCoderFactory.create(Arrays.asList(DoubleCoder.of(), DoubleCoder.of())));
- }
-
- /**
- * Checks that {#link CoderFactories.fromStaticMethods} successfully
- * builds a working {@link CoderFactory} from {@link ListCoder ListCoder.class}.
- */
- @Test
- public void testListCoderFactory() {
- CoderFactory listCoderFactory = CoderFactories.fromStaticMethods(ListCoder.class);
-
- assertEquals(
- ListCoder.of(DoubleCoder.of()),
- listCoderFactory.create(Arrays.asList(DoubleCoder.of())));
- }
-
- /**
- * Checks that {#link CoderFactories.fromStaticMethods} successfully
- * builds a working {@link CoderFactory} from {@link IterableCoder IterableCoder.class}.
- */
- @Test
- public void testIterableCoderFactory() {
- CoderFactory iterableCoderFactory = CoderFactories.fromStaticMethods(IterableCoder.class);
-
- assertEquals(
- IterableCoder.of(DoubleCoder.of()),
- iterableCoderFactory.create(Arrays.asList(DoubleCoder.of())));
- }
-
- ///////////////////////////////////////////////////////////////////////
-
- /**
- * Checks that an atomic coder class can be converted into
- * a factory that then yields a coder equal to the example
- * provided.
- */
- private <T> void checkAtomicCoderFactory(
- Class<? extends Coder<T>> coderClazz,
- Coder<T> expectedCoder) {
- CoderFactory factory = CoderFactories.fromStaticMethods(coderClazz);
- @SuppressWarnings("unchecked")
- Coder<T> actualCoder = (Coder<T>) factory.create(Collections.<Coder<?>>emptyList());
- assertEquals(expectedCoder, actualCoder);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/test/java/com/google/cloud/dataflow/sdk/coders/CoderProvidersTest.java
----------------------------------------------------------------------
diff --git a/sdk/src/test/java/com/google/cloud/dataflow/sdk/coders/CoderProvidersTest.java b/sdk/src/test/java/com/google/cloud/dataflow/sdk/coders/CoderProvidersTest.java
deleted file mode 100644
index 1c0a89e..0000000
--- a/sdk/src/test/java/com/google/cloud/dataflow/sdk/coders/CoderProvidersTest.java
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * Copyright (C) 2014 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.coders;
-
-import static org.hamcrest.Matchers.instanceOf;
-import static org.junit.Assert.assertThat;
-
-import com.google.cloud.dataflow.sdk.values.TypeDescriptor;
-
-import org.junit.Rule;
-import org.junit.Test;
-import org.junit.rules.ExpectedException;
-import org.junit.runner.RunWith;
-import org.junit.runners.JUnit4;
-
-import java.util.Map;
-
-/**
- * Tests for {@link CoderFactories}.
- */
-@RunWith(JUnit4.class)
-public class CoderProvidersTest {
-
- @Rule
- public ExpectedException thrown = ExpectedException.none();
-
- @Test
- public void testAvroThenSerializableStringMap() throws Exception {
- CoderProvider provider = CoderProviders.firstOf(AvroCoder.PROVIDER, SerializableCoder.PROVIDER);
- Coder<Map<String, String>> coder =
- provider.getCoder(new TypeDescriptor<Map<String, String>>(){});
- assertThat(coder, instanceOf(AvroCoder.class));
- }
-
- @Test
- public void testThrowingThenSerializable() throws Exception {
- CoderProvider provider =
- CoderProviders.firstOf(new ThrowingCoderProvider(), SerializableCoder.PROVIDER);
- Coder<Integer> coder = provider.getCoder(new TypeDescriptor<Integer>(){});
- assertThat(coder, instanceOf(SerializableCoder.class));
- }
-
- @Test
- public void testNullThrows() throws Exception {
- CoderProvider provider = CoderProviders.firstOf(new ThrowingCoderProvider());
- thrown.expect(CannotProvideCoderException.class);
- thrown.expectMessage("ThrowingCoderProvider");
- provider.getCoder(new TypeDescriptor<Integer>(){});
- }
-
- private static class ThrowingCoderProvider implements CoderProvider {
- @Override
- public <T> Coder<T> getCoder(TypeDescriptor<T> type) throws CannotProvideCoderException {
- throw new CannotProvideCoderException("ThrowingCoderProvider cannot ever provide a Coder");
- }
- }
-}
[38/67] [partial] incubator-beam git commit: Directory reorganization
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/DataflowPipelineWorkerPoolOptions.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/DataflowPipelineWorkerPoolOptions.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/DataflowPipelineWorkerPoolOptions.java
deleted file mode 100644
index dd3d83a..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/DataflowPipelineWorkerPoolOptions.java
+++ /dev/null
@@ -1,254 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.options;
-
-import com.google.cloud.dataflow.sdk.annotations.Experimental;
-import com.google.cloud.dataflow.sdk.runners.DataflowPipelineRunner;
-
-import com.fasterxml.jackson.annotation.JsonIgnore;
-
-import java.util.List;
-
-/**
- * Options that are used to configure the Dataflow pipeline worker pool.
- */
-@Description("Options that are used to configure the Dataflow pipeline worker pool.")
-public interface DataflowPipelineWorkerPoolOptions extends PipelineOptions {
- /**
- * Number of workers to use when executing the Dataflow job. Note that selection of an autoscaling
- * algorithm other then {@code NONE} will affect the size of the worker pool. If left unspecified,
- * the Dataflow service will determine the number of workers.
- */
- @Description("Number of workers to use when executing the Dataflow job. Note that "
- + "selection of an autoscaling algorithm other then \"NONE\" will affect the "
- + "size of the worker pool. If left unspecified, the Dataflow service will "
- + "determine the number of workers.")
- int getNumWorkers();
- void setNumWorkers(int value);
-
- /**
- * Type of autoscaling algorithm to use.
- */
- @Experimental(Experimental.Kind.AUTOSCALING)
- public enum AutoscalingAlgorithmType {
- /** Use numWorkers machines. Do not autoscale the worker pool. */
- NONE("AUTOSCALING_ALGORITHM_NONE"),
-
- @Deprecated
- BASIC("AUTOSCALING_ALGORITHM_BASIC"),
-
- /** Autoscale the workerpool based on throughput (up to maxNumWorkers). */
- THROUGHPUT_BASED("AUTOSCALING_ALGORITHM_BASIC");
-
- private final String algorithm;
-
- private AutoscalingAlgorithmType(String algorithm) {
- this.algorithm = algorithm;
- }
-
- /** Returns the string representation of this type. */
- public String getAlgorithm() {
- return this.algorithm;
- }
- }
-
- /**
- * [Experimental] The autoscaling algorithm to use for the workerpool.
- *
- * <ul>
- * <li>NONE: does not change the size of the worker pool.</li>
- * <li>BASIC: autoscale the worker pool size up to maxNumWorkers until the job completes.</li>
- * <li>THROUGHPUT_BASED: autoscale the workerpool based on throughput (up to maxNumWorkers).
- * </li>
- * </ul>
- */
- @Description("[Experimental] The autoscaling algorithm to use for the workerpool. "
- + "NONE: does not change the size of the worker pool. "
- + "BASIC (deprecated): autoscale the worker pool size up to maxNumWorkers until the job "
- + "completes. "
- + "THROUGHPUT_BASED: autoscale the workerpool based on throughput (up to maxNumWorkers).")
- @Experimental(Experimental.Kind.AUTOSCALING)
- AutoscalingAlgorithmType getAutoscalingAlgorithm();
- void setAutoscalingAlgorithm(AutoscalingAlgorithmType value);
-
- /**
- * The maximum number of workers to use for the workerpool. This options limits the size of the
- * workerpool for the lifetime of the job, including
- * <a href="https://cloud.google.com/dataflow/pipelines/updating-a-pipeline">pipeline updates</a>.
- * If left unspecified, the Dataflow service will compute a ceiling.
- */
- @Description("The maximum number of workers to use for the workerpool. This options limits the "
- + "size of the workerpool for the lifetime of the job, including pipeline updates. "
- + "If left unspecified, the Dataflow service will compute a ceiling.")
- int getMaxNumWorkers();
- void setMaxNumWorkers(int value);
-
- /**
- * Remote worker disk size, in gigabytes, or 0 to use the default size.
- */
- @Description("Remote worker disk size, in gigabytes, or 0 to use the default size.")
- int getDiskSizeGb();
- void setDiskSizeGb(int value);
-
- /**
- * Docker container image that executes Dataflow worker harness, residing in Google Container
- * Registry.
- */
- @Default.InstanceFactory(WorkerHarnessContainerImageFactory.class)
- @Description("Docker container image that executes Dataflow worker harness, residing in Google "
- + " Container Registry.")
- @Hidden
- String getWorkerHarnessContainerImage();
- void setWorkerHarnessContainerImage(String value);
-
- /**
- * Returns the default Docker container image that executes Dataflow worker harness, residing in
- * Google Container Registry.
- */
- public static class WorkerHarnessContainerImageFactory
- implements DefaultValueFactory<String> {
- @Override
- public String create(PipelineOptions options) {
- DataflowPipelineOptions dataflowOptions = options.as(DataflowPipelineOptions.class);
- if (dataflowOptions.isStreaming()) {
- return DataflowPipelineRunner.STREAMING_WORKER_HARNESS_CONTAINER_IMAGE;
- } else {
- return DataflowPipelineRunner.BATCH_WORKER_HARNESS_CONTAINER_IMAGE;
- }
- }
- }
-
- /**
- * GCE <a href="https://cloud.google.com/compute/docs/networking">network</a> for launching
- * workers.
- *
- * <p>Default is up to the Dataflow service.
- */
- @Description("GCE network for launching workers. For more information, see the reference "
- + "documentation https://cloud.google.com/compute/docs/networking. "
- + "Default is up to the Dataflow service.")
- String getNetwork();
- void setNetwork(String value);
-
- /**
- * GCE <a href="https://cloud.google.com/compute/docs/networking">subnetwork</a> for launching
- * workers.
- *
- * <p>Default is up to the Dataflow service. Expected format is zones/ZONE/subnetworks/SUBNETWORK.
- */
- @Description("GCE subnetwork for launching workers. For more information, see the reference "
- + "documentation https://cloud.google.com/compute/docs/networking. "
- + "Default is up to the Dataflow service.")
- String getSubnetwork();
- void setSubnetwork(String value);
-
- /**
- * GCE <a href="https://developers.google.com/compute/docs/zones"
- * >availability zone</a> for launching workers.
- *
- * <p>Default is up to the Dataflow service.
- */
- @Description("GCE availability zone for launching workers. See "
- + "https://developers.google.com/compute/docs/zones for a list of valid options. "
- + "Default is up to the Dataflow service.")
- String getZone();
- void setZone(String value);
-
- /**
- * Machine type to create Dataflow worker VMs as.
- *
- * <p>See <a href="https://cloud.google.com/compute/docs/machine-types">GCE machine types</a>
- * for a list of valid options.
- *
- * <p>If unset, the Dataflow service will choose a reasonable default.
- */
- @Description("Machine type to create Dataflow worker VMs as. See "
- + "https://cloud.google.com/compute/docs/machine-types for a list of valid options. "
- + "If unset, the Dataflow service will choose a reasonable default.")
- String getWorkerMachineType();
- void setWorkerMachineType(String value);
-
- /**
- * The policy for tearing down the workers spun up by the service.
- */
- public enum TeardownPolicy {
- /**
- * All VMs created for a Dataflow job are deleted when the job finishes, regardless of whether
- * it fails or succeeds.
- */
- TEARDOWN_ALWAYS("TEARDOWN_ALWAYS"),
- /**
- * All VMs created for a Dataflow job are left running when the job finishes, regardless of
- * whether it fails or succeeds.
- */
- TEARDOWN_NEVER("TEARDOWN_NEVER"),
- /**
- * All VMs created for a Dataflow job are deleted when the job succeeds, but are left running
- * when it fails. (This is typically used for debugging failing jobs by SSHing into the
- * workers.)
- */
- TEARDOWN_ON_SUCCESS("TEARDOWN_ON_SUCCESS");
-
- private final String teardownPolicy;
-
- private TeardownPolicy(String teardownPolicy) {
- this.teardownPolicy = teardownPolicy;
- }
-
- public String getTeardownPolicyName() {
- return this.teardownPolicy;
- }
- }
-
- /**
- * The teardown policy for the VMs.
- *
- * <p>If unset, the Dataflow service will choose a reasonable default.
- */
- @Description("The teardown policy for the VMs. If unset, the Dataflow service will "
- + "choose a reasonable default.")
- TeardownPolicy getTeardownPolicy();
- void setTeardownPolicy(TeardownPolicy value);
-
- /**
- * List of local files to make available to workers.
- *
- * <p>Files are placed on the worker's classpath.
- *
- * <p>The default value is the list of jars from the main program's classpath.
- */
- @Description("Files to stage on GCS and make available to workers. "
- + "Files are placed on the worker's classpath. "
- + "The default value is all files from the classpath.")
- @JsonIgnore
- List<String> getFilesToStage();
- void setFilesToStage(List<String> value);
-
- /**
- * Specifies what type of persistent disk should be used. The value should be a full or partial
- * URL of a disk type resource, e.g., zones/us-central1-f/disks/pd-standard. For
- * more information, see the
- * <a href="https://cloud.google.com/compute/docs/reference/latest/diskTypes">API reference
- * documentation for DiskTypes</a>.
- */
- @Description("Specifies what type of persistent disk should be used. The value should be a full "
- + "or partial URL of a disk type resource, e.g., zones/us-central1-f/disks/pd-standard. For "
- + "more information, see the API reference documentation for DiskTypes: "
- + "https://cloud.google.com/compute/docs/reference/latest/diskTypes")
- String getWorkerDiskType();
- void setWorkerDiskType(String value);
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/DataflowProfilingOptions.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/DataflowProfilingOptions.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/DataflowProfilingOptions.java
deleted file mode 100644
index 8ad2ba2..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/DataflowProfilingOptions.java
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.options;
-
-import com.google.cloud.dataflow.sdk.annotations.Experimental;
-
-import java.util.HashMap;
-
-/**
- * Options for controlling profiling of pipeline execution.
- */
-@Description("[Experimental] Used to configure profiling of the Dataflow pipeline")
-@Experimental
-@Hidden
-public interface DataflowProfilingOptions {
-
- @Description("Whether to periodically dump profiling information to local disk.\n"
- + "WARNING: Enabling this option may fill local disk with profiling information.")
- boolean getEnableProfilingAgent();
- void setEnableProfilingAgent(boolean enabled);
-
- @Description(
- "[INTERNAL] Additional configuration for the profiling agent. Not typically necessary.")
- @Hidden
- DataflowProfilingAgentConfiguration getProfilingAgentConfiguration();
- void setProfilingAgentConfiguration(DataflowProfilingAgentConfiguration configuration);
-
- /**
- * Configuration the for profiling agent.
- */
- public static class DataflowProfilingAgentConfiguration extends HashMap<String, Object> {
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/DataflowWorkerHarnessOptions.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/DataflowWorkerHarnessOptions.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/DataflowWorkerHarnessOptions.java
deleted file mode 100644
index e4b1d72..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/DataflowWorkerHarnessOptions.java
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.options;
-
-/**
- * Options that are used exclusively within the Dataflow worker harness.
- * These options have no effect at pipeline creation time.
- */
-@Description("[Internal] Options that are used exclusively within the Dataflow worker harness. "
- + "These options have no effect at pipeline creation time.")
-@Hidden
-public interface DataflowWorkerHarnessOptions extends DataflowPipelineOptions {
- /**
- * The identity of the worker running this pipeline.
- */
- @Description("The identity of the worker running this pipeline.")
- String getWorkerId();
- void setWorkerId(String value);
-
- /**
- * The identity of the Dataflow job.
- */
- @Description("The identity of the Dataflow job.")
- String getJobId();
- void setJobId(String value);
-
- /**
- * The size of the worker's in-memory cache, in megabytes.
- *
- * <p>Currently, this cache is used for storing read values of side inputs.
- */
- @Description("The size of the worker's in-memory cache, in megabytes.")
- @Default.Integer(100)
- Integer getWorkerCacheMb();
- void setWorkerCacheMb(Integer value);
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/DataflowWorkerLoggingOptions.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/DataflowWorkerLoggingOptions.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/DataflowWorkerLoggingOptions.java
deleted file mode 100644
index 2328873..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/DataflowWorkerLoggingOptions.java
+++ /dev/null
@@ -1,153 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.options;
-
-import com.google.common.base.Preconditions;
-
-import com.fasterxml.jackson.annotation.JsonCreator;
-
-import java.util.Arrays;
-import java.util.HashMap;
-import java.util.Map;
-
-/**
- * Options that are used to control logging configuration on the Dataflow worker.
- */
-@Description("Options that are used to control logging configuration on the Dataflow worker.")
-public interface DataflowWorkerLoggingOptions extends PipelineOptions {
- /**
- * The set of log levels that can be used on the Dataflow worker.
- */
- public enum Level {
- DEBUG, ERROR, INFO, TRACE, WARN
- }
-
- /**
- * This option controls the default log level of all loggers without a log level override.
- */
- @Description("Controls the default log level of all loggers without a log level override.")
- @Default.Enum("INFO")
- Level getDefaultWorkerLogLevel();
- void setDefaultWorkerLogLevel(Level level);
-
- /**
- * This option controls the log levels for specifically named loggers.
- *
- * <p>Later options with equivalent names override earlier options.
- *
- * <p>See {@link WorkerLogLevelOverrides} for more information on how to configure logging
- * on a per {@link Class}, {@link Package}, or name basis. If used from the command line,
- * the expected format is {"Name":"Level",...}, further details on
- * {@link WorkerLogLevelOverrides#from}.
- */
- @Description("This option controls the log levels for specifically named loggers. "
- + "The expected format is {\"Name\":\"Level\",...}. The Dataflow worker uses "
- + "java.util.logging, which supports a logging hierarchy based off of names that are '.' "
- + "separated. For example, by specifying the value {\"a.b.c.Foo\":\"DEBUG\"}, the logger "
- + "for the class 'a.b.c.Foo' will be configured to output logs at the DEBUG level. "
- + "Similarly, by specifying the value {\"a.b.c\":\"WARN\"}, all loggers underneath the "
- + "'a.b.c' package will be configured to output logs at the WARN level. Also, note that "
- + "when multiple overrides are specified, the exact name followed by the closest parent "
- + "takes precedence.")
- WorkerLogLevelOverrides getWorkerLogLevelOverrides();
- void setWorkerLogLevelOverrides(WorkerLogLevelOverrides value);
-
- /**
- * Defines a log level override for a specific class, package, or name.
- *
- * <p>{@code java.util.logging} is used on the Dataflow worker harness and supports
- * a logging hierarchy based off of names that are "." separated. It is a common
- * pattern to have the logger for a given class share the same name as the class itself.
- * Given the classes {@code a.b.c.Foo}, {@code a.b.c.Xyz}, and {@code a.b.Bar}, with
- * loggers named {@code "a.b.c.Foo"}, {@code "a.b.c.Xyz"}, and {@code "a.b.Bar"} respectively,
- * we can override the log levels:
- * <ul>
- * <li>for {@code Foo} by specifying the name {@code "a.b.c.Foo"} or the {@link Class}
- * representing {@code a.b.c.Foo}.
- * <li>for {@code Foo}, {@code Xyz}, and {@code Bar} by specifying the name {@code "a.b"} or
- * the {@link Package} representing {@code a.b}.
- * <li>for {@code Foo} and {@code Bar} by specifying both of their names or classes.
- * </ul>
- * Note that by specifying multiple overrides, the exact name followed by the closest parent
- * takes precedence.
- */
- public static class WorkerLogLevelOverrides extends HashMap<String, Level> {
- /**
- * Overrides the default log level for the passed in class.
- *
- * <p>This is equivalent to calling
- * {@link #addOverrideForName(String, DataflowWorkerLoggingOptions.Level)}
- * and passing in the {@link Class#getName() class name}.
- */
- public WorkerLogLevelOverrides addOverrideForClass(Class<?> klass, Level level) {
- Preconditions.checkNotNull(klass, "Expected class to be not null.");
- addOverrideForName(klass.getName(), level);
- return this;
- }
-
- /**
- * Overrides the default log level for the passed in package.
- *
- * <p>This is equivalent to calling
- * {@link #addOverrideForName(String, DataflowWorkerLoggingOptions.Level)}
- * and passing in the {@link Package#getName() package name}.
- */
- public WorkerLogLevelOverrides addOverrideForPackage(Package pkg, Level level) {
- Preconditions.checkNotNull(pkg, "Expected package to be not null.");
- addOverrideForName(pkg.getName(), level);
- return this;
- }
-
- /**
- * Overrides the default log level for the passed in name.
- *
- * <p>Note that because of the hierarchical nature of logger names, this will
- * override the log level of all loggers that have the passed in name or
- * a parent logger that has the passed in name.
- */
- public WorkerLogLevelOverrides addOverrideForName(String name, Level level) {
- Preconditions.checkNotNull(name, "Expected name to be not null.");
- Preconditions.checkNotNull(level,
- "Expected level to be one of %s.", Arrays.toString(Level.values()));
- put(name, level);
- return this;
- }
-
- /**
- * Expects a map keyed by logger {@code Name}s with values representing {@code Level}s.
- * The {@code Name} generally represents the fully qualified Java
- * {@link Class#getName() class name}, or fully qualified Java
- * {@link Package#getName() package name}, or custom logger name. The {@code Level}
- * represents the log level and must be one of {@link Level}.
- */
- @JsonCreator
- public static WorkerLogLevelOverrides from(Map<String, String> values) {
- Preconditions.checkNotNull(values, "Expected values to be not null.");
- WorkerLogLevelOverrides overrides = new WorkerLogLevelOverrides();
- for (Map.Entry<String, String> entry : values.entrySet()) {
- try {
- overrides.addOverrideForName(entry.getKey(), Level.valueOf(entry.getValue()));
- } catch (IllegalArgumentException e) {
- throw new IllegalArgumentException(String.format(
- "Unsupported log level '%s' requested for %s. Must be one of %s.",
- entry.getValue(), entry.getKey(), Arrays.toString(Level.values())));
- }
-
- }
- return overrides;
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/Default.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/Default.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/Default.java
deleted file mode 100644
index 46ff682..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/Default.java
+++ /dev/null
@@ -1,153 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.options;
-
-import java.lang.annotation.Documented;
-import java.lang.annotation.ElementType;
-import java.lang.annotation.Retention;
-import java.lang.annotation.RetentionPolicy;
-import java.lang.annotation.Target;
-
-/**
- * {@link Default} represents a set of annotations that can be used to annotate getter properties
- * on {@link PipelineOptions} with information representing the default value to be returned
- * if no value is specified.
- */
-public @interface Default {
- /**
- * This represents that the default of the option is the specified {@link java.lang.Class} value.
- */
- @Target(ElementType.METHOD)
- @Retention(RetentionPolicy.RUNTIME)
- @Documented
- public @interface Class {
- java.lang.Class<?> value();
- }
-
- /**
- * This represents that the default of the option is the specified {@link java.lang.String}
- * value.
- */
- @Target(ElementType.METHOD)
- @Retention(RetentionPolicy.RUNTIME)
- @Documented
- public @interface String {
- java.lang.String value();
- }
-
- /**
- * This represents that the default of the option is the specified boolean primitive value.
- */
- @Target(ElementType.METHOD)
- @Retention(RetentionPolicy.RUNTIME)
- @Documented
- public @interface Boolean {
- boolean value();
- }
-
- /**
- * This represents that the default of the option is the specified char primitive value.
- */
- @Target(ElementType.METHOD)
- @Retention(RetentionPolicy.RUNTIME)
- @Documented
- public @interface Character {
- char value();
- }
-
- /**
- * This represents that the default of the option is the specified byte primitive value.
- */
- @Target(ElementType.METHOD)
- @Retention(RetentionPolicy.RUNTIME)
- @Documented
- public @interface Byte {
- byte value();
- }
- /**
- * This represents that the default of the option is the specified short primitive value.
- */
- @Target(ElementType.METHOD)
- @Retention(RetentionPolicy.RUNTIME)
- @Documented
- public @interface Short {
- short value();
- }
- /**
- * This represents that the default of the option is the specified int primitive value.
- */
- @Target(ElementType.METHOD)
- @Retention(RetentionPolicy.RUNTIME)
- @Documented
- public @interface Integer {
- int value();
- }
-
- /**
- * This represents that the default of the option is the specified long primitive value.
- */
- @Target(ElementType.METHOD)
- @Retention(RetentionPolicy.RUNTIME)
- @Documented
- public @interface Long {
- long value();
- }
-
- /**
- * This represents that the default of the option is the specified float primitive value.
- */
- @Target(ElementType.METHOD)
- @Retention(RetentionPolicy.RUNTIME)
- @Documented
- public @interface Float {
- float value();
- }
-
- /**
- * This represents that the default of the option is the specified double primitive value.
- */
- @Target(ElementType.METHOD)
- @Retention(RetentionPolicy.RUNTIME)
- @Documented
- public @interface Double {
- double value();
- }
-
- /**
- * This represents that the default of the option is the specified enum.
- * The value should equal the enum's {@link java.lang.Enum#name() name}.
- */
- @Target(ElementType.METHOD)
- @Retention(RetentionPolicy.RUNTIME)
- @Documented
- public @interface Enum {
- java.lang.String value();
- }
-
- /**
- * Value must be of type {@link DefaultValueFactory} and have a default constructor.
- * Value is instantiated and then used as a factory to generate the default.
- *
- * <p>See {@link DefaultValueFactory} for more details.
- */
- @Target(ElementType.METHOD)
- @Retention(RetentionPolicy.RUNTIME)
- @Documented
- public @interface InstanceFactory {
- java.lang.Class<? extends DefaultValueFactory<?>> value();
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/DefaultValueFactory.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/DefaultValueFactory.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/DefaultValueFactory.java
deleted file mode 100644
index 1faedb7..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/DefaultValueFactory.java
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.options;
-
-/**
- * An interface used with the {@link Default.InstanceFactory} annotation to specify the class that
- * will be an instance factory to produce default values for a given getter on
- * {@link PipelineOptions}. When a property on a {@link PipelineOptions} is fetched, and is
- * currently unset, the default value factory will be instantiated and invoked.
- *
- * <p>Care must be taken to not produce an infinite loop when accessing other fields on the
- * {@link PipelineOptions} object.
- *
- * @param <T> The type of object this factory produces.
- */
-public interface DefaultValueFactory<T> {
- /**
- * Creates a default value for a getter marked with {@link Default.InstanceFactory}.
- *
- * @param options The current pipeline options.
- * @return The default value to be used for the annotated getter.
- */
- T create(PipelineOptions options);
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/Description.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/Description.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/Description.java
deleted file mode 100644
index 9ceaf58..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/Description.java
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.options;
-
-import java.lang.annotation.ElementType;
-import java.lang.annotation.Retention;
-import java.lang.annotation.RetentionPolicy;
-import java.lang.annotation.Target;
-
-/**
- * Descriptions are used to generate human readable output when the {@code --help}
- * command is specified. Description annotations placed on interfaces that extend
- * {@link PipelineOptions} will describe groups of related options. Description annotations
- * placed on getter methods will be used to provide human readable information
- * for the specific option.
- */
-@Target({ElementType.METHOD, ElementType.TYPE})
-@Retention(RetentionPolicy.RUNTIME)
-public @interface Description {
- String value();
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/DirectPipelineOptions.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/DirectPipelineOptions.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/DirectPipelineOptions.java
deleted file mode 100644
index 0867740..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/DirectPipelineOptions.java
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.options;
-
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.runners.DirectPipeline;
-import com.google.cloud.dataflow.sdk.runners.DirectPipelineRunner;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-
-import com.fasterxml.jackson.annotation.JsonIgnore;
-
-/**
- * Options that can be used to configure the {@link DirectPipeline}.
- */
-public interface DirectPipelineOptions extends
- ApplicationNameOptions, BigQueryOptions, GcsOptions, GcpOptions,
- PipelineOptions, StreamingOptions {
-
- /**
- * The random seed to use for pseudorandom behaviors in the {@link DirectPipelineRunner}.
- * If not explicitly specified, a random seed will be generated.
- */
- @JsonIgnore
- @Description("The random seed to use for pseudorandom behaviors in the DirectPipelineRunner."
- + " If not explicitly specified, a random seed will be generated.")
- Long getDirectPipelineRunnerRandomSeed();
- void setDirectPipelineRunnerRandomSeed(Long value);
-
- /**
- * Controls whether the runner should ensure that all of the elements of
- * the pipeline, such as DoFns, can be serialized.
- */
- @JsonIgnore
- @Description("Controls whether the runner should ensure that all of the elements of the "
- + "pipeline, such as DoFns, can be serialized.")
- @Default.Boolean(true)
- boolean isTestSerializability();
- void setTestSerializability(boolean testSerializability);
-
- /**
- * Controls whether the runner should ensure that all of the elements of
- * every {@link PCollection} can be encoded using the appropriate
- * {@link Coder}.
- */
- @JsonIgnore
- @Description("Controls whether the runner should ensure that all of the elements of every "
- + "PCollection can be encoded using the appropriate Coder.")
- @Default.Boolean(true)
- boolean isTestEncodability();
- void setTestEncodability(boolean testEncodability);
-
- /**
- * Controls whether the runner should randomize the order of each
- * {@link PCollection}.
- */
- @JsonIgnore
- @Description("Controls whether the runner should randomize the order of each PCollection.")
- @Default.Boolean(true)
- boolean isTestUnorderedness();
- void setTestUnorderedness(boolean testUnorderedness);
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/GcpOptions.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/GcpOptions.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/GcpOptions.java
deleted file mode 100644
index 7b70f4c..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/GcpOptions.java
+++ /dev/null
@@ -1,291 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.options;
-
-import com.google.api.client.auth.oauth2.Credential;
-import com.google.api.client.googleapis.auth.oauth2.GoogleOAuthConstants;
-import com.google.cloud.dataflow.sdk.util.CredentialFactory;
-import com.google.cloud.dataflow.sdk.util.GcpCredentialFactory;
-import com.google.cloud.dataflow.sdk.util.InstanceBuilder;
-import com.google.common.annotations.VisibleForTesting;
-import com.google.common.io.Files;
-
-import com.fasterxml.jackson.annotation.JsonIgnore;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.File;
-import java.io.IOException;
-import java.nio.charset.StandardCharsets;
-import java.security.GeneralSecurityException;
-import java.util.Locale;
-import java.util.Map;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-/**
- * Options used to configure Google Cloud Platform project and credentials.
- *
- * <p>These options configure which of the following three different mechanisms for obtaining a
- * credential are used:
- * <ol>
- * <li>
- * It can fetch the
- * <a href="https://developers.google.com/accounts/docs/application-default-credentials">
- * application default credentials</a>.
- * </li>
- * <li>
- * The user can specify a client secrets file and go through the OAuth2
- * webflow. The credential will then be cached in the user's home
- * directory for reuse.
- * </li>
- * <li>
- * The user can specify a file containing a service account private key along
- * with the service account name.
- * </li>
- * </ol>
- *
- * <p>The default mechanism is to use the
- * <a href="https://developers.google.com/accounts/docs/application-default-credentials">
- * application default credentials</a>. The other options can be
- * used by setting the corresponding properties.
- */
-@Description("Options used to configure Google Cloud Platform project and credentials.")
-public interface GcpOptions extends GoogleApiDebugOptions, PipelineOptions {
- /**
- * Project id to use when launching jobs.
- */
- @Description("Project id. Required when running a Dataflow in the cloud. "
- + "See https://cloud.google.com/storage/docs/projects for further details.")
- @Default.InstanceFactory(DefaultProjectFactory.class)
- String getProject();
- void setProject(String value);
-
- /**
- * This option controls which file to use when attempting to create the credentials using the
- * service account method.
- *
- * <p>This option if specified, needs be combined with the
- * {@link GcpOptions#getServiceAccountName() serviceAccountName}.
- */
- @JsonIgnore
- @Description("Controls which file to use when attempting to create the credentials "
- + "using the service account method. This option if specified, needs to be combined with "
- + "the serviceAccountName option.")
- String getServiceAccountKeyfile();
- void setServiceAccountKeyfile(String value);
-
- /**
- * This option controls which service account to use when attempting to create the credentials
- * using the service account method.
- *
- * <p>This option if specified, needs be combined with the
- * {@link GcpOptions#getServiceAccountKeyfile() serviceAccountKeyfile}.
- */
- @JsonIgnore
- @Description("Controls which service account to use when attempting to create the credentials "
- + "using the service account method. This option if specified, needs to be combined with "
- + "the serviceAccountKeyfile option.")
- String getServiceAccountName();
- void setServiceAccountName(String value);
-
- /**
- * This option controls which file to use when attempting to create the credentials
- * using the OAuth 2 webflow. After the OAuth2 webflow, the credentials will be stored
- * within credentialDir.
- */
- @JsonIgnore
- @Description("This option controls which file to use when attempting to create the credentials "
- + "using the OAuth 2 webflow. After the OAuth2 webflow, the credentials will be stored "
- + "within credentialDir.")
- String getSecretsFile();
- void setSecretsFile(String value);
-
- /**
- * This option controls which credential store to use when creating the credentials
- * using the OAuth 2 webflow.
- */
- @Description("This option controls which credential store to use when creating the credentials "
- + "using the OAuth 2 webflow.")
- @Default.String("cloud_dataflow")
- String getCredentialId();
- void setCredentialId(String value);
-
- /**
- * Directory for storing dataflow credentials after execution of the OAuth 2 webflow. Defaults
- * to using the $HOME/.store/data-flow directory.
- */
- @Description("Directory for storing dataflow credentials after execution of the OAuth 2 webflow. "
- + "Defaults to using the $HOME/.store/data-flow directory.")
- @Default.InstanceFactory(CredentialDirFactory.class)
- String getCredentialDir();
- void setCredentialDir(String value);
-
- /**
- * Returns the default credential directory of ${user.home}/.store/data-flow.
- */
- public static class CredentialDirFactory implements DefaultValueFactory<String> {
- @Override
- public String create(PipelineOptions options) {
- File home = new File(System.getProperty("user.home"));
- File store = new File(home, ".store");
- File dataflow = new File(store, "data-flow");
- return dataflow.getPath();
- }
- }
-
- /**
- * The class of the credential factory that should be created and used to create
- * credentials. If gcpCredential has not been set explicitly, an instance of this class will
- * be constructed and used as a credential factory.
- */
- @Description("The class of the credential factory that should be created and used to create "
- + "credentials. If gcpCredential has not been set explicitly, an instance of this class will "
- + "be constructed and used as a credential factory.")
- @Default.Class(GcpCredentialFactory.class)
- Class<? extends CredentialFactory> getCredentialFactoryClass();
- void setCredentialFactoryClass(
- Class<? extends CredentialFactory> credentialFactoryClass);
-
- /**
- * The credential instance that should be used to authenticate against GCP services.
- * If no credential has been set explicitly, the default is to use the instance factory
- * that constructs a credential based upon the currently set credentialFactoryClass.
- */
- @JsonIgnore
- @Description("The credential instance that should be used to authenticate against GCP services. "
- + "If no credential has been set explicitly, the default is to use the instance factory "
- + "that constructs a credential based upon the currently set credentialFactoryClass.")
- @Default.InstanceFactory(GcpUserCredentialsFactory.class)
- @Hidden
- Credential getGcpCredential();
- void setGcpCredential(Credential value);
-
- /**
- * Attempts to infer the default project based upon the environment this application
- * is executing within. Currently this only supports getting the default project from gcloud.
- */
- public static class DefaultProjectFactory implements DefaultValueFactory<String> {
- private static final Logger LOG = LoggerFactory.getLogger(DefaultProjectFactory.class);
-
- @Override
- public String create(PipelineOptions options) {
- try {
- File configFile;
- if (getEnvironment().containsKey("CLOUDSDK_CONFIG")) {
- configFile = new File(getEnvironment().get("CLOUDSDK_CONFIG"), "properties");
- } else if (isWindows() && getEnvironment().containsKey("APPDATA")) {
- configFile = new File(getEnvironment().get("APPDATA"), "gcloud/properties");
- } else {
- // New versions of gcloud use this file
- configFile = new File(
- System.getProperty("user.home"),
- ".config/gcloud/configurations/config_default");
- if (!configFile.exists()) {
- // Old versions of gcloud use this file
- configFile = new File(System.getProperty("user.home"), ".config/gcloud/properties");
- }
- }
- String section = null;
- Pattern projectPattern = Pattern.compile("^project\\s*=\\s*(.*)$");
- Pattern sectionPattern = Pattern.compile("^\\[(.*)\\]$");
- for (String line : Files.readLines(configFile, StandardCharsets.UTF_8)) {
- line = line.trim();
- if (line.isEmpty() || line.startsWith(";")) {
- continue;
- }
- Matcher matcher = sectionPattern.matcher(line);
- if (matcher.matches()) {
- section = matcher.group(1);
- } else if (section == null || section.equals("core")) {
- matcher = projectPattern.matcher(line);
- if (matcher.matches()) {
- String project = matcher.group(1).trim();
- LOG.info("Inferred default GCP project '{}' from gcloud. If this is the incorrect "
- + "project, please cancel this Pipeline and specify the command-line "
- + "argument --project.", project);
- return project;
- }
- }
- }
- } catch (IOException expected) {
- LOG.debug("Failed to find default project.", expected);
- }
- // return null if can't determine
- return null;
- }
-
- /**
- * Returns true if running on the Windows OS.
- */
- private static boolean isWindows() {
- return System.getProperty("os.name").toLowerCase(Locale.ENGLISH).contains("windows");
- }
-
- /**
- * Used to mock out getting environment variables.
- */
- @VisibleForTesting
- Map<String, String> getEnvironment() {
- return System.getenv();
- }
- }
-
- /**
- * Attempts to load the GCP credentials. See
- * {@link CredentialFactory#getCredential()} for more details.
- */
- public static class GcpUserCredentialsFactory implements DefaultValueFactory<Credential> {
- @Override
- public Credential create(PipelineOptions options) {
- GcpOptions gcpOptions = options.as(GcpOptions.class);
- try {
- CredentialFactory factory = InstanceBuilder.ofType(CredentialFactory.class)
- .fromClass(gcpOptions.getCredentialFactoryClass())
- .fromFactoryMethod("fromOptions")
- .withArg(PipelineOptions.class, options)
- .build();
- return factory.getCredential();
- } catch (IOException | GeneralSecurityException e) {
- throw new RuntimeException("Unable to obtain credential", e);
- }
- }
- }
-
- /**
- * The token server URL to use for OAuth 2 authentication. Normally, the default is sufficient,
- * but some specialized use cases may want to override this value.
- */
- @Description("The token server URL to use for OAuth 2 authentication. Normally, the default "
- + "is sufficient, but some specialized use cases may want to override this value.")
- @Default.String(GoogleOAuthConstants.TOKEN_SERVER_URL)
- @Hidden
- String getTokenServerUrl();
- void setTokenServerUrl(String value);
-
- /**
- * The authorization server URL to use for OAuth 2 authentication. Normally, the default is
- * sufficient, but some specialized use cases may want to override this value.
- */
- @Description("The authorization server URL to use for OAuth 2 authentication. Normally, the "
- + "default is sufficient, but some specialized use cases may want to override this value.")
- @Default.String(GoogleOAuthConstants.AUTHORIZATION_SERVER_URL)
- @Hidden
- String getAuthorizationServerEncodedUrl();
- void setAuthorizationServerEncodedUrl(String value);
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/GcsOptions.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/GcsOptions.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/GcsOptions.java
deleted file mode 100644
index d221807..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/GcsOptions.java
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.options;
-
-import com.google.cloud.dataflow.sdk.util.AppEngineEnvironment;
-import com.google.cloud.dataflow.sdk.util.GcsUtil;
-import com.google.cloud.hadoop.util.AbstractGoogleAsyncWriteChannel;
-import com.google.common.util.concurrent.MoreExecutors;
-import com.google.common.util.concurrent.ThreadFactoryBuilder;
-
-import com.fasterxml.jackson.annotation.JsonIgnore;
-
-import java.util.concurrent.ExecutorService;
-import java.util.concurrent.SynchronousQueue;
-import java.util.concurrent.ThreadPoolExecutor;
-import java.util.concurrent.TimeUnit;
-
-/**
- * Options used to configure Google Cloud Storage.
- */
-public interface GcsOptions extends
- ApplicationNameOptions, GcpOptions, PipelineOptions {
- /**
- * The GcsUtil instance that should be used to communicate with Google Cloud Storage.
- */
- @JsonIgnore
- @Description("The GcsUtil instance that should be used to communicate with Google Cloud Storage.")
- @Default.InstanceFactory(GcsUtil.GcsUtilFactory.class)
- @Hidden
- GcsUtil getGcsUtil();
- void setGcsUtil(GcsUtil value);
-
- /**
- * The ExecutorService instance to use to create threads, can be overridden to specify an
- * ExecutorService that is compatible with the users environment. If unset, the
- * default is to create an ExecutorService with an unbounded number of threads; this
- * is compatible with Google AppEngine.
- */
- @JsonIgnore
- @Description("The ExecutorService instance to use to create multiple threads. Can be overridden "
- + "to specify an ExecutorService that is compatible with the users environment. If unset, "
- + "the default is to create an ExecutorService with an unbounded number of threads; this "
- + "is compatible with Google AppEngine.")
- @Default.InstanceFactory(ExecutorServiceFactory.class)
- @Hidden
- ExecutorService getExecutorService();
- void setExecutorService(ExecutorService value);
-
- /**
- * GCS endpoint to use. If unspecified, uses the default endpoint.
- */
- @JsonIgnore
- @Hidden
- @Description("The URL for the GCS API.")
- String getGcsEndpoint();
- void setGcsEndpoint(String value);
-
- /**
- * The buffer size (in bytes) to use when uploading files to GCS. Please see the documentation for
- * {@link AbstractGoogleAsyncWriteChannel#setUploadBufferSize} for more information on the
- * restrictions and performance implications of this value.
- */
- @Description("The buffer size (in bytes) to use when uploading files to GCS. Please see the "
- + "documentation for AbstractGoogleAsyncWriteChannel.setUploadBufferSize for more "
- + "information on the restrictions and performance implications of this value.\n\n"
- + "https://github.com/GoogleCloudPlatform/bigdata-interop/blob/master/util/src/main/java/"
- + "com/google/cloud/hadoop/util/AbstractGoogleAsyncWriteChannel.java")
- Integer getGcsUploadBufferSizeBytes();
- void setGcsUploadBufferSizeBytes(Integer bytes);
-
- /**
- * Returns the default {@link ExecutorService} to use within the Dataflow SDK. The
- * {@link ExecutorService} is compatible with AppEngine.
- */
- public static class ExecutorServiceFactory implements DefaultValueFactory<ExecutorService> {
- @SuppressWarnings("deprecation") // IS_APP_ENGINE is deprecated for internal use only.
- @Override
- public ExecutorService create(PipelineOptions options) {
- ThreadFactoryBuilder threadFactoryBuilder = new ThreadFactoryBuilder();
- threadFactoryBuilder.setThreadFactory(MoreExecutors.platformThreadFactory());
- if (!AppEngineEnvironment.IS_APP_ENGINE) {
- // AppEngine doesn't allow modification of threads to be daemon threads.
- threadFactoryBuilder.setDaemon(true);
- }
- /* The SDK requires an unbounded thread pool because a step may create X writers
- * each requiring their own thread to perform the writes otherwise a writer may
- * block causing deadlock for the step because the writers buffer is full.
- * Also, the MapTaskExecutor launches the steps in reverse order and completes
- * them in forward order thus requiring enough threads so that each step's writers
- * can be active.
- */
- return new ThreadPoolExecutor(
- 0, Integer.MAX_VALUE, // Allow an unlimited number of re-usable threads.
- Long.MAX_VALUE, TimeUnit.NANOSECONDS, // Keep non-core threads alive forever.
- new SynchronousQueue<Runnable>(),
- threadFactoryBuilder.build());
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/GoogleApiDebugOptions.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/GoogleApiDebugOptions.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/GoogleApiDebugOptions.java
deleted file mode 100644
index eff679b..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/GoogleApiDebugOptions.java
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.options;
-
-import com.google.api.client.googleapis.services.AbstractGoogleClient;
-import com.google.api.client.googleapis.services.AbstractGoogleClientRequest;
-import com.google.api.client.googleapis.services.GoogleClientRequestInitializer;
-
-import java.io.IOException;
-import java.util.HashMap;
-import java.util.Map;
-
-/**
- * These options configure debug settings for Google API clients created within the Dataflow SDK.
- */
-public interface GoogleApiDebugOptions extends PipelineOptions {
- /**
- * This option enables tracing of API calls to Google services used within the
- * Dataflow SDK. Values are expected in JSON format <code>{"ApiName":"TraceDestination",...}
- * </code> where the {@code ApiName} represents the request classes canonical name. The
- * {@code TraceDestination} is a logical trace consumer to whom the trace will be reported.
- * Typically, "producer" is the right destination to use: this makes API traces available to the
- * team offering the API. Note that by enabling this option, the contents of the requests to and
- * from Google Cloud services will be made available to Google. For example, by specifying
- * <code>{"Dataflow":"producer"}</code>, all calls to the Dataflow service will be made available
- * to Google, specifically to the Google Cloud Dataflow team.
- */
- @Description("This option enables tracing of API calls to Google services used within the "
- + "Dataflow SDK. Values are expected in JSON format {\"ApiName\":\"TraceDestination\",...} "
- + "where the ApiName represents the request classes canonical name. The TraceDestination is "
- + "a logical trace consumer to whom the trace will be reported. Typically, \"producer\" is "
- + "the right destination to use: this makes API traces available to the team offering the "
- + "API. Note that by enabling this option, the contents of the requests to and from "
- + "Google Cloud services will be made available to Google. For example, by specifying "
- + "{\"Dataflow\":\"producer\"}, all calls to the Dataflow service will be made available to "
- + "Google, specifically to the Google Cloud Dataflow team.")
- GoogleApiTracer getGoogleApiTrace();
- void setGoogleApiTrace(GoogleApiTracer commands);
-
- /**
- * A {@link GoogleClientRequestInitializer} that adds the trace destination to Google API calls.
- */
- public static class GoogleApiTracer extends HashMap<String, String>
- implements GoogleClientRequestInitializer {
- /**
- * Creates a {@link GoogleApiTracer} that sets the trace destination on all
- * calls that match the given client type.
- */
- public GoogleApiTracer addTraceFor(AbstractGoogleClient client, String traceDestination) {
- put(client.getClass().getCanonicalName(), traceDestination);
- return this;
- }
-
- /**
- * Creates a {@link GoogleApiTracer} that sets the trace {@code traceDestination} on all
- * calls that match for the given request type.
- */
- public GoogleApiTracer addTraceFor(
- AbstractGoogleClientRequest<?> request, String traceDestination) {
- put(request.getClass().getCanonicalName(), traceDestination);
- return this;
- }
-
- @Override
- public void initialize(AbstractGoogleClientRequest<?> request) throws IOException {
- for (Map.Entry<String, String> entry : this.entrySet()) {
- if (request.getClass().getCanonicalName().contains(entry.getKey())) {
- request.set("$trace", entry.getValue());
- }
- }
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/Hidden.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/Hidden.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/Hidden.java
deleted file mode 100644
index 6a487eb..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/Hidden.java
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.options;
-
-import java.lang.annotation.Documented;
-import java.lang.annotation.ElementType;
-import java.lang.annotation.Retention;
-import java.lang.annotation.RetentionPolicy;
-import java.lang.annotation.Target;
-
-/**
- * Methods and/or interfaces annotated with {@code @Hidden} will be suppressed from
- * being output when {@code --help} is specified on the command-line.
- */
-@Target({ElementType.METHOD, ElementType.TYPE})
-@Retention(RetentionPolicy.RUNTIME)
-@Documented
-public @interface Hidden {
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/PipelineOptions.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/PipelineOptions.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/PipelineOptions.java
deleted file mode 100644
index 8ff1fa9..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/options/PipelineOptions.java
+++ /dev/null
@@ -1,249 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.options;
-
-import com.google.auto.service.AutoService;
-import com.google.cloud.dataflow.sdk.Pipeline;
-import com.google.cloud.dataflow.sdk.options.GoogleApiDebugOptions.GoogleApiTracer;
-import com.google.cloud.dataflow.sdk.options.ProxyInvocationHandler.Deserializer;
-import com.google.cloud.dataflow.sdk.options.ProxyInvocationHandler.Serializer;
-import com.google.cloud.dataflow.sdk.runners.DirectPipelineRunner;
-import com.google.cloud.dataflow.sdk.runners.PipelineRunner;
-import com.google.cloud.dataflow.sdk.transforms.DoFn;
-import com.google.cloud.dataflow.sdk.transforms.DoFn.Context;
-
-import com.fasterxml.jackson.annotation.JsonIgnore;
-import com.fasterxml.jackson.databind.ObjectMapper;
-import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
-import com.fasterxml.jackson.databind.annotation.JsonSerialize;
-
-import java.lang.reflect.Proxy;
-import java.util.ServiceLoader;
-
-import javax.annotation.concurrent.ThreadSafe;
-
-/**
- * PipelineOptions are used to configure Pipelines. You can extend {@link PipelineOptions}
- * to create custom configuration options specific to your {@link Pipeline},
- * for both local execution and execution via a {@link PipelineRunner}.
- *
- * <p>{@link PipelineOptions} and their subinterfaces represent a collection of properties
- * which can be manipulated in a type safe manner. {@link PipelineOptions} is backed by a
- * dynamic {@link Proxy} which allows for type safe manipulation of properties in an extensible
- * fashion through plain old Java interfaces.
- *
- * <p>{@link PipelineOptions} can be created with {@link PipelineOptionsFactory#create()}
- * and {@link PipelineOptionsFactory#as(Class)}. They can be created
- * from command-line arguments with {@link PipelineOptionsFactory#fromArgs(String[])}.
- * They can be converted to another type by invoking {@link PipelineOptions#as(Class)} and
- * can be accessed from within a {@link DoFn} by invoking
- * {@link Context#getPipelineOptions()}.
- *
- * <p>For example:
- * <pre>{@code
- * // The most common way to construct PipelineOptions is via command-line argument parsing:
- * public static void main(String[] args) {
- * // Will parse the arguments passed into the application and construct a PipelineOptions
- * // Note that --help will print registered options, and --help=PipelineOptionsClassName
- * // will print out usage for the specific class.
- * PipelineOptions options =
- * PipelineOptionsFactory.fromArgs(args).create();
- *
- * Pipeline p = Pipeline.create(options);
- * ...
- * p.run();
- * }
- *
- * // To create options for the DirectPipeline:
- * DirectPipelineOptions directPipelineOptions =
- * PipelineOptionsFactory.as(DirectPipelineOptions.class);
- * directPipelineOptions.setStreaming(true);
- *
- * // To cast from one type to another using the as(Class) method:
- * DataflowPipelineOptions dataflowPipelineOptions =
- * directPipelineOptions.as(DataflowPipelineOptions.class);
- *
- * // Options for the same property are shared between types
- * // The statement below will print out "true"
- * System.out.println(dataflowPipelineOptions.isStreaming());
- *
- * // Prints out registered options.
- * PipelineOptionsFactory.printHelp(System.out);
- *
- * // Prints out options which are available to be set on DataflowPipelineOptions
- * PipelineOptionsFactory.printHelp(System.out, DataflowPipelineOptions.class);
- * }</pre>
- *
- * <h2>Defining Your Own PipelineOptions</h2>
- *
- * Defining your own {@link PipelineOptions} is the way for you to make configuration
- * options available for both local execution and execution via a {@link PipelineRunner}.
- * By having PipelineOptionsFactory as your command-line interpreter, you will provide
- * a standardized way for users to interact with your application via the command-line.
- *
- * <p>To define your own {@link PipelineOptions}, you create an interface which
- * extends {@link PipelineOptions} and define getter/setter pairs. These
- * getter/setter pairs define a collection of
- * <a href="https://docs.oracle.com/javase/tutorial/javabeans/writing/properties.html">
- * JavaBean properties</a>.
- *
- * <p>For example:
- * <pre>{@code
- * // Creates a user defined property called "myProperty"
- * public interface MyOptions extends PipelineOptions {
- * String getMyProperty();
- * void setMyProperty(String value);
- * }
- * }</pre>
- *
- * <p>Note: Please see the section on Registration below when using custom property types.
- *
- * <h3>Restrictions</h3>
- *
- * Since PipelineOptions can be "cast" to multiple types dynamically using
- * {@link PipelineOptions#as(Class)}, a property must conform to the following set of restrictions:
- * <ul>
- * <li>Any property with the same name must have the same return type for all derived
- * interfaces of {@link PipelineOptions}.
- * <li>Every bean property of any interface derived from {@link PipelineOptions} must have a
- * getter and setter method.
- * <li>Every method must conform to being a getter or setter for a JavaBean.
- * <li>The derived interface of {@link PipelineOptions} must be composable with every interface
- * part registered with the PipelineOptionsFactory.
- * <li>Only getters may be annotated with {@link JsonIgnore @JsonIgnore}.
- * <li>If any getter is annotated with {@link JsonIgnore @JsonIgnore}, then all getters for
- * this property must be annotated with {@link JsonIgnore @JsonIgnore}.
- * </ul>
- *
- * <h3>Annotations For PipelineOptions</h3>
- *
- * {@link Description @Description} can be used to annotate an interface or a getter
- * with useful information which is output when {@code --help}
- * is invoked via {@link PipelineOptionsFactory#fromArgs(String[])}.
- *
- * <p>{@link Default @Default} represents a set of annotations that can be used to annotate getter
- * properties on {@link PipelineOptions} with information representing the default value to be
- * returned if no value is specified. Any default implementation (using the {@code default} keyword)
- * is ignored.
- *
- * <p>{@link Hidden @Hidden} hides an option from being listed when {@code --help}
- * is invoked via {@link PipelineOptionsFactory#fromArgs(String[])}.
- *
- * <p>{@link Validation @Validation} represents a set of annotations that can be used to annotate
- * getter properties on {@link PipelineOptions} with information representing the validation
- * criteria to be used when validating with the {@link PipelineOptionsValidator}. Validation
- * will be performed if during construction of the {@link PipelineOptions},
- * {@link PipelineOptionsFactory#withValidation()} is invoked.
- *
- * <p>{@link JsonIgnore @JsonIgnore} is used to prevent a property from being serialized and
- * available during execution of {@link DoFn}. See the Serialization section below for more
- * details.
- *
- * <h2>Registration Of PipelineOptions</h2>
- *
- * Registration of {@link PipelineOptions} by an application guarantees that the
- * {@link PipelineOptions} is composable during execution of their {@link Pipeline} and
- * meets the restrictions listed above or will fail during registration. Registration
- * also lists the registered {@link PipelineOptions} when {@code --help}
- * is invoked via {@link PipelineOptionsFactory#fromArgs(String[])}.
- *
- * <p>Registration can be performed by invoking {@link PipelineOptionsFactory#register} within
- * a users application or via automatic registration by creating a {@link ServiceLoader} entry
- * and a concrete implementation of the {@link PipelineOptionsRegistrar} interface.
- *
- * <p>It is optional but recommended to use one of the many build time tools such as
- * {@link AutoService} to generate the necessary META-INF files automatically.
- *
- * <p>A list of registered options can be fetched from
- * {@link PipelineOptionsFactory#getRegisteredOptions()}.
- *
- * <h2>Serialization Of PipelineOptions</h2>
- *
- * {@link PipelineRunner}s require support for options to be serialized. Each property
- * within {@link PipelineOptions} must be able to be serialized using Jackson's
- * {@link ObjectMapper} or the getter method for the property annotated with
- * {@link JsonIgnore @JsonIgnore}.
- *
- * <p>Jackson supports serialization of many types and supports a useful set of
- * <a href="https://github.com/FasterXML/jackson-annotations">annotations</a> to aid in
- * serialization of custom types. We point you to the public
- * <a href="https://github.com/FasterXML/jackson">Jackson documentation</a> when attempting
- * to add serialization support for your custom types. See {@link GoogleApiTracer} for an
- * example using the Jackson annotations to serialize and deserialize a custom type.
- *
- * <p>Note: It is an error to have the same property available in multiple interfaces with only
- * some of them being annotated with {@link JsonIgnore @JsonIgnore}. It is also an error to mark a
- * setter for a property with {@link JsonIgnore @JsonIgnore}.
- */
-@JsonSerialize(using = Serializer.class)
-@JsonDeserialize(using = Deserializer.class)
-@ThreadSafe
-public interface PipelineOptions {
- /**
- * Transforms this object into an object of type {@code <T>} saving each property
- * that has been manipulated. {@code <T>} must extend {@link PipelineOptions}.
- *
- * <p>If {@code <T>} is not registered with the {@link PipelineOptionsFactory}, then we
- * attempt to verify that {@code <T>} is composable with every interface that this
- * instance of the {@code PipelineOptions} has seen.
- *
- * @param kls The class of the type to transform to.
- * @return An object of type kls.
- */
- <T extends PipelineOptions> T as(Class<T> kls);
-
- /**
- * Makes a deep clone of this object, and transforms the cloned object into the specified
- * type {@code kls}. See {@link #as} for more information about the conversion.
- *
- * <p>Properties that are marked with {@code @JsonIgnore} will not be cloned.
- */
- <T extends PipelineOptions> T cloneAs(Class<T> kls);
-
- /**
- * The pipeline runner that will be used to execute the pipeline.
- * For registered runners, the class name can be specified, otherwise the fully
- * qualified name needs to be specified.
- */
- @Validation.Required
- @Description("The pipeline runner that will be used to execute the pipeline. "
- + "For registered runners, the class name can be specified, otherwise the fully "
- + "qualified name needs to be specified.")
- @Default.Class(DirectPipelineRunner.class)
- Class<? extends PipelineRunner<?>> getRunner();
- void setRunner(Class<? extends PipelineRunner<?>> kls);
-
- /**
- * Enumeration of the possible states for a given check.
- */
- public static enum CheckEnabled {
- OFF,
- WARNING,
- ERROR;
- }
-
- /**
- * Whether to check for stable unique names on each transform. This is necessary to
- * support updating of pipelines.
- */
- @Validation.Required
- @Description("Whether to check for stable unique names on each transform. This is necessary to "
- + "support updating of pipelines.")
- @Default.Enum("WARNING")
- CheckEnabled getStableUniqueNames();
- void setStableUniqueNames(CheckEnabled enabled);
-}
[65/67] incubator-beam git commit: Directory reorganization
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/257a7a6b/sdks/java/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/ExamplePubsubTopicOptions.java
----------------------------------------------------------------------
diff --git a/sdks/java/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/ExamplePubsubTopicOptions.java b/sdks/java/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/ExamplePubsubTopicOptions.java
new file mode 100644
index 0000000..525de69
--- /dev/null
+++ b/sdks/java/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/ExamplePubsubTopicOptions.java
@@ -0,0 +1,49 @@
+/*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package ${package}.common;
+
+import com.google.cloud.dataflow.sdk.options.DataflowPipelineOptions;
+import com.google.cloud.dataflow.sdk.options.Default;
+import com.google.cloud.dataflow.sdk.options.DefaultValueFactory;
+import com.google.cloud.dataflow.sdk.options.Description;
+import com.google.cloud.dataflow.sdk.options.PipelineOptions;
+
+/**
+ * Options that can be used to configure Pub/Sub topic in Dataflow examples.
+ */
+public interface ExamplePubsubTopicOptions extends DataflowPipelineOptions {
+ @Description("Pub/Sub topic")
+ @Default.InstanceFactory(PubsubTopicFactory.class)
+ String getPubsubTopic();
+ void setPubsubTopic(String topic);
+
+ @Description("Number of workers to use when executing the injector pipeline")
+ @Default.Integer(1)
+ int getInjectorNumWorkers();
+ void setInjectorNumWorkers(int numWorkers);
+
+ /**
+ * Returns a default Pub/Sub topic based on the project and the job names.
+ */
+ static class PubsubTopicFactory implements DefaultValueFactory<String> {
+ @Override
+ public String create(PipelineOptions options) {
+ DataflowPipelineOptions dataflowPipelineOptions =
+ options.as(DataflowPipelineOptions.class);
+ return "projects/" + dataflowPipelineOptions.getProject()
+ + "/topics/" + dataflowPipelineOptions.getJobName();
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/257a7a6b/sdks/java/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/PubsubFileInjector.java
----------------------------------------------------------------------
diff --git a/sdks/java/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/PubsubFileInjector.java b/sdks/java/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/PubsubFileInjector.java
new file mode 100644
index 0000000..f6f80ae
--- /dev/null
+++ b/sdks/java/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/PubsubFileInjector.java
@@ -0,0 +1,153 @@
+/*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package ${package}.common;
+
+import com.google.api.services.pubsub.Pubsub;
+import com.google.api.services.pubsub.model.PublishRequest;
+import com.google.api.services.pubsub.model.PubsubMessage;
+import com.google.cloud.dataflow.sdk.Pipeline;
+import com.google.cloud.dataflow.sdk.io.TextIO;
+import com.google.cloud.dataflow.sdk.options.DataflowPipelineOptions;
+import com.google.cloud.dataflow.sdk.options.Description;
+import com.google.cloud.dataflow.sdk.options.PipelineOptions;
+import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
+import com.google.cloud.dataflow.sdk.options.Validation;
+import com.google.cloud.dataflow.sdk.transforms.DoFn;
+import com.google.cloud.dataflow.sdk.transforms.IntraBundleParallelization;
+import com.google.cloud.dataflow.sdk.util.Transport;
+import com.google.common.collect.ImmutableMap;
+
+import java.io.IOException;
+import java.util.Arrays;
+
+/**
+ * A batch Dataflow pipeline for injecting a set of GCS files into
+ * a PubSub topic line by line. Empty lines are skipped.
+ *
+ * <p>This is useful for testing streaming
+ * pipelines. Note that since batch pipelines might retry chunks, this
+ * does _not_ guarantee exactly-once injection of file data. Some lines may
+ * be published multiple times.
+ * </p>
+ */
+public class PubsubFileInjector {
+
+ /**
+ * An incomplete {@code PubsubFileInjector} transform with unbound output topic.
+ */
+ public static class Unbound {
+ private final String timestampLabelKey;
+
+ Unbound() {
+ this.timestampLabelKey = null;
+ }
+
+ Unbound(String timestampLabelKey) {
+ this.timestampLabelKey = timestampLabelKey;
+ }
+
+ Unbound withTimestampLabelKey(String timestampLabelKey) {
+ return new Unbound(timestampLabelKey);
+ }
+
+ public Bound publish(String outputTopic) {
+ return new Bound(outputTopic, timestampLabelKey);
+ }
+ }
+
+ /** A DoFn that publishes non-empty lines to Google Cloud PubSub. */
+ public static class Bound extends DoFn<String, Void> {
+ private final String outputTopic;
+ private final String timestampLabelKey;
+ public transient Pubsub pubsub;
+
+ public Bound(String outputTopic, String timestampLabelKey) {
+ this.outputTopic = outputTopic;
+ this.timestampLabelKey = timestampLabelKey;
+ }
+
+ @Override
+ public void startBundle(Context context) {
+ this.pubsub =
+ Transport.newPubsubClient(context.getPipelineOptions().as(DataflowPipelineOptions.class))
+ .build();
+ }
+
+ @Override
+ public void processElement(ProcessContext c) throws IOException {
+ if (c.element().isEmpty()) {
+ return;
+ }
+ PubsubMessage pubsubMessage = new PubsubMessage();
+ pubsubMessage.encodeData(c.element().getBytes());
+ if (timestampLabelKey != null) {
+ pubsubMessage.setAttributes(
+ ImmutableMap.of(timestampLabelKey, Long.toString(c.timestamp().getMillis())));
+ }
+ PublishRequest publishRequest = new PublishRequest();
+ publishRequest.setMessages(Arrays.asList(pubsubMessage));
+ this.pubsub.projects().topics().publish(outputTopic, publishRequest).execute();
+ }
+ }
+
+ /**
+ * Creates a {@code PubsubFileInjector} transform with the given timestamp label key.
+ */
+ public static Unbound withTimestampLabelKey(String timestampLabelKey) {
+ return new Unbound(timestampLabelKey);
+ }
+
+ /**
+ * Creates a {@code PubsubFileInjector} transform that publishes to the given output topic.
+ */
+ public static Bound publish(String outputTopic) {
+ return new Unbound().publish(outputTopic);
+ }
+
+ /**
+ * Command line parameter options.
+ */
+ private interface PubsubFileInjectorOptions extends PipelineOptions {
+ @Description("GCS location of files.")
+ @Validation.Required
+ String getInput();
+ void setInput(String value);
+
+ @Description("Topic to publish on.")
+ @Validation.Required
+ String getOutputTopic();
+ void setOutputTopic(String value);
+ }
+
+ /**
+ * Sets up and starts streaming pipeline.
+ */
+ public static void main(String[] args) {
+ PubsubFileInjectorOptions options = PipelineOptionsFactory.fromArgs(args)
+ .withValidation()
+ .as(PubsubFileInjectorOptions.class);
+
+ Pipeline pipeline = Pipeline.create(options);
+
+ pipeline
+ .apply(TextIO.Read.from(options.getInput()))
+ .apply(IntraBundleParallelization.of(PubsubFileInjector.publish(options.getOutputTopic()))
+ .withMaxParallelism(20));
+
+ pipeline.run();
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/257a7a6b/sdks/java/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/DebuggingWordCountTest.java
----------------------------------------------------------------------
diff --git a/sdks/java/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/DebuggingWordCountTest.java b/sdks/java/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/DebuggingWordCountTest.java
new file mode 100644
index 0000000..7a9aa4c
--- /dev/null
+++ b/sdks/java/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/DebuggingWordCountTest.java
@@ -0,0 +1,44 @@
+/*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package ${package};
+
+import com.google.common.io.Files;
+
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+import java.io.File;
+import java.nio.charset.StandardCharsets;
+
+/**
+ * Tests for {@link DebuggingWordCount}.
+ */
+@RunWith(JUnit4.class)
+public class DebuggingWordCountTest {
+ @Rule public TemporaryFolder tmpFolder = new TemporaryFolder();
+
+ @Test
+ public void testDebuggingWordCount() throws Exception {
+ File file = tmpFolder.newFile();
+ Files.write("stomach secret Flourish message Flourish here Flourish", file,
+ StandardCharsets.UTF_8);
+ DebuggingWordCount.main(new String[]{"--inputFile=" + file.getAbsolutePath()});
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/257a7a6b/sdks/java/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/WordCountTest.java
----------------------------------------------------------------------
diff --git a/sdks/java/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/WordCountTest.java b/sdks/java/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/WordCountTest.java
new file mode 100644
index 0000000..45555ce
--- /dev/null
+++ b/sdks/java/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/WordCountTest.java
@@ -0,0 +1,85 @@
+/*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package ${package};
+
+import ${package}.WordCount.CountWords;
+import ${package}.WordCount.ExtractWordsFn;
+import ${package}.WordCount.FormatAsTextFn;
+import com.google.cloud.dataflow.sdk.Pipeline;
+import com.google.cloud.dataflow.sdk.coders.StringUtf8Coder;
+import com.google.cloud.dataflow.sdk.testing.DataflowAssert;
+import com.google.cloud.dataflow.sdk.testing.RunnableOnService;
+import com.google.cloud.dataflow.sdk.testing.TestPipeline;
+import com.google.cloud.dataflow.sdk.transforms.Create;
+import com.google.cloud.dataflow.sdk.transforms.DoFnTester;
+import com.google.cloud.dataflow.sdk.transforms.ParDo;
+import com.google.cloud.dataflow.sdk.values.PCollection;
+
+import org.hamcrest.CoreMatchers;
+import org.junit.Assert;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * Tests of WordCount.
+ */
+@RunWith(JUnit4.class)
+public class WordCountTest {
+
+ /** Example test that tests a specific DoFn. */
+ @Test
+ public void testExtractWordsFn() {
+ DoFnTester<String, String> extractWordsFn =
+ DoFnTester.of(new ExtractWordsFn());
+
+ Assert.assertThat(extractWordsFn.processBatch(" some input words "),
+ CoreMatchers.hasItems("some", "input", "words"));
+ Assert.assertThat(extractWordsFn.processBatch(" "),
+ CoreMatchers.<String>hasItems());
+ Assert.assertThat(extractWordsFn.processBatch(" some ", " input", " words"),
+ CoreMatchers.hasItems("some", "input", "words"));
+ }
+
+ static final String[] WORDS_ARRAY = new String[] {
+ "hi there", "hi", "hi sue bob",
+ "hi sue", "", "bob hi"};
+
+ static final List<String> WORDS = Arrays.asList(WORDS_ARRAY);
+
+ static final String[] COUNTS_ARRAY = new String[] {
+ "hi: 5", "there: 1", "sue: 2", "bob: 2"};
+
+ /** Example test that tests a PTransform by using an in-memory input and inspecting the output. */
+ @Test
+ @Category(RunnableOnService.class)
+ public void testCountWords() throws Exception {
+ Pipeline p = TestPipeline.create();
+
+ PCollection<String> input = p.apply(Create.of(WORDS).withCoder(StringUtf8Coder.of()));
+
+ PCollection<String> output = input.apply(new CountWords())
+ .apply(ParDo.of(new FormatAsTextFn()));
+
+ DataflowAssert.that(output).containsInAnyOrder(COUNTS_ARRAY);
+ p.run();
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/257a7a6b/sdks/java/maven-archetypes/examples/src/test/resources/projects/basic/archetype.properties
----------------------------------------------------------------------
diff --git a/sdks/java/maven-archetypes/examples/src/test/resources/projects/basic/archetype.properties b/sdks/java/maven-archetypes/examples/src/test/resources/projects/basic/archetype.properties
new file mode 100644
index 0000000..c59e77a
--- /dev/null
+++ b/sdks/java/maven-archetypes/examples/src/test/resources/projects/basic/archetype.properties
@@ -0,0 +1,5 @@
+package=it.pkg
+version=0.1-SNAPSHOT
+groupId=archetype.it
+artifactId=basic
+targetPlatform=1.7
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/257a7a6b/sdks/java/maven-archetypes/examples/src/test/resources/projects/basic/goal.txt
----------------------------------------------------------------------
diff --git a/sdks/java/maven-archetypes/examples/src/test/resources/projects/basic/goal.txt b/sdks/java/maven-archetypes/examples/src/test/resources/projects/basic/goal.txt
new file mode 100644
index 0000000..0b59873
--- /dev/null
+++ b/sdks/java/maven-archetypes/examples/src/test/resources/projects/basic/goal.txt
@@ -0,0 +1 @@
+verify
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/257a7a6b/sdks/java/maven-archetypes/pom.xml
----------------------------------------------------------------------
diff --git a/sdks/java/maven-archetypes/pom.xml b/sdks/java/maven-archetypes/pom.xml
new file mode 100644
index 0000000..59efe50
--- /dev/null
+++ b/sdks/java/maven-archetypes/pom.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.beam</groupId>
+ <artifactId>parent</artifactId>
+ <version>0.1.0-incubating-SNAPSHOT</version>
+ <relativePath>../../../pom.xml</relativePath>
+ </parent>
+
+ <artifactId>maven-archetypes-parent</artifactId>
+ <packaging>pom</packaging>
+
+ <name>Apache Beam :: Maven Archetypes</name>
+
+ <modules>
+ <module>starter</module>
+ <module>examples</module>
+ </modules>
+
+</project>
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/257a7a6b/sdks/java/maven-archetypes/starter/pom.xml
----------------------------------------------------------------------
diff --git a/sdks/java/maven-archetypes/starter/pom.xml b/sdks/java/maven-archetypes/starter/pom.xml
new file mode 100644
index 0000000..933e8b1
--- /dev/null
+++ b/sdks/java/maven-archetypes/starter/pom.xml
@@ -0,0 +1,57 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.beam</groupId>
+ <artifactId>maven-archetypes-parent</artifactId>
+ <version>0.1.0-incubating-SNAPSHOT</version>
+ <relativePath>../pom.xml</relativePath>
+ </parent>
+
+ <groupId>org.apache.beam</groupId>
+ <artifactId>maven-archetypes-starter</artifactId>
+ <name>Apache Beam :: Maven Archetypes :: Starter</name>
+ <description>A Maven archetype to create a simple starter pipeline to
+ get started using the Apache Beam Java SDK. </description>
+
+ <packaging>maven-archetype</packaging>
+
+ <build>
+ <extensions>
+ <extension>
+ <groupId>org.apache.maven.archetype</groupId>
+ <artifactId>archetype-packaging</artifactId>
+ <version>2.4</version>
+ </extension>
+ </extensions>
+
+ <pluginManagement>
+ <plugins>
+ <plugin>
+ <artifactId>maven-archetype-plugin</artifactId>
+ <version>2.4</version>
+ </plugin>
+ </plugins>
+ </pluginManagement>
+ </build>
+</project>
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/257a7a6b/sdks/java/maven-archetypes/starter/src/main/resources/META-INF/maven/archetype-metadata.xml
----------------------------------------------------------------------
diff --git a/sdks/java/maven-archetypes/starter/src/main/resources/META-INF/maven/archetype-metadata.xml b/sdks/java/maven-archetypes/starter/src/main/resources/META-INF/maven/archetype-metadata.xml
new file mode 100644
index 0000000..bf75798
--- /dev/null
+++ b/sdks/java/maven-archetypes/starter/src/main/resources/META-INF/maven/archetype-metadata.xml
@@ -0,0 +1,21 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<archetype-descriptor
+ xsi:schemaLocation="http://maven.apache.org/plugins/maven-archetype-plugin/archetype-descriptor/1.0.0 http://maven.apache.org/xsd/archetype-descriptor-1.0.0.xsd"
+ name="Google Cloud Dataflow Starter Pipeline Archetype"
+ xmlns="http://maven.apache.org/plugins/maven-archetype-plugin/archetype-descriptor/1.0.0"
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+ <requiredProperties>
+ <requiredProperty key="targetPlatform">
+ <defaultValue>1.7</defaultValue>
+ </requiredProperty>
+ </requiredProperties>
+
+ <fileSets>
+ <fileSet filtered="true" packaged="true" encoding="UTF-8">
+ <directory>src/main/java</directory>
+ <includes>
+ <include>**/*.java</include>
+ </includes>
+ </fileSet>
+ </fileSets>
+</archetype-descriptor>
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/257a7a6b/sdks/java/maven-archetypes/starter/src/main/resources/archetype-resources/pom.xml
----------------------------------------------------------------------
diff --git a/sdks/java/maven-archetypes/starter/src/main/resources/archetype-resources/pom.xml b/sdks/java/maven-archetypes/starter/src/main/resources/archetype-resources/pom.xml
new file mode 100644
index 0000000..19e7d2d
--- /dev/null
+++ b/sdks/java/maven-archetypes/starter/src/main/resources/archetype-resources/pom.xml
@@ -0,0 +1,43 @@
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <groupId>${groupId}</groupId>
+ <artifactId>${artifactId}</artifactId>
+ <version>${version}</version>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-compiler-plugin</artifactId>
+ <version>3.3</version>
+ <configuration>
+ <source>${targetPlatform}</source>
+ <target>${targetPlatform}</target>
+ </configuration>
+ </plugin>
+ </plugins>
+ </build>
+
+ <dependencies>
+ <dependency>
+ <groupId>org.apache.beam</groupId>
+ <artifactId>java-sdk-all</artifactId>
+ <version>[0-incubating, 1-incubating)</version>
+ </dependency>
+
+ <!-- slf4j API frontend binding with JUL backend -->
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-api</artifactId>
+ <version>1.7.7</version>
+ </dependency>
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-jdk14</artifactId>
+ <version>1.7.7</version>
+ </dependency>
+ </dependencies>
+</project>
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/257a7a6b/sdks/java/maven-archetypes/starter/src/main/resources/archetype-resources/src/main/java/StarterPipeline.java
----------------------------------------------------------------------
diff --git a/sdks/java/maven-archetypes/starter/src/main/resources/archetype-resources/src/main/java/StarterPipeline.java b/sdks/java/maven-archetypes/starter/src/main/resources/archetype-resources/src/main/java/StarterPipeline.java
new file mode 100644
index 0000000..ffabbc0
--- /dev/null
+++ b/sdks/java/maven-archetypes/starter/src/main/resources/archetype-resources/src/main/java/StarterPipeline.java
@@ -0,0 +1,67 @@
+/*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package ${package};
+
+import com.google.cloud.dataflow.sdk.Pipeline;
+import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
+import com.google.cloud.dataflow.sdk.transforms.Create;
+import com.google.cloud.dataflow.sdk.transforms.DoFn;
+import com.google.cloud.dataflow.sdk.transforms.ParDo;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * A starter example for writing Google Cloud Dataflow programs.
+ *
+ * <p>The example takes two strings, converts them to their upper-case
+ * representation and logs them.
+ *
+ * <p>To run this starter example locally using DirectPipelineRunner, just
+ * execute it without any additional parameters from your favorite development
+ * environment.
+ *
+ * <p>To run this starter example using managed resource in Google Cloud
+ * Platform, you should specify the following command-line options:
+ * --project=<YOUR_PROJECT_ID>
+ * --stagingLocation=<STAGING_LOCATION_IN_CLOUD_STORAGE>
+ * --runner=BlockingDataflowPipelineRunner
+ */
+public class StarterPipeline {
+ private static final Logger LOG = LoggerFactory.getLogger(StarterPipeline.class);
+
+ public static void main(String[] args) {
+ Pipeline p = Pipeline.create(
+ PipelineOptionsFactory.fromArgs(args).withValidation().create());
+
+ p.apply(Create.of("Hello", "World"))
+ .apply(ParDo.of(new DoFn<String, String>() {
+ @Override
+ public void processElement(ProcessContext c) {
+ c.output(c.element().toUpperCase());
+ }
+ }))
+ .apply(ParDo.of(new DoFn<String, Void>() {
+ @Override
+ public void processElement(ProcessContext c) {
+ LOG.info(c.element());
+ }
+ }));
+
+ p.run();
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/257a7a6b/sdks/java/maven-archetypes/starter/src/test/resources/projects/basic/archetype.properties
----------------------------------------------------------------------
diff --git a/sdks/java/maven-archetypes/starter/src/test/resources/projects/basic/archetype.properties b/sdks/java/maven-archetypes/starter/src/test/resources/projects/basic/archetype.properties
new file mode 100644
index 0000000..c59e77a
--- /dev/null
+++ b/sdks/java/maven-archetypes/starter/src/test/resources/projects/basic/archetype.properties
@@ -0,0 +1,5 @@
+package=it.pkg
+version=0.1-SNAPSHOT
+groupId=archetype.it
+artifactId=basic
+targetPlatform=1.7
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/257a7a6b/sdks/java/maven-archetypes/starter/src/test/resources/projects/basic/goal.txt
----------------------------------------------------------------------
diff --git a/sdks/java/maven-archetypes/starter/src/test/resources/projects/basic/goal.txt b/sdks/java/maven-archetypes/starter/src/test/resources/projects/basic/goal.txt
new file mode 100644
index 0000000..0b59873
--- /dev/null
+++ b/sdks/java/maven-archetypes/starter/src/test/resources/projects/basic/goal.txt
@@ -0,0 +1 @@
+verify
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/257a7a6b/sdks/java/maven-archetypes/starter/src/test/resources/projects/basic/reference/pom.xml
----------------------------------------------------------------------
diff --git a/sdks/java/maven-archetypes/starter/src/test/resources/projects/basic/reference/pom.xml b/sdks/java/maven-archetypes/starter/src/test/resources/projects/basic/reference/pom.xml
new file mode 100644
index 0000000..d29424a
--- /dev/null
+++ b/sdks/java/maven-archetypes/starter/src/test/resources/projects/basic/reference/pom.xml
@@ -0,0 +1,43 @@
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <groupId>archetype.it</groupId>
+ <artifactId>basic</artifactId>
+ <version>0.1-SNAPSHOT</version>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-compiler-plugin</artifactId>
+ <version>3.3</version>
+ <configuration>
+ <source>1.7</source>
+ <target>1.7</target>
+ </configuration>
+ </plugin>
+ </plugins>
+ </build>
+
+ <dependencies>
+ <dependency>
+ <groupId>org.apache.beam</groupId>
+ <artifactId>java-sdk-all</artifactId>
+ <version>[0-incubating, 1-incubating)</version>
+ </dependency>
+
+ <!-- slf4j API frontend binding with JUL backend -->
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-api</artifactId>
+ <version>1.7.7</version>
+ </dependency>
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-jdk14</artifactId>
+ <version>1.7.7</version>
+ </dependency>
+ </dependencies>
+</project>
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/257a7a6b/sdks/java/maven-archetypes/starter/src/test/resources/projects/basic/reference/src/main/java/it/pkg/StarterPipeline.java
----------------------------------------------------------------------
diff --git a/sdks/java/maven-archetypes/starter/src/test/resources/projects/basic/reference/src/main/java/it/pkg/StarterPipeline.java b/sdks/java/maven-archetypes/starter/src/test/resources/projects/basic/reference/src/main/java/it/pkg/StarterPipeline.java
new file mode 100644
index 0000000..2e7c4e1
--- /dev/null
+++ b/sdks/java/maven-archetypes/starter/src/test/resources/projects/basic/reference/src/main/java/it/pkg/StarterPipeline.java
@@ -0,0 +1,67 @@
+/*
+ * Copyright (C) 2015 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package it.pkg;
+
+import com.google.cloud.dataflow.sdk.Pipeline;
+import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
+import com.google.cloud.dataflow.sdk.transforms.Create;
+import com.google.cloud.dataflow.sdk.transforms.DoFn;
+import com.google.cloud.dataflow.sdk.transforms.ParDo;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * A starter example for writing Google Cloud Dataflow programs.
+ *
+ * <p>The example takes two strings, converts them to their upper-case
+ * representation and logs them.
+ *
+ * <p>To run this starter example locally using DirectPipelineRunner, just
+ * execute it without any additional parameters from your favorite development
+ * environment.
+ *
+ * <p>To run this starter example using managed resource in Google Cloud
+ * Platform, you should specify the following command-line options:
+ * --project=<YOUR_PROJECT_ID>
+ * --stagingLocation=<STAGING_LOCATION_IN_CLOUD_STORAGE>
+ * --runner=BlockingDataflowPipelineRunner
+ */
+public class StarterPipeline {
+ private static final Logger LOG = LoggerFactory.getLogger(StarterPipeline.class);
+
+ public static void main(String[] args) {
+ Pipeline p = Pipeline.create(
+ PipelineOptionsFactory.fromArgs(args).withValidation().create());
+
+ p.apply(Create.of("Hello", "World"))
+ .apply(ParDo.of(new DoFn<String, String>() {
+ @Override
+ public void processElement(ProcessContext c) {
+ c.output(c.element().toUpperCase());
+ }
+ }))
+ .apply(ParDo.of(new DoFn<String, Void>() {
+ @Override
+ public void processElement(ProcessContext c) {
+ LOG.info(c.element());
+ }
+ }));
+
+ p.run();
+ }
+}
[25/67] [partial] incubator-beam git commit: Directory reorganization
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/Combine.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/Combine.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/Combine.java
deleted file mode 100644
index b8d20e3..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/Combine.java
+++ /dev/null
@@ -1,2240 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.transforms;
-
-import com.google.cloud.dataflow.sdk.coders.CannotProvideCoderException;
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.coders.CoderException;
-import com.google.cloud.dataflow.sdk.coders.CoderRegistry;
-import com.google.cloud.dataflow.sdk.coders.CustomCoder;
-import com.google.cloud.dataflow.sdk.coders.DelegateCoder;
-import com.google.cloud.dataflow.sdk.coders.IterableCoder;
-import com.google.cloud.dataflow.sdk.coders.KvCoder;
-import com.google.cloud.dataflow.sdk.coders.StandardCoder;
-import com.google.cloud.dataflow.sdk.coders.VarIntCoder;
-import com.google.cloud.dataflow.sdk.coders.VoidCoder;
-import com.google.cloud.dataflow.sdk.transforms.CombineFnBase.AbstractGlobalCombineFn;
-import com.google.cloud.dataflow.sdk.transforms.CombineFnBase.AbstractPerKeyCombineFn;
-import com.google.cloud.dataflow.sdk.transforms.CombineFnBase.GlobalCombineFn;
-import com.google.cloud.dataflow.sdk.transforms.CombineFnBase.PerKeyCombineFn;
-import com.google.cloud.dataflow.sdk.transforms.CombineWithContext.CombineFnWithContext;
-import com.google.cloud.dataflow.sdk.transforms.CombineWithContext.Context;
-import com.google.cloud.dataflow.sdk.transforms.CombineWithContext.KeyedCombineFnWithContext;
-import com.google.cloud.dataflow.sdk.transforms.CombineWithContext.RequiresContextInternal;
-import com.google.cloud.dataflow.sdk.transforms.windowing.GlobalWindow;
-import com.google.cloud.dataflow.sdk.transforms.windowing.GlobalWindows;
-import com.google.cloud.dataflow.sdk.transforms.windowing.Window;
-import com.google.cloud.dataflow.sdk.util.AppliedCombineFn;
-import com.google.cloud.dataflow.sdk.util.PerKeyCombineFnRunner;
-import com.google.cloud.dataflow.sdk.util.PerKeyCombineFnRunners;
-import com.google.cloud.dataflow.sdk.util.PropertyNames;
-import com.google.cloud.dataflow.sdk.util.SerializableUtils;
-import com.google.cloud.dataflow.sdk.util.WindowingStrategy;
-import com.google.cloud.dataflow.sdk.util.common.Counter;
-import com.google.cloud.dataflow.sdk.values.KV;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.cloud.dataflow.sdk.values.PCollectionList;
-import com.google.cloud.dataflow.sdk.values.PCollectionTuple;
-import com.google.cloud.dataflow.sdk.values.PCollectionView;
-import com.google.cloud.dataflow.sdk.values.TupleTag;
-import com.google.cloud.dataflow.sdk.values.TupleTagList;
-import com.google.cloud.dataflow.sdk.values.TypeDescriptor;
-import com.google.common.base.Preconditions;
-import com.google.common.collect.ImmutableList;
-import com.google.common.collect.Iterables;
-
-import com.fasterxml.jackson.annotation.JsonCreator;
-import com.fasterxml.jackson.annotation.JsonProperty;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Iterator;
-import java.util.List;
-import java.util.concurrent.ThreadLocalRandom;
-
-/**
- * {@code PTransform}s for combining {@code PCollection} elements
- * globally and per-key.
- *
- * <p>See the <a href="https://cloud.google.com/dataflow/model/combine">documentation</a>
- * for how to use the operations in this class.
- */
-public class Combine {
- private Combine() {
- // do not instantiate
- }
-
- /**
- * Returns a {@link Globally Combine.Globally} {@code PTransform}
- * that uses the given {@code SerializableFunction} to combine all
- * the elements in each window of the input {@code PCollection} into a
- * single value in the output {@code PCollection}. The types of the input
- * elements and the output elements must be the same.
- *
- * <p>If the input {@code PCollection} is windowed into {@link GlobalWindows},
- * a default value in the {@link GlobalWindow} will be output if the input
- * {@code PCollection} is empty. To use this with inputs with other windowing,
- * either {@link Globally#withoutDefaults} or {@link Globally#asSingletonView}
- * must be called.
- *
- * <p>See {@link Globally Combine.Globally} for more information.
- */
- public static <V> Globally<V, V> globally(
- SerializableFunction<Iterable<V>, V> combiner) {
- return globally(IterableCombineFn.of(combiner));
- }
-
- /**
- * Returns a {@link Globally Combine.Globally} {@code PTransform}
- * that uses the given {@code GloballyCombineFn} to combine all
- * the elements in each window of the input {@code PCollection} into a
- * single value in the output {@code PCollection}. The types of the input
- * elements and the output elements can differ.
- *
- * <p>If the input {@code PCollection} is windowed into {@link GlobalWindows},
- * a default value in the {@link GlobalWindow} will be output if the input
- * {@code PCollection} is empty. To use this with inputs with other windowing,
- * either {@link Globally#withoutDefaults} or {@link Globally#asSingletonView}
- * must be called.
- *
- * <p>See {@link Globally Combine.Globally} for more information.
- */
- public static <InputT, OutputT> Globally<InputT, OutputT> globally(
- GlobalCombineFn<? super InputT, ?, OutputT> fn) {
- return new Globally<>(fn, true, 0);
- }
-
- /**
- * Returns a {@link PerKey Combine.PerKey} {@code PTransform} that
- * first groups its input {@code PCollection} of {@code KV}s by keys and
- * windows, then invokes the given function on each of the values lists to
- * produce a combined value, and then returns a {@code PCollection}
- * of {@code KV}s mapping each distinct key to its combined value for each
- * window.
- *
- * <p>Each output element is in the window by which its corresponding input
- * was grouped, and has the timestamp of the end of that window. The output
- * {@code PCollection} has the same
- * {@link com.google.cloud.dataflow.sdk.transforms.windowing.WindowFn}
- * as the input.
- *
- * <p>See {@link PerKey Combine.PerKey} for more information.
- */
- public static <K, V> PerKey<K, V, V> perKey(
- SerializableFunction<Iterable<V>, V> fn) {
- return perKey(Combine.IterableCombineFn.of(fn));
- }
-
- /**
- * Returns a {@link PerKey Combine.PerKey} {@code PTransform} that
- * first groups its input {@code PCollection} of {@code KV}s by keys and
- * windows, then invokes the given function on each of the values lists to
- * produce a combined value, and then returns a {@code PCollection}
- * of {@code KV}s mapping each distinct key to its combined value for each
- * window.
- *
- * <p>Each output element is in the window by which its corresponding input
- * was grouped, and has the timestamp of the end of that window. The output
- * {@code PCollection} has the same
- * {@link com.google.cloud.dataflow.sdk.transforms.windowing.WindowFn}
- * as the input.
- *
- * <p>See {@link PerKey Combine.PerKey} for more information.
- */
- public static <K, InputT, OutputT> PerKey<K, InputT, OutputT> perKey(
- GlobalCombineFn<? super InputT, ?, OutputT> fn) {
- return perKey(fn.<K>asKeyedFn());
- }
-
- /**
- * Returns a {@link PerKey Combine.PerKey} {@code PTransform} that
- * first groups its input {@code PCollection} of {@code KV}s by keys and
- * windows, then invokes the given function on each of the key/values-lists
- * pairs to produce a combined value, and then returns a
- * {@code PCollection} of {@code KV}s mapping each distinct key to
- * its combined value for each window.
- *
- * <p>Each output element is in the window by which its corresponding input
- * was grouped, and has the timestamp of the end of that window. The output
- * {@code PCollection} has the same
- * {@link com.google.cloud.dataflow.sdk.transforms.windowing.WindowFn}
- * as the input.
- *
- * <p>See {@link PerKey Combine.PerKey} for more information.
- */
- public static <K, InputT, OutputT> PerKey<K, InputT, OutputT> perKey(
- PerKeyCombineFn<? super K, ? super InputT, ?, OutputT> fn) {
- return new PerKey<>(fn, false /*fewKeys*/);
- }
-
- /**
- * Returns a {@link PerKey Combine.PerKey}, and set fewKeys
- * in {@link GroupByKey}.
- */
- private static <K, InputT, OutputT> PerKey<K, InputT, OutputT> fewKeys(
- PerKeyCombineFn<? super K, ? super InputT, ?, OutputT> fn) {
- return new PerKey<>(fn, true /*fewKeys*/);
- }
-
- /**
- * Returns a {@link GroupedValues Combine.GroupedValues}
- * {@code PTransform} that takes a {@code PCollection} of
- * {@code KV}s where a key maps to an {@code Iterable} of values, e.g.,
- * the result of a {@code GroupByKey}, then uses the given
- * {@code SerializableFunction} to combine all the values associated
- * with a key, ignoring the key. The type of the input and
- * output values must be the same.
- *
- * <p>Each output element has the same timestamp and is in the same window
- * as its corresponding input element, and the output
- * {@code PCollection} has the same
- * {@link com.google.cloud.dataflow.sdk.transforms.windowing.WindowFn}
- * associated with it as the input.
- *
- * <p>See {@link GroupedValues Combine.GroupedValues} for more information.
- *
- * <p>Note that {@link #perKey(SerializableFunction)} is typically
- * more convenient to use than {@link GroupByKey} followed by
- * {@code groupedValues(...)}.
- */
- public static <K, V> GroupedValues<K, V, V> groupedValues(
- SerializableFunction<Iterable<V>, V> fn) {
- return groupedValues(IterableCombineFn.of(fn));
- }
-
- /**
- * Returns a {@link GroupedValues Combine.GroupedValues}
- * {@code PTransform} that takes a {@code PCollection} of
- * {@code KV}s where a key maps to an {@code Iterable} of values, e.g.,
- * the result of a {@code GroupByKey}, then uses the given
- * {@code CombineFn} to combine all the values associated with a
- * key, ignoring the key. The types of the input and output values
- * can differ.
- *
- * <p>Each output element has the same timestamp and is in the same window
- * as its corresponding input element, and the output
- * {@code PCollection} has the same
- * {@link com.google.cloud.dataflow.sdk.transforms.windowing.WindowFn}
- * associated with it as the input.
- *
- * <p>See {@link GroupedValues Combine.GroupedValues} for more information.
- *
- * <p>Note that {@link #perKey(CombineFnBase.GlobalCombineFn)} is typically
- * more convenient to use than {@link GroupByKey} followed by
- * {@code groupedValues(...)}.
- */
- public static <K, InputT, OutputT> GroupedValues<K, InputT, OutputT> groupedValues(
- GlobalCombineFn<? super InputT, ?, OutputT> fn) {
- return groupedValues(fn.<K>asKeyedFn());
- }
-
- /**
- * Returns a {@link GroupedValues Combine.GroupedValues}
- * {@code PTransform} that takes a {@code PCollection} of
- * {@code KV}s where a key maps to an {@code Iterable} of values, e.g.,
- * the result of a {@code GroupByKey}, then uses the given
- * {@code KeyedCombineFn} to combine all the values associated with
- * each key. The combining function is provided the key. The types
- * of the input and output values can differ.
- *
- * <p>Each output element has the same timestamp and is in the same window
- * as its corresponding input element, and the output
- * {@code PCollection} has the same
- * {@link com.google.cloud.dataflow.sdk.transforms.windowing.WindowFn}
- * associated with it as the input.
- *
- * <p>See {@link GroupedValues Combine.GroupedValues} for more information.
- *
- * <p>Note that {@link #perKey(CombineFnBase.PerKeyCombineFn)} is typically
- * more convenient to use than {@link GroupByKey} followed by
- * {@code groupedValues(...)}.
- */
- public static <K, InputT, OutputT> GroupedValues<K, InputT, OutputT> groupedValues(
- PerKeyCombineFn<? super K, ? super InputT, ?, OutputT> fn) {
- return new GroupedValues<>(fn);
- }
-
-
- /////////////////////////////////////////////////////////////////////////////
-
- /**
- * A {@code CombineFn<InputT, AccumT, OutputT>} specifies how to combine a
- * collection of input values of type {@code InputT} into a single
- * output value of type {@code OutputT}. It does this via one or more
- * intermediate mutable accumulator values of type {@code AccumT}.
- *
- * <p>The overall process to combine a collection of input
- * {@code InputT} values into a single output {@code OutputT} value is as
- * follows:
- *
- * <ol>
- *
- * <li> The input {@code InputT} values are partitioned into one or more
- * batches.
- *
- * <li> For each batch, the {@link #createAccumulator} operation is
- * invoked to create a fresh mutable accumulator value of type
- * {@code AccumT}, initialized to represent the combination of zero
- * values.
- *
- * <li> For each input {@code InputT} value in a batch, the
- * {@link #addInput} operation is invoked to add the value to that
- * batch's accumulator {@code AccumT} value. The accumulator may just
- * record the new value (e.g., if {@code AccumT == List<InputT>}, or may do
- * work to represent the combination more compactly.
- *
- * <li> The {@link #mergeAccumulators} operation is invoked to
- * combine a collection of accumulator {@code AccumT} values into a
- * single combined output accumulator {@code AccumT} value, once the
- * merging accumulators have had all all the input values in their
- * batches added to them. This operation is invoked repeatedly,
- * until there is only one accumulator value left.
- *
- * <li> The {@link #extractOutput} operation is invoked on the final
- * accumulator {@code AccumT} value to get the output {@code OutputT} value.
- *
- * </ol>
- *
- * <p>For example:
- * <pre> {@code
- * public class AverageFn extends CombineFn<Integer, AverageFn.Accum, Double> {
- * public static class Accum {
- * int sum = 0;
- * int count = 0;
- * }
- * public Accum createAccumulator() {
- * return new Accum();
- * }
- * public Accum addInput(Accum accum, Integer input) {
- * accum.sum += input;
- * accum.count++;
- * return accum;
- * }
- * public Accum mergeAccumulators(Iterable<Accum> accums) {
- * Accum merged = createAccumulator();
- * for (Accum accum : accums) {
- * merged.sum += accum.sum;
- * merged.count += accum.count;
- * }
- * return merged;
- * }
- * public Double extractOutput(Accum accum) {
- * return ((double) accum.sum) / accum.count;
- * }
- * }
- * PCollection<Integer> pc = ...;
- * PCollection<Double> average = pc.apply(Combine.globally(new AverageFn()));
- * } </pre>
- *
- * <p>Combining functions used by {@link Combine.Globally},
- * {@link Combine.PerKey}, {@link Combine.GroupedValues}, and
- * {@code PTransforms} derived from them should be
- * <i>associative</i> and <i>commutative</i>. Associativity is
- * required because input values are first broken up into subgroups
- * before being combined, and their intermediate results further
- * combined, in an arbitrary tree structure. Commutativity is
- * required because any order of the input values is ignored when
- * breaking up input values into groups.
- *
- * @param <InputT> type of input values
- * @param <AccumT> type of mutable accumulator values
- * @param <OutputT> type of output values
- */
- public abstract static class CombineFn<InputT, AccumT, OutputT>
- extends AbstractGlobalCombineFn<InputT, AccumT, OutputT> {
-
- /**
- * Returns a new, mutable accumulator value, representing the accumulation of zero input values.
- */
- public abstract AccumT createAccumulator();
-
- /**
- * Adds the given input value to the given accumulator, returning the
- * new accumulator value.
- *
- * <p>For efficiency, the input accumulator may be modified and returned.
- */
- public abstract AccumT addInput(AccumT accumulator, InputT input);
-
- /**
- * Returns an accumulator representing the accumulation of all the
- * input values accumulated in the merging accumulators.
- *
- * <p>May modify any of the argument accumulators. May return a
- * fresh accumulator, or may return one of the (modified) argument
- * accumulators.
- */
- public abstract AccumT mergeAccumulators(Iterable<AccumT> accumulators);
-
- /**
- * Returns the output value that is the result of combining all
- * the input values represented by the given accumulator.
- */
- public abstract OutputT extractOutput(AccumT accumulator);
-
- /**
- * Returns an accumulator that represents the same logical value as the
- * input accumulator, but may have a more compact representation.
- *
- * <p>For most CombineFns this would be a no-op, but should be overridden
- * by CombineFns that (for example) buffer up elements and combine
- * them in batches.
- *
- * <p>For efficiency, the input accumulator may be modified and returned.
- *
- * <p>By default returns the original accumulator.
- */
- public AccumT compact(AccumT accumulator) {
- return accumulator;
- }
-
- /**
- * Applies this {@code CombineFn} to a collection of input values
- * to produce a combined output value.
- *
- * <p>Useful when using a {@code CombineFn} separately from a
- * {@code Combine} transform. Does not invoke the
- * {@link mergeAccumulators} operation.
- */
- public OutputT apply(Iterable<? extends InputT> inputs) {
- AccumT accum = createAccumulator();
- for (InputT input : inputs) {
- accum = addInput(accum, input);
- }
- return extractOutput(accum);
- }
-
- /**
- * {@inheritDoc}
- *
- * <p>By default returns the extract output of an empty accumulator.
- */
- @Override
- public OutputT defaultValue() {
- return extractOutput(createAccumulator());
- }
-
- /**
- * Returns a {@link TypeDescriptor} capturing what is known statically
- * about the output type of this {@code CombineFn} instance's
- * most-derived class.
- *
- * <p>In the normal case of a concrete {@code CombineFn} subclass with
- * no generic type parameters of its own, this will be a complete
- * non-generic type.
- */
- public TypeDescriptor<OutputT> getOutputType() {
- return new TypeDescriptor<OutputT>(getClass()) {};
- }
-
- @SuppressWarnings({"unchecked", "rawtypes"})
- @Override
- public <K> KeyedCombineFn<K, InputT, AccumT, OutputT> asKeyedFn() {
- // The key, an object, is never even looked at.
- return new KeyedCombineFn<K, InputT, AccumT, OutputT>() {
- @Override
- public AccumT createAccumulator(K key) {
- return CombineFn.this.createAccumulator();
- }
-
- @Override
- public AccumT addInput(K key, AccumT accumulator, InputT input) {
- return CombineFn.this.addInput(accumulator, input);
- }
-
- @Override
- public AccumT mergeAccumulators(K key, Iterable<AccumT> accumulators) {
- return CombineFn.this.mergeAccumulators(accumulators);
- }
-
- @Override
- public OutputT extractOutput(K key, AccumT accumulator) {
- return CombineFn.this.extractOutput(accumulator);
- }
-
- @Override
- public AccumT compact(K key, AccumT accumulator) {
- return CombineFn.this.compact(accumulator);
- }
-
- @Override
- public Coder<AccumT> getAccumulatorCoder(
- CoderRegistry registry, Coder<K> keyCoder, Coder<InputT> inputCoder)
- throws CannotProvideCoderException {
- return CombineFn.this.getAccumulatorCoder(registry, inputCoder);
- }
-
- @Override
- public Coder<OutputT> getDefaultOutputCoder(
- CoderRegistry registry, Coder<K> keyCoder, Coder<InputT> inputCoder)
- throws CannotProvideCoderException {
- return CombineFn.this.getDefaultOutputCoder(registry, inputCoder);
- }
-
- @Override
- public CombineFn<InputT, AccumT, OutputT> forKey(K key, Coder<K> keyCoder) {
- return CombineFn.this;
- }
- };
- }
- }
-
-
- /////////////////////////////////////////////////////////////////////////////
-
- /**
- * An abstract subclass of {@link CombineFn} for implementing combiners that are more
- * easily expressed as binary operations.
- */
- public abstract static class BinaryCombineFn<V> extends
- CombineFn<V, Holder<V>, V> {
-
- /**
- * Applies the binary operation to the two operands, returning the result.
- */
- public abstract V apply(V left, V right);
-
- /**
- * Returns the value that should be used for the combine of the empty set.
- */
- public V identity() {
- return null;
- }
-
- @Override
- public Holder<V> createAccumulator() {
- return new Holder<>();
- }
-
- @Override
- public Holder<V> addInput(Holder<V> accumulator, V input) {
- if (accumulator.present) {
- accumulator.set(apply(accumulator.value, input));
- } else {
- accumulator.set(input);
- }
- return accumulator;
- }
-
- @Override
- public Holder<V> mergeAccumulators(Iterable<Holder<V>> accumulators) {
- Iterator<Holder<V>> iter = accumulators.iterator();
- if (!iter.hasNext()) {
- return createAccumulator();
- } else {
- Holder<V> running = iter.next();
- while (iter.hasNext()) {
- Holder<V> accum = iter.next();
- if (accum.present) {
- if (running.present) {
- running.set(apply(running.value, accum.value));
- } else {
- running.set(accum.value);
- }
- }
- }
- return running;
- }
- }
-
- @Override
- public V extractOutput(Holder<V> accumulator) {
- if (accumulator.present) {
- return accumulator.value;
- } else {
- return identity();
- }
- }
-
- @Override
- public Coder<Holder<V>> getAccumulatorCoder(CoderRegistry registry, Coder<V> inputCoder) {
- return new HolderCoder<>(inputCoder);
- }
-
- @Override
- public Coder<V> getDefaultOutputCoder(CoderRegistry registry, Coder<V> inputCoder) {
- return inputCoder;
- }
-
- }
-
- /**
- * Holds a single value value of type {@code V} which may or may not be present.
- *
- * <p>Used only as a private accumulator class.
- */
- public static class Holder<V> {
- private V value;
- private boolean present;
- private Holder() { }
- private Holder(V value) {
- set(value);
- }
-
- private void set(V value) {
- this.present = true;
- this.value = value;
- }
- }
-
- /**
- * A {@link Coder} for a {@link Holder}.
- */
- private static class HolderCoder<V> extends CustomCoder<Holder<V>> {
-
- private Coder<V> valueCoder;
-
- public HolderCoder(Coder<V> valueCoder) {
- this.valueCoder = valueCoder;
- }
-
- @Override
- public List<Coder<?>> getCoderArguments() {
- return Arrays.<Coder<?>>asList(valueCoder);
- }
-
- @Override
- public void encode(Holder<V> accumulator, OutputStream outStream, Context context)
- throws CoderException, IOException {
- if (accumulator.present) {
- outStream.write(1);
- valueCoder.encode(accumulator.value, outStream, context);
- } else {
- outStream.write(0);
- }
- }
-
- @Override
- public Holder<V> decode(InputStream inStream, Context context)
- throws CoderException, IOException {
- if (inStream.read() == 1) {
- return new Holder<>(valueCoder.decode(inStream, context));
- } else {
- return new Holder<>();
- }
- }
-
- @Override
- public void verifyDeterministic() throws NonDeterministicException {
- valueCoder.verifyDeterministic();
- }
- }
-
- /**
- * An abstract subclass of {@link CombineFn} for implementing combiners that are more
- * easily and efficiently expressed as binary operations on <code>int</code>s
- *
- * <p> It uses {@code int[0]} as the mutable accumulator.
- */
- public abstract static class BinaryCombineIntegerFn extends CombineFn<Integer, int[], Integer> {
-
- /**
- * Applies the binary operation to the two operands, returning the result.
- */
- public abstract int apply(int left, int right);
-
- /**
- * Returns the identity element of this operation, i.e. an element {@code e}
- * such that {@code apply(e, x) == apply(x, e) == x} for all values of {@code x}.
- */
- public abstract int identity();
-
- @Override
- public int[] createAccumulator() {
- return wrap(identity());
- }
-
- @Override
- public int[] addInput(int[] accumulator, Integer input) {
- accumulator[0] = apply(accumulator[0], input);
- return accumulator;
- }
-
- @Override
- public int[] mergeAccumulators(Iterable<int[]> accumulators) {
- Iterator<int[]> iter = accumulators.iterator();
- if (!iter.hasNext()) {
- return createAccumulator();
- } else {
- int[] running = iter.next();
- while (iter.hasNext()) {
- running[0] = apply(running[0], iter.next()[0]);
- }
- return running;
- }
- }
-
- @Override
- public Integer extractOutput(int[] accumulator) {
- return accumulator[0];
- }
-
- @Override
- public Coder<int[]> getAccumulatorCoder(CoderRegistry registry, Coder<Integer> inputCoder) {
- return DelegateCoder.of(
- inputCoder,
- new DelegateCoder.CodingFunction<int[], Integer>() {
- @Override
- public Integer apply(int[] accumulator) {
- return accumulator[0];
- }
- },
- new DelegateCoder.CodingFunction<Integer, int[]>() {
- @Override
- public int[] apply(Integer value) {
- return wrap(value);
- }
- });
- }
-
- @Override
- public Coder<Integer> getDefaultOutputCoder(CoderRegistry registry,
- Coder<Integer> inputCoder) {
- return inputCoder;
- }
-
- private int[] wrap(int value) {
- return new int[] { value };
- }
-
- public Counter<Integer> getCounter(String name) {
- throw new UnsupportedOperationException("BinaryCombineDoubleFn does not support getCounter");
- }
- }
-
- /**
- * An abstract subclass of {@link CombineFn} for implementing combiners that are more
- * easily and efficiently expressed as binary operations on <code>long</code>s.
- *
- * <p> It uses {@code long[0]} as the mutable accumulator.
- */
- public abstract static class BinaryCombineLongFn extends CombineFn<Long, long[], Long> {
- /**
- * Applies the binary operation to the two operands, returning the result.
- */
- public abstract long apply(long left, long right);
-
- /**
- * Returns the identity element of this operation, i.e. an element {@code e}
- * such that {@code apply(e, x) == apply(x, e) == x} for all values of {@code x}.
- */
- public abstract long identity();
-
- @Override
- public long[] createAccumulator() {
- return wrap(identity());
- }
-
- @Override
- public long[] addInput(long[] accumulator, Long input) {
- accumulator[0] = apply(accumulator[0], input);
- return accumulator;
- }
-
- @Override
- public long[] mergeAccumulators(Iterable<long[]> accumulators) {
- Iterator<long[]> iter = accumulators.iterator();
- if (!iter.hasNext()) {
- return createAccumulator();
- } else {
- long[] running = iter.next();
- while (iter.hasNext()) {
- running[0] = apply(running[0], iter.next()[0]);
- }
- return running;
- }
- }
-
- @Override
- public Long extractOutput(long[] accumulator) {
- return accumulator[0];
- }
-
- @Override
- public Coder<long[]> getAccumulatorCoder(CoderRegistry registry, Coder<Long> inputCoder) {
- return DelegateCoder.of(
- inputCoder,
- new DelegateCoder.CodingFunction<long[], Long>() {
- @Override
- public Long apply(long[] accumulator) {
- return accumulator[0];
- }
- },
- new DelegateCoder.CodingFunction<Long, long[]>() {
- @Override
- public long[] apply(Long value) {
- return wrap(value);
- }
- });
- }
-
- @Override
- public Coder<Long> getDefaultOutputCoder(CoderRegistry registry, Coder<Long> inputCoder) {
- return inputCoder;
- }
-
- private long[] wrap(long value) {
- return new long[] { value };
- }
-
- public Counter<Long> getCounter(String name) {
- throw new UnsupportedOperationException("BinaryCombineDoubleFn does not support getCounter");
- }
- }
-
- /**
- * An abstract subclass of {@link CombineFn} for implementing combiners that are more
- * easily and efficiently expressed as binary operations on <code>double</code>s.
- *
- * <p> It uses {@code double[0]} as the mutable accumulator.
- */
- public abstract static class BinaryCombineDoubleFn extends CombineFn<Double, double[], Double> {
-
- /**
- * Applies the binary operation to the two operands, returning the result.
- */
- public abstract double apply(double left, double right);
-
- /**
- * Returns the identity element of this operation, i.e. an element {@code e}
- * such that {@code apply(e, x) == apply(x, e) == x} for all values of {@code x}.
- */
- public abstract double identity();
-
- @Override
- public double[] createAccumulator() {
- return wrap(identity());
- }
-
- @Override
- public double[] addInput(double[] accumulator, Double input) {
- accumulator[0] = apply(accumulator[0], input);
- return accumulator;
- }
-
- @Override
- public double[] mergeAccumulators(Iterable<double[]> accumulators) {
- Iterator<double[]> iter = accumulators.iterator();
- if (!iter.hasNext()) {
- return createAccumulator();
- } else {
- double[] running = iter.next();
- while (iter.hasNext()) {
- running[0] = apply(running[0], iter.next()[0]);
- }
- return running;
- }
- }
-
- @Override
- public Double extractOutput(double[] accumulator) {
- return accumulator[0];
- }
-
- @Override
- public Coder<double[]> getAccumulatorCoder(CoderRegistry registry, Coder<Double> inputCoder) {
- return DelegateCoder.of(
- inputCoder,
- new DelegateCoder.CodingFunction<double[], Double>() {
- @Override
- public Double apply(double[] accumulator) {
- return accumulator[0];
- }
- },
- new DelegateCoder.CodingFunction<Double, double[]>() {
- @Override
- public double[] apply(Double value) {
- return wrap(value);
- }
- });
- }
-
- @Override
- public Coder<Double> getDefaultOutputCoder(CoderRegistry registry, Coder<Double> inputCoder) {
- return inputCoder;
- }
-
- private double[] wrap(double value) {
- return new double[] { value };
- }
-
- public Counter<Double> getCounter(String name) {
- throw new UnsupportedOperationException("BinaryCombineDoubleFn does not support getCounter");
- }
- }
-
- /////////////////////////////////////////////////////////////////////////////
-
- /**
- * A {@code CombineFn} that uses a subclass of
- * {@link AccumulatingCombineFn.Accumulator} as its accumulator
- * type. By defining the operations of the {@code Accumulator}
- * helper class, the operations of the enclosing {@code CombineFn}
- * are automatically provided. This can reduce the code required to
- * implement a {@code CombineFn}.
- *
- * <p>For example, the example from {@link CombineFn} above can be
- * expressed using {@code AccumulatingCombineFn} more concisely as
- * follows:
- *
- * <pre> {@code
- * public class AverageFn
- * extends AccumulatingCombineFn<Integer, AverageFn.Accum, Double> {
- * public Accum createAccumulator() {
- * return new Accum();
- * }
- * public class Accum
- * extends AccumulatingCombineFn<Integer, AverageFn.Accum, Double>
- * .Accumulator {
- * private int sum = 0;
- * private int count = 0;
- * public void addInput(Integer input) {
- * sum += input;
- * count++;
- * }
- * public void mergeAccumulator(Accum other) {
- * sum += other.sum;
- * count += other.count;
- * }
- * public Double extractOutput() {
- * return ((double) sum) / count;
- * }
- * }
- * }
- * PCollection<Integer> pc = ...;
- * PCollection<Double> average = pc.apply(Combine.globally(new AverageFn()));
- * } </pre>
- *
- * @param <InputT> type of input values
- * @param <AccumT> type of mutable accumulator values
- * @param <OutputT> type of output values
- */
- public abstract static class AccumulatingCombineFn<
- InputT,
- AccumT extends AccumulatingCombineFn.Accumulator<InputT, AccumT, OutputT>,
- OutputT>
- extends CombineFn<InputT, AccumT, OutputT> {
-
- /**
- * The type of mutable accumulator values used by this
- * {@code AccumulatingCombineFn}.
- */
- public abstract static interface Accumulator<InputT, AccumT, OutputT> {
- /**
- * Adds the given input value to this accumulator, modifying
- * this accumulator.
- */
- public abstract void addInput(InputT input);
-
- /**
- * Adds the input values represented by the given accumulator
- * into this accumulator.
- */
- public abstract void mergeAccumulator(AccumT other);
-
- /**
- * Returns the output value that is the result of combining all
- * the input values represented by this accumulator.
- */
- public abstract OutputT extractOutput();
- }
-
- @Override
- public final AccumT addInput(AccumT accumulator, InputT input) {
- accumulator.addInput(input);
- return accumulator;
- }
-
- @Override
- public final AccumT mergeAccumulators(Iterable<AccumT> accumulators) {
- AccumT accumulator = createAccumulator();
- for (AccumT partial : accumulators) {
- accumulator.mergeAccumulator(partial);
- }
- return accumulator;
- }
-
- @Override
- public final OutputT extractOutput(AccumT accumulator) {
- return accumulator.extractOutput();
- }
- }
-
-
- /////////////////////////////////////////////////////////////////////////////
-
-
- /**
- * A {@code KeyedCombineFn<K, InputT, AccumT, OutputT>} specifies how to combine
- * a collection of input values of type {@code InputT}, associated with
- * a key of type {@code K}, into a single output value of type
- * {@code OutputT}. It does this via one or more intermediate mutable
- * accumulator values of type {@code AccumT}.
- *
- * <p>The overall process to combine a collection of input
- * {@code InputT} values associated with an input {@code K} key into a
- * single output {@code OutputT} value is as follows:
- *
- * <ol>
- *
- * <li> The input {@code InputT} values are partitioned into one or more
- * batches.
- *
- * <li> For each batch, the {@link #createAccumulator} operation is
- * invoked to create a fresh mutable accumulator value of type
- * {@code AccumT}, initialized to represent the combination of zero
- * values.
- *
- * <li> For each input {@code InputT} value in a batch, the
- * {@link #addInput} operation is invoked to add the value to that
- * batch's accumulator {@code AccumT} value. The accumulator may just
- * record the new value (e.g., if {@code AccumT == List<InputT>}, or may do
- * work to represent the combination more compactly.
- *
- * <li> The {@link #mergeAccumulators} operation is invoked to
- * combine a collection of accumulator {@code AccumT} values into a
- * single combined output accumulator {@code AccumT} value, once the
- * merging accumulators have had all all the input values in their
- * batches added to them. This operation is invoked repeatedly,
- * until there is only one accumulator value left.
- *
- * <li> The {@link #extractOutput} operation is invoked on the final
- * accumulator {@code AccumT} value to get the output {@code OutputT} value.
- *
- * </ol>
- *
- * <p>All of these operations are passed the {@code K} key that the
- * values being combined are associated with.
- *
- * <p>For example:
- * <pre> {@code
- * public class ConcatFn
- * extends KeyedCombineFn<String, Integer, ConcatFn.Accum, String> {
- * public static class Accum {
- * String s = "";
- * }
- * public Accum createAccumulator(String key) {
- * return new Accum();
- * }
- * public Accum addInput(String key, Accum accum, Integer input) {
- * accum.s += "+" + input;
- * return accum;
- * }
- * public Accum mergeAccumulators(String key, Iterable<Accum> accums) {
- * Accum merged = new Accum();
- * for (Accum accum : accums) {
- * merged.s += accum.s;
- * }
- * return merged;
- * }
- * public String extractOutput(String key, Accum accum) {
- * return key + accum.s;
- * }
- * }
- * PCollection<KV<String, Integer>> pc = ...;
- * PCollection<KV<String, String>> pc2 = pc.apply(
- * Combine.perKey(new ConcatFn()));
- * } </pre>
- *
- * <p>Keyed combining functions used by {@link Combine.PerKey},
- * {@link Combine.GroupedValues}, and {@code PTransforms} derived
- * from them should be <i>associative</i> and <i>commutative</i>.
- * Associativity is required because input values are first broken
- * up into subgroups before being combined, and their intermediate
- * results further combined, in an arbitrary tree structure.
- * Commutativity is required because any order of the input values
- * is ignored when breaking up input values into groups.
- *
- * @param <K> type of keys
- * @param <InputT> type of input values
- * @param <AccumT> type of mutable accumulator values
- * @param <OutputT> type of output values
- */
- public abstract static class KeyedCombineFn<K, InputT, AccumT, OutputT>
- extends AbstractPerKeyCombineFn<K, InputT, AccumT, OutputT> {
- /**
- * Returns a new, mutable accumulator value representing the accumulation of zero input values.
- *
- * @param key the key that all the accumulated values using the
- * accumulator are associated with
- */
- public abstract AccumT createAccumulator(K key);
-
- /**
- * Adds the given input value to the given accumulator, returning the new accumulator value.
- *
- * <p>For efficiency, the input accumulator may be modified and returned.
- *
- * @param key the key that all the accumulated values using the
- * accumulator are associated with
- */
- public abstract AccumT addInput(K key, AccumT accumulator, InputT value);
-
- /**
- * Returns an accumulator representing the accumulation of all the
- * input values accumulated in the merging accumulators.
- *
- * <p>May modify any of the argument accumulators. May return a
- * fresh accumulator, or may return one of the (modified) argument
- * accumulators.
- *
- * @param key the key that all the accumulators are associated
- * with
- */
- public abstract AccumT mergeAccumulators(K key, Iterable<AccumT> accumulators);
-
- /**
- * Returns the output value that is the result of combining all
- * the input values represented by the given accumulator.
- *
- * @param key the key that all the accumulated values using the
- * accumulator are associated with
- */
- public abstract OutputT extractOutput(K key, AccumT accumulator);
-
- /**
- * Returns an accumulator that represents the same logical value as the
- * input accumulator, but may have a more compact representation.
- *
- * <p>For most CombineFns this would be a no-op, but should be overridden
- * by CombineFns that (for example) buffer up elements and combine
- * them in batches.
- *
- * <p>For efficiency, the input accumulator may be modified and returned.
- *
- * <p>By default returns the original accumulator.
- */
- public AccumT compact(K key, AccumT accumulator) {
- return accumulator;
- }
-
- @Override
- public CombineFn<InputT, AccumT, OutputT> forKey(final K key, final Coder<K> keyCoder) {
- return new CombineFn<InputT, AccumT, OutputT>() {
-
- @Override
- public AccumT createAccumulator() {
- return KeyedCombineFn.this.createAccumulator(key);
- }
-
- @Override
- public AccumT addInput(AccumT accumulator, InputT input) {
- return KeyedCombineFn.this.addInput(key, accumulator, input);
- }
-
- @Override
- public AccumT mergeAccumulators(Iterable<AccumT> accumulators) {
- return KeyedCombineFn.this.mergeAccumulators(key, accumulators);
- }
-
- @Override
- public OutputT extractOutput(AccumT accumulator) {
- return KeyedCombineFn.this.extractOutput(key, accumulator);
- }
-
- @Override
- public AccumT compact(AccumT accumulator) {
- return KeyedCombineFn.this.compact(key, accumulator);
- }
-
- @Override
- public Coder<AccumT> getAccumulatorCoder(CoderRegistry registry, Coder<InputT> inputCoder)
- throws CannotProvideCoderException {
- return KeyedCombineFn.this.getAccumulatorCoder(registry, keyCoder, inputCoder);
- }
-
- @Override
- public Coder<OutputT> getDefaultOutputCoder(
- CoderRegistry registry, Coder<InputT> inputCoder) throws CannotProvideCoderException {
- return KeyedCombineFn.this.getDefaultOutputCoder(registry, keyCoder, inputCoder);
- }
- };
- }
-
- /**
- * Applies this {@code KeyedCombineFn} to a key and a collection
- * of input values to produce a combined output value.
- *
- * <p>Useful when testing the behavior of a {@code KeyedCombineFn}
- * separately from a {@code Combine} transform.
- */
- public OutputT apply(K key, Iterable<? extends InputT> inputs) {
- AccumT accum = createAccumulator(key);
- for (InputT input : inputs) {
- accum = addInput(key, accum, input);
- }
- return extractOutput(key, accum);
- }
- }
-
- ////////////////////////////////////////////////////////////////////////////
-
- /**
- * {@code Combine.Globally<InputT, OutputT>} takes a {@code PCollection<InputT>}
- * and returns a {@code PCollection<OutputT>} whose elements are the result of
- * combining all the elements in each window of the input {@code PCollection},
- * using a specified {@link CombineFn CombineFn<InputT, AccumT, OutputT>}.
- * It is common for {@code InputT == OutputT}, but not required. Common combining
- * functions include sums, mins, maxes, and averages of numbers,
- * conjunctions and disjunctions of booleans, statistical
- * aggregations, etc.
- *
- * <p>Example of use:
- * <pre> {@code
- * PCollection<Integer> pc = ...;
- * PCollection<Integer> sum = pc.apply(
- * Combine.globally(new Sum.SumIntegerFn()));
- * } </pre>
- *
- * <p>Combining can happen in parallel, with different subsets of the
- * input {@code PCollection} being combined separately, and their
- * intermediate results combined further, in an arbitrary tree
- * reduction pattern, until a single result value is produced.
- *
- * <p>If the input {@code PCollection} is windowed into {@link GlobalWindows},
- * a default value in the {@link GlobalWindow} will be output if the input
- * {@code PCollection} is empty. To use this with inputs with other windowing,
- * either {@link #withoutDefaults} or {@link #asSingletonView} must be called,
- * as the default value cannot be automatically assigned to any single window.
- *
- * <p>By default, the {@code Coder} of the output {@code PValue<OutputT>}
- * is inferred from the concrete type of the
- * {@code CombineFn<InputT, AccumT, OutputT>}'s output type {@code OutputT}.
- *
- * <p>See also {@link #perKey}/{@link PerKey Combine.PerKey} and
- * {@link #groupedValues}/{@link GroupedValues Combine.GroupedValues}, which
- * are useful for combining values associated with each key in
- * a {@code PCollection} of {@code KV}s.
- *
- * @param <InputT> type of input values
- * @param <OutputT> type of output values
- */
- public static class Globally<InputT, OutputT>
- extends PTransform<PCollection<InputT>, PCollection<OutputT>> {
-
- private final GlobalCombineFn<? super InputT, ?, OutputT> fn;
- private final boolean insertDefault;
- private final int fanout;
- private final List<PCollectionView<?>> sideInputs;
-
- private Globally(GlobalCombineFn<? super InputT, ?, OutputT> fn,
- boolean insertDefault, int fanout) {
- this.fn = fn;
- this.insertDefault = insertDefault;
- this.fanout = fanout;
- this.sideInputs = ImmutableList.<PCollectionView<?>>of();
- }
-
- private Globally(String name, GlobalCombineFn<? super InputT, ?, OutputT> fn,
- boolean insertDefault, int fanout) {
- super(name);
- this.fn = fn;
- this.insertDefault = insertDefault;
- this.fanout = fanout;
- this.sideInputs = ImmutableList.<PCollectionView<?>>of();
- }
-
- private Globally(String name, GlobalCombineFn<? super InputT, ?, OutputT> fn,
- boolean insertDefault, int fanout, List<PCollectionView<?>> sideInputs) {
- super(name);
- this.fn = fn;
- this.insertDefault = insertDefault;
- this.fanout = fanout;
- this.sideInputs = sideInputs;
- }
-
- /**
- * Return a new {@code Globally} transform that's like this transform but with the
- * specified name. Does not modify this transform.
- */
- public Globally<InputT, OutputT> named(String name) {
- return new Globally<>(name, fn, insertDefault, fanout);
- }
-
- /**
- * Returns a {@link PTransform} that produces a {@code PCollectionView}
- * whose elements are the result of combining elements per-window in
- * the input {@code PCollection}. If a value is requested from the view
- * for a window that is not present, the result of applying the {@code CombineFn}
- * to an empty input set will be returned.
- */
- public GloballyAsSingletonView<InputT, OutputT> asSingletonView() {
- return new GloballyAsSingletonView<>(fn, insertDefault, fanout);
- }
-
- /**
- * Returns a {@link PTransform} identical to this, but that does not attempt to
- * provide a default value in the case of empty input. Required when the input
- * is not globally windowed and the output is not being used as a side input.
- */
- public Globally<InputT, OutputT> withoutDefaults() {
- return new Globally<>(name, fn, false, fanout);
- }
-
- /**
- * Returns a {@link PTransform} identical to this, but that uses an intermediate node
- * to combine parts of the data to reduce load on the final global combine step.
- *
- * <p>The {@code fanout} parameter determines the number of intermediate keys
- * that will be used.
- */
- public Globally<InputT, OutputT> withFanout(int fanout) {
- return new Globally<>(name, fn, insertDefault, fanout);
- }
-
- /**
- * Returns a {@link PTransform} identical to this, but with the specified side inputs to use
- * in {@link CombineFnWithContext}.
- */
- public Globally<InputT, OutputT> withSideInputs(
- Iterable<? extends PCollectionView<?>> sideInputs) {
- Preconditions.checkState(fn instanceof RequiresContextInternal);
- return new Globally<InputT, OutputT>(name, fn, insertDefault, fanout,
- ImmutableList.<PCollectionView<?>>copyOf(sideInputs));
- }
-
- @Override
- public PCollection<OutputT> apply(PCollection<InputT> input) {
- PCollection<KV<Void, InputT>> withKeys = input
- .apply(WithKeys.<Void, InputT>of((Void) null))
- .setCoder(KvCoder.of(VoidCoder.of(), input.getCoder()));
-
- Combine.PerKey<Void, InputT, OutputT> combine =
- Combine.<Void, InputT, OutputT>fewKeys(fn.asKeyedFn());
- if (!sideInputs.isEmpty()) {
- combine = combine.withSideInputs(sideInputs);
- }
-
- PCollection<KV<Void, OutputT>> combined;
- if (fanout >= 2) {
- combined = withKeys.apply(combine.withHotKeyFanout(fanout));
- } else {
- combined = withKeys.apply(combine);
- }
-
- PCollection<OutputT> output = combined.apply(Values.<OutputT>create());
-
- if (insertDefault) {
- if (!output.getWindowingStrategy().getWindowFn().isCompatible(new GlobalWindows())) {
- throw new IllegalStateException(fn.getIncompatibleGlobalWindowErrorMessage());
- }
- return insertDefaultValueIfEmpty(output);
- } else {
- return output;
- }
- }
-
- private PCollection<OutputT> insertDefaultValueIfEmpty(PCollection<OutputT> maybeEmpty) {
- final PCollectionView<Iterable<OutputT>> maybeEmptyView = maybeEmpty.apply(
- View.<OutputT>asIterable());
-
-
- final OutputT defaultValue = fn.defaultValue();
- PCollection<OutputT> defaultIfEmpty = maybeEmpty.getPipeline()
- .apply("CreateVoid", Create.of((Void) null).withCoder(VoidCoder.of()))
- .apply(ParDo.named("ProduceDefault").withSideInputs(maybeEmptyView).of(
- new DoFn<Void, OutputT>() {
- @Override
- public void processElement(DoFn<Void, OutputT>.ProcessContext c) {
- Iterator<OutputT> combined = c.sideInput(maybeEmptyView).iterator();
- if (!combined.hasNext()) {
- c.output(defaultValue);
- }
- }
- }))
- .setCoder(maybeEmpty.getCoder())
- .setWindowingStrategyInternal(maybeEmpty.getWindowingStrategy());
-
- return PCollectionList.of(maybeEmpty).and(defaultIfEmpty)
- .apply(Flatten.<OutputT>pCollections());
- }
- }
-
- /**
- * {@code Combine.GloballyAsSingletonView<InputT, OutputT>} takes a {@code PCollection<InputT>}
- * and returns a {@code PCollectionView<OutputT>} whose elements are the result of
- * combining all the elements in each window of the input {@code PCollection},
- * using a specified {@link CombineFn CombineFn<InputT, AccumT, OutputT>}.
- * It is common for {@code InputT == OutputT}, but not required. Common combining
- * functions include sums, mins, maxes, and averages of numbers,
- * conjunctions and disjunctions of booleans, statistical
- * aggregations, etc.
- *
- * <p>Example of use:
- * <pre> {@code
- * PCollection<Integer> pc = ...;
- * PCollection<Integer> sum = pc.apply(
- * Combine.globally(new Sum.SumIntegerFn()));
- * } </pre>
- *
- * <p>Combining can happen in parallel, with different subsets of the
- * input {@code PCollection} being combined separately, and their
- * intermediate results combined further, in an arbitrary tree
- * reduction pattern, until a single result value is produced.
- *
- * <p>If a value is requested from the view for a window that is not present
- * and {@code insertDefault} is true, the result of calling the {@code CombineFn}
- * on empty input will returned. If {@code insertDefault} is false, an
- * exception will be thrown instead.
- *
- * <p>By default, the {@code Coder} of the output {@code PValue<OutputT>}
- * is inferred from the concrete type of the
- * {@code CombineFn<InputT, AccumT, OutputT>}'s output type {@code OutputT}.
- *
- * <p>See also {@link #perKey}/{@link PerKey Combine.PerKey} and
- * {@link #groupedValues}/{@link GroupedValues Combine.GroupedValues}, which
- * are useful for combining values associated with each key in
- * a {@code PCollection} of {@code KV}s.
- *
- * @param <InputT> type of input values
- * @param <OutputT> type of output values
- */
- public static class GloballyAsSingletonView<InputT, OutputT>
- extends PTransform<PCollection<InputT>, PCollectionView<OutputT>> {
-
- private final GlobalCombineFn<? super InputT, ?, OutputT> fn;
- private final boolean insertDefault;
- private final int fanout;
-
- private GloballyAsSingletonView(
- GlobalCombineFn<? super InputT, ?, OutputT> fn, boolean insertDefault, int fanout) {
- this.fn = fn;
- this.insertDefault = insertDefault;
- this.fanout = fanout;
- }
-
- @Override
- public PCollectionView<OutputT> apply(PCollection<InputT> input) {
- Globally<InputT, OutputT> combineGlobally =
- Combine.<InputT, OutputT>globally(fn).withoutDefaults().withFanout(fanout);
- if (insertDefault) {
- return input
- .apply(combineGlobally)
- .apply(View.<OutputT>asSingleton().withDefaultValue(fn.defaultValue()));
- } else {
- return input
- .apply(combineGlobally)
- .apply(View.<OutputT>asSingleton());
- }
- }
-
- public int getFanout() {
- return fanout;
- }
-
- public boolean getInsertDefault() {
- return insertDefault;
- }
-
- public GlobalCombineFn<? super InputT, ?, OutputT> getCombineFn() {
- return fn;
- }
- }
-
- /**
- * Converts a {@link SerializableFunction} from {@code Iterable<V>}s
- * to {@code V}s into a simple {@link CombineFn} over {@code V}s.
- *
- * <p>Used in the implementation of convenience methods like
- * {@link #globally(SerializableFunction)},
- * {@link #perKey(SerializableFunction)}, and
- * {@link #groupedValues(SerializableFunction)}.
- */
- public static class IterableCombineFn<V> extends CombineFn<V, List<V>, V> {
- /**
- * Returns a {@code CombineFn} that uses the given
- * {@code SerializableFunction} to combine values.
- */
- public static <V> IterableCombineFn<V> of(
- SerializableFunction<Iterable<V>, V> combiner) {
- return of(combiner, DEFAULT_BUFFER_SIZE);
- }
-
- /**
- * Returns a {@code CombineFn} that uses the given
- * {@code SerializableFunction} to combine values,
- * attempting to buffer at least {@code bufferSize}
- * values between invocations.
- */
- public static <V> IterableCombineFn<V> of(
- SerializableFunction<Iterable<V>, V> combiner, int bufferSize) {
- return new IterableCombineFn<>(combiner, bufferSize);
- }
-
- private static final int DEFAULT_BUFFER_SIZE = 20;
-
- /** The combiner function. */
- private final SerializableFunction<Iterable<V>, V> combiner;
-
- /**
- * The number of values to accumulate before invoking the combiner
- * function to combine them.
- */
- private final int bufferSize;
-
- private IterableCombineFn(
- SerializableFunction<Iterable<V>, V> combiner, int bufferSize) {
- this.combiner = combiner;
- this.bufferSize = bufferSize;
- }
-
- @Override
- public List<V> createAccumulator() {
- return new ArrayList<>();
- }
-
- @Override
- public List<V> addInput(List<V> accumulator, V input) {
- accumulator.add(input);
- if (accumulator.size() > bufferSize) {
- return mergeToSingleton(accumulator);
- } else {
- return accumulator;
- }
- }
-
- @Override
- public List<V> mergeAccumulators(Iterable<List<V>> accumulators) {
- return mergeToSingleton(Iterables.concat(accumulators));
- }
-
- @Override
- public V extractOutput(List<V> accumulator) {
- return combiner.apply(accumulator);
- }
-
- @Override
- public List<V> compact(List<V> accumulator) {
- return accumulator.size() > 1 ? mergeToSingleton(accumulator) : accumulator;
- }
-
- private List<V> mergeToSingleton(Iterable<V> values) {
- List<V> singleton = new ArrayList<>();
- singleton.add(combiner.apply(values));
- return singleton;
- }
- }
-
- /**
- * Converts a {@link SerializableFunction} from {@code Iterable<V>}s
- * to {@code V}s into a simple {@link CombineFn} over {@code V}s.
- *
- * <p>@deprecated Use {@link IterableCombineFn} or the more space efficient
- * {@link BinaryCombineFn} instead (which avoids buffering values).
- */
- @Deprecated
- public static class SimpleCombineFn<V> extends IterableCombineFn<V> {
-
- /**
- * Returns a {@code CombineFn} that uses the given
- * {@code SerializableFunction} to combine values.
- */
- @Deprecated
- public static <V> SimpleCombineFn<V> of(
- SerializableFunction<Iterable<V>, V> combiner) {
- return new SimpleCombineFn<>(combiner);
- }
-
- protected SimpleCombineFn(SerializableFunction<Iterable<V>, V> combiner) {
- super(combiner, IterableCombineFn.DEFAULT_BUFFER_SIZE);
- }
- }
-
-
- /////////////////////////////////////////////////////////////////////////////
-
- /**
- * {@code PerKey<K, InputT, OutputT>} takes a
- * {@code PCollection<KV<K, InputT>>}, groups it by key, applies a
- * combining function to the {@code InputT} values associated with each
- * key to produce a combined {@code OutputT} value, and returns a
- * {@code PCollection<KV<K, OutputT>>} representing a map from each
- * distinct key of the input {@code PCollection} to the corresponding
- * combined value. {@code InputT} and {@code OutputT} are often the same.
- *
- * <p>This is a concise shorthand for an application of
- * {@link GroupByKey} followed by an application of
- * {@link GroupedValues Combine.GroupedValues}. See those
- * operations for more details on how keys are compared for equality
- * and on the default {@code Coder} for the output.
- *
- * <p>Example of use:
- * <pre> {@code
- * PCollection<KV<String, Double>> salesRecords = ...;
- * PCollection<KV<String, Double>> totalSalesPerPerson =
- * salesRecords.apply(Combine.<String, Double>perKey(
- * new Sum.SumDoubleFn()));
- * } </pre>
- *
- * <p>Each output element is in the window by which its corresponding input
- * was grouped, and has the timestamp of the end of that window. The output
- * {@code PCollection} has the same
- * {@link com.google.cloud.dataflow.sdk.transforms.windowing.WindowFn}
- * as the input.
- *
- * @param <K> the type of the keys of the input and output
- * {@code PCollection}s
- * @param <InputT> the type of the values of the input {@code PCollection}
- * @param <OutputT> the type of the values of the output {@code PCollection}
- */
- public static class PerKey<K, InputT, OutputT>
- extends PTransform<PCollection<KV<K, InputT>>, PCollection<KV<K, OutputT>>> {
-
- private final transient PerKeyCombineFn<? super K, ? super InputT, ?, OutputT> fn;
- private final boolean fewKeys;
- private final List<PCollectionView<?>> sideInputs;
-
- private PerKey(
- PerKeyCombineFn<? super K, ? super InputT, ?, OutputT> fn, boolean fewKeys) {
- this.fn = fn;
- this.fewKeys = fewKeys;
- this.sideInputs = ImmutableList.of();
- }
-
- private PerKey(String name,
- PerKeyCombineFn<? super K, ? super InputT, ?, OutputT> fn,
- boolean fewKeys, List<PCollectionView<?>> sideInputs) {
- super(name);
- this.fn = fn;
- this.fewKeys = fewKeys;
- this.sideInputs = sideInputs;
- }
-
- private PerKey(
- String name, PerKeyCombineFn<? super K, ? super InputT, ?, OutputT> fn,
- boolean fewKeys) {
- super(name);
- this.fn = fn;
- this.fewKeys = fewKeys;
- this.sideInputs = ImmutableList.of();
- }
-
- /**
- * Return a new {@code Globally} transform that's like this transform but with the
- * specified name. Does not modify this transform.
- */
- public PerKey<K, InputT, OutputT> named(String name) {
- return new PerKey<K, InputT, OutputT>(name, fn, fewKeys);
- }
-
- /**
- * Returns a {@link PTransform} identical to this, but with the specified side inputs to use
- * in {@link KeyedCombineFnWithContext}.
- */
- public PerKey<K, InputT, OutputT> withSideInputs(
- Iterable<? extends PCollectionView<?>> sideInputs) {
- Preconditions.checkState(fn instanceof RequiresContextInternal);
- return new PerKey<K, InputT, OutputT>(name, fn, fewKeys,
- ImmutableList.<PCollectionView<?>>copyOf(sideInputs));
- }
-
- /**
- * If a single key has disproportionately many values, it may become a
- * bottleneck, especially in streaming mode. This returns a new per-key
- * combining transform that inserts an intermediate node to combine "hot"
- * keys partially before performing the full combine.
- *
- * @param hotKeyFanout a function from keys to an integer N, where the key
- * will be spread among N intermediate nodes for partial combining.
- * If N is less than or equal to 1, this key will not be sent through an
- * intermediate node.
- */
- public PerKeyWithHotKeyFanout<K, InputT, OutputT> withHotKeyFanout(
- SerializableFunction<? super K, Integer> hotKeyFanout) {
- return new PerKeyWithHotKeyFanout<K, InputT, OutputT>(name, fn, hotKeyFanout);
- }
-
- /**
- * Like {@link #withHotKeyFanout(SerializableFunction)}, but returning the given
- * constant value for every key.
- */
- public PerKeyWithHotKeyFanout<K, InputT, OutputT> withHotKeyFanout(final int hotKeyFanout) {
- return new PerKeyWithHotKeyFanout<K, InputT, OutputT>(name, fn,
- new SerializableFunction<K, Integer>(){
- @Override
- public Integer apply(K unused) {
- return hotKeyFanout;
- }
- });
- }
-
- /**
- * Returns the {@link PerKeyCombineFn} used by this Combine operation.
- */
- public PerKeyCombineFn<? super K, ? super InputT, ?, OutputT> getFn() {
- return fn;
- }
-
- /**
- * Returns the side inputs used by this Combine operation.
- */
- public List<PCollectionView<?>> getSideInputs() {
- return sideInputs;
- }
-
- @Override
- public PCollection<KV<K, OutputT>> apply(PCollection<KV<K, InputT>> input) {
- return input
- .apply(GroupByKey.<K, InputT>create(fewKeys))
- .apply(Combine.<K, InputT, OutputT>groupedValues(fn).withSideInputs(sideInputs));
- }
- }
-
- /**
- * Like {@link PerKey}, but sharding the combining of hot keys.
- */
- public static class PerKeyWithHotKeyFanout<K, InputT, OutputT>
- extends PTransform<PCollection<KV<K, InputT>>, PCollection<KV<K, OutputT>>> {
-
- private final transient PerKeyCombineFn<? super K, ? super InputT, ?, OutputT> fn;
- private final SerializableFunction<? super K, Integer> hotKeyFanout;
-
- private PerKeyWithHotKeyFanout(String name,
- PerKeyCombineFn<? super K, ? super InputT, ?, OutputT> fn,
- SerializableFunction<? super K, Integer> hotKeyFanout) {
- super(name);
- this.fn = fn;
- this.hotKeyFanout = hotKeyFanout;
- }
-
- @Override
- public PCollection<KV<K, OutputT>> apply(PCollection<KV<K, InputT>> input) {
- return applyHelper(input);
- }
-
- private <AccumT> PCollection<KV<K, OutputT>> applyHelper(PCollection<KV<K, InputT>> input) {
-
- // Name the accumulator type.
- @SuppressWarnings("unchecked")
- final PerKeyCombineFn<K, InputT, AccumT, OutputT> typedFn =
- (PerKeyCombineFn<K, InputT, AccumT, OutputT>) this.fn;
-
- if (!(input.getCoder() instanceof KvCoder)) {
- throw new IllegalStateException(
- "Expected input coder to be KvCoder, but was " + input.getCoder());
- }
-
- @SuppressWarnings("unchecked")
- final KvCoder<K, InputT> inputCoder = (KvCoder<K, InputT>) input.getCoder();
- final Coder<AccumT> accumCoder;
-
- try {
- accumCoder = typedFn.getAccumulatorCoder(
- input.getPipeline().getCoderRegistry(),
- inputCoder.getKeyCoder(), inputCoder.getValueCoder());
- } catch (CannotProvideCoderException e) {
- throw new IllegalStateException("Unable to determine accumulator coder.", e);
- }
- Coder<InputOrAccum<InputT, AccumT>> inputOrAccumCoder =
- new InputOrAccum.InputOrAccumCoder<InputT, AccumT>(
- inputCoder.getValueCoder(), accumCoder);
-
- // A CombineFn's mergeAccumulator can be applied in a tree-like fashon.
- // Here we shard the key using an integer nonce, combine on that partial
- // set of values, then drop the nonce and do a final combine of the
- // aggregates. We do this by splitting the original CombineFn into two,
- // on that does addInput + merge and another that does merge + extract.
- PerKeyCombineFn<KV<K, Integer>, InputT, AccumT, AccumT> hotPreCombine;
- PerKeyCombineFn<K, InputOrAccum<InputT, AccumT>, AccumT, OutputT> postCombine;
- if (!(typedFn instanceof RequiresContextInternal)) {
- final KeyedCombineFn<K, InputT, AccumT, OutputT> keyedFn =
- (KeyedCombineFn<K, InputT, AccumT, OutputT>) typedFn;
- hotPreCombine =
- new KeyedCombineFn<KV<K, Integer>, InputT, AccumT, AccumT>() {
- @Override
- public AccumT createAccumulator(KV<K, Integer> key) {
- return keyedFn.createAccumulator(key.getKey());
- }
- @Override
- public AccumT addInput(KV<K, Integer> key, AccumT accumulator, InputT value) {
- return keyedFn.addInput(key.getKey(), accumulator, value);
- }
- @Override
- public AccumT mergeAccumulators(
- KV<K, Integer> key, Iterable<AccumT> accumulators) {
- return keyedFn.mergeAccumulators(key.getKey(), accumulators);
- }
- @Override
- public AccumT compact(KV<K, Integer> key, AccumT accumulator) {
- return keyedFn.compact(key.getKey(), accumulator);
- }
- @Override
- public AccumT extractOutput(KV<K, Integer> key, AccumT accumulator) {
- return accumulator;
- }
- @Override
- @SuppressWarnings("unchecked")
- public Coder<AccumT> getAccumulatorCoder(
- CoderRegistry registry, Coder<KV<K, Integer>> keyCoder, Coder<InputT> inputCoder)
- throws CannotProvideCoderException {
- return accumCoder;
- }
- };
- postCombine =
- new KeyedCombineFn<K, InputOrAccum<InputT, AccumT>, AccumT, OutputT>() {
- @Override
- public AccumT createAccumulator(K key) {
- return keyedFn.createAccumulator(key);
- }
- @Override
- public AccumT addInput(
- K key, AccumT accumulator, InputOrAccum<InputT, AccumT> value) {
- if (value.accum == null) {
- return keyedFn.addInput(key, accumulator, value.input);
- } else {
- return keyedFn.mergeAccumulators(key, ImmutableList.of(accumulator, value.accum));
- }
- }
- @Override
- public AccumT mergeAccumulators(K key, Iterable<AccumT> accumulators) {
- return keyedFn.mergeAccumulators(key, accumulators);
- }
- @Override
- public AccumT compact(K key, AccumT accumulator) {
- return keyedFn.compact(key, accumulator);
- }
- @Override
- public OutputT extractOutput(K key, AccumT accumulator) {
- return keyedFn.extractOutput(key, accumulator);
- }
- @Override
- public Coder<OutputT> getDefaultOutputCoder(
- CoderRegistry registry,
- Coder<K> keyCoder,
- Coder<InputOrAccum<InputT, AccumT>> accumulatorCoder)
- throws CannotProvideCoderException {
- return keyedFn.getDefaultOutputCoder(
- registry, keyCoder, inputCoder.getValueCoder());
- }
-
- @Override
- public Coder<AccumT> getAccumulatorCoder(CoderRegistry registry, Coder<K> keyCoder,
- Coder<InputOrAccum<InputT, AccumT>> inputCoder)
- throws CannotProvideCoderException {
- return accumCoder;
- }
- };
- } else {
- final KeyedCombineFnWithContext<K, InputT, AccumT, OutputT> keyedFnWithContext =
- (KeyedCombineFnWithContext<K, InputT, AccumT, OutputT>) typedFn;
- hotPreCombine =
- new KeyedCombineFnWithContext<KV<K, Integer>, InputT, AccumT, AccumT>() {
- @Override
- public AccumT createAccumulator(KV<K, Integer> key, Context c) {
- return keyedFnWithContext.createAccumulator(key.getKey(), c);
- }
-
- @Override
- public AccumT addInput(
- KV<K, Integer> key, AccumT accumulator, InputT value, Context c) {
- return keyedFnWithContext.addInput(key.getKey(), accumulator, value, c);
- }
-
- @Override
- public AccumT mergeAccumulators(
- KV<K, Integer> key, Iterable<AccumT> accumulators, Context c) {
- return keyedFnWithContext.mergeAccumulators(key.getKey(), accumulators, c);
- }
-
- @Override
- public AccumT compact(KV<K, Integer> key, AccumT accumulator, Context c) {
- return keyedFnWithContext.compact(key.getKey(), accumulator, c);
- }
-
- @Override
- public AccumT extractOutput(KV<K, Integer> key, AccumT accumulator, Context c) {
- return accumulator;
- }
-
- @Override
- @SuppressWarnings("unchecked")
- public Coder<AccumT> getAccumulatorCoder(
- CoderRegistry registry, Coder<KV<K, Integer>> keyCoder, Coder<InputT> inputCoder)
- throws CannotProvideCoderException {
- return accumCoder;
- }
- };
- postCombine =
- new KeyedCombineFnWithContext<K, InputOrAccum<InputT, AccumT>, AccumT, OutputT>() {
- @Override
- public AccumT createAccumulator(K key, Context c) {
- return keyedFnWithContext.createAccumulator(key, c);
- }
- @Override
- public AccumT addInput(
- K key, AccumT accumulator, InputOrAccum<InputT, AccumT> value, Context c) {
- if (value.accum == null) {
- return keyedFnWithContext.addInput(key, accumulator, value.input, c);
- } else {
- return keyedFnWithContext.mergeAccumulators(
- key, ImmutableList.of(accumulator, value.accum), c);
- }
- }
- @Override
- public AccumT mergeAccumulators(K key, Iterable<AccumT> accumulators, Context c) {
- return keyedFnWithContext.mergeAccumulators(key, accumulators, c);
- }
- @Override
- public AccumT compact(K key, AccumT accumulator, Context c) {
- return keyedFnWithContext.compact(key, accumulator, c);
- }
- @Override
- public OutputT extractOutput(K key, AccumT accumulator, Context c) {
- return keyedFnWithContext.extractOutput(key, accumulator, c);
- }
- @Override
- public Coder<OutputT> getDefaultOutputCoder(
- CoderRegistry registry,
- Coder<K> keyCoder,
- Coder<InputOrAccum<InputT, AccumT>> accumulatorCoder)
- throws CannotProvideCoderException {
- return keyedFnWithContext.getDefaultOutputCoder(
- registry, keyCoder, inputCoder.getValueCoder());
- }
-
- @Override
- public Coder<AccumT> getAccumulatorCoder(CoderRegistry registry, Coder<K> keyCoder,
- Coder<InputOrAccum<InputT, AccumT>> inputCoder)
- throws CannotProvideCoderException {
- return accumCoder;
- }
- };
- }
-
- // Use the provided hotKeyFanout fn to split into "hot" and "cold" keys,
- // augmenting the hot keys with a nonce.
- final TupleTag<KV<KV<K, Integer>, InputT>> hot = new TupleTag<>();
- final TupleTag<KV<K, InputT>> cold = new TupleTag<>();
- PCollectionTuple split = input.apply(
- ParDo.named("AddNonce").of(
- new DoFn<KV<K, InputT>, KV<K, InputT>>() {
- transient int counter;
- @Override
- public void startBundle(Context c) {
- counter = ThreadLocalRandom.current().nextInt(
- Integer.MAX_VALUE);
- }
-
- @Override
- public void processElement(ProcessContext c) {
- KV<K, InputT> kv = c.element();
- int spread = Math.max(1, hotKeyFanout.apply(kv.getKey()));
- if (spread <= 1) {
- c.output(kv);
- } else {
- int nonce = counter++ % spread;
- c.sideOutput(hot, KV.of(KV.of(kv.getKey(), nonce), kv.getValue()));
- }
- }
- })
- .withOutputTags(cold, TupleTagList.of(hot)));
-
- // The first level of combine should never use accumulating mode.
- WindowingStrategy<?, ?> preCombineStrategy = input.getWindowingStrategy();
- if (preCombineStrategy.getMode()
- == WindowingStrategy.AccumulationMode.ACCUMULATING_FIRED_PANES) {
- preCombineStrategy = preCombineStrategy.withMode(
- WindowingStrategy.AccumulationMode.DISCARDING_FIRED_PANES);
- }
-
- // Combine the hot and cold keys separately.
- PCollection<KV<K, InputOrAccum<InputT, AccumT>>> precombinedHot = split
- .get(hot)
- .setCoder(KvCoder.of(KvCoder.of(inputCoder.getKeyCoder(), VarIntCoder.of()),
- inputCoder.getValueCoder()))
- .setWindowingStrategyInternal(preCombineStrategy)
- .apply("PreCombineHot", Combine.perKey(hotPreCombine))
- .apply(ParDo.named("StripNonce").of(
- new DoFn<KV<KV<K, Integer>, AccumT>,
- KV<K, InputOrAccum<InputT, AccumT>>>() {
- @Override
- public void processElement(ProcessContext c) {
- c.output(KV.of(
- c.element().getKey().getKey(),
- InputOrAccum.<InputT, AccumT>accum(c.element().getValue())));
- }
- }))
- .setCoder(KvCoder.of(inputCoder.getKeyCoder(), inputOrAccumCoder))
- .apply(Window.<KV<K, InputOrAccum<InputT, AccumT>>>remerge())
- .setWindowingStrategyInternal(input.getWindowingStrategy());
- PCollection<KV<K, InputOrAccum<InputT, AccumT>>> preprocessedCold = split
- .get(cold)
- .setCoder(inputCoder)
- .apply(ParDo.named("PrepareCold").of(
- new DoFn<KV<K, InputT>, KV<K, InputOrAccum<InputT, AccumT>>>() {
- @Override
- public void processElement(ProcessContext c) {
- c.output(KV.of(c.element().getKey(),
- InputOrAccum.<InputT, AccumT>input(c.element().getValue())));
- }
- }))
- .setCoder(KvCoder.of(inputCoder.getKeyCoder(), inputOrAccumCoder));
-
- // Combine the union of the pre-processed hot and cold key results.
- return PCollectionList.of(precombinedHot).and(preprocessedCold)
- .apply(Flatten.<KV<K, InputOrAccum<InputT, AccumT>>>pCollections())
- .apply("PostCombine", Combine.perKey(postCombine));
- }
-
- /**
- * Used to store either an input or accumulator value, for flattening
- * the hot and cold key paths.
- */
- private static class InputOrAccum<InputT, AccumT> {
- public final InputT input;
- public final AccumT accum;
-
- private InputOrAccum(InputT input, AccumT aggr) {
- this.input = input;
- this.accum = aggr;
- }
-
- public static <InputT, AccumT> InputOrAccum<InputT, AccumT> input(InputT input) {
- return new InputOrAccum<InputT, AccumT>(input, null);
- }
-
- public static <InputT, AccumT> InputOrAccum<InputT, AccumT> accum(AccumT aggr) {
- return new InputOrAccum<InputT, AccumT>(null, aggr);
- }
-
- private static class InputOrAccumCoder<InputT, AccumT>
- extends StandardCoder<InputOrAccum<InputT, AccumT>> {
-
- private final Coder<InputT> inputCoder;
- private final Coder<AccumT> accumCoder;
-
- public InputOrAccumCoder(Coder<InputT> inputCoder, Coder<AccumT> accumCoder) {
- this.inputCoder = inputCoder;
- this.accumCoder = accumCoder;
- }
-
- @JsonCreator
- @SuppressWarnings({"rawtypes", "unchecked"})
- public static <InputT, AccumT> InputOrAccumCoder<InputT, AccumT> of(
- @JsonProperty(PropertyNames.COMPONENT_ENCODINGS)
- List<Coder<?>> elementCoders) {
- return new InputOrAccumCoder(elementCoders.get(0), elementCoders.get(1));
- }
-
- @Override
- public void encode(
- InputOrAccum<InputT, AccumT> value, OutputStream outStream, Coder.Context context)
- throws CoderException, IOException {
- if (value.input != null) {
- outStream.write(0);
- inputCoder.encode(value.input, outStream, context);
- } else {
- outStream.write(1);
- accumCoder.encode(value.accum, outStream, context);
- }
- }
-
- @Override
- public InputOrAccum<InputT, AccumT> decode(InputStream inStream, Coder.Context context)
- throws CoderException, IOException {
- if (inStream.read() == 0) {
- return InputOrAccum.<InputT, AccumT>input(inputCoder.decode(inStream, context));
- } else {
- return InputOrAccum.<InputT, AccumT>accum(accumCoder.decode(inStream, context));
- }
- }
-
- @Override
- public List<? extends Coder<?>> getCoderArguments() {
- return ImmutableList.of(inputCoder, accumCoder);
- }
-
- @Override
- public void verifyDeterministic() throws Coder.NonDeterministicException {
- inputCoder.verifyDeterministic();
- accumCoder.verifyDeterministic();
- }
- }
- }
- }
-
-
- /////////////////////////////////////////////////////////////////////////////
-
- /**
- * {@code GroupedValues<K, InputT, OutputT>} takes a
- * {@code PCollection<KV<K, Iterable<InputT>>>}, such as the result of
- * {@link GroupByKey}, applies a specified
- * {@link KeyedCombineFn KeyedCombineFn<K, InputT, AccumT, OutputT>}
- * to each of the input {@code KV<K, Iterable<InputT>>} elements to
- * produce a combined output {@code KV<K, OutputT>} element, and returns a
- * {@code PCollection<KV<K, OutputT>>} containing all the combined output
- * elements. It is common for {@code InputT == OutputT}, but not required.
- * Common combining functions include sums, mins, maxes, and averages
- * of numbers, conjunctions and disjunctions of booleans, statistical
- * aggregations, etc.
- *
- * <p>Example of use:
- * <pre> {@code
- * PCollection<KV<String, Integer>> pc = ...;
- * PCollection<KV<String, Iterable<Integer>>> groupedByKey = pc.apply(
- * new GroupByKey<String, Integer>());
- * PCollection<KV<String, Integer>> sumByKey = groupedByKey.apply(
- * Combine.<String, Integer>groupedValues(
- * new Sum.SumIntegerFn()));
- * } </pre>
- *
- * <p>See also {@link #perKey}/{@link PerKey Combine.PerKey}, which
- * captures the common pattern of "combining by key" in a
- * single easy-to-use {@code PTransform}.
- *
- * <p>Combining for different keys can happen in parallel. Moreover,
- * combining of the {@code Iterable<InputT>} values associated a single
- * key can happen in parallel, with different subsets of the values
- * being combined separately, and their intermediate results combined
- * further, in an arbitrary tree reduction pattern, until a single
- * result value is produced for each key.
- *
- * <p>By default, the {@code Coder} of the keys of the output
- * {@code PCollection<KV<K, OutputT>>} is that of the keys of the input
- * {@code PCollection<KV<K, InputT>>}, and the {@code Coder} of the values
- * of the output {@code PCollection<KV<K, OutputT>>} is inferred from the
- * concrete type of the {@code KeyedCombineFn<K, InputT, AccumT, OutputT>}'s output
- * type {@code OutputT}.
- *
- * <p>Each output element has the same timestamp and is in the same window
- * as its corresponding input element, and the output
- * {@code PCollection} has the same
- * {@link com.google.cloud.dataflow.sdk.transforms.windowing.WindowFn}
- * associated with it as the input.
- *
- * <p>See also {@link #globally}/{@link Globally Combine.Globally}, which
- * combines all the values in a {@code PCollection} into a
- * single value in a {@code PCollection}.
- *
- * @param <K> type of input and output keys
- * @param <InputT> type of input values
- * @param <OutputT> type of output values
- */
- public static class GroupedValues<K, InputT, OutputT>
- extends PTransform
- <PCollection<? extends KV<K, ? extends Iterable<InputT>>>,
- PCollection<KV<K, OutputT>>> {
-
- private final PerKeyCombineFn<? super K, ? super InputT, ?, OutputT> fn;
- private final List<PCollectionView<?>> sideInputs;
-
- private GroupedValues(PerKeyCombineFn<? super K, ? super InputT, ?, OutputT> fn) {
- this.fn = SerializableUtils.clone(fn);
- this.sideInputs = ImmutableList.<PCollectionView<?>>of();
- }
-
- private GroupedValues(
- PerKeyCombineFn<? super K, ? super InputT, ?, OutputT> fn,
- List<PCollectionView<?>> sideInputs) {
- this.fn = SerializableUtils.clone(fn);
- this.sideInputs = sideInputs;
- }
-
- public GroupedValues<K, InputT, OutputT> withSideInputs(
- Iterable<? extends PCollectionView<?>> sideInputs) {
- return new GroupedValues<>(fn, ImmutableList.<PCollectionView<?>>copyOf(sideInputs));
- }
-
- /**
- * Returns the KeyedCombineFn used by this Combine operation.
- */
- public PerKeyCombineFn<? super K, ? super InputT, ?, OutputT> getFn() {
- return fn;
- }
-
- public List<PCollectionView<?>> getSideInputs() {
- return sideInputs;
- }
-
- @Override
- public PCollection<KV<K, OutputT>> apply(
- PCollection<? extends KV<K, ? extends Iterable<InputT>>> input) {
-
- final PerKeyCombineFnRunner<? super K, ? super InputT, ?, OutputT> combineFnRunner =
- PerKeyCombineFnRunners.create(fn);
- PCollection<KV<K, OutputT>> output = input.apply(ParDo.of(
- new DoFn<KV<K, ? extends Iterable<InputT>>, KV<K, OutputT>>() {
- @Override
- public void processElement(ProcessContext c) {
- K key = c.element().getKey();
-
- c.output(KV.of(key, combineFnRunner.apply(key, c.element().getValue(), c)));
- }
- }).withSideInputs(sideInputs));
-
- try {
- Coder<KV<K, OutputT>> outputCoder = getDefaultOutputCoder(input);
- output.setCoder(outputCoder);
- } catch (CannotProvideCoderException exc) {
- // let coder inference happen later, if it can
- }
-
- return output;
- }
-
- /**
- * Returns the {@link CombineFn} bound to its coders.
- *
- * <p>For internal use.
- */
- public AppliedCombineFn<? super K, ? super InputT, ?, OutputT> getAppliedFn(
- CoderRegistry registry, Coder<? extends KV<K, ? extends Iterable<InputT>>> inputCoder,
- WindowingStrategy<?, ?> windowingStrategy) {
- KvCoder<K, InputT> kvCoder = getKvCoder(inputCoder);
- return AppliedCombineFn.withInputCoder(
- fn, registry, kvCoder, sideInputs, windowingStrategy);
- }
-
- private KvCoder<K, InputT> getKvCoder(
- Coder<? extends KV<K, ? extends Iterable<InputT>>> inputCoder) {
- if (!(inputCoder instanceof KvCoder)) {
- throw new IllegalStateException(
- "Combine.GroupedValues requires its input to use KvCoder");
- }
- @SuppressWarnings({"unchecked", "rawtypes"})
- KvCoder<K, ? extends Iterable<InputT>> kvCoder = (KvCoder) inputCoder;
- Coder<K> keyCoder = kvCoder.getKeyCoder();
- Coder<? extends Iterable<InputT>> kvValueCoder = kvCoder.getValueCoder();
- if (!(kvValueCoder instanceof IterableCoder)) {
- throw new IllegalStateException(
- "Combine.GroupedValues requires its input values to use "
- + "IterableCoder");
- }
- @SuppressWarnings("unchecked")
- IterableCoder<InputT> inputValuesCoder = (IterableCoder<InputT>) kvValueCoder;
- Coder<InputT> inputValueCoder = inputValuesCoder.getElemCoder();
- return KvCoder.of(keyCoder, inputValueCoder);
- }
-
- @Override
- public Coder<KV<K, OutputT>> getDefaultOutputCoder(
- PCollection<? extends KV<K, ? extends Iterable<InputT>>> input)
- throws CannotProvideCoderException {
- KvCoder<K, InputT> kvCoder = getKvCoder(input.getCoder());
- @SuppressWarnings("unchecked")
- Coder<OutputT> outputValueCoder =
- ((PerKeyCombineFn<K, InputT, ?, OutputT>) fn)
- .getDefaultOutputCoder(
- input.getPipeline().getCoderRegistry(),
- kvCoder.getKeyCoder(), kvCoder.getValueCoder());
- return KvCoder.of(kvCoder.getKeyCoder(), outputValueCoder);
- }
- }
-}
[07/67] [partial] incubator-beam git commit: Directory reorganization
Posted by dh...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/UnownedInputStream.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/UnownedInputStream.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/UnownedInputStream.java
deleted file mode 100644
index 3d80230..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/UnownedInputStream.java
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.common.base.MoreObjects;
-
-import java.io.FilterInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-
-/**
- * A {@link OutputStream} wrapper which protects against the user attempting to modify
- * the underlying stream by closing it or using mark.
- */
-public class UnownedInputStream extends FilterInputStream {
- public UnownedInputStream(InputStream delegate) {
- super(delegate);
- }
-
- @Override
- public void close() throws IOException {
- throw new UnsupportedOperationException("Caller does not own the underlying input stream "
- + " and should not call close().");
- }
-
- @Override
- public boolean equals(Object obj) {
- return obj instanceof UnownedInputStream
- && ((UnownedInputStream) obj).in.equals(in);
- }
-
- @Override
- public int hashCode() {
- return in.hashCode();
- }
-
- @SuppressWarnings("UnsynchronizedOverridesSynchronized")
- @Override
- public void mark(int readlimit) {
- throw new UnsupportedOperationException("Caller does not own the underlying input stream "
- + " and should not call mark().");
- }
-
- @Override
- public boolean markSupported() {
- return false;
- }
-
- @SuppressWarnings("UnsynchronizedOverridesSynchronized")
- @Override
- public void reset() throws IOException {
- throw new UnsupportedOperationException("Caller does not own the underlying input stream "
- + " and should not call reset().");
- }
-
- @Override
- public String toString() {
- return MoreObjects.toStringHelper(UnownedInputStream.class).add("in", in).toString();
- }
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/UnownedOutputStream.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/UnownedOutputStream.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/UnownedOutputStream.java
deleted file mode 100644
index 29187a1..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/UnownedOutputStream.java
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.common.base.MoreObjects;
-
-import java.io.FilterOutputStream;
-import java.io.IOException;
-import java.io.OutputStream;
-
-/**
- * A {@link OutputStream} wrapper which protects against the user attempting to modify
- * the underlying stream by closing it.
- */
-public class UnownedOutputStream extends FilterOutputStream {
- public UnownedOutputStream(OutputStream delegate) {
- super(delegate);
- }
-
- @Override
- public void close() throws IOException {
- throw new UnsupportedOperationException("Caller does not own the underlying output stream "
- + " and should not call close().");
- }
-
- @Override
- public boolean equals(Object obj) {
- return obj instanceof UnownedOutputStream
- && ((UnownedOutputStream) obj).out.equals(out);
- }
-
- @Override
- public int hashCode() {
- return out.hashCode();
- }
-
- @Override
- public String toString() {
- return MoreObjects.toStringHelper(UnownedOutputStream.class).add("out", out).toString();
- }
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/UploadIdResponseInterceptor.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/UploadIdResponseInterceptor.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/UploadIdResponseInterceptor.java
deleted file mode 100644
index da597e6..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/UploadIdResponseInterceptor.java
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.api.client.http.GenericUrl;
-import com.google.api.client.http.HttpResponse;
-import com.google.api.client.http.HttpResponseInterceptor;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.IOException;
-
-/**
- * Implements a response intercepter that logs the upload id if the upload
- * id header exists and it is the first request (does not have upload_id parameter in the request).
- * Only logs if debug level is enabled.
- */
-public class UploadIdResponseInterceptor implements HttpResponseInterceptor {
-
- private static final Logger LOG = LoggerFactory.getLogger(UploadIdResponseInterceptor.class);
- private static final String UPLOAD_ID_PARAM = "upload_id";
- private static final String UPLOAD_TYPE_PARAM = "uploadType";
- private static final String UPLOAD_HEADER = "X-GUploader-UploadID";
-
- @Override
- public void interceptResponse(HttpResponse response) throws IOException {
- if (!LOG.isDebugEnabled()) {
- return;
- }
- String uploadId = response.getHeaders().getFirstHeaderStringValue(UPLOAD_HEADER);
- if (uploadId == null) {
- return;
- }
-
- GenericUrl url = response.getRequest().getUrl();
- // The check for no upload id limits the output to one log line per upload.
- // The check for upload type makes sure this is an upload and not a read.
- if (url.get(UPLOAD_ID_PARAM) == null && url.get(UPLOAD_TYPE_PARAM) != null) {
- LOG.debug(
- "Upload ID for url {} on worker {} is {}",
- url,
- System.getProperty("worker_id"),
- uploadId);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/UserCodeException.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/UserCodeException.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/UserCodeException.java
deleted file mode 100644
index 9b9c7a5..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/UserCodeException.java
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-import java.util.Arrays;
-import java.util.Objects;
-
-/**
- * An exception that was thrown in user-code. Sets the stack trace
- * from the first time execution enters user code down through the
- * rest of the user's stack frames until the exception is
- * reached.
- */
-public class UserCodeException extends RuntimeException {
-
- public static UserCodeException wrap(Throwable t) {
- if (t instanceof UserCodeException) {
- return (UserCodeException) t;
- }
-
- return new UserCodeException(t);
- }
-
- public static RuntimeException wrapIf(boolean condition, Throwable t) {
- if (condition) {
- return wrap(t);
- }
-
- if (t instanceof RuntimeException) {
- return (RuntimeException) t;
- }
-
- return new RuntimeException(t);
- }
-
- private UserCodeException(Throwable t) {
- super(t);
- truncateStackTrace(t);
- }
-
- /**
- * Truncates the @{Throwable}'s stack trace to contain only user code,
- * removing all frames below.
- *
- * <p>This is to remove infrastructure noise below user code entry point. We do this
- * by finding common stack frames between the throwable's captured stack and that
- * of the current thread.
- */
- private void truncateStackTrace(Throwable t) {
-
- StackTraceElement[] currentStack = Thread.currentThread().getStackTrace();
- StackTraceElement[] throwableStack = t.getStackTrace();
-
- int currentStackSize = currentStack.length;
- int throwableStackSize = throwableStack.length;
-
- int commonFrames = 0;
- while (framesEqual(currentStack[currentStackSize - commonFrames - 1],
- throwableStack[throwableStackSize - commonFrames - 1])) {
- commonFrames++;
- if (commonFrames >= Math.min(currentStackSize, throwableStackSize)) {
- break;
- }
- }
-
- StackTraceElement[] truncatedStack = Arrays.copyOfRange(throwableStack, 0,
- throwableStackSize - commonFrames);
- t.setStackTrace(truncatedStack);
- }
-
- /**
- * Check if two frames are equal; Frames are considered equal if they point to the same method.
- */
- private boolean framesEqual(StackTraceElement frame1, StackTraceElement frame2) {
- boolean areEqual = Objects.equals(frame1.getClassName(), frame2.getClassName());
- areEqual &= Objects.equals(frame1.getMethodName(), frame2.getMethodName());
-
- return areEqual;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/ValueWithRecordId.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/ValueWithRecordId.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/ValueWithRecordId.java
deleted file mode 100644
index ac1f2eb..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/ValueWithRecordId.java
+++ /dev/null
@@ -1,154 +0,0 @@
-/*******************************************************************************
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- ******************************************************************************/
-
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.cloud.dataflow.sdk.coders.ByteArrayCoder;
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.coders.StandardCoder;
-import com.google.cloud.dataflow.sdk.transforms.DoFn;
-import com.google.cloud.dataflow.sdk.transforms.PTransform;
-import com.google.cloud.dataflow.sdk.transforms.ParDo;
-import com.google.cloud.dataflow.sdk.values.PCollection;
-import com.google.common.base.MoreObjects;
-import com.google.common.base.Preconditions;
-
-import com.fasterxml.jackson.annotation.JsonCreator;
-import com.fasterxml.jackson.annotation.JsonProperty;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.util.Arrays;
-import java.util.List;
-import java.util.Objects;
-
-/**
- * Immutable struct containing a value as well as a unique id identifying the value.
- *
- * @param <ValueT> the underlying value type
- */
-public class ValueWithRecordId<ValueT> {
- private final ValueT value;
- private final byte[] id;
-
- public ValueWithRecordId(ValueT value, byte[] id) {
- this.value = value;
- this.id = id;
- }
-
- public ValueT getValue() {
- return value;
- }
-
- public byte[] getId() {
- return id;
- }
-
- @Override
- public String toString() {
- return MoreObjects.toStringHelper(this)
- .add("id", id)
- .add("value", value)
- .toString();
- }
-
- @Override
- public boolean equals(Object other) {
- if (this == other) {
- return true;
- }
- if (!(other instanceof ValueWithRecordId)) {
- return false;
- }
- ValueWithRecordId<?> otherRecord = (ValueWithRecordId<?>) other;
- return Objects.deepEquals(id, otherRecord.id)
- && Objects.deepEquals(value, otherRecord.value);
- }
-
- @Override
- public int hashCode() {
- return Objects.hash(Arrays.hashCode(id), value);
- }
-
- /**
- * A {@link Coder} for {@code ValueWithRecordId}, using a wrapped value {@code Coder}.
- */
- public static class ValueWithRecordIdCoder<ValueT>
- extends StandardCoder<ValueWithRecordId<ValueT>> {
- public static <ValueT> ValueWithRecordIdCoder<ValueT> of(Coder<ValueT> valueCoder) {
- return new ValueWithRecordIdCoder<>(valueCoder);
- }
-
- @JsonCreator
- public static <ValueT> ValueWithRecordIdCoder<ValueT> of(
- @JsonProperty(PropertyNames.COMPONENT_ENCODINGS)
- List<Coder<ValueT>> components) {
- Preconditions.checkArgument(components.size() == 1,
- "Expecting 1 component, got " + components.size());
- return of(components.get(0));
- }
-
- protected ValueWithRecordIdCoder(Coder<ValueT> valueCoder) {
- this.valueCoder = valueCoder;
- this.idCoder = ByteArrayCoder.of();
- }
-
- @Override
- public List<? extends Coder<?>> getCoderArguments() {
- return Arrays.asList(valueCoder);
- }
-
- @Override
- public void encode(ValueWithRecordId<ValueT> value, OutputStream outStream, Context context)
- throws IOException {
- valueCoder.encode(value.value, outStream, context.nested());
- idCoder.encode(value.id, outStream, context);
- }
-
- @Override
- public ValueWithRecordId<ValueT> decode(InputStream inStream, Context context)
- throws IOException {
- return new ValueWithRecordId<ValueT>(
- valueCoder.decode(inStream, context.nested()),
- idCoder.decode(inStream, context));
- }
-
- @Override
- public void verifyDeterministic() throws NonDeterministicException {
- valueCoder.verifyDeterministic();
- }
-
- public Coder<ValueT> getValueCoder() {
- return valueCoder;
- }
-
- Coder<ValueT> valueCoder;
- ByteArrayCoder idCoder;
- }
-
- public static <T>
- PTransform<PCollection<? extends ValueWithRecordId<T>>, PCollection<T>> stripIds() {
- return ParDo.named("StripIds")
- .of(
- new DoFn<ValueWithRecordId<T>, T>() {
- @Override
- public void processElement(ProcessContext c) {
- c.output(c.element().getValue());
- }
- });
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/Values.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/Values.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/Values.java
deleted file mode 100644
index d4440e7..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/Values.java
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-import java.util.Map;
-
-import javax.annotation.Nullable;
-
-/**
- * A collection of static methods for manipulating value representations
- * transfered via the Dataflow API.
- */
-public final class Values {
- private Values() {} // Non-instantiable
-
- public static Boolean asBoolean(Object value) throws ClassCastException {
- @Nullable Boolean knownResult = checkKnownValue(CloudKnownType.BOOLEAN, value, Boolean.class);
- if (knownResult != null) {
- return knownResult;
- }
- return Boolean.class.cast(value);
- }
-
- public static Double asDouble(Object value) throws ClassCastException {
- @Nullable Double knownResult = checkKnownValue(CloudKnownType.FLOAT, value, Double.class);
- if (knownResult != null) {
- return knownResult;
- }
- if (value instanceof Double) {
- return (Double) value;
- }
- return ((Float) value).doubleValue();
- }
-
- public static Long asLong(Object value) throws ClassCastException {
- @Nullable Long knownResult = checkKnownValue(CloudKnownType.INTEGER, value, Long.class);
- if (knownResult != null) {
- return knownResult;
- }
- if (value instanceof Long) {
- return (Long) value;
- }
- return ((Integer) value).longValue();
- }
-
- public static String asString(Object value) throws ClassCastException {
- @Nullable String knownResult = checkKnownValue(CloudKnownType.TEXT, value, String.class);
- if (knownResult != null) {
- return knownResult;
- }
- return String.class.cast(value);
- }
-
- @Nullable
- private static <T> T checkKnownValue(CloudKnownType type, Object value, Class<T> clazz) {
- if (!(value instanceof Map)) {
- return null;
- }
- Map<String, Object> map = (Map<String, Object>) value;
- @Nullable String typeName = (String) map.get(PropertyNames.OBJECT_TYPE_NAME);
- if (typeName == null) {
- return null;
- }
- @Nullable CloudKnownType knownType = CloudKnownType.forUri(typeName);
- if (knownType == null || knownType != type) {
- return null;
- }
- @Nullable Object scalar = map.get(PropertyNames.SCALAR_FIELD_NAME);
- if (scalar == null) {
- return null;
- }
- return knownType.parse(scalar, clazz);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/VarInt.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/VarInt.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/VarInt.java
deleted file mode 100644
index af03911..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/VarInt.java
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-import java.io.EOFException;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-
-/**
- * Variable-length encoding for integers.
- *
- * <p>Handles, in a common encoding format, signed bytes, shorts, ints, and longs.
- * Takes between 1 and 10 bytes.
- * Less efficient than BigEndian{Int,Long} coder for negative or large numbers.
- * All negative ints are encoded using 5 bytes, longs take 10 bytes.
- */
-public class VarInt {
-
- private static long convertIntToLongNoSignExtend(int v) {
- return v & 0xFFFFFFFFL;
- }
-
- /**
- * Encodes the given value onto the stream.
- */
- public static void encode(int v, OutputStream stream) throws IOException {
- encode(convertIntToLongNoSignExtend(v), stream);
- }
-
- /**
- * Encodes the given value onto the stream.
- */
- public static void encode(long v, OutputStream stream) throws IOException {
- do {
- // Encode next 7 bits + terminator bit
- long bits = v & 0x7F;
- v >>>= 7;
- byte b = (byte) (bits | ((v != 0) ? 0x80 : 0));
- stream.write(b);
- } while (v != 0);
- }
-
- /**
- * Decodes an integer value from the given stream.
- */
- public static int decodeInt(InputStream stream) throws IOException {
- long r = decodeLong(stream);
- if (r < 0 || r >= 1L << 32) {
- throw new IOException("varint overflow " + r);
- }
- return (int) r;
- }
-
- /**
- * Decodes a long value from the given stream.
- */
- public static long decodeLong(InputStream stream) throws IOException {
- long result = 0;
- int shift = 0;
- int b;
- do {
- // Get 7 bits from next byte
- b = stream.read();
- if (b < 0) {
- if (shift == 0) {
- throw new EOFException();
- } else {
- throw new IOException("varint not terminated");
- }
- }
- long bits = b & 0x7F;
- if (shift >= 64 || (shift == 63 && bits > 1)) {
- // Out of range
- throw new IOException("varint too long");
- }
- result |= bits << shift;
- shift += 7;
- } while ((b & 0x80) != 0);
- return result;
- }
-
- /**
- * Returns the length of the encoding of the given value (in bytes).
- */
- public static int getLength(int v) {
- return getLength(convertIntToLongNoSignExtend(v));
- }
-
- /**
- * Returns the length of the encoding of the given value (in bytes).
- */
- public static int getLength(long v) {
- int result = 0;
- do {
- result++;
- v >>>= 7;
- } while (v != 0);
- return result;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/WatermarkHold.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/WatermarkHold.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/WatermarkHold.java
deleted file mode 100644
index d537ddb..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/WatermarkHold.java
+++ /dev/null
@@ -1,450 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
-import com.google.cloud.dataflow.sdk.transforms.windowing.OutputTimeFn;
-import com.google.cloud.dataflow.sdk.transforms.windowing.OutputTimeFns;
-import com.google.cloud.dataflow.sdk.transforms.windowing.Window.ClosingBehavior;
-import com.google.cloud.dataflow.sdk.util.state.MergingStateAccessor;
-import com.google.cloud.dataflow.sdk.util.state.ReadableState;
-import com.google.cloud.dataflow.sdk.util.state.StateMerging;
-import com.google.cloud.dataflow.sdk.util.state.StateTag;
-import com.google.cloud.dataflow.sdk.util.state.StateTags;
-import com.google.cloud.dataflow.sdk.util.state.WatermarkHoldState;
-import com.google.common.annotations.VisibleForTesting;
-import com.google.common.base.Preconditions;
-
-import org.joda.time.Duration;
-import org.joda.time.Instant;
-
-import java.io.Serializable;
-
-import javax.annotation.Nullable;
-
-/**
- * Implements the logic to hold the output watermark for a computation back
- * until it has seen all the elements it needs based on the input watermark for the
- * computation.
- *
- * <p>The backend ensures the output watermark can never progress beyond the
- * input watermark for a computation. GroupAlsoByWindows computations may add a 'hold'
- * to the output watermark in order to prevent it progressing beyond a time within a window.
- * The hold will be 'cleared' when the associated pane is emitted.
- *
- * <p>This class is only intended for use by {@link ReduceFnRunner}. The two evolve together and
- * will likely break any other uses.
- *
- * @param <W> The kind of {@link BoundedWindow} the hold is for.
- */
-class WatermarkHold<W extends BoundedWindow> implements Serializable {
- /**
- * Return tag for state containing the output watermark hold
- * used for elements.
- */
- public static <W extends BoundedWindow>
- StateTag<Object, WatermarkHoldState<W>> watermarkHoldTagForOutputTimeFn(
- OutputTimeFn<? super W> outputTimeFn) {
- return StateTags.<Object, WatermarkHoldState<W>>makeSystemTagInternal(
- StateTags.<W>watermarkStateInternal("hold", outputTimeFn));
- }
-
- /**
- * Tag for state containing end-of-window and garbage collection output watermark holds.
- * (We can't piggy-back on the data hold state since the outputTimeFn may be
- * {@link OutputTimeFns#outputAtLatestInputTimestamp()}, in which case every pane will
- * would take the end-of-window time as its element time.)
- */
- @VisibleForTesting
- public static final StateTag<Object, WatermarkHoldState<BoundedWindow>> EXTRA_HOLD_TAG =
- StateTags.makeSystemTagInternal(StateTags.watermarkStateInternal(
- "extra", OutputTimeFns.outputAtEarliestInputTimestamp()));
-
- private final TimerInternals timerInternals;
- private final WindowingStrategy<?, W> windowingStrategy;
- private final StateTag<Object, WatermarkHoldState<W>> elementHoldTag;
-
- public WatermarkHold(TimerInternals timerInternals, WindowingStrategy<?, W> windowingStrategy) {
- this.timerInternals = timerInternals;
- this.windowingStrategy = windowingStrategy;
- this.elementHoldTag = watermarkHoldTagForOutputTimeFn(windowingStrategy.getOutputTimeFn());
- }
-
- /**
- * Add a hold to prevent the output watermark progressing beyond the (possibly adjusted) timestamp
- * of the element in {@code context}. We allow the actual hold time to be shifted later by
- * {@link OutputTimeFn#assignOutputTime}, but no further than the end of the window. The hold will
- * remain until cleared by {@link #extractAndRelease}. Return the timestamp at which the hold
- * was placed, or {@literal null} if no hold was placed.
- *
- * <p>In the following we'll write {@code E} to represent an element's timestamp after passing
- * through the window strategy's output time function, {@code IWM} for the local input watermark,
- * {@code OWM} for the local output watermark, and {@code GCWM} for the garbage collection
- * watermark (which is at {@code IWM - getAllowedLateness}). Time progresses from left to right,
- * and we write {@code [ ... ]} to denote a bounded window with implied lower bound.
- *
- * <p>Note that the GCWM will be the same as the IWM if {@code getAllowedLateness}
- * is {@code ZERO}.
- *
- * <p>Here are the cases we need to handle. They are conceptually considered in the
- * sequence written since if getAllowedLateness is ZERO the GCWM is the same as the IWM.
- * <ol>
- * <li>(Normal)
- * <pre>
- * |
- * [ | E ]
- * |
- * IWM
- * </pre>
- * This is, hopefully, the common and happy case. The element is locally on-time and can
- * definitely make it to an {@code ON_TIME} pane which we can still set an end-of-window timer
- * for. We place an element hold at E, which may contribute to the {@code ON_TIME} pane's
- * timestamp (depending on the output time function). Thus the OWM will not proceed past E
- * until the next pane fires.
- *
- * <li>(Discard - no target window)
- * <pre>
- * | |
- * [ E ] | |
- * | |
- * GCWM <-getAllowedLateness-> IWM
- * </pre>
- * The element is very locally late. The window has been garbage collected, thus there
- * is no target pane E could be assigned to. We discard E.
- *
- * <li>(Unobservably late)
- * <pre>
- * | |
- * [ | E | ]
- * | |
- * OWM IWM
- * </pre>
- * The element is locally late, however we can still treat this case as for 'Normal' above
- * since the IWM has not yet passed the end of the window and the element is ahead of the
- * OWM. In effect, we get to 'launder' the locally late element and consider it as locally
- * on-time because no downstream computation can observe the difference.
- *
- * <li>(Maybe late 1)
- * <pre>
- * | |
- * [ | E ] |
- * | |
- * OWM IWM
- * </pre>
- * The end-of-window timer may have already fired for this window, and thus an {@code ON_TIME}
- * pane may have already been emitted. However, if timer firings have been delayed then it
- * is possible the {@code ON_TIME} pane has not yet been emitted. We can't place an element
- * hold since we can't be sure if it will be cleared promptly. Thus this element *may* find
- * its way into an {@code ON_TIME} pane, but if so it will *not* contribute to that pane's
- * timestamp. We may however set a garbage collection hold if required.
- *
- * <li>(Maybe late 2)
- * <pre>
- * | |
- * [ E | | ]
- * | |
- * OWM IWM
- * </pre>
- * The end-of-window timer has not yet fired, so this element may still appear in an
- * {@code ON_TIME} pane. However the element is too late to contribute to the output
- * watermark hold, and thus won't contribute to the pane's timestamp. We can still place an
- * end-of-window hold.
- *
- * <li>(Maybe late 3)
- * <pre>
- * | |
- * [ E | ] |
- * | |
- * OWM IWM
- * </pre>
- * As for the (Maybe late 2) case, however we don't even know if the end-of-window timer
- * has already fired, or it is about to fire. We can place only the garbage collection hold,
- * if required.
- *
- * <li>(Definitely late)
- * <pre>
- * | |
- * [ E ] | |
- * | |
- * OWM IWM
- * </pre>
- * The element is definitely too late to make an {@code ON_TIME} pane. We are too late to
- * place an end-of-window hold. We can still place a garbage collection hold if required.
- *
- * </ol>
- */
- @Nullable
- public Instant addHolds(ReduceFn<?, ?, ?, W>.ProcessValueContext context) {
- Instant hold = addElementHold(context);
- if (hold == null) {
- hold = addEndOfWindowOrGarbageCollectionHolds(context);
- }
- return hold;
- }
-
- /**
- * Return {@code timestamp}, possibly shifted forward in time according to the window
- * strategy's output time function.
- */
- private Instant shift(Instant timestamp, W window) {
- Instant shifted = windowingStrategy.getOutputTimeFn().assignOutputTime(timestamp, window);
- if (shifted.isBefore(timestamp)) {
- throw new IllegalStateException(
- String.format("OutputTimeFn moved element from %s to earlier time %s for window %s",
- timestamp, shifted, window));
- }
- if (!timestamp.isAfter(window.maxTimestamp()) && shifted.isAfter(window.maxTimestamp())) {
- throw new IllegalStateException(
- String.format("OutputTimeFn moved element from %s to %s which is beyond end of window %s",
- timestamp, shifted, window));
- }
-
- return shifted;
- }
-
- /**
- * Add an element hold if possible. Return instant at which hold was added, or {@literal null}
- * if no hold was added.
- */
- @Nullable
- private Instant addElementHold(ReduceFn<?, ?, ?, W>.ProcessValueContext context) {
- // Give the window function a chance to move the hold timestamp forward to encourage progress.
- // (A later hold implies less impediment to the output watermark making progress, which in
- // turn encourages end-of-window triggers to fire earlier in following computations.)
- Instant elementHold = shift(context.timestamp(), context.window());
-
- Instant outputWM = timerInternals.currentOutputWatermarkTime();
- Instant inputWM = timerInternals.currentInputWatermarkTime();
-
- // Only add the hold if we can be sure:
- // - the backend will be able to respect it
- // (ie the hold is at or ahead of the output watermark), AND
- // - a timer will be set to clear it by the end of window
- // (ie the end of window is at or ahead of the input watermark).
- String which;
- boolean tooLate;
- // TODO: These case labels could be tightened.
- // See the case analysis in addHolds above for the motivation.
- if (outputWM != null && elementHold.isBefore(outputWM)) {
- which = "too late to effect output watermark";
- tooLate = true;
- } else if (inputWM != null && context.window().maxTimestamp().isBefore(inputWM)) {
- which = "too late for end-of-window timer";
- tooLate = true;
- } else {
- which = "on time";
- tooLate = false;
- context.state().access(elementHoldTag).add(elementHold);
- }
- WindowTracing.trace(
- "WatermarkHold.addHolds: element hold at {} is {} for "
- + "key:{}; window:{}; inputWatermark:{}; outputWatermark:{}",
- elementHold, which, context.key(), context.window(), inputWM,
- outputWM);
-
- return tooLate ? null : elementHold;
- }
-
- /**
- * Add an end-of-window hold or, if too late for that, a garbage collection hold (if required).
- * Return the {@link Instant} at which hold was added, or {@literal null} if no hold was added.
- *
- * <p>The end-of-window hold guarantees that an empty {@code ON_TIME} pane can be given
- * a timestamp which will not be considered beyond allowed lateness by any downstream computation.
- */
- @Nullable
- private Instant addEndOfWindowOrGarbageCollectionHolds(ReduceFn<?, ?, ?, W>.Context context) {
- Instant hold = addEndOfWindowHold(context);
- if (hold == null) {
- hold = addGarbageCollectionHold(context);
- }
- return hold;
- }
-
- /**
- * Add an end-of-window hold. Return the {@link Instant} at which hold was added,
- * or {@literal null} if no hold was added.
- *
- * <p>The end-of-window hold guarantees that any empty {@code ON_TIME} pane can be given
- * a timestamp which will not be considered beyond allowed lateness by any downstream computation.
- */
- @Nullable
- private Instant addEndOfWindowHold(ReduceFn<?, ?, ?, W>.Context context) {
- // Only add an end-of-window hold if we can be sure a timer will be set to clear it
- // by the end of window (ie the end of window is at or ahead of the input watermark).
- Instant outputWM = timerInternals.currentOutputWatermarkTime();
- Instant inputWM = timerInternals.currentInputWatermarkTime();
- String which;
- boolean tooLate;
- Instant eowHold = context.window().maxTimestamp();
- if (inputWM != null && eowHold.isBefore(inputWM)) {
- which = "too late for end-of-window timer";
- tooLate = true;
- } else {
- which = "on time";
- tooLate = false;
- Preconditions.checkState(outputWM == null || !eowHold.isBefore(outputWM),
- "End-of-window hold %s cannot be before output watermark %s", eowHold, outputWM);
- context.state().access(EXTRA_HOLD_TAG).add(eowHold);
- }
- WindowTracing.trace(
- "WatermarkHold.addEndOfWindowHold: end-of-window hold at {} is {} for "
- + "key:{}; window:{}; inputWatermark:{}; outputWatermark:{}",
- eowHold, which, context.key(), context.window(), inputWM,
- outputWM);
-
- return tooLate ? null : eowHold;
- }
-
- /**
- * Add a garbage collection hold, if required. Return the {@link Instant} at which hold was added,
- * or {@literal null} if no hold was added.
- *
- * <p>The garbage collection hold gurantees that any empty final pane can be given
- * a timestamp which will not be considered beyond allowed lateness by any downstream
- * computation. If we are sure no empty final panes can be emitted then there's no need
- * for an additional hold.
- */
- @Nullable
- private Instant addGarbageCollectionHold(ReduceFn<?, ?, ?, W>.Context context) {
- // Only add a garbage collection hold if we may need to emit an empty pane
- // at garbage collection time, and garbage collection time is strictly after the
- // end of window. (All non-empty panes will have holds at their output
- // time derived from their incoming elements and no additional hold is required.)
- if (context.windowingStrategy().getClosingBehavior() == ClosingBehavior.FIRE_ALWAYS
- && windowingStrategy.getAllowedLateness().isLongerThan(Duration.ZERO)) {
- Instant gcHold = context.window().maxTimestamp().plus(windowingStrategy.getAllowedLateness());
- Instant outputWM = timerInternals.currentOutputWatermarkTime();
- Instant inputWM = timerInternals.currentInputWatermarkTime();
- WindowTracing.trace(
- "WatermarkHold.addGarbageCollectionHold: garbage collection at {} hold for "
- + "key:{}; window:{}; inputWatermark:{}; outputWatermark:{}",
- gcHold, context.key(), context.window(), inputWM, outputWM);
- Preconditions.checkState(inputWM == null || !gcHold.isBefore(inputWM),
- "Garbage collection hold %s cannot be before input watermark %s", gcHold, inputWM);
- context.state().access(EXTRA_HOLD_TAG).add(gcHold);
- return gcHold;
- } else {
- return null;
- }
- }
-
- /**
- * Prefetch watermark holds in preparation for merging.
- */
- public void prefetchOnMerge(MergingStateAccessor<?, W> state) {
- StateMerging.prefetchWatermarks(state, elementHoldTag);
- }
-
- /**
- * Updates the watermark hold when windows merge if it is possible the merged value does
- * not equal all of the existing holds. For example, if the new window implies a later
- * watermark hold, then earlier holds may be released.
- */
- public void onMerge(ReduceFn<?, ?, ?, W>.OnMergeContext context) {
- WindowTracing.debug("onMerge: for key:{}; window:{}; inputWatermark:{}; outputWatermark:{}",
- context.key(), context.window(), timerInternals.currentInputWatermarkTime(),
- timerInternals.currentOutputWatermarkTime());
- StateMerging.mergeWatermarks(context.state(), elementHoldTag, context.window());
- // If we had a cheap way to determine if we have an element hold then we could
- // avoid adding an unnecessary end-of-window or garbage collection hold.
- // Simply reading the above merged watermark would impose an additional read for the
- // common case that the active window has just one undelying state address window and
- // the hold depends on the min of the elemest timestamps.
- StateMerging.clear(context.state(), EXTRA_HOLD_TAG);
- addEndOfWindowOrGarbageCollectionHolds(context);
- }
-
- /**
- * Return (a future for) the earliest hold for {@code context}. Clear all the holds after
- * reading, but add/restore an end-of-window or garbage collection hold if required.
- *
- * <p>The returned timestamp is the output timestamp according to the {@link OutputTimeFn}
- * from the windowing strategy of this {@link WatermarkHold}, combined across all the non-late
- * elements in the current pane. If there is no such value the timestamp is the end
- * of the window.
- */
- public ReadableState<Instant> extractAndRelease(
- final ReduceFn<?, ?, ?, W>.Context context, final boolean isFinished) {
- WindowTracing.debug(
- "extractAndRelease: for key:{}; window:{}; inputWatermark:{}; outputWatermark:{}",
- context.key(), context.window(), timerInternals.currentInputWatermarkTime(),
- timerInternals.currentOutputWatermarkTime());
- final WatermarkHoldState<W> elementHoldState = context.state().access(elementHoldTag);
- final WatermarkHoldState<BoundedWindow> extraHoldState = context.state().access(EXTRA_HOLD_TAG);
- return new ReadableState<Instant>() {
- @Override
- public ReadableState<Instant> readLater() {
- elementHoldState.readLater();
- extraHoldState.readLater();
- return this;
- }
-
- @Override
- public Instant read() {
- // Read both the element and extra holds.
- Instant elementHold = elementHoldState.read();
- Instant extraHold = extraHoldState.read();
- Instant hold;
- // Find the minimum, accounting for null.
- if (elementHold == null) {
- hold = extraHold;
- } else if (extraHold == null) {
- hold = elementHold;
- } else if (elementHold.isBefore(extraHold)) {
- hold = elementHold;
- } else {
- hold = extraHold;
- }
- if (hold == null || hold.isAfter(context.window().maxTimestamp())) {
- // If no hold (eg because all elements came in behind the output watermark), or
- // the hold was for garbage collection, take the end of window as the result.
- WindowTracing.debug(
- "WatermarkHold.extractAndRelease.read: clipping from {} to end of window "
- + "for key:{}; window:{}",
- hold, context.key(), context.window());
- hold = context.window().maxTimestamp();
- }
- WindowTracing.debug("WatermarkHold.extractAndRelease.read: clearing for key:{}; window:{}",
- context.key(), context.window());
-
- // Clear the underlying state to allow the output watermark to progress.
- elementHoldState.clear();
- extraHoldState.clear();
-
- if (!isFinished) {
- // Only need to leave behind an end-of-window or garbage collection hold
- // if future elements will be processed.
- addEndOfWindowOrGarbageCollectionHolds(context);
- }
-
- return hold;
- }
- };
- }
-
- /**
- * Clear any remaining holds.
- */
- public void clearHolds(ReduceFn<?, ?, ?, W>.Context context) {
- WindowTracing.debug(
- "WatermarkHold.clearHolds: For key:{}; window:{}; inputWatermark:{}; outputWatermark:{}",
- context.key(), context.window(), timerInternals.currentInputWatermarkTime(),
- timerInternals.currentOutputWatermarkTime());
- context.state().access(elementHoldTag).clear();
- context.state().access(EXTRA_HOLD_TAG).clear();
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/Weighted.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/Weighted.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/Weighted.java
deleted file mode 100644
index c31ad7f..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/Weighted.java
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-/**
- * Interface representing an object that has a weight, in unspecified units.
- */
-public interface Weighted {
- /**
- * Returns the weight of the object.
- */
- long getWeight();
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/WeightedValue.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/WeightedValue.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/WeightedValue.java
deleted file mode 100644
index 4a6e840..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/WeightedValue.java
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-/**
- * A {@code T} with an accompanying weight. Units are unspecified.
- *
- * @param <T> the underlying type of object
- */
-public final class WeightedValue<T> implements Weighted {
-
- private final T value;
- private final long weight;
-
- private WeightedValue(T value, long weight) {
- this.value = value;
- this.weight = weight;
- }
-
- public static <T> WeightedValue<T> of(T value, long weight) {
- return new WeightedValue<>(value, weight);
- }
-
- public long getWeight() {
- return weight;
- }
-
- public T getValue() {
- return value;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/WindowTracing.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/WindowTracing.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/WindowTracing.java
deleted file mode 100644
index 6ae2f42..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/WindowTracing.java
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * Logging for window operations. Generally only feasible to enable on hand-picked pipelines.
- */
-public final class WindowTracing {
- private static final Logger LOG = LoggerFactory.getLogger(WindowTracing.class);
-
- public static void debug(String format, Object... args) {
- LOG.debug(format, args);
- }
-
- @SuppressWarnings("unused")
- public static void trace(String format, Object... args) {
- LOG.trace(format, args);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/WindowedValue.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/WindowedValue.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/WindowedValue.java
deleted file mode 100644
index 1e944e2..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/WindowedValue.java
+++ /dev/null
@@ -1,720 +0,0 @@
-/*******************************************************************************
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- ******************************************************************************/
-
-package com.google.cloud.dataflow.sdk.util;
-
-import static com.google.cloud.dataflow.sdk.util.Structs.addBoolean;
-import static com.google.common.base.Preconditions.checkArgument;
-import static com.google.common.base.Preconditions.checkNotNull;
-
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.coders.CoderException;
-import com.google.cloud.dataflow.sdk.coders.CollectionCoder;
-import com.google.cloud.dataflow.sdk.coders.InstantCoder;
-import com.google.cloud.dataflow.sdk.coders.StandardCoder;
-import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
-import com.google.cloud.dataflow.sdk.transforms.windowing.GlobalWindow;
-import com.google.cloud.dataflow.sdk.transforms.windowing.PaneInfo;
-import com.google.cloud.dataflow.sdk.transforms.windowing.PaneInfo.PaneInfoCoder;
-import com.google.cloud.dataflow.sdk.util.common.ElementByteSizeObserver;
-import com.google.common.base.MoreObjects;
-import com.google.common.base.Preconditions;
-
-import com.fasterxml.jackson.annotation.JsonCreator;
-import com.fasterxml.jackson.annotation.JsonProperty;
-
-import org.joda.time.Instant;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.LinkedHashSet;
-import java.util.List;
-import java.util.Objects;
-import java.util.Set;
-
-/**
- * An immutable triple of value, timestamp, and windows.
- *
- * @param <T> the type of the value
- */
-public abstract class WindowedValue<T> {
-
- protected final T value;
- protected final PaneInfo pane;
-
- /**
- * Returns a {@code WindowedValue} with the given value, timestamp,
- * and windows.
- */
- public static <T> WindowedValue<T> of(
- T value,
- Instant timestamp,
- Collection<? extends BoundedWindow> windows,
- PaneInfo pane) {
- Preconditions.checkNotNull(pane);
-
- if (windows.size() == 0 && BoundedWindow.TIMESTAMP_MIN_VALUE.equals(timestamp)) {
- return valueInEmptyWindows(value, pane);
- } else if (windows.size() == 1) {
- return of(value, timestamp, windows.iterator().next(), pane);
- } else {
- return new TimestampedValueInMultipleWindows<>(value, timestamp, windows, pane);
- }
- }
-
- /**
- * Returns a {@code WindowedValue} with the given value, timestamp, and window.
- */
- public static <T> WindowedValue<T> of(
- T value,
- Instant timestamp,
- BoundedWindow window,
- PaneInfo pane) {
- Preconditions.checkNotNull(pane);
-
- boolean isGlobal = GlobalWindow.INSTANCE.equals(window);
- if (isGlobal && BoundedWindow.TIMESTAMP_MIN_VALUE.equals(timestamp)) {
- return valueInGlobalWindow(value, pane);
- } else if (isGlobal) {
- return new TimestampedValueInGlobalWindow<>(value, timestamp, pane);
- } else {
- return new TimestampedValueInSingleWindow<>(value, timestamp, window, pane);
- }
- }
-
- /**
- * Returns a {@code WindowedValue} with the given value in the {@link GlobalWindow} using the
- * default timestamp and pane.
- */
- public static <T> WindowedValue<T> valueInGlobalWindow(T value) {
- return new ValueInGlobalWindow<>(value, PaneInfo.NO_FIRING);
- }
-
- /**
- * Returns a {@code WindowedValue} with the given value in the {@link GlobalWindow} using the
- * default timestamp and the specified pane.
- */
- public static <T> WindowedValue<T> valueInGlobalWindow(T value, PaneInfo pane) {
- return new ValueInGlobalWindow<>(value, pane);
- }
-
- /**
- * Returns a {@code WindowedValue} with the given value and timestamp,
- * {@code GlobalWindow} and default pane.
- */
- public static <T> WindowedValue<T> timestampedValueInGlobalWindow(T value, Instant timestamp) {
- if (BoundedWindow.TIMESTAMP_MIN_VALUE.equals(timestamp)) {
- return valueInGlobalWindow(value);
- } else {
- return new TimestampedValueInGlobalWindow<>(value, timestamp, PaneInfo.NO_FIRING);
- }
- }
-
- /**
- * Returns a {@code WindowedValue} with the given value in no windows, and the default timestamp
- * and pane.
- */
- public static <T> WindowedValue<T> valueInEmptyWindows(T value) {
- return new ValueInEmptyWindows<T>(value, PaneInfo.NO_FIRING);
- }
-
- /**
- * Returns a {@code WindowedValue} with the given value in no windows, and the default timestamp
- * and the specified pane.
- */
- public static <T> WindowedValue<T> valueInEmptyWindows(T value, PaneInfo pane) {
- return new ValueInEmptyWindows<T>(value, pane);
- }
-
- private WindowedValue(T value, PaneInfo pane) {
- this.value = value;
- this.pane = checkNotNull(pane);
- }
-
- /**
- * Returns a new {@code WindowedValue} that is a copy of this one, but with a different value,
- * which may have a new type {@code NewT}.
- */
- public abstract <NewT> WindowedValue<NewT> withValue(NewT value);
-
- /**
- * Returns the value of this {@code WindowedValue}.
- */
- public T getValue() {
- return value;
- }
-
- /**
- * Returns the timestamp of this {@code WindowedValue}.
- */
- public abstract Instant getTimestamp();
-
- /**
- * Returns the windows of this {@code WindowedValue}.
- */
- public abstract Collection<? extends BoundedWindow> getWindows();
-
- /**
- * Returns the pane of this {@code WindowedValue} in its window.
- */
- public PaneInfo getPane() {
- return pane;
- }
-
- @Override
- public abstract boolean equals(Object o);
-
- @Override
- public abstract int hashCode();
-
- @Override
- public abstract String toString();
-
- private static final Collection<? extends BoundedWindow> GLOBAL_WINDOWS =
- Collections.singletonList(GlobalWindow.INSTANCE);
-
- /**
- * The abstract superclass of WindowedValue representations where
- * timestamp == MIN.
- */
- private abstract static class MinTimestampWindowedValue<T>
- extends WindowedValue<T> {
- public MinTimestampWindowedValue(T value, PaneInfo pane) {
- super(value, pane);
- }
-
- @Override
- public Instant getTimestamp() {
- return BoundedWindow.TIMESTAMP_MIN_VALUE;
- }
- }
-
- /**
- * The representation of a WindowedValue where timestamp == MIN and
- * windows == {GlobalWindow}.
- */
- private static class ValueInGlobalWindow<T>
- extends MinTimestampWindowedValue<T> {
- public ValueInGlobalWindow(T value, PaneInfo pane) {
- super(value, pane);
- }
-
- @Override
- public <NewT> WindowedValue<NewT> withValue(NewT value) {
- return new ValueInGlobalWindow<>(value, pane);
- }
-
- @Override
- public Collection<? extends BoundedWindow> getWindows() {
- return GLOBAL_WINDOWS;
- }
-
- @Override
- public boolean equals(Object o) {
- if (o instanceof ValueInGlobalWindow) {
- ValueInGlobalWindow<?> that = (ValueInGlobalWindow<?>) o;
- return Objects.equals(that.pane, this.pane)
- && Objects.equals(that.value, this.value);
- } else {
- return false;
- }
- }
-
- @Override
- public int hashCode() {
- return Objects.hash(value, pane);
- }
-
- @Override
- public String toString() {
- return MoreObjects.toStringHelper(getClass())
- .add("value", value)
- .add("pane", pane)
- .toString();
- }
- }
-
- /**
- * The representation of a WindowedValue where timestamp == MIN and
- * windows == {}.
- */
- private static class ValueInEmptyWindows<T>
- extends MinTimestampWindowedValue<T> {
- public ValueInEmptyWindows(T value, PaneInfo pane) {
- super(value, pane);
- }
-
- @Override
- public <NewT> WindowedValue<NewT> withValue(NewT value) {
- return new ValueInEmptyWindows<>(value, pane);
- }
-
- @Override
- public Collection<? extends BoundedWindow> getWindows() {
- return Collections.emptyList();
- }
-
- @Override
- public boolean equals(Object o) {
- if (o instanceof ValueInEmptyWindows) {
- ValueInEmptyWindows<?> that = (ValueInEmptyWindows<?>) o;
- return Objects.equals(that.pane, this.pane)
- && Objects.equals(that.value, this.value);
- } else {
- return false;
- }
- }
-
- @Override
- public int hashCode() {
- return Objects.hash(value, pane);
- }
-
- @Override
- public String toString() {
- return MoreObjects.toStringHelper(getClass())
- .add("value", value)
- .add("pane", pane)
- .toString();
- }
- }
-
- /**
- * The abstract superclass of WindowedValue representations where
- * timestamp is arbitrary.
- */
- private abstract static class TimestampedWindowedValue<T>
- extends WindowedValue<T> {
- protected final Instant timestamp;
-
- public TimestampedWindowedValue(T value,
- Instant timestamp,
- PaneInfo pane) {
- super(value, pane);
- this.timestamp = checkNotNull(timestamp);
- }
-
- @Override
- public Instant getTimestamp() {
- return timestamp;
- }
- }
-
- /**
- * The representation of a WindowedValue where timestamp {@code >}
- * MIN and windows == {GlobalWindow}.
- */
- private static class TimestampedValueInGlobalWindow<T>
- extends TimestampedWindowedValue<T> {
- public TimestampedValueInGlobalWindow(T value,
- Instant timestamp,
- PaneInfo pane) {
- super(value, timestamp, pane);
- }
-
- @Override
- public <NewT> WindowedValue<NewT> withValue(NewT value) {
- return new TimestampedValueInGlobalWindow<>(value, timestamp, pane);
- }
-
- @Override
- public Collection<? extends BoundedWindow> getWindows() {
- return GLOBAL_WINDOWS;
- }
-
- @Override
- public boolean equals(Object o) {
- if (o instanceof TimestampedValueInGlobalWindow) {
- TimestampedValueInGlobalWindow<?> that =
- (TimestampedValueInGlobalWindow<?>) o;
- return this.timestamp.isEqual(that.timestamp) // don't compare chronology objects
- && Objects.equals(that.pane, this.pane)
- && Objects.equals(that.value, this.value);
- } else {
- return false;
- }
- }
-
- @Override
- public int hashCode() {
- return Objects.hash(value, pane, timestamp.getMillis());
- }
-
- @Override
- public String toString() {
- return MoreObjects.toStringHelper(getClass())
- .add("value", value)
- .add("timestamp", timestamp)
- .add("pane", pane)
- .toString();
- }
- }
-
- /**
- * The representation of a WindowedValue where timestamp is arbitrary and
- * windows == a single non-Global window.
- */
- private static class TimestampedValueInSingleWindow<T>
- extends TimestampedWindowedValue<T> {
- private final BoundedWindow window;
-
- public TimestampedValueInSingleWindow(T value,
- Instant timestamp,
- BoundedWindow window,
- PaneInfo pane) {
- super(value, timestamp, pane);
- this.window = checkNotNull(window);
- }
-
- @Override
- public <NewT> WindowedValue<NewT> withValue(NewT value) {
- return new TimestampedValueInSingleWindow<>(value, timestamp, window, pane);
- }
-
- @Override
- public Collection<? extends BoundedWindow> getWindows() {
- return Collections.singletonList(window);
- }
-
- @Override
- public boolean equals(Object o) {
- if (o instanceof TimestampedValueInSingleWindow) {
- TimestampedValueInSingleWindow<?> that =
- (TimestampedValueInSingleWindow<?>) o;
- return Objects.equals(that.value, this.value)
- && this.timestamp.isEqual(that.timestamp) // don't compare chronology objects
- && Objects.equals(that.pane, this.pane)
- && Objects.equals(that.window, this.window);
- } else {
- return false;
- }
- }
-
- @Override
- public int hashCode() {
- return Objects.hash(value, timestamp.getMillis(), pane, window);
- }
-
- @Override
- public String toString() {
- return MoreObjects.toStringHelper(getClass())
- .add("value", value)
- .add("timestamp", timestamp)
- .add("window", window)
- .add("pane", pane)
- .toString();
- }
- }
-
- /**
- * The representation of a WindowedValue, excluding the special
- * cases captured above.
- */
- private static class TimestampedValueInMultipleWindows<T>
- extends TimestampedWindowedValue<T> {
- private Collection<? extends BoundedWindow> windows;
-
- public TimestampedValueInMultipleWindows(
- T value,
- Instant timestamp,
- Collection<? extends BoundedWindow> windows,
- PaneInfo pane) {
- super(value, timestamp, pane);
- this.windows = checkNotNull(windows);
- }
-
- @Override
- public <NewT> WindowedValue<NewT> withValue(NewT value) {
- return new TimestampedValueInMultipleWindows<>(value, timestamp, windows, pane);
- }
-
- @Override
- public Collection<? extends BoundedWindow> getWindows() {
- return windows;
- }
-
- @Override
- public boolean equals(Object o) {
- if (o instanceof TimestampedValueInMultipleWindows) {
- TimestampedValueInMultipleWindows<?> that =
- (TimestampedValueInMultipleWindows<?>) o;
- if (this.timestamp.isEqual(that.timestamp) // don't compare chronology objects
- && Objects.equals(that.value, this.value)
- && Objects.equals(that.pane, this.pane)) {
- ensureWindowsAreASet();
- that.ensureWindowsAreASet();
- return that.windows.equals(this.windows);
- }
- }
- return false;
- }
-
- @Override
- public int hashCode() {
- ensureWindowsAreASet();
- return Objects.hash(value, timestamp.getMillis(), pane, windows);
- }
-
- @Override
- public String toString() {
- return MoreObjects.toStringHelper(getClass())
- .add("value", value)
- .add("timestamp", timestamp)
- .add("windows", windows)
- .add("pane", pane)
- .toString();
- }
-
- private void ensureWindowsAreASet() {
- if (!(windows instanceof Set)) {
- windows = new LinkedHashSet<>(windows);
- }
- }
- }
-
-
- /////////////////////////////////////////////////////////////////////////////
-
- /**
- * Returns the {@code Coder} to use for a {@code WindowedValue<T>},
- * using the given valueCoder and windowCoder.
- */
- public static <T> FullWindowedValueCoder<T> getFullCoder(
- Coder<T> valueCoder,
- Coder<? extends BoundedWindow> windowCoder) {
- return FullWindowedValueCoder.of(valueCoder, windowCoder);
- }
-
- /**
- * Returns the {@code ValueOnlyCoder} from the given valueCoder.
- */
- public static <T> ValueOnlyWindowedValueCoder<T> getValueOnlyCoder(Coder<T> valueCoder) {
- return ValueOnlyWindowedValueCoder.of(valueCoder);
- }
-
- /**
- * Abstract class for {@code WindowedValue} coder.
- */
- public abstract static class WindowedValueCoder<T>
- extends StandardCoder<WindowedValue<T>> {
- final Coder<T> valueCoder;
-
- WindowedValueCoder(Coder<T> valueCoder) {
- this.valueCoder = checkNotNull(valueCoder);
- }
-
- /**
- * Returns the value coder.
- */
- public Coder<T> getValueCoder() {
- return valueCoder;
- }
-
- /**
- * Returns a new {@code WindowedValueCoder} that is a copy of this one,
- * but with a different value coder.
- */
- public abstract <NewT> WindowedValueCoder<NewT> withValueCoder(Coder<NewT> valueCoder);
- }
-
- /**
- * Coder for {@code WindowedValue}.
- */
- public static class FullWindowedValueCoder<T> extends WindowedValueCoder<T> {
- private final Coder<? extends BoundedWindow> windowCoder;
- // Precompute and cache the coder for a list of windows.
- private final Coder<Collection<? extends BoundedWindow>> windowsCoder;
-
- public static <T> FullWindowedValueCoder<T> of(
- Coder<T> valueCoder,
- Coder<? extends BoundedWindow> windowCoder) {
- return new FullWindowedValueCoder<>(valueCoder, windowCoder);
- }
-
- @JsonCreator
- public static FullWindowedValueCoder<?> of(
- @JsonProperty(PropertyNames.COMPONENT_ENCODINGS)
- List<Coder<?>> components) {
- checkArgument(components.size() == 2,
- "Expecting 2 components, got " + components.size());
- @SuppressWarnings("unchecked")
- Coder<? extends BoundedWindow> window = (Coder<? extends BoundedWindow>) components.get(1);
- return of(components.get(0), window);
- }
-
- FullWindowedValueCoder(Coder<T> valueCoder,
- Coder<? extends BoundedWindow> windowCoder) {
- super(valueCoder);
- this.windowCoder = checkNotNull(windowCoder);
- // It's not possible to statically type-check correct use of the
- // windowCoder (we have to ensure externally that we only get
- // windows of the class handled by windowCoder), so type
- // windowsCoder in a way that makes encode() and decode() work
- // right, and cast the window type away here.
- @SuppressWarnings({"unchecked", "rawtypes"})
- Coder<Collection<? extends BoundedWindow>> collectionCoder =
- (Coder) CollectionCoder.of(this.windowCoder);
- this.windowsCoder = collectionCoder;
- }
-
- public Coder<? extends BoundedWindow> getWindowCoder() {
- return windowCoder;
- }
-
- public Coder<Collection<? extends BoundedWindow>> getWindowsCoder() {
- return windowsCoder;
- }
-
- @Override
- public <NewT> WindowedValueCoder<NewT> withValueCoder(Coder<NewT> valueCoder) {
- return new FullWindowedValueCoder<>(valueCoder, windowCoder);
- }
-
- @Override
- public void encode(WindowedValue<T> windowedElem,
- OutputStream outStream,
- Context context)
- throws CoderException, IOException {
- Context nestedContext = context.nested();
- valueCoder.encode(windowedElem.getValue(), outStream, nestedContext);
- InstantCoder.of().encode(
- windowedElem.getTimestamp(), outStream, nestedContext);
- windowsCoder.encode(windowedElem.getWindows(), outStream, nestedContext);
- PaneInfoCoder.INSTANCE.encode(windowedElem.getPane(), outStream, context);
- }
-
- @Override
- public WindowedValue<T> decode(InputStream inStream, Context context)
- throws CoderException, IOException {
- Context nestedContext = context.nested();
- T value = valueCoder.decode(inStream, nestedContext);
- Instant timestamp = InstantCoder.of().decode(inStream, nestedContext);
- Collection<? extends BoundedWindow> windows =
- windowsCoder.decode(inStream, nestedContext);
- PaneInfo pane = PaneInfoCoder.INSTANCE.decode(inStream, nestedContext);
- return WindowedValue.of(value, timestamp, windows, pane);
- }
-
- @Override
- public void verifyDeterministic() throws NonDeterministicException {
- verifyDeterministic(
- "FullWindowedValueCoder requires a deterministic valueCoder",
- valueCoder);
- verifyDeterministic(
- "FullWindowedValueCoder requires a deterministic windowCoder",
- windowCoder);
- }
-
- @Override
- public void registerByteSizeObserver(WindowedValue<T> value,
- ElementByteSizeObserver observer,
- Context context) throws Exception {
- valueCoder.registerByteSizeObserver(value.getValue(), observer, context);
- InstantCoder.of().registerByteSizeObserver(value.getTimestamp(), observer, context);
- windowsCoder.registerByteSizeObserver(value.getWindows(), observer, context);
- }
-
- @Override
- public CloudObject asCloudObject() {
- CloudObject result = super.asCloudObject();
- addBoolean(result, PropertyNames.IS_WRAPPER, true);
- return result;
- }
-
- @Override
- public List<? extends Coder<?>> getCoderArguments() {
- return null;
- }
-
- @Override
- public List<? extends Coder<?>> getComponents() {
- return Arrays.<Coder<?>>asList(valueCoder, windowCoder);
- }
- }
-
- /**
- * Coder for {@code WindowedValue}.
- *
- * <p>A {@code ValueOnlyWindowedValueCoder} only encodes and decodes the value. It drops
- * timestamp and windows for encoding, and uses defaults timestamp, and windows for decoding.
- */
- public static class ValueOnlyWindowedValueCoder<T> extends WindowedValueCoder<T> {
- public static <T> ValueOnlyWindowedValueCoder<T> of(
- Coder<T> valueCoder) {
- return new ValueOnlyWindowedValueCoder<>(valueCoder);
- }
-
- @JsonCreator
- public static ValueOnlyWindowedValueCoder<?> of(
- @JsonProperty(PropertyNames.COMPONENT_ENCODINGS)
- List<Coder<?>> components) {
- checkArgument(components.size() == 1, "Expecting 1 component, got " + components.size());
- return of(components.get(0));
- }
-
- ValueOnlyWindowedValueCoder(Coder<T> valueCoder) {
- super(valueCoder);
- }
-
- @Override
- public <NewT> WindowedValueCoder<NewT> withValueCoder(Coder<NewT> valueCoder) {
- return new ValueOnlyWindowedValueCoder<>(valueCoder);
- }
-
- @Override
- public void encode(WindowedValue<T> windowedElem, OutputStream outStream, Context context)
- throws CoderException, IOException {
- valueCoder.encode(windowedElem.getValue(), outStream, context);
- }
-
- @Override
- public WindowedValue<T> decode(InputStream inStream, Context context)
- throws CoderException, IOException {
- T value = valueCoder.decode(inStream, context);
- return WindowedValue.valueInGlobalWindow(value);
- }
-
- @Override
- public void verifyDeterministic() throws NonDeterministicException {
- verifyDeterministic(
- "ValueOnlyWindowedValueCoder requires a deterministic valueCoder",
- valueCoder);
- }
-
- @Override
- public void registerByteSizeObserver(
- WindowedValue<T> value, ElementByteSizeObserver observer, Context context)
- throws Exception {
- valueCoder.registerByteSizeObserver(value.getValue(), observer, context);
- }
-
- @Override
- public CloudObject asCloudObject() {
- CloudObject result = super.asCloudObject();
- addBoolean(result, PropertyNames.IS_WRAPPER, true);
- return result;
- }
-
- @Override
- public List<? extends Coder<?>> getCoderArguments() {
- return Arrays.<Coder<?>>asList(valueCoder);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/WindowingInternals.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/WindowingInternals.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/WindowingInternals.java
deleted file mode 100644
index 12fcd53..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/WindowingInternals.java
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.cloud.dataflow.sdk.coders.Coder;
-import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
-import com.google.cloud.dataflow.sdk.transforms.windowing.PaneInfo;
-import com.google.cloud.dataflow.sdk.util.state.StateInternals;
-import com.google.cloud.dataflow.sdk.values.PCollectionView;
-import com.google.cloud.dataflow.sdk.values.TupleTag;
-
-import org.joda.time.Instant;
-
-import java.io.IOException;
-import java.util.Collection;
-
-/**
- * Interface that may be required by some (internal) {@code DoFn}s to implement windowing. It should
- * not be necessary for general user code to interact with this at all.
- *
- * <p>This interface should be provided by runner implementors to support windowing on their runner.
- *
- * @param <InputT> input type
- * @param <OutputT> output type
- */
-public interface WindowingInternals<InputT, OutputT> {
-
- /**
- * Unsupported state internals. The key type is unknown. It is up to the user to use the
- * correct type of key.
- */
- StateInternals<?> stateInternals();
-
- /**
- * Output the value at the specified timestamp in the listed windows.
- */
- void outputWindowedValue(OutputT output, Instant timestamp,
- Collection<? extends BoundedWindow> windows, PaneInfo pane);
-
- /**
- * Return the timer manager provided by the underlying system, or null if Timers need
- * to be emulated.
- */
- TimerInternals timerInternals();
-
- /**
- * Access the windows the element is being processed in without "exploding" it.
- */
- Collection<? extends BoundedWindow> windows();
-
- /**
- * Access the pane of the current window(s).
- */
- PaneInfo pane();
-
- /**
- * Write the given {@link PCollectionView} data to a location accessible by other workers.
- */
- <T> void writePCollectionViewData(
- TupleTag<?> tag,
- Iterable<WindowedValue<T>> data,
- Coder<T> elemCoder) throws IOException;
-
- /**
- * Return the value of the side input for the window of a main input element.
- */
- <T> T sideInput(PCollectionView<T> view, BoundedWindow mainInputWindow);
-}
http://git-wip-us.apache.org/repos/asf/incubator-beam/blob/7bef2b7e/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/WindowingStrategy.java
----------------------------------------------------------------------
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/WindowingStrategy.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/WindowingStrategy.java
deleted file mode 100644
index c167b8c..0000000
--- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/WindowingStrategy.java
+++ /dev/null
@@ -1,268 +0,0 @@
-/*
- * Copyright (C) 2015 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-
-package com.google.cloud.dataflow.sdk.util;
-
-import com.google.cloud.dataflow.sdk.annotations.Experimental;
-import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
-import com.google.cloud.dataflow.sdk.transforms.windowing.DefaultTrigger;
-import com.google.cloud.dataflow.sdk.transforms.windowing.GlobalWindow;
-import com.google.cloud.dataflow.sdk.transforms.windowing.GlobalWindows;
-import com.google.cloud.dataflow.sdk.transforms.windowing.OutputTimeFn;
-import com.google.cloud.dataflow.sdk.transforms.windowing.Trigger;
-import com.google.cloud.dataflow.sdk.transforms.windowing.Window.ClosingBehavior;
-import com.google.cloud.dataflow.sdk.transforms.windowing.WindowFn;
-import com.google.common.base.MoreObjects;
-
-import org.joda.time.Duration;
-
-import java.io.Serializable;
-import java.util.Objects;
-
-/**
- * A {@code WindowingStrategy} describes the windowing behavior for a specific collection of values.
- * It has both a {@link WindowFn} describing how elements are assigned to windows and a
- * {@link Trigger} that controls when output is produced for each window.
- *
- * @param <T> type of elements being windowed
- * @param <W> {@link BoundedWindow} subclass used to represent the
- * windows used by this {@code WindowingStrategy}
- */
-public class WindowingStrategy<T, W extends BoundedWindow> implements Serializable {
-
- /**
- * The accumulation modes that can be used with windowing.
- */
- public enum AccumulationMode {
- DISCARDING_FIRED_PANES,
- ACCUMULATING_FIRED_PANES;
- }
-
- private static final Duration DEFAULT_ALLOWED_LATENESS = Duration.ZERO;
- private static final WindowingStrategy<Object, GlobalWindow> DEFAULT = of(new GlobalWindows());
-
- private final WindowFn<T, W> windowFn;
- private final OutputTimeFn<? super W> outputTimeFn;
- private final ExecutableTrigger<W> trigger;
- private final AccumulationMode mode;
- private final Duration allowedLateness;
- private final ClosingBehavior closingBehavior;
- private final boolean triggerSpecified;
- private final boolean modeSpecified;
- private final boolean allowedLatenessSpecified;
- private final boolean outputTimeFnSpecified;
-
- private WindowingStrategy(
- WindowFn<T, W> windowFn,
- ExecutableTrigger<W> trigger, boolean triggerSpecified,
- AccumulationMode mode, boolean modeSpecified,
- Duration allowedLateness, boolean allowedLatenessSpecified,
- OutputTimeFn<? super W> outputTimeFn, boolean outputTimeFnSpecified,
- ClosingBehavior closingBehavior) {
- this.windowFn = windowFn;
- this.trigger = trigger;
- this.triggerSpecified = triggerSpecified;
- this.mode = mode;
- this.modeSpecified = modeSpecified;
- this.allowedLateness = allowedLateness;
- this.allowedLatenessSpecified = allowedLatenessSpecified;
- this.closingBehavior = closingBehavior;
- this.outputTimeFn = outputTimeFn;
- this.outputTimeFnSpecified = outputTimeFnSpecified;
- }
-
- /**
- * Return a fully specified, default windowing strategy.
- */
- public static WindowingStrategy<Object, GlobalWindow> globalDefault() {
- return DEFAULT;
- }
-
- public static <T, W extends BoundedWindow> WindowingStrategy<T, W> of(
- WindowFn<T, W> windowFn) {
- return new WindowingStrategy<>(windowFn,
- ExecutableTrigger.create(DefaultTrigger.<W>of()), false,
- AccumulationMode.DISCARDING_FIRED_PANES, false,
- DEFAULT_ALLOWED_LATENESS, false,
- windowFn.getOutputTimeFn(), false,
- ClosingBehavior.FIRE_IF_NON_EMPTY);
- }
-
- public WindowFn<T, W> getWindowFn() {
- return windowFn;
- }
-
- public ExecutableTrigger<W> getTrigger() {
- return trigger;
- }
-
- public boolean isTriggerSpecified() {
- return triggerSpecified;
- }
-
- public Duration getAllowedLateness() {
- return allowedLateness;
- }
-
- public boolean isAllowedLatenessSpecified() {
- return allowedLatenessSpecified;
- }
-
- public AccumulationMode getMode() {
- return mode;
- }
-
- public boolean isModeSpecified() {
- return modeSpecified;
- }
-
- public ClosingBehavior getClosingBehavior() {
- return closingBehavior;
- }
-
- public OutputTimeFn<? super W> getOutputTimeFn() {
- return outputTimeFn;
- }
-
- public boolean isOutputTimeFnSpecified() {
- return outputTimeFnSpecified;
- }
-
- /**
- * Returns a {@link WindowingStrategy} identical to {@code this} but with the trigger set to
- * {@code wildcardTrigger}.
- */
- public WindowingStrategy<T, W> withTrigger(Trigger<?> wildcardTrigger) {
- @SuppressWarnings("unchecked")
- Trigger<W> typedTrigger = (Trigger<W>) wildcardTrigger;
- return new WindowingStrategy<T, W>(
- windowFn,
- ExecutableTrigger.create(typedTrigger), true,
- mode, modeSpecified,
- allowedLateness, allowedLatenessSpecified,
- outputTimeFn, outputTimeFnSpecified,
- closingBehavior);
- }
-
- /**
- * Returns a {@link WindowingStrategy} identical to {@code this} but with the accumulation mode
- * set to {@code mode}.
- */
- public WindowingStrategy<T, W> withMode(AccumulationMode mode) {
- return new WindowingStrategy<T, W>(
- windowFn,
- trigger, triggerSpecified,
- mode, true,
- allowedLateness, allowedLatenessSpecified,
- outputTimeFn, outputTimeFnSpecified,
- closingBehavior);
- }
-
- /**
- * Returns a {@link WindowingStrategy} identical to {@code this} but with the window function
- * set to {@code wildcardWindowFn}.
- */
- public WindowingStrategy<T, W> withWindowFn(WindowFn<?, ?> wildcardWindowFn) {
- @SuppressWarnings("unchecked")
- WindowFn<T, W> typedWindowFn = (WindowFn<T, W>) wildcardWindowFn;
-
- // The onus of type correctness falls on the callee.
- @SuppressWarnings("unchecked")
- OutputTimeFn<? super W> newOutputTimeFn = (OutputTimeFn<? super W>)
- (outputTimeFnSpecified ? outputTimeFn : typedWindowFn.getOutputTimeFn());
-
- return new WindowingStrategy<T, W>(
- typedWindowFn,
- trigger, triggerSpecified,
- mode, modeSpecified,
- allowedLateness, allowedLatenessSpecified,
- newOutputTimeFn, outputTimeFnSpecified,
- closingBehavior);
- }
-
- /**
- * Returns a {@link WindowingStrategy} identical to {@code this} but with the allowed lateness
- * set to {@code allowedLateness}.
- */
- public WindowingStrategy<T, W> withAllowedLateness(Duration allowedLateness) {
- return new WindowingStrategy<T, W>(
- windowFn,
- trigger, triggerSpecified,
- mode, modeSpecified,
- allowedLateness, true,
- outputTimeFn, outputTimeFnSpecified,
- closingBehavior);
- }
-
- public WindowingStrategy<T, W> withClosingBehavior(ClosingBehavior closingBehavior) {
- return new WindowingStrategy<T, W>(
- windowFn,
- trigger, triggerSpecified,
- mode, modeSpecified,
- allowedLateness, allowedLatenessSpecified,
- outputTimeFn, outputTimeFnSpecified,
- closingBehavior);
- }
-
- @Experimental(Experimental.Kind.OUTPUT_TIME)
- public WindowingStrategy<T, W> withOutputTimeFn(OutputTimeFn<?> outputTimeFn) {
-
- @SuppressWarnings("unchecked")
- OutputTimeFn<? super W> typedOutputTimeFn = (OutputTimeFn<? super W>) outputTimeFn;
-
- return new WindowingStrategy<T, W>(
- windowFn,
- trigger, triggerSpecified,
- mode, modeSpecified,
- allowedLateness, allowedLatenessSpecified,
- typedOutputTimeFn, true,
- closingBehavior);
- }
-
- @Override
- public String toString() {
- return MoreObjects.toStringHelper(this)
- .add("windowFn", windowFn)
- .add("allowedLateness", allowedLateness)
- .add("trigger", trigger)
- .add("accumulationMode", mode)
- .add("outputTimeFn", outputTimeFn)
- .toString();
- }
-
- @Override
- public boolean equals(Object object) {
- if (!(object instanceof WindowingStrategy)) {
- return false;
- }
- WindowingStrategy<?, ?> other = (WindowingStrategy<?, ?>) object;
- return
- isTriggerSpecified() == other.isTriggerSpecified()
- && isAllowedLatenessSpecified() == other.isAllowedLatenessSpecified()
- && isModeSpecified() == other.isModeSpecified()
- && getMode().equals(other.getMode())
- && getAllowedLateness().equals(other.getAllowedLateness())
- && getClosingBehavior().equals(other.getClosingBehavior())
- && getTrigger().equals(other.getTrigger())
- && getWindowFn().equals(other.getWindowFn());
- }
-
- @Override
- public int hashCode() {
- return Objects.hash(triggerSpecified, allowedLatenessSpecified, modeSpecified,
- windowFn, trigger, mode, allowedLateness, closingBehavior);
- }
-}