You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@crunch.apache.org by gr...@apache.org on 2016/05/02 17:44:42 UTC
crunch git commit: CRUNCH-607 Allow collection reuse in MemPipeline
Repository: crunch
Updated Branches:
refs/heads/master 49a64ab16 -> 49e457559
CRUNCH-607 Allow collection reuse in MemPipeline
Prevent SingleUseIterable from throwing an IllegalArgumentException
when legal reuse of PGroupedCollections are done with the
MemPipeline.
This simply prevents materializing the transformed contents of
a MemCollection until it is iterated over.
Project: http://git-wip-us.apache.org/repos/asf/crunch/repo
Commit: http://git-wip-us.apache.org/repos/asf/crunch/commit/49e45755
Tree: http://git-wip-us.apache.org/repos/asf/crunch/tree/49e45755
Diff: http://git-wip-us.apache.org/repos/asf/crunch/diff/49e45755
Branch: refs/heads/master
Commit: 49e4575595e4667a7d2aeef7d4e0aaeace0f59c3
Parents: 49a64ab
Author: Gabriel Reid <gr...@apache.org>
Authored: Mon May 2 17:31:20 2016 +0200
Committer: Gabriel Reid <gr...@apache.org>
Committed: Mon May 2 17:31:20 2016 +0200
----------------------------------------------------------------------
.../crunch/impl/mem/collect/MemCollection.java | 11 +--
.../java/org/apache/crunch/WriteModeTest.java | 6 +-
.../mem/MemPipelinePCollectionReuseTest.java | 74 ++++++++++++++++++++
3 files changed, 83 insertions(+), 8 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/crunch/blob/49e45755/crunch-core/src/main/java/org/apache/crunch/impl/mem/collect/MemCollection.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/impl/mem/collect/MemCollection.java b/crunch-core/src/main/java/org/apache/crunch/impl/mem/collect/MemCollection.java
index 087a31d..f032d18 100644
--- a/crunch-core/src/main/java/org/apache/crunch/impl/mem/collect/MemCollection.java
+++ b/crunch-core/src/main/java/org/apache/crunch/impl/mem/collect/MemCollection.java
@@ -25,6 +25,7 @@ import java.lang.reflect.Method;
import java.util.Collection;
import java.util.Set;
+import com.google.common.collect.Iterables;
import javassist.util.proxy.MethodFilter;
import javassist.util.proxy.MethodHandler;
import javassist.util.proxy.ProxyFactory;
@@ -67,7 +68,7 @@ import com.google.common.collect.ImmutableSet;
public class MemCollection<S> implements PCollection<S> {
- private final Collection<S> collect;
+ private final Iterable<S> collect;
private final PType<S> ptype;
private String name;
@@ -80,7 +81,7 @@ public class MemCollection<S> implements PCollection<S> {
}
public MemCollection(Iterable<S> collect, PType<S> ptype, String name) {
- this.collect = ImmutableList.copyOf(collect);
+ this.collect = collect;
this.ptype = ptype;
this.name = name;
}
@@ -244,11 +245,11 @@ public class MemCollection<S> implements PCollection<S> {
@Override
public ReadableData<S> asReadable(boolean materialize) {
- return new MemReadableData<S>(collect);
+ return new MemReadableData<S>(ImmutableList.copyOf(collect));
}
public Collection<S> getCollection() {
- return collect;
+ return ImmutableList.copyOf(collect);
}
@Override
@@ -266,7 +267,7 @@ public class MemCollection<S> implements PCollection<S> {
@Override
public long getSize() {
- return collect.isEmpty() ? 0 : 1; // getSize is only used for pipeline optimization in MR
+ return Iterables.isEmpty(collect) ? 0 : 1; // getSize is only used for pipeline optimization in MR
}
@Override
http://git-wip-us.apache.org/repos/asf/crunch/blob/49e45755/crunch-core/src/test/java/org/apache/crunch/WriteModeTest.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/test/java/org/apache/crunch/WriteModeTest.java b/crunch-core/src/test/java/org/apache/crunch/WriteModeTest.java
index e99ac7b..977b14d 100644
--- a/crunch-core/src/test/java/org/apache/crunch/WriteModeTest.java
+++ b/crunch-core/src/test/java/org/apache/crunch/WriteModeTest.java
@@ -51,7 +51,7 @@ public class WriteModeTest {
public void testOverwrite() throws Exception {
Path p = run(WriteMode.OVERWRITE, true);
PCollection<String> lines = MemPipeline.getInstance().readTextFile(p.toString());
- assertEquals(ImmutableList.of("some", "string", "values"), lines.materialize());
+ assertEquals(ImmutableList.of("some", "string", "values"), ImmutableList.copyOf(lines.materialize()));
}
@Test(expected=CrunchRuntimeException.class)
@@ -64,7 +64,7 @@ public class WriteModeTest {
Path p = run(WriteMode.APPEND, true);
PCollection<String> lines = MemPipeline.getInstance().readTextFile(p.toString());
assertEquals(ImmutableList.of("some", "string", "values", "some", "string", "values"),
- lines.materialize());
+ ImmutableList.copyOf(lines.materialize()));
}
@Test
@@ -72,7 +72,7 @@ public class WriteModeTest {
Path p = run(WriteMode.APPEND, false);
PCollection<String> lines = MemPipeline.getInstance().readTextFile(p.toString());
assertEquals(ImmutableList.of("some", "string", "values", "some", "string", "values"),
- lines.materialize());
+ ImmutableList.copyOf(lines.materialize()));
}
Path run(WriteMode writeMode, boolean doRun) throws Exception {
http://git-wip-us.apache.org/repos/asf/crunch/blob/49e45755/crunch-core/src/test/java/org/apache/crunch/impl/mem/MemPipelinePCollectionReuseTest.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/test/java/org/apache/crunch/impl/mem/MemPipelinePCollectionReuseTest.java b/crunch-core/src/test/java/org/apache/crunch/impl/mem/MemPipelinePCollectionReuseTest.java
new file mode 100644
index 0000000..fa63287
--- /dev/null
+++ b/crunch-core/src/test/java/org/apache/crunch/impl/mem/MemPipelinePCollectionReuseTest.java
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.crunch.impl.mem;
+
+import static org.junit.Assert.assertEquals;
+
+import java.util.Set;
+
+import com.google.common.collect.ImmutableSet;
+import org.apache.crunch.MapFn;
+import org.apache.crunch.PCollection;
+import org.apache.crunch.PGroupedTable;
+import org.apache.crunch.PTable;
+import org.apache.crunch.fn.IdentityFn;
+import org.apache.crunch.types.avro.Avros;
+import org.junit.Test;
+
+public class MemPipelinePCollectionReuseTest {
+
+ /**
+ * Specific test for the situation outlined in CRUNCH-607, which was that deriving two PCollections from the same
+ * PGroupedTable would throw an IllegalStateException from SingleUseIterable. This just ensures that this case
+ * doesn't return.
+ */
+ @Test
+ public void testGroupedCollectionReuse() {
+
+ PCollection<String> stringValues = MemPipeline.typedCollectionOf(Avros.strings(), "one", "two", "three");
+
+ PGroupedTable<String, String> groupedTable =
+ stringValues.by(IdentityFn.<String>getInstance(), Avros.strings()).groupByKey();
+
+ // Here we re-use the grouped table twice, meaning its internal iterators will need to be iterated multiple times
+ PTable<String, Integer> stringLengthTable =
+ groupedTable.mapValues(new MaxStringLengthFn(), Avros.ints());
+
+ // Previous to LP-607, this would fail with an IllegalStateException from SingleUseIterable
+ Set<String> keys = ImmutableSet.copyOf(groupedTable.ungroup().join(stringLengthTable).keys().materialize());
+
+ assertEquals(
+ ImmutableSet.of("one", "two", "three"),
+ keys);
+ }
+
+
+ public static class MaxStringLengthFn extends MapFn<Iterable<String>, Integer> {
+ @Override
+ public Integer map(Iterable<String> input) {
+ int maxLength = Integer.MIN_VALUE;
+ for (String inputString : input) {
+ maxLength = Math.max(maxLength, inputString.length());
+ }
+ return maxLength;
+ }
+ }
+
+
+}